From aed9a1a4f7106ff99a882ad06318cebfa71016a2 Mon Sep 17 00:00:00 2001 From: Mohamed Ahmed Date: Thu, 9 May 2024 23:43:52 +0300 Subject: [PATCH 0001/1578] drm/nouveau: use tile_mode and pte_kind for VM_BIND bo allocations Allow PTE kind and tile mode on BO create with VM_BIND, and add a GETPARAM to indicate this change. This is needed to support modifiers in NVK and ensure correctness when dealing with the nouveau GL driver. The userspace modifiers implementation this is for can be found here: https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24795 Fixes: b88baab82871 ("drm/nouveau: implement new VM_BIND uAPI") Signed-off-by: Mohamed Ahmed Reviewed-by: Faith Ekstrand Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20240509204352.7597-1-mohamedahmedegypt2001@gmail.com --- drivers/gpu/drm/nouveau/nouveau_abi16.c | 3 ++ drivers/gpu/drm/nouveau/nouveau_bo.c | 44 +++++++++++-------------- include/uapi/drm/nouveau_drm.h | 7 ++++ 3 files changed, 29 insertions(+), 25 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_abi16.c b/drivers/gpu/drm/nouveau/nouveau_abi16.c index 80f74ee0fc7867..47e53e17b4e586 100644 --- a/drivers/gpu/drm/nouveau/nouveau_abi16.c +++ b/drivers/gpu/drm/nouveau/nouveau_abi16.c @@ -272,6 +272,9 @@ nouveau_abi16_ioctl_getparam(ABI16_IOCTL_ARGS) getparam->value = (u64)ttm_resource_manager_usage(vram_mgr); break; } + case NOUVEAU_GETPARAM_HAS_VMA_TILEMODE: + getparam->value = 1; + break; default: NV_PRINTK(dbg, cli, "unknown parameter %lld\n", getparam->param); return -EINVAL; diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c index db8cbf6151129d..186add400ea5ff 100644 --- a/drivers/gpu/drm/nouveau/nouveau_bo.c +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c @@ -241,28 +241,28 @@ nouveau_bo_alloc(struct nouveau_cli *cli, u64 *size, int *align, u32 domain, } nvbo->contig = !(tile_flags & NOUVEAU_GEM_TILE_NONCONTIG); - if (!nouveau_cli_uvmm(cli) || internal) { - /* for BO noVM allocs, don't assign kinds */ - if (cli->device.info.family >= NV_DEVICE_INFO_V0_FERMI) { - nvbo->kind = (tile_flags & 0x0000ff00) >> 8; - if (!nvif_mmu_kind_valid(mmu, nvbo->kind)) { - kfree(nvbo); - return ERR_PTR(-EINVAL); - } - nvbo->comp = mmu->kind[nvbo->kind] != nvbo->kind; - } else if (cli->device.info.family >= NV_DEVICE_INFO_V0_TESLA) { - nvbo->kind = (tile_flags & 0x00007f00) >> 8; - nvbo->comp = (tile_flags & 0x00030000) >> 16; - if (!nvif_mmu_kind_valid(mmu, nvbo->kind)) { - kfree(nvbo); - return ERR_PTR(-EINVAL); - } - } else { - nvbo->zeta = (tile_flags & 0x00000007); + if (cli->device.info.family >= NV_DEVICE_INFO_V0_FERMI) { + nvbo->kind = (tile_flags & 0x0000ff00) >> 8; + if (!nvif_mmu_kind_valid(mmu, nvbo->kind)) { + kfree(nvbo); + return ERR_PTR(-EINVAL); + } + + nvbo->comp = mmu->kind[nvbo->kind] != nvbo->kind; + } else if (cli->device.info.family >= NV_DEVICE_INFO_V0_TESLA) { + nvbo->kind = (tile_flags & 0x00007f00) >> 8; + nvbo->comp = (tile_flags & 0x00030000) >> 16; + if (!nvif_mmu_kind_valid(mmu, nvbo->kind)) { + kfree(nvbo); + return ERR_PTR(-EINVAL); } - nvbo->mode = tile_mode; + } else { + nvbo->zeta = (tile_flags & 0x00000007); + } + nvbo->mode = tile_mode; + if (!nouveau_cli_uvmm(cli) || internal) { /* Determine the desirable target GPU page size for the buffer. */ for (i = 0; i < vmm->page_nr; i++) { /* Because we cannot currently allow VMM maps to fail @@ -304,12 +304,6 @@ nouveau_bo_alloc(struct nouveau_cli *cli, u64 *size, int *align, u32 domain, } nvbo->page = vmm->page[pi].shift; } else { - /* reject other tile flags when in VM mode. */ - if (tile_mode) - return ERR_PTR(-EINVAL); - if (tile_flags & ~NOUVEAU_GEM_TILE_NONCONTIG) - return ERR_PTR(-EINVAL); - /* Determine the desirable target GPU page size for the buffer. */ for (i = 0; i < vmm->page_nr; i++) { /* Because we cannot currently allow VMM maps to fail diff --git a/include/uapi/drm/nouveau_drm.h b/include/uapi/drm/nouveau_drm.h index cd84227f1b42ff..5402f77ee85948 100644 --- a/include/uapi/drm/nouveau_drm.h +++ b/include/uapi/drm/nouveau_drm.h @@ -68,6 +68,13 @@ extern "C" { */ #define NOUVEAU_GETPARAM_VRAM_USED 19 +/* + * NOUVEAU_GETPARAM_HAS_VMA_TILEMODE + * + * Query whether tile mode and PTE kind are accepted with VM allocs or not. + */ +#define NOUVEAU_GETPARAM_HAS_VMA_TILEMODE 20 + struct drm_nouveau_getparam { __u64 param; __u64 value; From 9f365cb8bbd0162963d6852651d7c9e30adcb7b5 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 14 May 2024 13:47:23 -0700 Subject: [PATCH 0002/1578] scsi: mpi3mr: Use proper format specifier in mpi3mr_sas_port_add() When building for a 32-bit platform such as ARM or i386, for which size_t is unsigned int, there is a warning due to using an unsigned long format specifier: drivers/scsi/mpi3mr/mpi3mr_transport.c:1370:11: error: format specifies type 'unsigned long' but the argument has type 'unsigned int' [-Werror,-Wformat] 1369 | ioc_warn(mrioc, "skipping port %u, max allowed value is %lu\n", | ~~~ | %u 1370 | i, sizeof(mr_sas_port->phy_mask) * 8); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Use the proper format specifier for size_t, %zu, to resolve the warning for all platforms. Fixes: 3668651def2c ("scsi: mpi3mr: Sanitise num_phys") Signed-off-by: Nathan Chancellor Link: https://lore.kernel.org/r/20240514-mpi3mr-fix-wformat-v1-1-f1ad49217e5e@kernel.org Signed-off-by: Martin K. Petersen --- drivers/scsi/mpi3mr/mpi3mr_transport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/mpi3mr/mpi3mr_transport.c b/drivers/scsi/mpi3mr/mpi3mr_transport.c index 7ca9a7c2709cfe..5d261c2f2d2005 100644 --- a/drivers/scsi/mpi3mr/mpi3mr_transport.c +++ b/drivers/scsi/mpi3mr/mpi3mr_transport.c @@ -1366,7 +1366,7 @@ static struct mpi3mr_sas_port *mpi3mr_sas_port_add(struct mpi3mr_ioc *mrioc, continue; if (i > sizeof(mr_sas_port->phy_mask) * 8) { - ioc_warn(mrioc, "skipping port %u, max allowed value is %lu\n", + ioc_warn(mrioc, "skipping port %u, max allowed value is %zu\n", i, sizeof(mr_sas_port->phy_mask) * 8); goto out_fail; } From 10157b1fc1a762293381e9145041253420dfc6ad Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Tue, 14 May 2024 16:03:44 +0200 Subject: [PATCH 0003/1578] scsi: core: alua: I/O errors for ALUA state transitions When a host is configured with a few LUNs and I/O is running, injecting FC faults repeatedly leads to path recovery problems. The LUNs have 4 paths each and 3 of them come back active after say an FC fault which makes 2 of the paths go down, instead of all 4. This happens after several iterations of continuous FC faults. Reason here is that we're returning an I/O error whenever we're encountering sense code 06/04/0a (LOGICAL UNIT NOT ACCESSIBLE, ASYMMETRIC ACCESS STATE TRANSITION) instead of retrying. [mwilck: The original patch was developed by Rajashekhar M A and Hannes Reinecke. I moved the code to alua_check_sense() as suggested by Mike Christie [1]. Evan Milne had raised the question whether pg->state should be set to transitioning in the UA case [2]. I believe that doing this is correct. SCSI_ACCESS_STATE_TRANSITIONING by itself doesn't cause I/O errors. Our handler schedules an RTPG, which will only result in an I/O error condition if the transitioning timeout expires.] [1] https://lore.kernel.org/all/0bc96e82-fdda-4187-148d-5b34f81d4942@oracle.com/ [2] https://lore.kernel.org/all/CAGtn9r=kicnTDE2o7Gt5Y=yoidHYD7tG8XdMHEBJTBraVEoOCw@mail.gmail.com/ Co-developed-by: Rajashekhar M A Co-developed-by: Hannes Reinecke Signed-off-by: Hannes Reinecke Signed-off-by: Martin Wilck Link: https://lore.kernel.org/r/20240514140344.19538-1-mwilck@suse.com Reviewed-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Mike Christie Signed-off-by: Martin K. Petersen --- drivers/scsi/device_handler/scsi_dh_alua.c | 31 +++++++++++++++------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c index a226dc1b65d715..4eb0837298d4d2 100644 --- a/drivers/scsi/device_handler/scsi_dh_alua.c +++ b/drivers/scsi/device_handler/scsi_dh_alua.c @@ -414,28 +414,40 @@ static char print_alua_state(unsigned char state) } } -static enum scsi_disposition alua_check_sense(struct scsi_device *sdev, - struct scsi_sense_hdr *sense_hdr) +static void alua_handle_state_transition(struct scsi_device *sdev) { struct alua_dh_data *h = sdev->handler_data; struct alua_port_group *pg; + rcu_read_lock(); + pg = rcu_dereference(h->pg); + if (pg) + pg->state = SCSI_ACCESS_STATE_TRANSITIONING; + rcu_read_unlock(); + alua_check(sdev, false); +} + +static enum scsi_disposition alua_check_sense(struct scsi_device *sdev, + struct scsi_sense_hdr *sense_hdr) +{ switch (sense_hdr->sense_key) { case NOT_READY: if (sense_hdr->asc == 0x04 && sense_hdr->ascq == 0x0a) { /* * LUN Not Accessible - ALUA state transition */ - rcu_read_lock(); - pg = rcu_dereference(h->pg); - if (pg) - pg->state = SCSI_ACCESS_STATE_TRANSITIONING; - rcu_read_unlock(); - alua_check(sdev, false); + alua_handle_state_transition(sdev); return NEEDS_RETRY; } break; case UNIT_ATTENTION: + if (sense_hdr->asc == 0x04 && sense_hdr->ascq == 0x0a) { + /* + * LUN Not Accessible - ALUA state transition + */ + alua_handle_state_transition(sdev); + return NEEDS_RETRY; + } if (sense_hdr->asc == 0x29 && sense_hdr->ascq == 0x00) { /* * Power On, Reset, or Bus Device Reset. @@ -502,7 +514,8 @@ static int alua_tur(struct scsi_device *sdev) retval = scsi_test_unit_ready(sdev, ALUA_FAILOVER_TIMEOUT * HZ, ALUA_FAILOVER_RETRIES, &sense_hdr); - if (sense_hdr.sense_key == NOT_READY && + if ((sense_hdr.sense_key == NOT_READY || + sense_hdr.sense_key == UNIT_ATTENTION) && sense_hdr.asc == 0x04 && sense_hdr.ascq == 0x0a) return SCSI_DH_RETRY; else if (retval) From 9fad9d560af5c654bb38e0b07ee54a4e9acdc5cd Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Wed, 8 May 2024 17:22:51 +0000 Subject: [PATCH 0004/1578] scsi: sr: Fix unintentional arithmetic wraparound Running syzkaller with the newly reintroduced signed integer overflow sanitizer produces this report: [ 65.194362] ------------[ cut here ]------------ [ 65.197752] UBSAN: signed-integer-overflow in ../drivers/scsi/sr_ioctl.c:436:9 [ 65.203607] -2147483648 * 177 cannot be represented in type 'int' [ 65.207911] CPU: 2 PID: 10416 Comm: syz-executor.1 Not tainted 6.8.0-rc2-00035-gb3ef86b5a957 #1 [ 65.213585] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 [ 65.219923] Call Trace: [ 65.221556] [ 65.223029] dump_stack_lvl+0x93/0xd0 [ 65.225573] handle_overflow+0x171/0x1b0 [ 65.228219] sr_select_speed+0xeb/0xf0 [ 65.230786] ? __pm_runtime_resume+0xe6/0x130 [ 65.233606] sr_block_ioctl+0x15d/0x1d0 ... Historically, the signed integer overflow sanitizer did not work in the kernel due to its interaction with `-fwrapv` but this has since been changed [1] in the newest version of Clang. It was re-enabled in the kernel with Commit 557f8c582a9b ("ubsan: Reintroduce signed overflow sanitizer"). Firstly, let's change the type of "speed" to unsigned long as sr_select_speed()'s only caller passes in an unsigned long anyways. $ git grep '\.select_speed' | drivers/scsi/sr.c: .select_speed = sr_select_speed, ... | static int cdrom_ioctl_select_speed(struct cdrom_device_info *cdi, | unsigned long arg) | { | ... | return cdi->ops->select_speed(cdi, arg); | } Next, let's add an extra check to make sure we don't exceed 0xffff/177 (350) since 0xffff is the max speed. This has two benefits: 1) we deal with integer overflow before it happens and 2) we properly respect the max speed of 0xffff. There are some "magic" numbers here but I did not want to change more than what was necessary. Link: https://github.com/llvm/llvm-project/pull/82432 [1] Closes: https://github.com/KSPP/linux/issues/357 Cc: linux-hardening@vger.kernel.org Signed-off-by: Justin Stitt Link: https://lore.kernel.org/r/20240508-b4-b4-sio-sr_select_speed-v2-1-00b68f724290@google.com Reviewed-by: Kees Cook Signed-off-by: Martin K. Petersen --- Documentation/cdrom/cdrom-standard.rst | 4 ++-- drivers/scsi/sr.h | 2 +- drivers/scsi/sr_ioctl.c | 5 ++++- include/linux/cdrom.h | 2 +- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/Documentation/cdrom/cdrom-standard.rst b/Documentation/cdrom/cdrom-standard.rst index 7964fe134277b8..6c1303cff159e1 100644 --- a/Documentation/cdrom/cdrom-standard.rst +++ b/Documentation/cdrom/cdrom-standard.rst @@ -217,7 +217,7 @@ current *struct* is:: int (*media_changed)(struct cdrom_device_info *, int); int (*tray_move)(struct cdrom_device_info *, int); int (*lock_door)(struct cdrom_device_info *, int); - int (*select_speed)(struct cdrom_device_info *, int); + int (*select_speed)(struct cdrom_device_info *, unsigned long); int (*get_last_session) (struct cdrom_device_info *, struct cdrom_multisession *); int (*get_mcn)(struct cdrom_device_info *, struct cdrom_mcn *); @@ -396,7 +396,7 @@ action need be taken, and the return value should be 0. :: - int select_speed(struct cdrom_device_info *cdi, int speed) + int select_speed(struct cdrom_device_info *cdi, unsigned long speed) Some CD-ROM drives are capable of changing their head-speed. There are several reasons for changing the speed of a CD-ROM drive. Badly diff --git a/drivers/scsi/sr.h b/drivers/scsi/sr.h index 1175f2e213b566..dc899277b3a441 100644 --- a/drivers/scsi/sr.h +++ b/drivers/scsi/sr.h @@ -65,7 +65,7 @@ int sr_disk_status(struct cdrom_device_info *); int sr_get_last_session(struct cdrom_device_info *, struct cdrom_multisession *); int sr_get_mcn(struct cdrom_device_info *, struct cdrom_mcn *); int sr_reset(struct cdrom_device_info *); -int sr_select_speed(struct cdrom_device_info *cdi, int speed); +int sr_select_speed(struct cdrom_device_info *cdi, unsigned long speed); int sr_audio_ioctl(struct cdrom_device_info *, unsigned int, void *); int sr_is_xa(Scsi_CD *); diff --git a/drivers/scsi/sr_ioctl.c b/drivers/scsi/sr_ioctl.c index 5b0b35e60e61fe..a0d2556a27bba3 100644 --- a/drivers/scsi/sr_ioctl.c +++ b/drivers/scsi/sr_ioctl.c @@ -425,11 +425,14 @@ int sr_reset(struct cdrom_device_info *cdi) return 0; } -int sr_select_speed(struct cdrom_device_info *cdi, int speed) +int sr_select_speed(struct cdrom_device_info *cdi, unsigned long speed) { Scsi_CD *cd = cdi->handle; struct packet_command cgc; + /* avoid exceeding the max speed or overflowing integer bounds */ + speed = clamp(0, speed, 0xffff / 177); + if (speed == 0) speed = 0xffff; /* set to max */ else diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index 98c6fd0b39b634..fdfb61ccf55aef 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -77,7 +77,7 @@ struct cdrom_device_ops { unsigned int clearing, int slot); int (*tray_move) (struct cdrom_device_info *, int); int (*lock_door) (struct cdrom_device_info *, int); - int (*select_speed) (struct cdrom_device_info *, int); + int (*select_speed) (struct cdrom_device_info *, unsigned long); int (*get_last_session) (struct cdrom_device_info *, struct cdrom_multisession *); int (*get_mcn) (struct cdrom_device_info *, From 51071f0831ea975fc045526dd7e17efe669dc6e1 Mon Sep 17 00:00:00 2001 From: Saurav Kashyap Date: Wed, 15 May 2024 14:40:59 +0530 Subject: [PATCH 0005/1578] scsi: qedf: Don't process stag work during unload and recovery Stag work can cause issues during unload and recovery, hence don't process it. Signed-off-by: Saurav Kashyap Signed-off-by: Nilesh Javali Link: https://lore.kernel.org/r/20240515091101.18754-2-skashyap@marvell.com Signed-off-by: Martin K. Petersen --- drivers/scsi/qedf/qedf_main.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/scsi/qedf/qedf_main.c b/drivers/scsi/qedf/qedf_main.c index fd12439cbaab6b..c457caae07b38f 100644 --- a/drivers/scsi/qedf/qedf_main.c +++ b/drivers/scsi/qedf/qedf_main.c @@ -3997,6 +3997,22 @@ void qedf_stag_change_work(struct work_struct *work) struct qedf_ctx *qedf = container_of(work, struct qedf_ctx, stag_work.work); + if (!qedf) { + QEDF_ERR(&qedf->dbg_ctx, "qedf is NULL"); + return; + } + + if (test_bit(QEDF_IN_RECOVERY, &qedf->flags)) { + QEDF_ERR(&qedf->dbg_ctx, + "Already is in recovery, hence not calling software context reset.\n"); + return; + } + + if (test_bit(QEDF_UNLOADING, &qedf->flags)) { + QEDF_ERR(&qedf->dbg_ctx, "Driver unloading\n"); + return; + } + printk_ratelimited("[%s]:[%s:%d]:%d: Performing software context reset.", dev_name(&qedf->pdev->dev), __func__, __LINE__, qedf->dbg_ctx.host_no); From 78e88472b60936025b83eba57cffa59d3501dc07 Mon Sep 17 00:00:00 2001 From: Saurav Kashyap Date: Wed, 15 May 2024 14:41:00 +0530 Subject: [PATCH 0006/1578] scsi: qedf: Wait for stag work during unload If stag work is already scheduled and unload is called, it can lead to issues as unload cleans up the work element. Wait for stag work to get completed before cleanup during unload. Signed-off-by: Saurav Kashyap Signed-off-by: Nilesh Javali Link: https://lore.kernel.org/r/20240515091101.18754-3-skashyap@marvell.com Signed-off-by: Martin K. Petersen --- drivers/scsi/qedf/qedf.h | 1 + drivers/scsi/qedf/qedf_main.c | 30 +++++++++++++++++++++++++++--- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/qedf/qedf.h b/drivers/scsi/qedf/qedf.h index 5058e01b65a273..98afdfe6360031 100644 --- a/drivers/scsi/qedf/qedf.h +++ b/drivers/scsi/qedf/qedf.h @@ -363,6 +363,7 @@ struct qedf_ctx { #define QEDF_IN_RECOVERY 5 #define QEDF_DBG_STOP_IO 6 #define QEDF_PROBING 8 +#define QEDF_STAG_IN_PROGRESS 9 unsigned long flags; /* Miscellaneous state flags */ int fipvlan_retries; u8 num_queues; diff --git a/drivers/scsi/qedf/qedf_main.c b/drivers/scsi/qedf/qedf_main.c index c457caae07b38f..eca0190d9a826a 100644 --- a/drivers/scsi/qedf/qedf_main.c +++ b/drivers/scsi/qedf/qedf_main.c @@ -318,11 +318,18 @@ static struct fc_seq *qedf_elsct_send(struct fc_lport *lport, u32 did, */ if (resp == fc_lport_flogi_resp) { qedf->flogi_cnt++; + qedf->flogi_pending++; + + if (test_bit(QEDF_UNLOADING, &qedf->flags)) { + QEDF_ERR(&qedf->dbg_ctx, "Driver unloading\n"); + qedf->flogi_pending = 0; + } + if (qedf->flogi_pending >= QEDF_FLOGI_RETRY_CNT) { schedule_delayed_work(&qedf->stag_work, 2); return NULL; } - qedf->flogi_pending++; + return fc_elsct_send(lport, did, fp, op, qedf_flogi_resp, arg, timeout); } @@ -912,13 +919,14 @@ void qedf_ctx_soft_reset(struct fc_lport *lport) struct qedf_ctx *qedf; struct qed_link_output if_link; + qedf = lport_priv(lport); + if (lport->vport) { + clear_bit(QEDF_STAG_IN_PROGRESS, &qedf->flags); printk_ratelimited("Cannot issue host reset on NPIV port.\n"); return; } - qedf = lport_priv(lport); - qedf->flogi_pending = 0; /* For host reset, essentially do a soft link up/down */ atomic_set(&qedf->link_state, QEDF_LINK_DOWN); @@ -938,6 +946,7 @@ void qedf_ctx_soft_reset(struct fc_lport *lport) if (!if_link.link_up) { QEDF_INFO(&qedf->dbg_ctx, QEDF_LOG_DISC, "Physical link is not up.\n"); + clear_bit(QEDF_STAG_IN_PROGRESS, &qedf->flags); return; } /* Flush and wait to make sure link down is processed */ @@ -950,6 +959,7 @@ void qedf_ctx_soft_reset(struct fc_lport *lport) "Queue link up work.\n"); queue_delayed_work(qedf->link_update_wq, &qedf->link_update, 0); + clear_bit(QEDF_STAG_IN_PROGRESS, &qedf->flags); } /* Reset the host by gracefully logging out and then logging back in */ @@ -3721,6 +3731,7 @@ static void __qedf_remove(struct pci_dev *pdev, int mode) { struct qedf_ctx *qedf; int rc; + int cnt = 0; if (!pdev) { QEDF_ERR(NULL, "pdev is NULL.\n"); @@ -3738,6 +3749,17 @@ static void __qedf_remove(struct pci_dev *pdev, int mode) return; } +stag_in_prog: + if (test_bit(QEDF_STAG_IN_PROGRESS, &qedf->flags)) { + QEDF_ERR(&qedf->dbg_ctx, "Stag in progress, cnt=%d.\n", cnt); + cnt++; + + if (cnt < 5) { + msleep(500); + goto stag_in_prog; + } + } + if (mode != QEDF_MODE_RECOVERY) set_bit(QEDF_UNLOADING, &qedf->flags); @@ -4013,6 +4035,8 @@ void qedf_stag_change_work(struct work_struct *work) return; } + set_bit(QEDF_STAG_IN_PROGRESS, &qedf->flags); + printk_ratelimited("[%s]:[%s:%d]:%d: Performing software context reset.", dev_name(&qedf->pdev->dev), __func__, __LINE__, qedf->dbg_ctx.host_no); From 6c3bb589debd763dc4b94803ddf3c13b4fcca776 Mon Sep 17 00:00:00 2001 From: Saurav Kashyap Date: Wed, 15 May 2024 14:41:01 +0530 Subject: [PATCH 0007/1578] scsi: qedf: Set qed_slowpath_params to zero before use Zero qed_slowpath_params before use. Signed-off-by: Saurav Kashyap Signed-off-by: Nilesh Javali Link: https://lore.kernel.org/r/20240515091101.18754-4-skashyap@marvell.com Signed-off-by: Martin K. Petersen --- drivers/scsi/qedf/qedf_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/qedf/qedf_main.c b/drivers/scsi/qedf/qedf_main.c index eca0190d9a826a..49adddf978cc73 100644 --- a/drivers/scsi/qedf/qedf_main.c +++ b/drivers/scsi/qedf/qedf_main.c @@ -3473,6 +3473,7 @@ static int __qedf_probe(struct pci_dev *pdev, int mode) } /* Start the Slowpath-process */ + memset(&slowpath_params, 0, sizeof(struct qed_slowpath_params)); slowpath_params.int_mode = QED_INT_MODE_MSIX; slowpath_params.drv_major = QEDF_DRIVER_MAJOR_VER; slowpath_params.drv_minor = QEDF_DRIVER_MINOR_VER; From e4f5f8298cf6ddae43210d236ad65ac2c6379559 Mon Sep 17 00:00:00 2001 From: Deming Wang Date: Mon, 13 May 2024 07:59:56 -0400 Subject: [PATCH 0008/1578] scsi: mpt3sas: Add missing kerneldoc parameter descriptions Add missing kerneldoc parameter descriptions to _scsih_set_debug_level(). Signed-off-by: Deming Wang Link: https://lore.kernel.org/r/20240513115956.1576-1-wangdeming@inspur.com Signed-off-by: Martin K. Petersen --- drivers/scsi/mpt3sas/mpt3sas_scsih.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c index 89ef43a5ef862d..12d08d8ba5382d 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c +++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c @@ -302,8 +302,8 @@ struct _scsi_io_transfer { /** * _scsih_set_debug_level - global setting of ioc->logging_level. - * @val: ? - * @kp: ? + * @val: value of the parameter to be set + * @kp: pointer to kernel_param structure * * Note: The logging levels are defined in mpt3sas_debug.h. */ From 637c435f08ea7e77c53a2ad590b651d0de225e3b Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 16 May 2024 11:30:41 +0300 Subject: [PATCH 0009/1578] wifi: ath11k: Fix error path in ath11k_pcic_ext_irq_config If one of the dummy allocation fails in ath11k_pcic_ext_irq_config(), the previous allocated devices might leak due to returning without deallocating the devices. Instead of returning on the error path, deallocate all the previously allocated net_devices and then return. Fixes: bca592ead825 ("wifi: ath11k: allocate dummy net_device dynamically") Signed-off-by: Breno Leitao Reviewed-by: Simon Horman Signed-off-by: Kalle Valo Link: https://msgid.link/20240508185902.70975-1-leitao@debian.org --- drivers/net/wireless/ath/ath11k/pcic.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/drivers/net/wireless/ath/ath11k/pcic.c b/drivers/net/wireless/ath/ath11k/pcic.c index 79eb3f9c902f4b..debe7c5919ef00 100644 --- a/drivers/net/wireless/ath/ath11k/pcic.c +++ b/drivers/net/wireless/ath/ath11k/pcic.c @@ -561,6 +561,7 @@ static int ath11k_pcic_ext_irq_config(struct ath11k_base *ab) { int i, j, n, ret, num_vectors = 0; u32 user_base_data = 0, base_vector = 0; + struct ath11k_ext_irq_grp *irq_grp; unsigned long irq_flags; ret = ath11k_pcic_get_user_msi_assignment(ab, "DP", &num_vectors, @@ -574,14 +575,16 @@ static int ath11k_pcic_ext_irq_config(struct ath11k_base *ab) irq_flags |= IRQF_NOBALANCING; for (i = 0; i < ATH11K_EXT_IRQ_GRP_NUM_MAX; i++) { - struct ath11k_ext_irq_grp *irq_grp = &ab->ext_irq_grp[i]; + irq_grp = &ab->ext_irq_grp[i]; u32 num_irq = 0; irq_grp->ab = ab; irq_grp->grp_id = i; irq_grp->napi_ndev = alloc_netdev_dummy(0); - if (!irq_grp->napi_ndev) - return -ENOMEM; + if (!irq_grp->napi_ndev) { + ret = -ENOMEM; + goto fail_allocate; + } netif_napi_add(irq_grp->napi_ndev, &irq_grp->napi, ath11k_pcic_ext_grp_napi_poll); @@ -606,11 +609,8 @@ static int ath11k_pcic_ext_irq_config(struct ath11k_base *ab) int irq = ath11k_pcic_get_msi_irq(ab, vector); if (irq < 0) { - for (n = 0; n <= i; n++) { - irq_grp = &ab->ext_irq_grp[n]; - free_netdev(irq_grp->napi_ndev); - } - return irq; + ret = irq; + goto fail_irq; } ab->irq_num[irq_idx] = irq; @@ -635,6 +635,15 @@ static int ath11k_pcic_ext_irq_config(struct ath11k_base *ab) } return 0; +fail_irq: + /* i ->napi_ndev was properly allocated. Free it also */ + i += 1; +fail_allocate: + for (n = 0; n < i; n++) { + irq_grp = &ab->ext_irq_grp[n]; + free_netdev(irq_grp->napi_ndev); + } + return ret; } int ath11k_pcic_config_irq(struct ath11k_base *ab) From 117bbc0e43adc6f76a3fc39a98f75a811a853459 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Thu, 29 Feb 2024 10:51:13 +0000 Subject: [PATCH 0010/1578] drm/buddy: stop using PAGE_SIZE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The drm_buddy minimum page-size requirements should be distinct from the CPU PAGE_SIZE. Only restriction is that the minimum page-size is at least 4K. Signed-off-by: Matthew Auld Cc: Arunpravin Paneer Selvam Cc: Christian König Cc: Arnd Bergmann Reviewed-by: Arunpravin Paneer Selvam Acked-by: Arnd Bergmann Link: https://patchwork.freedesktop.org/patch/msgid/20240229105112.250077-3-matthew.auld@intel.com Signed-off-by: Christian König --- drivers/gpu/drm/drm_buddy.c | 2 +- include/drm/drm_buddy.h | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 5ebdd6f8f36e6b..f999568d69c10d 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -102,7 +102,7 @@ int drm_buddy_init(struct drm_buddy *mm, u64 size, u64 chunk_size) if (size < chunk_size) return -EINVAL; - if (chunk_size < PAGE_SIZE) + if (chunk_size < SZ_4K) return -EINVAL; if (!is_power_of_2(chunk_size)) diff --git a/include/drm/drm_buddy.h b/include/drm/drm_buddy.h index a5b39fc01003a6..19ed661a32f3bd 100644 --- a/include/drm/drm_buddy.h +++ b/include/drm/drm_buddy.h @@ -53,8 +53,8 @@ struct drm_buddy_block { struct list_head tmp_link; }; -/* Order-zero must be at least PAGE_SIZE */ -#define DRM_BUDDY_MAX_ORDER (63 - PAGE_SHIFT) +/* Order-zero must be at least SZ_4K */ +#define DRM_BUDDY_MAX_ORDER (63 - 12) /* * Binary Buddy System. @@ -82,7 +82,7 @@ struct drm_buddy { unsigned int n_roots; unsigned int max_order; - /* Must be at least PAGE_SIZE */ + /* Must be at least SZ_4K */ u64 chunk_size; u64 size; u64 avail; From 520fb7f183e9b4d0ad7a2f084f3c4987845425e2 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Thu, 29 Feb 2024 10:51:14 +0000 Subject: [PATCH 0011/1578] drm/tests/buddy: stop using PAGE_SIZE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Gives the wrong impression that min page-size has to be tied to the CPU PAGE_SIZE. Signed-off-by: Matthew Auld Cc: Arunpravin Paneer Selvam Cc: Christian König Cc: Arnd Bergmann Reviewed-by: Arunpravin Paneer Selvam Link: https://patchwork.freedesktop.org/patch/msgid/20240229105112.250077-4-matthew.auld@intel.com Signed-off-by: Christian König --- drivers/gpu/drm/tests/drm_buddy_test.c | 42 +++++++++++++------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/tests/drm_buddy_test.c b/drivers/gpu/drm/tests/drm_buddy_test.c index e48863a445564d..f2397696d25267 100644 --- a/drivers/gpu/drm/tests/drm_buddy_test.c +++ b/drivers/gpu/drm/tests/drm_buddy_test.c @@ -329,8 +329,8 @@ static void drm_test_buddy_alloc_pathological(struct kunit *test) * Eventually we will have a fully 50% fragmented mm. */ - mm_size = PAGE_SIZE << max_order; - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, PAGE_SIZE), + mm_size = SZ_4K << max_order; + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), "buddy_init failed\n"); KUNIT_EXPECT_EQ(test, mm.max_order, max_order); @@ -344,7 +344,7 @@ static void drm_test_buddy_alloc_pathological(struct kunit *test) } for (order = top; order--;) { - size = get_size(order, PAGE_SIZE); + size = get_size(order, mm.chunk_size); KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), @@ -358,7 +358,7 @@ static void drm_test_buddy_alloc_pathological(struct kunit *test) } /* There should be one final page for this sub-allocation */ - size = get_size(0, PAGE_SIZE); + size = get_size(0, mm.chunk_size); KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc hit -ENOMEM for hole\n"); @@ -368,7 +368,7 @@ static void drm_test_buddy_alloc_pathological(struct kunit *test) list_move_tail(&block->link, &holes); - size = get_size(top, PAGE_SIZE); + size = get_size(top, mm.chunk_size); KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc unexpectedly succeeded at top-order %d/%d, it should be full!", @@ -379,7 +379,7 @@ static void drm_test_buddy_alloc_pathological(struct kunit *test) /* Nothing larger than blocks of chunk_size now available */ for (order = 1; order <= max_order; order++) { - size = get_size(order, PAGE_SIZE); + size = get_size(order, mm.chunk_size); KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc unexpectedly succeeded at order %d, it should be full!", @@ -408,14 +408,14 @@ static void drm_test_buddy_alloc_pessimistic(struct kunit *test) * page left. */ - mm_size = PAGE_SIZE << max_order; - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, PAGE_SIZE), + mm_size = SZ_4K << max_order; + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), "buddy_init failed\n"); KUNIT_EXPECT_EQ(test, mm.max_order, max_order); for (order = 0; order < max_order; order++) { - size = get_size(order, PAGE_SIZE); + size = get_size(order, mm.chunk_size); KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc hit -ENOMEM with order=%d\n", @@ -428,7 +428,7 @@ static void drm_test_buddy_alloc_pessimistic(struct kunit *test) } /* And now the last remaining block available */ - size = get_size(0, PAGE_SIZE); + size = get_size(0, mm.chunk_size); KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc hit -ENOMEM on final alloc\n"); @@ -440,7 +440,7 @@ static void drm_test_buddy_alloc_pessimistic(struct kunit *test) /* Should be completely full! */ for (order = max_order; order--;) { - size = get_size(order, PAGE_SIZE); + size = get_size(order, mm.chunk_size); KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc unexpectedly succeeded, it should be full!"); @@ -456,7 +456,7 @@ static void drm_test_buddy_alloc_pessimistic(struct kunit *test) list_del(&block->link); drm_buddy_free_block(&mm, block); - size = get_size(order, PAGE_SIZE); + size = get_size(order, mm.chunk_size); KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc hit -ENOMEM with order=%d\n", @@ -471,7 +471,7 @@ static void drm_test_buddy_alloc_pessimistic(struct kunit *test) } /* To confirm, now the whole mm should be available */ - size = get_size(max_order, PAGE_SIZE); + size = get_size(max_order, mm.chunk_size); KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc (realloc) hit -ENOMEM with order=%d\n", @@ -502,15 +502,15 @@ static void drm_test_buddy_alloc_optimistic(struct kunit *test) * try to allocate them all. */ - mm_size = PAGE_SIZE * ((1 << (max_order + 1)) - 1); + mm_size = SZ_4K * ((1 << (max_order + 1)) - 1); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, PAGE_SIZE), + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), "buddy_init failed\n"); KUNIT_EXPECT_EQ(test, mm.max_order, max_order); for (order = 0; order <= max_order; order++) { - size = get_size(order, PAGE_SIZE); + size = get_size(order, mm.chunk_size); KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc hit -ENOMEM with order=%d\n", @@ -523,7 +523,7 @@ static void drm_test_buddy_alloc_optimistic(struct kunit *test) } /* Should be completely full! */ - size = get_size(0, PAGE_SIZE); + size = get_size(0, mm.chunk_size); KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc unexpectedly succeeded, it should be full!"); @@ -540,7 +540,7 @@ static void drm_test_buddy_alloc_limit(struct kunit *test) LIST_HEAD(allocated); struct drm_buddy mm; - KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, size, PAGE_SIZE)); + KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, size, SZ_4K)); KUNIT_EXPECT_EQ_MSG(test, mm.max_order, DRM_BUDDY_MAX_ORDER, "mm.max_order(%d) != %d\n", mm.max_order, @@ -548,7 +548,7 @@ static void drm_test_buddy_alloc_limit(struct kunit *test) size = mm.chunk_size << mm.max_order; KUNIT_EXPECT_FALSE(test, drm_buddy_alloc_blocks(&mm, start, size, size, - PAGE_SIZE, &allocated, flags)); + mm.chunk_size, &allocated, flags)); block = list_first_entry_or_null(&allocated, struct drm_buddy_block, link); KUNIT_EXPECT_TRUE(test, block); @@ -558,10 +558,10 @@ static void drm_test_buddy_alloc_limit(struct kunit *test) drm_buddy_block_order(block), mm.max_order); KUNIT_EXPECT_EQ_MSG(test, drm_buddy_block_size(&mm, block), - BIT_ULL(mm.max_order) * PAGE_SIZE, + BIT_ULL(mm.max_order) * mm.chunk_size, "block size(%llu) != %llu\n", drm_buddy_block_size(&mm, block), - BIT_ULL(mm.max_order) * PAGE_SIZE); + BIT_ULL(mm.max_order) * mm.chunk_size); drm_buddy_free_list(&mm, &allocated); drm_buddy_fini(&mm); From dc21c6cc3d6986d938efbf95de62473982c98dec Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 15 May 2024 13:23:39 +0000 Subject: [PATCH 0012/1578] netfilter: nfnetlink_queue: acquire rcu_read_lock() in instance_destroy_rcu() syzbot reported that nf_reinject() could be called without rcu_read_lock() : WARNING: suspicious RCU usage 6.9.0-rc7-syzkaller-02060-g5c1672705a1a #0 Not tainted net/netfilter/nfnetlink_queue.c:263 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 2 locks held by syz-executor.4/13427: #0: ffffffff8e334f60 (rcu_callback){....}-{0:0}, at: rcu_lock_acquire include/linux/rcupdate.h:329 [inline] #0: ffffffff8e334f60 (rcu_callback){....}-{0:0}, at: rcu_do_batch kernel/rcu/tree.c:2190 [inline] #0: ffffffff8e334f60 (rcu_callback){....}-{0:0}, at: rcu_core+0xa86/0x1830 kernel/rcu/tree.c:2471 #1: ffff88801ca92958 (&inst->lock){+.-.}-{2:2}, at: spin_lock_bh include/linux/spinlock.h:356 [inline] #1: ffff88801ca92958 (&inst->lock){+.-.}-{2:2}, at: nfqnl_flush net/netfilter/nfnetlink_queue.c:405 [inline] #1: ffff88801ca92958 (&inst->lock){+.-.}-{2:2}, at: instance_destroy_rcu+0x30/0x220 net/netfilter/nfnetlink_queue.c:172 stack backtrace: CPU: 0 PID: 13427 Comm: syz-executor.4 Not tainted 6.9.0-rc7-syzkaller-02060-g5c1672705a1a #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/02/2024 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:114 lockdep_rcu_suspicious+0x221/0x340 kernel/locking/lockdep.c:6712 nf_reinject net/netfilter/nfnetlink_queue.c:323 [inline] nfqnl_reinject+0x6ec/0x1120 net/netfilter/nfnetlink_queue.c:397 nfqnl_flush net/netfilter/nfnetlink_queue.c:410 [inline] instance_destroy_rcu+0x1ae/0x220 net/netfilter/nfnetlink_queue.c:172 rcu_do_batch kernel/rcu/tree.c:2196 [inline] rcu_core+0xafd/0x1830 kernel/rcu/tree.c:2471 handle_softirqs+0x2d6/0x990 kernel/softirq.c:554 __do_softirq kernel/softirq.c:588 [inline] invoke_softirq kernel/softirq.c:428 [inline] __irq_exit_rcu+0xf4/0x1c0 kernel/softirq.c:637 irq_exit_rcu+0x9/0x30 kernel/softirq.c:649 instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1043 [inline] sysvec_apic_timer_interrupt+0xa6/0xc0 arch/x86/kernel/apic/apic.c:1043 Fixes: 9872bec773c2 ("[NETFILTER]: nfnetlink: use RCU for queue instances hash") Reported-by: syzbot Signed-off-by: Eric Dumazet Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_queue.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 00f4bd21c59b41..f1c31757e4969e 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -169,7 +169,9 @@ instance_destroy_rcu(struct rcu_head *head) struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance, rcu); + rcu_read_lock(); nfqnl_flush(inst, NULL, 0); + rcu_read_unlock(); kfree(inst); module_put(THIS_MODULE); } From 21ae74e1bf18331ae5e279bd96304b3630828009 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Fri, 17 May 2024 10:00:28 +0300 Subject: [PATCH 0013/1578] wifi: ath10k: fix QCOM_RPROC_COMMON dependency If ath10k_snoc is built-in, while Qualcomm remoteprocs are built as modules, compilation fails with: /usr/bin/aarch64-linux-gnu-ld: drivers/net/wireless/ath/ath10k/snoc.o: in function `ath10k_modem_init': drivers/net/wireless/ath/ath10k/snoc.c:1534: undefined reference to `qcom_register_ssr_notifier' /usr/bin/aarch64-linux-gnu-ld: drivers/net/wireless/ath/ath10k/snoc.o: in function `ath10k_modem_deinit': drivers/net/wireless/ath/ath10k/snoc.c:1551: undefined reference to `qcom_unregister_ssr_notifier' Add corresponding dependency to ATH10K_SNOC Kconfig entry so that it's built as module if QCOM_RPROC_COMMON is built as module too. Fixes: 747ff7d3d742 ("ath10k: Don't always treat modem stop events as crashes") Cc: stable@vger.kernel.org Signed-off-by: Dmitry Baryshkov Signed-off-by: Kalle Valo Link: https://msgid.link/20240511-ath10k-snoc-dep-v1-1-9666e3af5c27@linaro.org --- drivers/net/wireless/ath/ath10k/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/ath/ath10k/Kconfig b/drivers/net/wireless/ath/ath10k/Kconfig index e6ea884cafc190..4f385f4a8cef2a 100644 --- a/drivers/net/wireless/ath/ath10k/Kconfig +++ b/drivers/net/wireless/ath/ath10k/Kconfig @@ -45,6 +45,7 @@ config ATH10K_SNOC depends on ATH10K depends on ARCH_QCOM || COMPILE_TEST depends on QCOM_SMEM + depends on QCOM_RPROC_COMMON || QCOM_RPROC_COMMON=n select QCOM_SCM select QCOM_QMI_HELPERS help From c1193d9bbbd379defe9be3c6de566de684de8a6f Mon Sep 17 00:00:00 2001 From: Alexander Maltsev Date: Wed, 17 Apr 2024 18:51:41 +0500 Subject: [PATCH 0014/1578] netfilter: ipset: Add list flush to cancel_gc Flushing list in cancel_gc drops references to other lists right away, without waiting for RCU to destroy list. Fixes race when referenced ipsets can't be destroyed while referring list is scheduled for destroy. Fixes: 97f7cf1cd80e ("netfilter: ipset: fix performance regression in swap operation") Signed-off-by: Alexander Maltsev Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_list_set.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 6c3f28bc59b325..54e2a1dd7f5f51 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -549,6 +549,9 @@ list_set_cancel_gc(struct ip_set *set) if (SET_WITH_TIMEOUT(set)) timer_shutdown_sync(&map->gc); + + /* Flush list to drop references to other ipsets */ + list_set_flush(set); } static const struct ip_set_type_variant set_variant = { From aff5c01fa1284d606f8e7cbdaafeef2511bb46c1 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 9 May 2024 23:02:24 +0200 Subject: [PATCH 0015/1578] netfilter: nft_payload: restore vlan q-in-q match support Revert f6ae9f120dad ("netfilter: nft_payload: add C-VLAN support"). f41f72d09ee1 ("netfilter: nft_payload: simplify vlan header handling") already allows to match on inner vlan tags by subtract the vlan header size to the payload offset which has been popped and stored in skbuff metadata fields. Fixes: f6ae9f120dad ("netfilter: nft_payload: add C-VLAN support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_payload.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 0a689c8e0295df..a3cb5dbcb362cc 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -45,36 +45,27 @@ nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len) int mac_off = skb_mac_header(skb) - skb->data; u8 *vlanh, *dst_u8 = (u8 *) d; struct vlan_ethhdr veth; - u8 vlan_hlen = 0; - - if ((skb->protocol == htons(ETH_P_8021AD) || - skb->protocol == htons(ETH_P_8021Q)) && - offset >= VLAN_ETH_HLEN && offset < VLAN_ETH_HLEN + VLAN_HLEN) - vlan_hlen += VLAN_HLEN; vlanh = (u8 *) &veth; - if (offset < VLAN_ETH_HLEN + vlan_hlen) { + if (offset < VLAN_ETH_HLEN) { u8 ethlen = len; - if (vlan_hlen && - skb_copy_bits(skb, mac_off, &veth, VLAN_ETH_HLEN) < 0) - return false; - else if (!nft_payload_rebuild_vlan_hdr(skb, mac_off, &veth)) + if (!nft_payload_rebuild_vlan_hdr(skb, mac_off, &veth)) return false; - if (offset + len > VLAN_ETH_HLEN + vlan_hlen) - ethlen -= offset + len - VLAN_ETH_HLEN - vlan_hlen; + if (offset + len > VLAN_ETH_HLEN) + ethlen -= offset + len - VLAN_ETH_HLEN; - memcpy(dst_u8, vlanh + offset - vlan_hlen, ethlen); + memcpy(dst_u8, vlanh + offset, ethlen); len -= ethlen; if (len == 0) return true; dst_u8 += ethlen; - offset = ETH_HLEN + vlan_hlen; + offset = ETH_HLEN; } else { - offset -= VLAN_HLEN + vlan_hlen; + offset -= VLAN_HLEN; } return skb_copy_bits(skb, offset + mac_off, dst_u8, len) == 0; From 2a1b02bcba78f8498ab00d6142e1238d85b01591 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 20 May 2024 13:28:33 -1000 Subject: [PATCH 0016/1578] workqueue: Refactor worker ID formatting and make wq_worker_comm() use full ID string Currently, worker ID formatting is open coded in create_worker(), init_rescuer() and worker_thread() (for %WORKER_DIE case). The formatted ID is saved into task->comm and wq_worker_comm() uses it as the base name to append extra information to when generating the name to be shown to userspace. However, TASK_COMM_LEN is only 16 leading to badly truncated names for rescuers. For example, the rescuer for the inet_frag_wq workqueue becomes: $ ps -ef | grep '[k]worker/R-inet' root 483 2 0 Apr26 ? 00:00:00 [kworker/R-inet_] Even for non-rescue workers, it's easy to run over 15 characters on moderately large machines. Fit it by consolidating worker ID formatting into a new helper format_worker_id() and calling it from wq_worker_comm() to obtain the untruncated worker ID string. $ ps -ef | grep '[k]worker/R-inet' root 60 2 0 12:10 ? 00:00:00 [kworker/R-inet_frag_wq] Signed-off-by: Tejun Heo Reported-and-tested-by: Jan Engelhardt Suggested-by: Linus Torvalds --- kernel/workqueue.c | 51 ++++++++++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 003474c9a77d09..3fbaecfc88c28f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -125,6 +125,7 @@ enum wq_internal_consts { HIGHPRI_NICE_LEVEL = MIN_NICE, WQ_NAME_LEN = 32, + WORKER_ID_LEN = 10 + WQ_NAME_LEN, /* "kworker/R-" + WQ_NAME_LEN */ }; /* @@ -2742,6 +2743,26 @@ static void worker_detach_from_pool(struct worker *worker) complete(detach_completion); } +static int format_worker_id(char *buf, size_t size, struct worker *worker, + struct worker_pool *pool) +{ + if (worker->rescue_wq) + return scnprintf(buf, size, "kworker/R-%s", + worker->rescue_wq->name); + + if (pool) { + if (pool->cpu >= 0) + return scnprintf(buf, size, "kworker/%d:%d%s", + pool->cpu, worker->id, + pool->attrs->nice < 0 ? "H" : ""); + else + return scnprintf(buf, size, "kworker/u%d:%d", + pool->id, worker->id); + } else { + return scnprintf(buf, size, "kworker/dying"); + } +} + /** * create_worker - create a new workqueue worker * @pool: pool the new worker will belong to @@ -2758,7 +2779,6 @@ static struct worker *create_worker(struct worker_pool *pool) { struct worker *worker; int id; - char id_buf[23]; /* ID is needed to determine kthread name */ id = ida_alloc(&pool->worker_ida, GFP_KERNEL); @@ -2777,17 +2797,14 @@ static struct worker *create_worker(struct worker_pool *pool) worker->id = id; if (!(pool->flags & POOL_BH)) { - if (pool->cpu >= 0) - snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id, - pool->attrs->nice < 0 ? "H" : ""); - else - snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id); + char id_buf[WORKER_ID_LEN]; + format_worker_id(id_buf, sizeof(id_buf), worker, pool); worker->task = kthread_create_on_node(worker_thread, worker, - pool->node, "kworker/%s", id_buf); + pool->node, "%s", id_buf); if (IS_ERR(worker->task)) { if (PTR_ERR(worker->task) == -EINTR) { - pr_err("workqueue: Interrupted when creating a worker thread \"kworker/%s\"\n", + pr_err("workqueue: Interrupted when creating a worker thread \"%s\"\n", id_buf); } else { pr_err_once("workqueue: Failed to create a worker thread: %pe", @@ -3350,7 +3367,6 @@ static int worker_thread(void *__worker) raw_spin_unlock_irq(&pool->lock); set_pf_worker(false); - set_task_comm(worker->task, "kworker/dying"); ida_free(&pool->worker_ida, worker->id); worker_detach_from_pool(worker); WARN_ON_ONCE(!list_empty(&worker->entry)); @@ -5542,6 +5558,7 @@ static int wq_clamp_max_active(int max_active, unsigned int flags, static int init_rescuer(struct workqueue_struct *wq) { struct worker *rescuer; + char id_buf[WORKER_ID_LEN]; int ret; if (!(wq->flags & WQ_MEM_RECLAIM)) @@ -5555,7 +5572,9 @@ static int init_rescuer(struct workqueue_struct *wq) } rescuer->rescue_wq = wq; - rescuer->task = kthread_create(rescuer_thread, rescuer, "kworker/R-%s", wq->name); + format_worker_id(id_buf, sizeof(id_buf), rescuer, NULL); + + rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", id_buf); if (IS_ERR(rescuer->task)) { ret = PTR_ERR(rescuer->task); pr_err("workqueue: Failed to create a rescuer kthread for wq \"%s\": %pe", @@ -6384,19 +6403,15 @@ void show_freezable_workqueues(void) /* used to show worker information through /proc/PID/{comm,stat,status} */ void wq_worker_comm(char *buf, size_t size, struct task_struct *task) { - int off; - - /* always show the actual comm */ - off = strscpy(buf, task->comm, size); - if (off < 0) - return; - /* stabilize PF_WQ_WORKER and worker pool association */ mutex_lock(&wq_pool_attach_mutex); if (task->flags & PF_WQ_WORKER) { struct worker *worker = kthread_data(task); struct worker_pool *pool = worker->pool; + int off; + + off = format_worker_id(buf, size, worker, pool); if (pool) { raw_spin_lock_irq(&pool->lock); @@ -6415,6 +6430,8 @@ void wq_worker_comm(char *buf, size_t size, struct task_struct *task) } raw_spin_unlock_irq(&pool->lock); } + } else { + strscpy(buf, task->comm, size); } mutex_unlock(&wq_pool_attach_mutex); From e8dc41afca161b988e6d462f4d0803d247e22250 Mon Sep 17 00:00:00 2001 From: Shengjiu Wang Date: Sat, 11 May 2024 10:55:25 +0800 Subject: [PATCH 0017/1578] pmdomain: imx: gpcv2: Add delay after power up handshake AudioMix BLK-CTRL on i.MX8MP encountered an accessing register issue after power up. [ 2.181035] Kernel panic - not syncing: Asynchronous SError Interrupt [ 2.181038] CPU: 1 PID: 48 Comm: kworker/u16:2 Not tainted 6.9.0-rc5-next-20240424-00003-g21cec88845c6 #171 [ 2.181047] Hardware name: NXP i.MX8MPlus EVK board (DT) [ 2.181050] Workqueue: events_unbound deferred_probe_work_func [ 2.181064] Call trace: [...] [ 2.181142] arm64_serror_panic+0x6c/0x78 [ 2.181149] do_serror+0x3c/0x70 [ 2.181157] el1h_64_error_handler+0x30/0x48 [ 2.181164] el1h_64_error+0x64/0x68 [ 2.181171] clk_imx8mp_audiomix_runtime_resume+0x34/0x44 [ 2.181183] __genpd_runtime_resume+0x30/0x80 [ 2.181195] genpd_runtime_resume+0x110/0x244 [ 2.181205] __rpm_callback+0x48/0x1d8 [ 2.181213] rpm_callback+0x68/0x74 [ 2.181224] rpm_resume+0x468/0x6c0 [ 2.181234] __pm_runtime_resume+0x50/0x94 [ 2.181243] pm_runtime_get_suppliers+0x60/0x8c [ 2.181258] __driver_probe_device+0x48/0x12c [ 2.181268] driver_probe_device+0xd8/0x15c [ 2.181278] __device_attach_driver+0xb8/0x134 [ 2.181290] bus_for_each_drv+0x84/0xe0 [ 2.181302] __device_attach+0x9c/0x188 [ 2.181312] device_initial_probe+0x14/0x20 [ 2.181323] bus_probe_device+0xac/0xb0 [ 2.181334] deferred_probe_work_func+0x88/0xc0 [ 2.181344] process_one_work+0x150/0x290 [ 2.181357] worker_thread+0x2f8/0x408 [ 2.181370] kthread+0x110/0x114 [ 2.181381] ret_from_fork+0x10/0x20 [ 2.181391] SMP: stopping secondary CPUs According to comments in power up handshake: /* request the ADB400 to power up */ if (domain->bits.hskreq) { regmap_update_bits(domain->regmap, domain->regs->hsk, domain->bits.hskreq, domain->bits.hskreq); /* * ret = regmap_read_poll_timeout(domain->regmap, domain->regs->hsk, reg_val, * (reg_val & domain->bits.hskack), 0, * USEC_PER_MSEC); * Technically we need the commented code to wait handshake. But that needs * the BLK-CTL module BUS clk-en bit being set. * * There is a separate BLK-CTL module and we will have such a driver for it, * that driver will set the BUS clk-en bit and handshake will be triggered * automatically there. Just add a delay and suppose the handshake finish * after that. */ } The BLK-CTL module needs to add delay to wait for a handshake request finished. For some BLK-CTL module (eg. AudioMix on i.MX8MP) doesn't have BUS clk-en bit, it is better to add delay in this driver, as the BLK-CTL module doesn't need to care about how it is powered up. regmap_read_bypassed() is to make sure the above write IO transaction already reaches target before udelay(). Fixes: 1496dd413b2e ("clk: imx: imx8mp: Add pm_runtime support for power saving") Reported-by: Francesco Dolcini Closes: https://lore.kernel.org/all/66293535.170a0220.21fe.a2e7@mx.google.com/ Suggested-by: Frank Li Signed-off-by: Shengjiu Wang Tested-by: Adam Ford Tested-by: Alexander Stein Link: https://lore.kernel.org/r/1715396125-3724-1-git-send-email-shengjiu.wang@nxp.com Signed-off-by: Ulf Hansson --- drivers/pmdomain/imx/gpcv2.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/pmdomain/imx/gpcv2.c b/drivers/pmdomain/imx/gpcv2.c index 4b828d74a6064c..856eaac0ec140d 100644 --- a/drivers/pmdomain/imx/gpcv2.c +++ b/drivers/pmdomain/imx/gpcv2.c @@ -393,6 +393,17 @@ static int imx_pgc_power_up(struct generic_pm_domain *genpd) * automatically there. Just add a delay and suppose the handshake finish * after that. */ + + /* + * For some BLK-CTL module (eg. AudioMix on i.MX8MP) doesn't have BUS + * clk-en bit, it is better to add delay here, as the BLK-CTL module + * doesn't need to care about how it is powered up. + * + * regmap_read_bypassed() is to make sure the above write IO transaction + * already reaches target before udelay() + */ + regmap_read_bypassed(domain->regmap, domain->regs->hsk, ®_val); + udelay(5); } /* Disable reset clocks for all devices in the domain */ From 25460d6f39024cc3b8241b14c7ccf0d6f11a736a Mon Sep 17 00:00:00 2001 From: Nikita Zhandarovich Date: Mon, 8 Apr 2024 07:10:39 -0700 Subject: [PATCH 0018/1578] net/9p: fix uninit-value in p9_client_rpc() Syzbot with the help of KMSAN reported the following error: BUG: KMSAN: uninit-value in trace_9p_client_res include/trace/events/9p.h:146 [inline] BUG: KMSAN: uninit-value in p9_client_rpc+0x1314/0x1340 net/9p/client.c:754 trace_9p_client_res include/trace/events/9p.h:146 [inline] p9_client_rpc+0x1314/0x1340 net/9p/client.c:754 p9_client_create+0x1551/0x1ff0 net/9p/client.c:1031 v9fs_session_init+0x1b9/0x28e0 fs/9p/v9fs.c:410 v9fs_mount+0xe2/0x12b0 fs/9p/vfs_super.c:122 legacy_get_tree+0x114/0x290 fs/fs_context.c:662 vfs_get_tree+0xa7/0x570 fs/super.c:1797 do_new_mount+0x71f/0x15e0 fs/namespace.c:3352 path_mount+0x742/0x1f20 fs/namespace.c:3679 do_mount fs/namespace.c:3692 [inline] __do_sys_mount fs/namespace.c:3898 [inline] __se_sys_mount+0x725/0x810 fs/namespace.c:3875 __x64_sys_mount+0xe4/0x150 fs/namespace.c:3875 do_syscall_64+0xd5/0x1f0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 Uninit was created at: __alloc_pages+0x9d6/0xe70 mm/page_alloc.c:4598 __alloc_pages_node include/linux/gfp.h:238 [inline] alloc_pages_node include/linux/gfp.h:261 [inline] alloc_slab_page mm/slub.c:2175 [inline] allocate_slab mm/slub.c:2338 [inline] new_slab+0x2de/0x1400 mm/slub.c:2391 ___slab_alloc+0x1184/0x33d0 mm/slub.c:3525 __slab_alloc mm/slub.c:3610 [inline] __slab_alloc_node mm/slub.c:3663 [inline] slab_alloc_node mm/slub.c:3835 [inline] kmem_cache_alloc+0x6d3/0xbe0 mm/slub.c:3852 p9_tag_alloc net/9p/client.c:278 [inline] p9_client_prepare_req+0x20a/0x1770 net/9p/client.c:641 p9_client_rpc+0x27e/0x1340 net/9p/client.c:688 p9_client_create+0x1551/0x1ff0 net/9p/client.c:1031 v9fs_session_init+0x1b9/0x28e0 fs/9p/v9fs.c:410 v9fs_mount+0xe2/0x12b0 fs/9p/vfs_super.c:122 legacy_get_tree+0x114/0x290 fs/fs_context.c:662 vfs_get_tree+0xa7/0x570 fs/super.c:1797 do_new_mount+0x71f/0x15e0 fs/namespace.c:3352 path_mount+0x742/0x1f20 fs/namespace.c:3679 do_mount fs/namespace.c:3692 [inline] __do_sys_mount fs/namespace.c:3898 [inline] __se_sys_mount+0x725/0x810 fs/namespace.c:3875 __x64_sys_mount+0xe4/0x150 fs/namespace.c:3875 do_syscall_64+0xd5/0x1f0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 If p9_check_errors() fails early in p9_client_rpc(), req->rc.tag will not be properly initialized. However, trace_9p_client_res() ends up trying to print it out anyway before p9_client_rpc() finishes. Fix this issue by assigning default values to p9_fcall fields such as 'tag' and (just in case KMSAN unearths something new) 'id' during the tag allocation stage. Reported-and-tested-by: syzbot+ff14db38f56329ef68df@syzkaller.appspotmail.com Fixes: 348b59012e5c ("net/9p: Convert net/9p protocol dumps to tracepoints") Signed-off-by: Nikita Zhandarovich Reviewed-by: Christian Schoenebeck Cc: stable@vger.kernel.org Message-ID: <20240408141039.30428-1-n.zhandarovich@fintech.ru> Signed-off-by: Dominique Martinet --- net/9p/client.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/9p/client.c b/net/9p/client.c index f7e90b4769bba9..b05f73c291b4b9 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -235,6 +235,8 @@ static int p9_fcall_init(struct p9_client *c, struct p9_fcall *fc, if (!fc->sdata) return -ENOMEM; fc->capacity = alloc_msize; + fc->id = 0; + fc->tag = P9_NOTAG; return 0; } From 39bc27bd688066a63e56f7f64ad34fae03fbe3b8 Mon Sep 17 00:00:00 2001 From: "Wachowski, Karol" Date: Mon, 20 May 2024 12:05:14 +0200 Subject: [PATCH 0019/1578] drm/shmem-helper: Fix BUG_ON() on mmap(PROT_WRITE, MAP_PRIVATE) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lack of check for copy-on-write (COW) mapping in drm_gem_shmem_mmap allows users to call mmap with PROT_WRITE and MAP_PRIVATE flag causing a kernel panic due to BUG_ON in vmf_insert_pfn_prot: BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); Return -EINVAL early if COW mapping is detected. This bug affects all drm drivers using default shmem helpers. It can be reproduced by this simple example: void *ptr = mmap(0, size, PROT_WRITE, MAP_PRIVATE, fd, mmap_offset); ptr[0] = 0; Fixes: 2194a63a818d ("drm: Add library for shmem backed GEM objects") Cc: Noralf Trønnes Cc: Eric Anholt Cc: Rob Herring Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: Thomas Zimmermann Cc: David Airlie Cc: Daniel Vetter Cc: dri-devel@lists.freedesktop.org Cc: # v5.2+ Signed-off-by: Wachowski, Karol Signed-off-by: Jacek Lawrynowicz Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20240520100514.925681-1-jacek.lawrynowicz@linux.intel.com --- drivers/gpu/drm/drm_gem_shmem_helper.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/drm_gem_shmem_helper.c b/drivers/gpu/drm/drm_gem_shmem_helper.c index e435f986cd135b..1ff0678be7c75b 100644 --- a/drivers/gpu/drm/drm_gem_shmem_helper.c +++ b/drivers/gpu/drm/drm_gem_shmem_helper.c @@ -610,6 +610,9 @@ int drm_gem_shmem_mmap(struct drm_gem_shmem_object *shmem, struct vm_area_struct return ret; } + if (is_cow_mapping(vma->vm_flags)) + return -EINVAL; + dma_resv_lock(shmem->base.resv, NULL); ret = drm_gem_shmem_get_pages(shmem); dma_resv_unlock(shmem->base.resv); From d3a043733f25d743f3aa617c7f82dbcb5ee2211a Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Thu, 16 May 2024 17:43:51 +0530 Subject: [PATCH 0020/1578] nvme-multipath: find NUMA path only for online numa-node In current native multipath design when a shared namespace is created, we loop through each possible numa-node, calculate the NUMA distance of that node from each nvme controller and then cache the optimal IO path for future reference while sending IO. The issue with this design is that we may refer to the NUMA distance table for an offline node which may not be populated at the time and so we may inadvertently end up finding and caching a non-optimal path for IO. Then latter when the corresponding numa-node becomes online and hence the NUMA distance table entry for that node is created, ideally we should re-calculate the multipath node distance for the newly added node however that doesn't happen unless we rescan/reset the controller. So essentially, we may keep using non-optimal IO path for a node which is made online after namespace is created. This patch helps fix this issue ensuring that when a shared namespace is created, we calculate the multipath node distance for each online numa-node instead of each possible numa-node. Then latter when a node becomes online and we receive any IO on that newly added node, we would calculate the multipath node distance for newly added node but this time NUMA distance table would have been already populated for newly added node. Hence we would be able to correctly calculate the multipath node distance and choose the optimal path for the IO. Signed-off-by: Nilay Shroff Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/multipath.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index d16e976ae1a473..9c1e135b8df3b1 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -595,7 +595,7 @@ static void nvme_mpath_set_live(struct nvme_ns *ns) int node, srcu_idx; srcu_idx = srcu_read_lock(&head->srcu); - for_each_node(node) + for_each_online_node(node) __nvme_find_path(head, node); srcu_read_unlock(&head->srcu, srcu_idx); } From 8d00547ea8754afdc4a550af2fb7af2e3ba93cf8 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Thu, 16 May 2024 10:09:28 +0800 Subject: [PATCH 0021/1578] MAINTAINERS: Add myself as reviewer of ARM64 BPF JIT I am working on ARM64 BPF JIT for a while, hence add myself as reviewer. Signed-off-by: Xu Kuohai Acked-by: Hengqi Chen Link: https://lore.kernel.org/r/20240516020928.156125-1-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 3fdc3b09c17174..5e279e9ff63f7d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3816,6 +3816,7 @@ BPF JIT for ARM64 M: Daniel Borkmann M: Alexei Starovoitov M: Puranjay Mohan +R: Xu Kuohai L: bpf@vger.kernel.org S: Supported F: arch/arm64/net/ From aad11473f8f4be3df86461081ce35ec5b145ba68 Mon Sep 17 00:00:00 2001 From: Dmitry Mastykin Date: Wed, 22 May 2024 10:45:24 +0300 Subject: [PATCH 0022/1578] NFSv4: Fix memory leak in nfs4_set_security_label We leak nfs_fattr and nfs4_label every time we set a security xattr. Signed-off-by: Dmitry Mastykin Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c93c12063b3af2..94c07875aa3f42 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6268,6 +6268,7 @@ nfs4_set_security_label(struct inode *inode, const void *buf, size_t buflen) if (status == 0) nfs_setsecurity(inode, fattr); + nfs_free_fattr(fattr); return status; } #endif /* CONFIG_NFS_V4_SECURITY_LABEL */ From 6cbe14f42be3b596e9590d48e12436982ba26e4b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 22 May 2024 11:27:32 -0400 Subject: [PATCH 0023/1578] MAINTAINERS: Change email address for Trond Myklebust Ensure that patches are sent to an email server that won't corrupt them. Signed-off-by: Trond Myklebust --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 28e20975c26f5c..481d76fbf16ccf 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15549,7 +15549,7 @@ F: drivers/nfc/virtual_ncidev.c F: tools/testing/selftests/nci/ NFS, SUNRPC, AND LOCKD CLIENTS -M: Trond Myklebust +M: Trond Myklebust M: Anna Schumaker L: linux-nfs@vger.kernel.org S: Maintained From 134d0b3f2440cdddd12fc3444c9c0f62331ce6fc Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 21 May 2024 15:58:40 +0300 Subject: [PATCH 0024/1578] nfs: propagate readlink errors in nfs_symlink_filler There is an inherent race where a symlink file may have been overriden (by a different client) between lookup and readlink, resulting in a spurious EIO error returned to userspace. Fix this by propagating back ESTALE errors such that the vfs will retry the lookup/get_link (similar to nfs4_file_open) at least once. Cc: Dan Aloni Signed-off-by: Sagi Grimberg Reviewed-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/symlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c index 0e27a2e4e68b84..13818129d268fe 100644 --- a/fs/nfs/symlink.c +++ b/fs/nfs/symlink.c @@ -41,7 +41,7 @@ static int nfs_symlink_filler(struct file *file, struct folio *folio) error: folio_set_error(folio); folio_unlock(folio); - return -EIO; + return error; } static const char *nfs_get_link(struct dentry *dentry, From 5134acb15d9ef27aa2b90aad46d4e89fcef79fdc Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 22 May 2024 10:32:43 -0700 Subject: [PATCH 0025/1578] efi/libstub: zboot.lds: Discard .discard sections When building ARCH=loongarch defconfig + CONFIG_UNWINDER_ORC=y using LLVM, there is a warning from ld.lld when linking the EFI zboot image due to the use of unreachable() in number() in vsprintf.c: ld.lld: warning: drivers/firmware/efi/libstub/lib.a(vsprintf.stub.o):(.discard.unreachable+0x0): has non-ABS relocation R_LARCH_32_PCREL against symbol '' If the compiler cannot eliminate the default case for any reason, the .discard.unreachable section will remain in the final binary but the entire point of any section prefixed with .discard is that it is only used at compile time, so it can be discarded via /DISCARD/ in a linker script. The asm-generic vmlinux.lds.h includes .discard and .discard.* in the COMMON_DISCARDS macro but that is not used for zboot.lds, as it is not a kernel image linker script. Add .discard and .discard.* to /DISCARD/ in zboot.lds, so that any sections meant to be discarded at link time are not included in the final zboot image. This issue is not specific to LoongArch, it is just the first architecture to select CONFIG_OBJTOOL, which defines annotate_unreachable() as an asm statement to add the .discard.unreachable section, and use the EFI stub. Closes: https://github.com/ClangBuiltLinux/linux/issues/2023 Signed-off-by: Nathan Chancellor Acked-by: Huacai Chen Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/libstub/zboot.lds | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/firmware/efi/libstub/zboot.lds b/drivers/firmware/efi/libstub/zboot.lds index ac8c0ef851581f..af2c82f7bd9024 100644 --- a/drivers/firmware/efi/libstub/zboot.lds +++ b/drivers/firmware/efi/libstub/zboot.lds @@ -41,6 +41,7 @@ SECTIONS } /DISCARD/ : { + *(.discard .discard.*) *(.modinfo .init.modinfo) } } From 7c23b186ab892088f76a3ad9dbff1685ffe2e832 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Sun, 19 May 2024 13:33:53 -0300 Subject: [PATCH 0026/1578] efi: pstore: Return proper errors on UEFI failures Right now efi-pstore either returns 0 (success) or -EIO; but we do have a function to convert UEFI errors in different standard error codes, helping to narrow down potential issues more accurately. So, let's use this helper here. Signed-off-by: Guilherme G. Piccoli Reviewed-by: Kees Cook Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/efi-pstore.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/firmware/efi/efi-pstore.c b/drivers/firmware/efi/efi-pstore.c index 5b9dc26e6bcb96..552c78f5f05902 100644 --- a/drivers/firmware/efi/efi-pstore.c +++ b/drivers/firmware/efi/efi-pstore.c @@ -136,7 +136,7 @@ static int efi_pstore_read_func(struct pstore_record *record, &size, record->buf); if (status != EFI_SUCCESS) { kfree(record->buf); - return -EIO; + return efi_status_to_err(status); } /* @@ -189,7 +189,7 @@ static ssize_t efi_pstore_read(struct pstore_record *record) return 0; if (status != EFI_SUCCESS) - return -EIO; + return efi_status_to_err(status); /* skip variables that don't concern us */ if (efi_guidcmp(guid, LINUX_EFI_CRASH_GUID)) @@ -227,7 +227,7 @@ static int efi_pstore_write(struct pstore_record *record) record->size, record->psi->buf, true); efivar_unlock(); - return status == EFI_SUCCESS ? 0 : -EIO; + return efi_status_to_err(status); }; static int efi_pstore_erase(struct pstore_record *record) @@ -238,7 +238,7 @@ static int efi_pstore_erase(struct pstore_record *record) PSTORE_EFI_ATTRIBUTES, 0, NULL); if (status != EFI_SUCCESS && status != EFI_NOT_FOUND) - return -EIO; + return efi_status_to_err(status); return 0; } From c898afdc15645efb555acb6d85b484eb40a45409 Mon Sep 17 00:00:00 2001 From: Dominique Martinet Date: Tue, 21 May 2024 21:13:36 +0900 Subject: [PATCH 0027/1578] 9p: add missing locking around taking dentry fid list Fix a use-after-free on dentry's d_fsdata fid list when a thread looks up a fid through dentry while another thread unlinks it: UAF thread: refcount_t: addition on 0; use-after-free. p9_fid_get linux/./include/net/9p/client.h:262 v9fs_fid_find+0x236/0x280 linux/fs/9p/fid.c:129 v9fs_fid_lookup_with_uid linux/fs/9p/fid.c:181 v9fs_fid_lookup+0xbf/0xc20 linux/fs/9p/fid.c:314 v9fs_vfs_getattr_dotl+0xf9/0x360 linux/fs/9p/vfs_inode_dotl.c:400 vfs_statx+0xdd/0x4d0 linux/fs/stat.c:248 Freed by: p9_fid_destroy (inlined) p9_client_clunk+0xb0/0xe0 linux/net/9p/client.c:1456 p9_fid_put linux/./include/net/9p/client.h:278 v9fs_dentry_release+0xb5/0x140 linux/fs/9p/vfs_dentry.c:55 v9fs_remove+0x38f/0x620 linux/fs/9p/vfs_inode.c:518 vfs_unlink+0x29a/0x810 linux/fs/namei.c:4335 The problem is that d_fsdata was not accessed under d_lock, because d_release() normally is only called once the dentry is otherwise no longer accessible but since we also call it explicitly in v9fs_remove that lock is required: move the hlist out of the dentry under lock then unref its fids once they are no longer accessible. Fixes: 154372e67d40 ("fs/9p: fix create-unlink-getattr idiom") Cc: stable@vger.kernel.org Reported-by: Meysam Firouzi Reported-by: Amirmohammad Eftekhar Reviewed-by: Christian Schoenebeck Message-ID: <20240521122947.1080227-1-asmadeus@codewreck.org> Signed-off-by: Dominique Martinet --- fs/9p/vfs_dentry.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index f16f7358163490..01338d4c2d9e6f 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -48,12 +48,17 @@ static int v9fs_cached_dentry_delete(const struct dentry *dentry) static void v9fs_dentry_release(struct dentry *dentry) { struct hlist_node *p, *n; + struct hlist_head head; p9_debug(P9_DEBUG_VFS, " dentry: %pd (%p)\n", dentry, dentry); - hlist_for_each_safe(p, n, (struct hlist_head *)&dentry->d_fsdata) + + spin_lock(&dentry->d_lock); + hlist_move_list((struct hlist_head *)&dentry->d_fsdata, &head); + spin_unlock(&dentry->d_lock); + + hlist_for_each_safe(p, n, &head) p9_fid_put(hlist_entry(p, struct p9_fid, dlist)); - dentry->d_fsdata = NULL; } static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags) From 9f788ba457b45b0ce422943fcec9fa35c4587764 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 22 May 2024 20:09:49 +0300 Subject: [PATCH 0028/1578] spi: Don't mark message DMA mapped when no transfer in it is There is no need to set the DMA mapped flag of the message if it has no mapped transfers. Moreover, it may give the code a chance to take the wrong paths, i.e. to exercise DMA related APIs on unmapped data. Make __spi_map_msg() to bail earlier on the above mentioned cases. Fixes: 99adef310f68 ("spi: Provide core support for DMA mapping transfers") Signed-off-by: Andy Shevchenko Link: https://msgid.link/r/20240522171018.3362521-2-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- drivers/spi/spi.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index a8966caed84142..d40ce0fdb1a81e 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -1243,6 +1243,7 @@ static int __spi_map_msg(struct spi_controller *ctlr, struct spi_message *msg) else rx_dev = ctlr->dev.parent; + ret = -ENOMSG; list_for_each_entry(xfer, &msg->transfers, transfer_list) { /* The sync is done before each transfer. */ unsigned long attrs = DMA_ATTR_SKIP_CPU_SYNC; @@ -1272,6 +1273,9 @@ static int __spi_map_msg(struct spi_controller *ctlr, struct spi_message *msg) } } } + /* No transfer has been mapped, bail out with success */ + if (ret) + return 0; ctlr->cur_rx_dma_dev = rx_dev; ctlr->cur_tx_dma_dev = tx_dev; From da560097c05612f8d360f86528f6213629b9c395 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 22 May 2024 20:09:50 +0300 Subject: [PATCH 0029/1578] spi: Check if transfer is mapped before calling DMA sync APIs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The resent update to remove the orig_nents checks revealed that not all DMA sync backends can cope with the unallocated SG list, while supplying orig_nents == 0 (the commit 861370f49ce4 ("iommu/dma: force bouncing if the size is not cacheline-aligned"), for example, makes that happen for the IOMMU case). It means we have to check if the buffers are DMA mapped before trying to sync them. Re-introduce that check in a form of calling ->can_dma() in the same way as it's done in the DMA mapping loop for the SPI transfers. Reported-by: Nícolas F. R. A. Prado Reported-by: Neil Armstrong Closes: https://lore.kernel.org/r/8ae675b5-fcf9-4c9b-b06a-4462f70e1322@linaro.org Closes: https://lore.kernel.org/all/d3679496-2e4e-4a7c-97ed-f193bd53af1d@notapiano Fixes: 8cc3bad9d9d6 ("spi: Remove unneded check for orig_nents") Suggested-by: Nícolas F. R. A. Prado Tested-by: Nícolas F. R. A. Prado Signed-off-by: Andy Shevchenko Link: https://msgid.link/r/20240522171018.3362521-3-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- drivers/spi/spi.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index d40ce0fdb1a81e..b18a4c871e217f 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -1311,7 +1311,7 @@ static int __spi_unmap_msg(struct spi_controller *ctlr, struct spi_message *msg) return 0; } -static void spi_dma_sync_for_device(struct spi_controller *ctlr, +static void spi_dma_sync_for_device(struct spi_controller *ctlr, struct spi_message *msg, struct spi_transfer *xfer) { struct device *rx_dev = ctlr->cur_rx_dma_dev; @@ -1320,11 +1320,14 @@ static void spi_dma_sync_for_device(struct spi_controller *ctlr, if (!ctlr->cur_msg_mapped) return; + if (!ctlr->can_dma(ctlr, msg->spi, xfer)) + return; + dma_sync_sgtable_for_device(tx_dev, &xfer->tx_sg, DMA_TO_DEVICE); dma_sync_sgtable_for_device(rx_dev, &xfer->rx_sg, DMA_FROM_DEVICE); } -static void spi_dma_sync_for_cpu(struct spi_controller *ctlr, +static void spi_dma_sync_for_cpu(struct spi_controller *ctlr, struct spi_message *msg, struct spi_transfer *xfer) { struct device *rx_dev = ctlr->cur_rx_dma_dev; @@ -1333,6 +1336,9 @@ static void spi_dma_sync_for_cpu(struct spi_controller *ctlr, if (!ctlr->cur_msg_mapped) return; + if (!ctlr->can_dma(ctlr, msg->spi, xfer)) + return; + dma_sync_sgtable_for_cpu(rx_dev, &xfer->rx_sg, DMA_FROM_DEVICE); dma_sync_sgtable_for_cpu(tx_dev, &xfer->tx_sg, DMA_TO_DEVICE); } @@ -1350,11 +1356,13 @@ static inline int __spi_unmap_msg(struct spi_controller *ctlr, } static void spi_dma_sync_for_device(struct spi_controller *ctrl, + struct spi_message *msg, struct spi_transfer *xfer) { } static void spi_dma_sync_for_cpu(struct spi_controller *ctrl, + struct spi_message *msg, struct spi_transfer *xfer) { } @@ -1626,10 +1634,10 @@ static int spi_transfer_one_message(struct spi_controller *ctlr, reinit_completion(&ctlr->xfer_completion); fallback_pio: - spi_dma_sync_for_device(ctlr, xfer); + spi_dma_sync_for_device(ctlr, msg, xfer); ret = ctlr->transfer_one(ctlr, msg->spi, xfer); if (ret < 0) { - spi_dma_sync_for_cpu(ctlr, xfer); + spi_dma_sync_for_cpu(ctlr, msg, xfer); if (ctlr->cur_msg_mapped && (xfer->error & SPI_TRANS_FAIL_NO_START)) { @@ -1654,7 +1662,7 @@ static int spi_transfer_one_message(struct spi_controller *ctlr, msg->status = ret; } - spi_dma_sync_for_cpu(ctlr, xfer); + spi_dma_sync_for_cpu(ctlr, msg, xfer); } else { if (xfer->len) dev_err(&msg->spi->dev, From a827ad9b3c2fc243e058595533f91ce41a312527 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 23 May 2024 12:33:25 +0200 Subject: [PATCH 0030/1578] spi: stm32: Revert change that enabled controller before asserting CS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On stm32mp157 enabling the controller before asserting CS makes the hardware trigger spurious interrupts in a tight loop and the transfers fail. Revert the commit that swapped the order of enable and CS. This reintroduces the problem that swapping was supposed to fix, which however is less grave. Reported-by: Leonard Göhrs Link: https://lore.kernel.org/all/39033ed7-3e57-4339-80b4-fc8919e26aa7@pengutronix.de/ Fixes: 52b62e7a5d4f ("spi: stm32: enable controller before asserting CS") Signed-off-by: Uwe Kleine-König Link: https://msgid.link/r/20240523103326.792907-2-u.kleine-koenig@pengutronix.de Signed-off-by: Mark Brown --- drivers/spi/spi-stm32.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-stm32.c b/drivers/spi/spi-stm32.c index 4a68abcdcc3535..e4e7ddb7524a99 100644 --- a/drivers/spi/spi-stm32.c +++ b/drivers/spi/spi-stm32.c @@ -1016,8 +1016,10 @@ static irqreturn_t stm32fx_spi_irq_event(int irq, void *dev_id) static irqreturn_t stm32fx_spi_irq_thread(int irq, void *dev_id) { struct spi_controller *ctrl = dev_id; + struct stm32_spi *spi = spi_controller_get_devdata(ctrl); spi_finalize_current_transfer(ctrl); + stm32fx_spi_disable(spi); return IRQ_HANDLED; } @@ -1185,8 +1187,6 @@ static int stm32_spi_prepare_msg(struct spi_controller *ctrl, ~clrb) | setb, spi->base + spi->cfg->regs->cpol.reg); - stm32_spi_enable(spi); - spin_unlock_irqrestore(&spi->lock, flags); return 0; @@ -1204,6 +1204,7 @@ static void stm32fx_spi_dma_tx_cb(void *data) if (spi->cur_comm == SPI_SIMPLEX_TX || spi->cur_comm == SPI_3WIRE_TX) { spi_finalize_current_transfer(spi->ctrl); + stm32fx_spi_disable(spi); } } @@ -1218,6 +1219,7 @@ static void stm32_spi_dma_rx_cb(void *data) struct stm32_spi *spi = data; spi_finalize_current_transfer(spi->ctrl); + spi->cfg->disable(spi); } /** @@ -1305,6 +1307,8 @@ static int stm32fx_spi_transfer_one_irq(struct stm32_spi *spi) stm32_spi_set_bits(spi, STM32FX_SPI_CR2, cr2); + stm32_spi_enable(spi); + /* starting data transfer when buffer is loaded */ if (spi->tx_buf) spi->cfg->write_tx(spi); @@ -1341,6 +1345,8 @@ static int stm32h7_spi_transfer_one_irq(struct stm32_spi *spi) spin_lock_irqsave(&spi->lock, flags); + stm32_spi_enable(spi); + /* Be sure to have data in fifo before starting data transfer */ if (spi->tx_buf) stm32h7_spi_write_txfifo(spi); @@ -1372,6 +1378,8 @@ static void stm32fx_spi_transfer_one_dma_start(struct stm32_spi *spi) */ stm32_spi_set_bits(spi, STM32FX_SPI_CR2, STM32FX_SPI_CR2_ERRIE); } + + stm32_spi_enable(spi); } /** @@ -1405,6 +1413,8 @@ static void stm32h7_spi_transfer_one_dma_start(struct stm32_spi *spi) stm32_spi_set_bits(spi, STM32H7_SPI_IER, ier); + stm32_spi_enable(spi); + if (STM32_SPI_HOST_MODE(spi)) stm32_spi_set_bits(spi, STM32H7_SPI_CR1, STM32H7_SPI_CR1_CSTART); } From e6722ea6b9ed731f7392277d76ca912dfffca7ee Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 6 Jan 2024 13:48:25 +0100 Subject: [PATCH 0031/1578] i2c: synquacer: Remove a clk reference from struct synquacer_i2c 'pclk' is only used locally in the probe. Remove it from the 'synquacer_i2c' structure. Also remove a useless debug message. Signed-off-by: Christophe JAILLET Acked-by: Ard Biesheuvel Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-synquacer.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/i2c/busses/i2c-synquacer.c b/drivers/i2c/busses/i2c-synquacer.c index 31ecb2c7e97835..4eccbcd0fbfc00 100644 --- a/drivers/i2c/busses/i2c-synquacer.c +++ b/drivers/i2c/busses/i2c-synquacer.c @@ -138,7 +138,6 @@ struct synquacer_i2c { int irq; struct device *dev; void __iomem *base; - struct clk *pclk; u32 pclkrate; u32 speed_khz; u32 timeout_ms; @@ -535,6 +534,7 @@ static const struct i2c_adapter synquacer_i2c_ops = { static int synquacer_i2c_probe(struct platform_device *pdev) { struct synquacer_i2c *i2c; + struct clk *pclk; u32 bus_speed; int ret; @@ -550,13 +550,12 @@ static int synquacer_i2c_probe(struct platform_device *pdev) device_property_read_u32(&pdev->dev, "socionext,pclk-rate", &i2c->pclkrate); - i2c->pclk = devm_clk_get_enabled(&pdev->dev, "pclk"); - if (IS_ERR(i2c->pclk)) - return dev_err_probe(&pdev->dev, PTR_ERR(i2c->pclk), + pclk = devm_clk_get_enabled(&pdev->dev, "pclk"); + if (IS_ERR(pclk)) + return dev_err_probe(&pdev->dev, PTR_ERR(pclk), "failed to get and enable clock\n"); - dev_dbg(&pdev->dev, "clock source %p\n", i2c->pclk); - i2c->pclkrate = clk_get_rate(i2c->pclk); + i2c->pclkrate = clk_get_rate(pclk); if (i2c->pclkrate < SYNQUACER_I2C_MIN_CLK_RATE || i2c->pclkrate > SYNQUACER_I2C_MAX_CLK_RATE) From e61bcf42d290e73025bab38e0e55a5586c2d8ad5 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 15 Apr 2024 22:50:27 +0200 Subject: [PATCH 0032/1578] i2c: Remove I2C_CLASS_SPD Remove this class after all users have been gone. Signed-off-by: Heiner Kallweit Signed-off-by: Andi Shyti --- include/linux/i2c.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 5e6cd43a6dbdd9..9709537370ee9b 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -852,7 +852,6 @@ static inline void i2c_mark_adapter_resumed(struct i2c_adapter *adap) /* i2c adapter classes (bitmask) */ #define I2C_CLASS_HWMON (1<<0) /* lm_sensors, ... */ -#define I2C_CLASS_SPD (1<<7) /* Memory modules */ /* Warn users that the adapter doesn't support classes anymore */ #define I2C_CLASS_DEPRECATED (1<<8) From 2360497238261f17d4a3f6cbc02d6dbd8951c23c Mon Sep 17 00:00:00 2001 From: Zhang Lixu Date: Thu, 23 May 2024 09:14:01 +0800 Subject: [PATCH 0033/1578] HID: intel-ish-hid: Fix build error for COMPILE_TEST kernel test robot reported build error due to a pointer type mismatch: .../ishtp/loader.c:172:8: error: incompatible pointer types passing '__le64 *' (aka 'unsigned long long *') to parameter of type 'dma_addr_t *' (aka 'unsigned int *') The issue arises because the driver, which is primarily intended for x86-64, is also built for i386 when COMPILE_TEST is enabled. Resolve type mismatch by using a temporary dma_addr_t variable to hold the DMA address. Populate this temporary variable in dma_alloc_coherent() function, and then convert and store the address in the fragment->fragment_tbl[i].ddr_adrs field in the correct format. Similarly, convert the ddr_adrs field back to dma_addr_t when freeing the DMA buffer with dma_free_coherent(). Fixes: 579a267e4617 ("HID: intel-ish-hid: Implement loading firmware from host feature") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202405201313.SAStVPrT-lkp@intel.com/ Signed-off-by: Zhang Lixu Signed-off-by: Jiri Kosina --- drivers/hid/intel-ish-hid/ishtp/loader.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/hid/intel-ish-hid/ishtp/loader.c b/drivers/hid/intel-ish-hid/ishtp/loader.c index 993f8b390e57ad..2785b04a2f5a0b 100644 --- a/drivers/hid/intel-ish-hid/ishtp/loader.c +++ b/drivers/hid/intel-ish-hid/ishtp/loader.c @@ -138,12 +138,13 @@ static void release_dma_bufs(struct ishtp_device *dev, struct loader_xfer_dma_fragment *fragment, void **dma_bufs, u32 fragment_size) { + dma_addr_t dma_addr; int i; for (i = 0; i < FRAGMENT_MAX_NUM; i++) { if (dma_bufs[i]) { - dma_free_coherent(dev->devc, fragment_size, dma_bufs[i], - fragment->fragment_tbl[i].ddr_adrs); + dma_addr = le64_to_cpu(fragment->fragment_tbl[i].ddr_adrs); + dma_free_coherent(dev->devc, fragment_size, dma_bufs[i], dma_addr); dma_bufs[i] = NULL; } } @@ -164,15 +165,16 @@ static int prepare_dma_bufs(struct ishtp_device *dev, struct loader_xfer_dma_fragment *fragment, void **dma_bufs, u32 fragment_size) { + dma_addr_t dma_addr; u32 offset = 0; int i; for (i = 0; i < fragment->fragment_cnt && offset < ish_fw->size; i++) { - dma_bufs[i] = dma_alloc_coherent(dev->devc, fragment_size, - &fragment->fragment_tbl[i].ddr_adrs, GFP_KERNEL); + dma_bufs[i] = dma_alloc_coherent(dev->devc, fragment_size, &dma_addr, GFP_KERNEL); if (!dma_bufs[i]) return -ENOMEM; + fragment->fragment_tbl[i].ddr_adrs = cpu_to_le64(dma_addr); fragment->fragment_tbl[i].length = clamp(ish_fw->size - offset, 0, fragment_size); fragment->fragment_tbl[i].fw_off = offset; memcpy(dma_bufs[i], ish_fw->data + offset, fragment->fragment_tbl[i].length); From 0a3f9f7fc59feb8a91a2793b8b60977895c72365 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Wed, 15 May 2024 11:30:51 +0800 Subject: [PATCH 0034/1578] HID: nvidia-shield: Add missing check for input_ff_create_memless Add check for the return value of input_ff_create_memless() and return the error if it fails in order to catch the error. Fixes: 09308562d4af ("HID: nvidia-shield: Initial driver implementation with Thunderstrike support") Signed-off-by: Chen Ni Reviewed-by: Rahul Rameshbabu Signed-off-by: Jiri Kosina --- drivers/hid/hid-nvidia-shield.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/hid/hid-nvidia-shield.c b/drivers/hid/hid-nvidia-shield.c index 58b15750dbb0ac..ff9078ad196112 100644 --- a/drivers/hid/hid-nvidia-shield.c +++ b/drivers/hid/hid-nvidia-shield.c @@ -283,7 +283,9 @@ static struct input_dev *shield_haptics_create( return haptics; input_set_capability(haptics, EV_FF, FF_RUMBLE); - input_ff_create_memless(haptics, NULL, play_effect); + ret = input_ff_create_memless(haptics, NULL, play_effect); + if (ret) + goto err; ret = input_register_device(haptics); if (ret) From ed281c6ab6eb8a914f06c74dfeaebde15b34a3f4 Mon Sep 17 00:00:00 2001 From: Carl Huang Date: Tue, 21 May 2024 11:08:10 +0300 Subject: [PATCH 0035/1578] wifi: ath11k: fix WCN6750 firmware crash caused by 17 num_vdevs WCN6750 firmware crashes because of num_vdevs changed from 4 to 17 in ath11k_init_wmi_config_qca6390() as the ab->hw_params.num_vdevs is 17. This is caused by commit f019f4dff2e4 ("wifi: ath11k: support 2 station interfaces") which assigns ab->hw_params.num_vdevs directly to config->num_vdevs in ath11k_init_wmi_config_qca6390(), therefore WCN6750 firmware crashes as it can't support such a big num_vdevs. Fix it by assign 3 to num_vdevs in hw_params for WCN6750 as 3 is sufficient too. Tested-on: WCN6855 hw2.0 PCI WLAN.HSP.1.1-03125-QCAHSPSWPL_V1_V2_SILICONZ_LITE-3 Tested-on: WCN6750 hw1.0 AHB WLAN.MSL.1.0.1-01371-QCAMSLSWPLZ-1 Fixes: f019f4dff2e4 ("wifi: ath11k: support 2 station interfaces") Reported-by: Luca Weiss Tested-by: Luca Weiss Closes: https://lore.kernel.org/r/D15TIIDIIESY.D1EKKJLZINMA@fairphone.com/ Signed-off-by: Carl Huang Signed-off-by: Kalle Valo Link: https://msgid.link/20240520030757.2209395-1-quic_cjhuang@quicinc.com --- drivers/net/wireless/ath/ath11k/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath11k/core.c b/drivers/net/wireless/ath/ath11k/core.c index 3cc817a3b4a404..b82e8fb2854130 100644 --- a/drivers/net/wireless/ath/ath11k/core.c +++ b/drivers/net/wireless/ath/ath11k/core.c @@ -604,7 +604,7 @@ static const struct ath11k_hw_params ath11k_hw_params[] = { .coldboot_cal_ftm = true, .cbcal_restart_fw = false, .fw_mem_mode = 0, - .num_vdevs = 16 + 1, + .num_vdevs = 3, .num_peers = 512, .supports_suspend = false, .hal_desc_sz = sizeof(struct hal_rx_desc_qcn9074), From 6e16782d6b4a724f9c9dcd49471219643593b60c Mon Sep 17 00:00:00 2001 From: Baochen Qiang Date: Tue, 21 May 2024 11:08:11 +0300 Subject: [PATCH 0036/1578] wifi: ath11k: move power type check to ASSOC stage when connecting to 6 GHz AP With commit bc8a0fac8677 ("wifi: mac80211: don't set bss_conf in parsing") ath11k fails to connect to 6 GHz AP. This is because currently ath11k checks AP's power type in ath11k_mac_op_assign_vif_chanctx() which would be called in AUTH stage. However with above commit power type is not available until ASSOC stage. As a result power type check fails and therefore connection fails. Fix this by moving power type check to ASSOC stage, also move regulatory rules update there because it depends on power type. Tested-on: WCN6855 hw2.0 PCI WLAN.HSP.1.1-03125-QCAHSPSWPL_V1_V2_SILICONZ_LITE-3.6510.30 Fixes: bc8a0fac8677 ("wifi: mac80211: don't set bss_conf in parsing") Signed-off-by: Baochen Qiang Acked-by: Jeff Johnson Signed-off-by: Kalle Valo Link: https://msgid.link/20240424064019.4847-1-quic_bqiang@quicinc.com --- drivers/net/wireless/ath/ath11k/mac.c | 38 ++++++++++++++++++--------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c index 4f62e38ba48b3a..9b96dbb21d8336 100644 --- a/drivers/net/wireless/ath/ath11k/mac.c +++ b/drivers/net/wireless/ath/ath11k/mac.c @@ -7988,8 +7988,6 @@ ath11k_mac_op_assign_vif_chanctx(struct ieee80211_hw *hw, struct ath11k_base *ab = ar->ab; struct ath11k_vif *arvif = ath11k_vif_to_arvif(vif); int ret; - struct cur_regulatory_info *reg_info; - enum ieee80211_ap_reg_power power_type; mutex_lock(&ar->conf_mutex); @@ -8000,17 +7998,6 @@ ath11k_mac_op_assign_vif_chanctx(struct ieee80211_hw *hw, if (ath11k_wmi_supports_6ghz_cc_ext(ar) && ctx->def.chan->band == NL80211_BAND_6GHZ && arvif->vdev_type == WMI_VDEV_TYPE_STA) { - reg_info = &ab->reg_info_store[ar->pdev_idx]; - power_type = vif->bss_conf.power_type; - - ath11k_dbg(ab, ATH11K_DBG_MAC, "chanctx power type %d\n", power_type); - - if (power_type == IEEE80211_REG_UNSET_AP) { - ret = -EINVAL; - goto out; - } - - ath11k_reg_handle_chan_list(ab, reg_info, power_type); arvif->chanctx = *ctx; ath11k_mac_parse_tx_pwr_env(ar, vif, ctx); } @@ -9626,6 +9613,8 @@ static int ath11k_mac_op_sta_state(struct ieee80211_hw *hw, struct ath11k *ar = hw->priv; struct ath11k_vif *arvif = ath11k_vif_to_arvif(vif); struct ath11k_sta *arsta = ath11k_sta_to_arsta(sta); + enum ieee80211_ap_reg_power power_type; + struct cur_regulatory_info *reg_info; struct ath11k_peer *peer; int ret = 0; @@ -9705,6 +9694,29 @@ static int ath11k_mac_op_sta_state(struct ieee80211_hw *hw, ath11k_warn(ar->ab, "Unable to authorize peer %pM vdev %d: %d\n", sta->addr, arvif->vdev_id, ret); } + + if (!ret && + ath11k_wmi_supports_6ghz_cc_ext(ar) && + arvif->vdev_type == WMI_VDEV_TYPE_STA && + arvif->chanctx.def.chan && + arvif->chanctx.def.chan->band == NL80211_BAND_6GHZ) { + reg_info = &ar->ab->reg_info_store[ar->pdev_idx]; + power_type = vif->bss_conf.power_type; + + if (power_type == IEEE80211_REG_UNSET_AP) { + ath11k_warn(ar->ab, "invalid power type %d\n", + power_type); + ret = -EINVAL; + } else { + ret = ath11k_reg_handle_chan_list(ar->ab, + reg_info, + power_type); + if (ret) + ath11k_warn(ar->ab, + "failed to handle chan list with power type %d\n", + power_type); + } + } } else if (old_state == IEEE80211_STA_AUTHORIZED && new_state == IEEE80211_STA_ASSOC) { spin_lock_bh(&ar->ab->base_lock); From bb9025f4432f8c158322cf2c04c2b492f23eb511 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Sat, 4 May 2024 14:47:01 +0300 Subject: [PATCH 0037/1578] dma-mapping: benchmark: fix up kthread-related error handling kthread creation failure is invalidly handled inside do_map_benchmark(). The put_task_struct() calls on the error path are supposed to balance the get_task_struct() calls which only happen after all the kthreads are successfully created. Rollback using kthread_stop() for already created kthreads in case of such failure. In normal situation call kthread_stop_put() to gracefully stop kthreads and put their task refcounts. This should be done for all started kthreads. Found by Linux Verification Center (linuxtesting.org). Fixes: 65789daa8087 ("dma-mapping: add benchmark support for streaming DMA APIs") Suggested-by: Robin Murphy Signed-off-by: Fedor Pchelkin Reviewed-by: Robin Murphy Signed-off-by: Christoph Hellwig --- kernel/dma/map_benchmark.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c index 02205ab53b7e93..2478957cf9f839 100644 --- a/kernel/dma/map_benchmark.c +++ b/kernel/dma/map_benchmark.c @@ -118,6 +118,8 @@ static int do_map_benchmark(struct map_benchmark_data *map) if (IS_ERR(tsk[i])) { pr_err("create dma_map thread failed\n"); ret = PTR_ERR(tsk[i]); + while (--i >= 0) + kthread_stop(tsk[i]); goto out; } @@ -139,13 +141,17 @@ static int do_map_benchmark(struct map_benchmark_data *map) msleep_interruptible(map->bparam.seconds * 1000); - /* wait for the completion of benchmark threads */ + /* wait for the completion of all started benchmark threads */ for (i = 0; i < threads; i++) { - ret = kthread_stop(tsk[i]); - if (ret) - goto out; + int kthread_ret = kthread_stop_put(tsk[i]); + + if (kthread_ret) + ret = kthread_ret; } + if (ret) + goto out; + loops = atomic64_read(&map->loops); if (likely(loops > 0)) { u64 map_variance, unmap_variance; @@ -170,8 +176,6 @@ static int do_map_benchmark(struct map_benchmark_data *map) } out: - for (i = 0; i < threads; i++) - put_task_struct(tsk[i]); put_device(map->dev); kfree(tsk); return ret; From f7c9ccaadffd13066353332c13d7e9bf73b8f92d Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Sat, 4 May 2024 14:47:02 +0300 Subject: [PATCH 0038/1578] dma-mapping: benchmark: avoid needless copy_to_user if benchmark fails If do_map_benchmark() has failed, there is nothing useful to copy back to userspace. Suggested-by: Barry Song <21cnbao@gmail.com> Signed-off-by: Fedor Pchelkin Acked-by: Robin Murphy Signed-off-by: Christoph Hellwig --- kernel/dma/map_benchmark.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c index 2478957cf9f839..a6edb1ef98c8a6 100644 --- a/kernel/dma/map_benchmark.c +++ b/kernel/dma/map_benchmark.c @@ -256,6 +256,9 @@ static long map_benchmark_ioctl(struct file *file, unsigned int cmd, * dma_mask changed by benchmark */ dma_set_mask(map->dev, old_dma_mask); + + if (ret) + return ret; break; default: return -EINVAL; From 1ff05e723f7ca30644b8ec3fb093f16312e408ad Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Sat, 4 May 2024 14:47:03 +0300 Subject: [PATCH 0039/1578] dma-mapping: benchmark: fix node id validation While validating node ids in map_benchmark_ioctl(), node_possible() may be provided with invalid argument outside of [0,MAX_NUMNODES-1] range leading to: BUG: KASAN: wild-memory-access in map_benchmark_ioctl (kernel/dma/map_benchmark.c:214) Read of size 8 at addr 1fffffff8ccb6398 by task dma_map_benchma/971 CPU: 7 PID: 971 Comm: dma_map_benchma Not tainted 6.9.0-rc6 #37 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) Call Trace: dump_stack_lvl (lib/dump_stack.c:117) kasan_report (mm/kasan/report.c:603) kasan_check_range (mm/kasan/generic.c:189) variable_test_bit (arch/x86/include/asm/bitops.h:227) [inline] arch_test_bit (arch/x86/include/asm/bitops.h:239) [inline] _test_bit at (include/asm-generic/bitops/instrumented-non-atomic.h:142) [inline] node_state (include/linux/nodemask.h:423) [inline] map_benchmark_ioctl (kernel/dma/map_benchmark.c:214) full_proxy_unlocked_ioctl (fs/debugfs/file.c:333) __x64_sys_ioctl (fs/ioctl.c:890) do_syscall_64 (arch/x86/entry/common.c:83) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) Compare node ids with sane bounds first. NUMA_NO_NODE is considered a special valid case meaning that benchmarking kthreads won't be bound to a cpuset of a given node. Found by Linux Verification Center (linuxtesting.org). Fixes: 65789daa8087 ("dma-mapping: add benchmark support for streaming DMA APIs") Signed-off-by: Fedor Pchelkin Reviewed-by: Robin Murphy Signed-off-by: Christoph Hellwig --- kernel/dma/map_benchmark.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c index a6edb1ef98c8a6..9f6c15f3f16811 100644 --- a/kernel/dma/map_benchmark.c +++ b/kernel/dma/map_benchmark.c @@ -212,7 +212,8 @@ static long map_benchmark_ioctl(struct file *file, unsigned int cmd, } if (map->bparam.node != NUMA_NO_NODE && - !node_possible(map->bparam.node)) { + (map->bparam.node < 0 || map->bparam.node >= MAX_NUMNODES || + !node_possible(map->bparam.node))) { pr_err("invalid numa node\n"); return -EINVAL; } From e64746e74f717961250a155e14c156616fcd981f Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Sat, 4 May 2024 14:47:04 +0300 Subject: [PATCH 0040/1578] dma-mapping: benchmark: handle NUMA_NO_NODE correctly cpumask_of_node() can be called for NUMA_NO_NODE inside do_map_benchmark() resulting in the following sanitizer report: UBSAN: array-index-out-of-bounds in ./arch/x86/include/asm/topology.h:72:28 index -1 is out of range for type 'cpumask [64][1]' CPU: 1 PID: 990 Comm: dma_map_benchma Not tainted 6.9.0-rc6 #29 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) Call Trace: dump_stack_lvl (lib/dump_stack.c:117) ubsan_epilogue (lib/ubsan.c:232) __ubsan_handle_out_of_bounds (lib/ubsan.c:429) cpumask_of_node (arch/x86/include/asm/topology.h:72) [inline] do_map_benchmark (kernel/dma/map_benchmark.c:104) map_benchmark_ioctl (kernel/dma/map_benchmark.c:246) full_proxy_unlocked_ioctl (fs/debugfs/file.c:333) __x64_sys_ioctl (fs/ioctl.c:890) do_syscall_64 (arch/x86/entry/common.c:83) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) Use cpumask_of_node() in place when binding a kernel thread to a cpuset of a particular node. Note that the provided node id is checked inside map_benchmark_ioctl(). It's just a NUMA_NO_NODE case which is not handled properly later. Found by Linux Verification Center (linuxtesting.org). Fixes: 65789daa8087 ("dma-mapping: add benchmark support for streaming DMA APIs") Signed-off-by: Fedor Pchelkin Acked-by: Barry Song Signed-off-by: Christoph Hellwig --- kernel/dma/map_benchmark.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c index 9f6c15f3f16811..4950e0b622b1f3 100644 --- a/kernel/dma/map_benchmark.c +++ b/kernel/dma/map_benchmark.c @@ -101,7 +101,6 @@ static int do_map_benchmark(struct map_benchmark_data *map) struct task_struct **tsk; int threads = map->bparam.threads; int node = map->bparam.node; - const cpumask_t *cpu_mask = cpumask_of_node(node); u64 loops; int ret = 0; int i; @@ -124,7 +123,7 @@ static int do_map_benchmark(struct map_benchmark_data *map) } if (node != NUMA_NO_NODE) - kthread_bind_mask(tsk[i], cpu_mask); + kthread_bind_mask(tsk[i], cpumask_of_node(node)); } /* clear the old value in the previous benchmark */ From 803482f472ccc9576c0e606143725d1d8c61019d Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Fri, 17 May 2024 17:04:22 -0700 Subject: [PATCH 0041/1578] KVM: x86/mmu: Use SHADOW_NONPRESENT_VALUE for atomic zap in TDP MMU Use SHADOW_NONPRESENT_VALUE when zapping TDP MMU SPTEs with mmu_lock held for read, tdp_mmu_zap_spte_atomic() was simply missed during the initial development. Fixes: 7f01cab84928 ("KVM: x86/mmu: Allow non-zero value for non-present SPTE and removed SPTE") Signed-off-by: Isaku Yamahata [sean: write changelog] Signed-off-by: Sean Christopherson Reviewed-by: Kai Huang Message-ID: <20240518000430.1118488-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 1259dd63defc8e..36539c1b36cd63 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -626,7 +626,7 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, * SPTEs. */ handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, - 0, iter->level, true); + SHADOW_NONPRESENT_VALUE, iter->level, true); return 0; } From 40e8a6901a2c983e3e541a363865e02cf591d458 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 17 May 2024 17:04:28 -0700 Subject: [PATCH 0042/1578] KVM: VMX: Don't kill the VM on an unexpected #VE Don't terminate the VM on an unexpected #VE, as it's extremely unlikely the #VE is fatal to the guest, and even less likely that it presents a danger to the host. Simply resume the guest on "failure", as the #VE info page's BUSY field will prevent converting any more EPT Violations to #VEs for the vCPU (at least, that's what the BUSY field is supposed to do). Signed-off-by: Sean Christopherson Message-ID: <20240518000430.1118488-8-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 51b2cd13250a2d..069fbbc1e04a00 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5217,8 +5217,8 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) if (is_invalid_opcode(intr_info)) return handle_ud(vcpu); - if (KVM_BUG_ON(is_ve_fault(intr_info), vcpu->kvm)) - return -EIO; + if (WARN_ON_ONCE(is_ve_fault(intr_info))) + return 1; error_code = 0; if (intr_info & INTR_INFO_DELIVER_CODE_MASK) From d1b32ecdc8ad0346ac551866d9f6831995fd70de Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 17 May 2024 17:04:23 -0700 Subject: [PATCH 0043/1578] KVM: nVMX: Initialize #VE info page for vmcs02 when proving #VE support Point vmcs02.VE_INFORMATION_ADDRESS at the vCPU's #VE info page when initializing vmcs02, otherwise KVM will run L2 with EPT Violation #VE enabled and a VE info address pointing at pfn 0. Fixes: 8131cf5b4fd8 ("KVM: VMX: Introduce test mode related to EPT violation VE") Signed-off-by: Sean Christopherson Message-ID: <20240518000430.1118488-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index d5b832126e3458..6798fadaa33502 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2242,6 +2242,9 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL)); + if (vmx->ve_info) + vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info)); + /* All VMFUNCs are currently emulated through L0 vmexits. */ if (cpu_has_vmx_vmfunc()) vmcs_write64(VM_FUNCTION_CONTROL, 0); From 9031b42139b9d45ef806c9a7fee166c1b6443c3c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 17 May 2024 17:04:24 -0700 Subject: [PATCH 0044/1578] KVM: nVMX: Always handle #VEs in L0 (never forward #VEs from L2 to L1) Always handle #VEs, e.g. due to prove EPT Violation #VE failures, in L0, as KVM does not expose any #VE capabilities to L1, i.e. any and all #VEs are KVM's responsibility. Fixes: 8131cf5b4fd8 ("KVM: VMX: Introduce test mode related to EPT violation VE") Signed-off-by: Sean Christopherson Message-ID: <20240518000430.1118488-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 6798fadaa33502..643935a0f70ab7 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -6233,6 +6233,8 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, else if (is_alignment_check(intr_info) && !vmx_guest_inject_ac(vcpu)) return true; + else if (is_ve_fault(intr_info)) + return true; return false; case EXIT_REASON_EXTERNAL_INTERRUPT: return true; From 837d557aba6b816985141ddbeb7649444ed26d3b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 17 May 2024 17:04:25 -0700 Subject: [PATCH 0045/1578] KVM: x86/mmu: Add sanity checks that KVM doesn't create EPT #VE SPTEs Assert that KVM doesn't set a SPTE to a value that could trigger an EPT Violation #VE on a non-MMIO SPTE, e.g. to help detect bugs even without KVM_INTEL_PROVE_VE enabled, and to help debug actual #VE failures. Note, this will run afoul of TDX support, which needs to reflect emulated MMIO accesses into the guest as #VEs (which was the whole point of adding EPT Violation #VE support in KVM). The obvious fix for that is to exempt MMIO SPTEs, but that's annoyingly difficult now that is_mmio_spte() relies on a per-VM value. However, resolving that conundrum is a future problem, whereas getting KVM_INTEL_PROVE_VE healthy is a current problem. Signed-off-by: Sean Christopherson Message-ID: <20240518000430.1118488-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 3 +++ arch/x86/kvm/mmu/spte.h | 9 +++++++++ arch/x86/kvm/mmu/tdp_iter.h | 2 ++ 3 files changed, 14 insertions(+) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 662f62dfb2aa9f..1ae1c1328268b8 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -336,16 +336,19 @@ static int is_cpuid_PSE36(void) #ifdef CONFIG_X86_64 static void __set_spte(u64 *sptep, u64 spte) { + KVM_MMU_WARN_ON(is_ept_ve_possible(spte)); WRITE_ONCE(*sptep, spte); } static void __update_clear_spte_fast(u64 *sptep, u64 spte) { + KVM_MMU_WARN_ON(is_ept_ve_possible(spte)); WRITE_ONCE(*sptep, spte); } static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) { + KVM_MMU_WARN_ON(is_ept_ve_possible(spte)); return xchg(sptep, spte); } diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 5dd5405fa07ac9..52fa004a1fbc9f 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -3,6 +3,8 @@ #ifndef KVM_X86_MMU_SPTE_H #define KVM_X86_MMU_SPTE_H +#include + #include "mmu.h" #include "mmu_internal.h" @@ -276,6 +278,13 @@ static inline bool is_shadow_present_pte(u64 pte) return !!(pte & SPTE_MMU_PRESENT_MASK); } +static inline bool is_ept_ve_possible(u64 spte) +{ + return (shadow_present_mask & VMX_EPT_SUPPRESS_VE_BIT) && + !(spte & VMX_EPT_SUPPRESS_VE_BIT) && + (spte & VMX_EPT_RWX_MASK) != VMX_EPT_MISCONFIG_WX_VALUE; +} + /* * Returns true if A/D bits are supported in hardware and are enabled by KVM. * When enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index fae559559a806a..2880fd392e0cbb 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -21,11 +21,13 @@ static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep) static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte) { + KVM_MMU_WARN_ON(is_ept_ve_possible(new_spte)); return xchg(rcu_dereference(sptep), new_spte); } static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte) { + KVM_MMU_WARN_ON(is_ept_ve_possible(new_spte)); WRITE_ONCE(*rcu_dereference(sptep), new_spte); } From 743f1773366461cb44de297b1caf0a4292eb8fda Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 17 May 2024 17:04:26 -0700 Subject: [PATCH 0046/1578] KVM: VMX: Dump VMCS on unexpected #VE Dump the VMCS on an unexpected #VE, otherwise it's practically impossible to figure out why the #VE occurred. Signed-off-by: Sean Christopherson Message-ID: <20240518000430.1118488-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 069fbbc1e04a00..a02b2720023a3f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5217,8 +5217,10 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) if (is_invalid_opcode(intr_info)) return handle_ud(vcpu); - if (WARN_ON_ONCE(is_ve_fault(intr_info))) + if (WARN_ON_ONCE(is_ve_fault(intr_info))) { + dump_vmcs(vcpu); return 1; + } error_code = 0; if (intr_info & INTR_INFO_DELIVER_CODE_MASK) From bca99c0356524a620126b789edbaf25934415467 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 17 May 2024 17:04:27 -0700 Subject: [PATCH 0047/1578] KVM: x86/mmu: Print SPTEs on unexpected #VE Print the SPTEs that correspond to the faulting GPA on an unexpected EPT Violation #VE to help the user debug failures, e.g. to pinpoint which SPTE didn't have SUPPRESS_VE set. Opportunistically assert that the underlying exit reason was indeed an EPT Violation, as the CPU has *really* gone off the rails if a #VE occurs due to a completely unexpected exit reason. Signed-off-by: Sean Christopherson Message-ID: <20240518000430.1118488-7-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu/mmu.c | 40 ++++++++++++++++++++++++++------- arch/x86/kvm/vmx/vmx.c | 5 +++++ 3 files changed, 38 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ece45b3f6f2073..f8ca74e7678f3a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2154,6 +2154,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len); +void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, u64 addr, unsigned long roots); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 1ae1c1328268b8..b7b4426a7221df 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4104,23 +4104,31 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level return leaf; } -/* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */ -static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) +static int get_sptes_lockless(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, + int *root_level) { - u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; - struct rsvd_bits_validate *rsvd_check; - int root, leaf, level; - bool reserved = false; + int leaf; walk_shadow_page_lockless_begin(vcpu); if (is_tdp_mmu_active(vcpu)) - leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root); + leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root_level); else - leaf = get_walk(vcpu, addr, sptes, &root); + leaf = get_walk(vcpu, addr, sptes, root_level); walk_shadow_page_lockless_end(vcpu); + return leaf; +} +/* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */ +static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) +{ + u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; + struct rsvd_bits_validate *rsvd_check; + int root, leaf, level; + bool reserved = false; + + leaf = get_sptes_lockless(vcpu, addr, sptes, &root); if (unlikely(leaf < 0)) { *sptep = 0ull; return reserved; @@ -5924,6 +5932,22 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err } EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); +void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg) +{ + u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; + int root_level, leaf, level; + + leaf = get_sptes_lockless(vcpu, gpa, sptes, &root_level); + if (unlikely(leaf < 0)) + return; + + pr_err("%s %llx", msg, gpa); + for (level = root_level; level >= leaf; level--) + pr_cont(", spte[%d] = 0x%llx", level, sptes[level]); + pr_cont("\n"); +} +EXPORT_SYMBOL_GPL(kvm_mmu_print_sptes); + static void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, u64 addr, hpa_t root_hpa) { diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a02b2720023a3f..58832aae224872 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5218,7 +5218,12 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) return handle_ud(vcpu); if (WARN_ON_ONCE(is_ve_fault(intr_info))) { + struct vmx_ve_information *ve_info = vmx->ve_info; + + WARN_ONCE(ve_info->exit_reason != EXIT_REASON_EPT_VIOLATION, + "Unexpected #VE on VM-Exit reason 0x%x", ve_info->exit_reason); dump_vmcs(vcpu); + kvm_mmu_print_sptes(vcpu, ve_info->guest_physical_address, "#VE"); return 1; } From a5dc0c9b557573315633bc78bacf8f548352f95b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 17 May 2024 17:04:29 -0700 Subject: [PATCH 0048/1578] KVM: VMX: Enumerate EPT Violation #VE support in /proc/cpuinfo Don't suppress printing EPT_VIOLATION_VE in /proc/cpuinfo, knowing whether or not KVM_INTEL_PROVE_VE actually does anything is extremely valuable. A privileged user can get at the information by reading the raw MSR, but the whole point of the VMX flags is to avoid needing to glean information from raw MSR reads. Signed-off-by: Sean Christopherson Message-ID: <20240518000430.1118488-9-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/vmxfeatures.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h index 266daf5b5b842d..695f3666488996 100644 --- a/arch/x86/include/asm/vmxfeatures.h +++ b/arch/x86/include/asm/vmxfeatures.h @@ -77,7 +77,7 @@ #define VMX_FEATURE_ENCLS_EXITING ( 2*32+ 15) /* "" VM-Exit on ENCLS (leaf dependent) */ #define VMX_FEATURE_RDSEED_EXITING ( 2*32+ 16) /* "" VM-Exit on RDSEED */ #define VMX_FEATURE_PAGE_MOD_LOGGING ( 2*32+ 17) /* "pml" Log dirty pages into buffer */ -#define VMX_FEATURE_EPT_VIOLATION_VE ( 2*32+ 18) /* "" Conditionally reflect EPT violations as #VE exceptions */ +#define VMX_FEATURE_EPT_VIOLATION_VE ( 2*32+ 18) /* Conditionally reflect EPT violations as #VE exceptions */ #define VMX_FEATURE_PT_CONCEAL_VMX ( 2*32+ 19) /* "" Suppress VMX indicators in Processor Trace */ #define VMX_FEATURE_XSAVES ( 2*32+ 20) /* "" Enable XSAVES and XRSTORS in guest */ #define VMX_FEATURE_MODE_BASED_EPT_EXEC ( 2*32+ 22) /* "ept_mode_based_exec" Enable separate EPT EXEC bits for supervisor vs. user */ From 6af6142e3a62efd6074905e4a94d64956a3f4b7c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 17 May 2024 17:04:30 -0700 Subject: [PATCH 0049/1578] KVM: x86: Disable KVM_INTEL_PROVE_VE by default Disable KVM's "prove #VE" support by default, as it provides no functional value, and even its sanity checking benefits are relatively limited. I.e. it should be fully opt-in even on debug kernels, especially since EPT Violation #VE suppression appears to be buggy on some CPUs. Opportunistically add a line in the help text to make it abundantly clear that KVM_INTEL_PROVE_VE should never be enabled in a production environment. Suggested-by: Paolo Bonzini Signed-off-by: Sean Christopherson Message-ID: <20240518000430.1118488-10-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Kconfig | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index d64fb2b3eb69e3..7f47233ddf46c3 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -97,15 +97,17 @@ config KVM_INTEL config KVM_INTEL_PROVE_VE bool "Check that guests do not receive #VE exceptions" - default KVM_PROVE_MMU || DEBUG_KERNEL - depends on KVM_INTEL + depends on KVM_INTEL && EXPERT help - Checks that KVM's page table management code will not incorrectly let guests receive a virtualization exception. Virtualization exceptions will be trapped by the hypervisor rather than injected in the guest. + Note: some CPUs appear to generate spurious EPT Violations #VEs + that trigger KVM's WARN, in particular with eptad=0 and/or nested + virtualization. + If unsure, say N. config X86_SGX_KVM From 76d5363c20eeeb937b56c0ac6c61e697bd1bf154 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 17 May 2024 11:03:41 -0700 Subject: [PATCH 0050/1578] KVM: x86: Force KVM_WERROR if the global WERROR is enabled Force KVM_WERROR if the global WERROR is enabled to avoid pestering the user about a Kconfig that will ultimately be ignored. Force KVM_WERROR instead of making it mutually exclusive with WERROR to avoid generating a .config builds KVM with -Werror, but has KVM_WERROR=n. Suggested-by: Linus Torvalds Signed-off-by: Sean Christopherson Message-ID: <20240517180341.974251-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 7f47233ddf46c3..fec95a7702703f 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -44,6 +44,7 @@ config KVM select KVM_VFIO select HAVE_KVM_PM_NOTIFIER if PM select KVM_GENERIC_HARDWARE_ENABLING + select KVM_WERROR if WERROR help Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent @@ -66,7 +67,7 @@ config KVM_WERROR # FRAME_WARN, i.e. KVM_WERROR=y with KASAN=y requires special tuning. # Building KVM with -Werror and KASAN is still doable via enabling # the kernel-wide WERROR=y. - depends on KVM && EXPERT && !KASAN + depends on KVM && ((EXPERT && !KASAN) || WERROR) help Add -Werror to the build flags for KVM. From b4bd556467477420ee3a91fbcba73c579669edc6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 21 May 2024 19:14:35 -0700 Subject: [PATCH 0051/1578] KVM: SVM: WARN on vNMI + NMI window iff NMIs are outright masked When requesting an NMI window, WARN on vNMI support being enabled if and only if NMIs are actually masked, i.e. if the vCPU is already handling an NMI. KVM's ABI for NMIs that arrive simultanesouly (from KVM's point of view) is to inject one NMI and pend the other. When using vNMI, KVM pends the second NMI simply by setting V_NMI_PENDING, and lets the CPU do the rest (hardware automatically sets V_NMI_BLOCKING when an NMI is injected). However, if KVM can't immediately inject an NMI, e.g. because the vCPU is in an STI shadow or is running with GIF=0, then KVM will request an NMI window and trigger the WARN (but still function correctly). Whether or not the GIF=0 case makes sense is debatable, as the intent of KVM's behavior is to provide functionality that is as close to real hardware as possible. E.g. if two NMIs are sent in quick succession, the probability of both NMIs arriving in an STI shadow is infinitesimally low on real hardware, but significantly larger in a virtual environment, e.g. if the vCPU is preempted in the STI shadow. For GIF=0, the argument isn't as clear cut, because the window where two NMIs can collide is much larger in bare metal (though still small). That said, KVM should not have divergent behavior for the GIF=0 case based on whether or not vNMI support is enabled. And KVM has allowed simultaneous NMIs with GIF=0 for over a decade, since commit 7460fb4a3400 ("KVM: Fix simultaneous NMIs"). I.e. KVM's GIF=0 handling shouldn't be modified without a *really* good reason to do so, and if KVM's behavior were to be modified, it should be done irrespective of vNMI support. Fixes: fa4c027a7956 ("KVM: x86: Add support for SVM's Virtual NMI") Cc: stable@vger.kernel.org Cc: Santosh Shukla Cc: Maxim Levitsky Signed-off-by: Sean Christopherson Message-ID: <20240522021435.1684366-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index c8dc25886c1658..1d3c8be39479b3 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3846,16 +3846,27 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); /* - * KVM should never request an NMI window when vNMI is enabled, as KVM - * allows at most one to-be-injected NMI and one pending NMI, i.e. if - * two NMIs arrive simultaneously, KVM will inject one and set - * V_NMI_PENDING for the other. WARN, but continue with the standard - * single-step approach to try and salvage the pending NMI. + * If NMIs are outright masked, i.e. the vCPU is already handling an + * NMI, and KVM has not yet intercepted an IRET, then there is nothing + * more to do at this time as KVM has already enabled IRET intercepts. + * If KVM has already intercepted IRET, then single-step over the IRET, + * as NMIs aren't architecturally unmasked until the IRET completes. + * + * If vNMI is enabled, KVM should never request an NMI window if NMIs + * are masked, as KVM allows at most one to-be-injected NMI and one + * pending NMI. If two NMIs arrive simultaneously, KVM will inject one + * NMI and set V_NMI_PENDING for the other, but if and only if NMIs are + * unmasked. KVM _will_ request an NMI window in some situations, e.g. + * if the vCPU is in an STI shadow or if GIF=0, KVM can't immediately + * inject the NMI. In those situations, KVM needs to single-step over + * the STI shadow or intercept STGI. */ - WARN_ON_ONCE(is_vnmi_enabled(svm)); + if (svm_get_nmi_mask(vcpu)) { + WARN_ON_ONCE(is_vnmi_enabled(svm)); - if (svm_get_nmi_mask(vcpu) && !svm->awaiting_iret_completion) - return; /* IRET will cause a vm exit */ + if (!svm->awaiting_iret_completion) + return; /* IRET will cause a vm exit */ + } /* * SEV-ES guests are responsible for signaling when a vCPU is ready to From 2fe7b422460d14b33027d8770f7be8d26bcb2639 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 21 May 2024 09:50:47 -0700 Subject: [PATCH 0052/1578] nvme: fix multipath batched completion accounting Batched completions were missing the io stats accounting and bio trace events. Move the common code to a helper and call it from the batched and non-batched functions. Fixes: d4d957b53d91ee ("nvme-multipath: support io stats on the mpath device") Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 954f850f113a12..79cdd34dfa18e6 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -414,6 +414,14 @@ static inline void nvme_end_req_zoned(struct request *req) } } +static inline void __nvme_end_req(struct request *req) +{ + nvme_end_req_zoned(req); + nvme_trace_bio_complete(req); + if (req->cmd_flags & REQ_NVME_MPATH) + nvme_mpath_end_request(req); +} + static inline void nvme_end_req(struct request *req) { blk_status_t status = nvme_error_status(nvme_req(req)->status); @@ -424,10 +432,7 @@ static inline void nvme_end_req(struct request *req) else nvme_log_error(req); } - nvme_end_req_zoned(req); - nvme_trace_bio_complete(req); - if (req->cmd_flags & REQ_NVME_MPATH) - nvme_mpath_end_request(req); + __nvme_end_req(req); blk_mq_end_request(req, status); } @@ -476,7 +481,7 @@ void nvme_complete_batch_req(struct request *req) { trace_nvme_complete_rq(req); nvme_cleanup_cmd(req); - nvme_end_req_zoned(req); + __nvme_end_req(req); } EXPORT_SYMBOL_GPL(nvme_complete_batch_req); From a2e4c5f5f68dbd206f132bc709b98dea64afc3b8 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 21 May 2024 11:02:28 -0700 Subject: [PATCH 0053/1578] nvme-multipath: fix io accounting on failover There are io stats accounting that needs to be handled, so don't call blk_mq_end_request() directly. Use the existing nvme_end_req() helper that already handles everything. Fixes: d4d957b53d91ee ("nvme-multipath: support io stats on the mpath device") Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 +- drivers/nvme/host/multipath.c | 3 ++- drivers/nvme/host/nvme.h | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 79cdd34dfa18e6..7706df23734945 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -422,7 +422,7 @@ static inline void __nvme_end_req(struct request *req) nvme_mpath_end_request(req); } -static inline void nvme_end_req(struct request *req) +void nvme_end_req(struct request *req) { blk_status_t status = nvme_error_status(nvme_req(req)->status); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 9c1e135b8df3b1..1bee176fd850e3 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -118,7 +118,8 @@ void nvme_failover_req(struct request *req) blk_steal_bios(&ns->head->requeue_list, req); spin_unlock_irqrestore(&ns->head->requeue_lock, flags); - blk_mq_end_request(req, 0); + nvme_req(req)->status = 0; + nvme_end_req(req); kblockd_schedule_work(&ns->head->requeue_work); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index cacc56f4bbf442..fc31bd340a63a2 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -767,6 +767,7 @@ static inline bool nvme_state_terminal(struct nvme_ctrl *ctrl) } } +void nvme_end_req(struct request *req); void nvme_complete_rq(struct request *req); void nvme_complete_batch_req(struct request *req); From f97914e35fd98b2b18fb8a092e0a0799f73afdfe Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 21 May 2024 23:20:28 +0300 Subject: [PATCH 0054/1578] nvmet: fix ns enable/disable possible hang When disabling an nvmet namespace, there is a period where the subsys->lock is released, as the ns disable waits for backend IO to complete, and the ns percpu ref to be properly killed. The original intent was to avoid taking the subsystem lock for a prolong period as other processes may need to acquire it (for example new incoming connections). However, it opens up a window where another process may come in and enable the ns, (re)intiailizing the ns percpu_ref, causing the disable sequence to hang. Solve this by taking the global nvmet_config_sem over the entire configfs enable/disable sequence. Fixes: a07b4970f464 ("nvmet: add a generic NVMe target") Signed-off-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/target/configfs.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 7c43a0ad68771f..bd87dfd173a4cb 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -676,10 +676,18 @@ static ssize_t nvmet_ns_enable_store(struct config_item *item, if (kstrtobool(page, &enable)) return -EINVAL; + /* + * take a global nvmet_config_sem because the disable routine has a + * window where it releases the subsys-lock, giving a chance to + * a parallel enable to concurrently execute causing the disable to + * have a misaccounting of the ns percpu_ref. + */ + down_write(&nvmet_config_sem); if (enable) ret = nvmet_ns_enable(ns); else nvmet_ns_disable(ns); + up_write(&nvmet_config_sem); return ret ? ret : count; } From ec58991054e899c9d86f7e3c8a96cb602d4b5938 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Tue, 21 May 2024 15:03:02 +0800 Subject: [PATCH 0055/1578] drm/amdgpu: correct hbm field in boot status hbm filed takes bit 13 and bit 14 in boot status. Signed-off-by: Hawking Zhang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index c8980d5f6540a6..7021c4a66fb5e9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -46,7 +46,7 @@ struct amdgpu_iv_entry; #define AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(x) AMDGPU_GET_REG_FIELD(x, 7, 7) #define AMDGPU_RAS_GPU_ERR_SOCKET_ID(x) AMDGPU_GET_REG_FIELD(x, 10, 8) #define AMDGPU_RAS_GPU_ERR_AID_ID(x) AMDGPU_GET_REG_FIELD(x, 12, 11) -#define AMDGPU_RAS_GPU_ERR_HBM_ID(x) AMDGPU_GET_REG_FIELD(x, 13, 13) +#define AMDGPU_RAS_GPU_ERR_HBM_ID(x) AMDGPU_GET_REG_FIELD(x, 14, 13) #define AMDGPU_RAS_GPU_ERR_BOOT_STATUS(x) AMDGPU_GET_REG_FIELD(x, 31, 31) #define AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT 1000 From 8195979d2dd995d60c2663adf54c69c1bf4eadd1 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 8 May 2024 16:45:35 -0500 Subject: [PATCH 0056/1578] drm/amd/display: Enable colorspace property for MST connectors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MST colorspace property support was disabled due to a series of warnings that came up when the device was plugged in since the properties weren't made at device creation. Create the properties in advance instead. Suggested-by: Ville Syrjälä Fixes: 69a959610229 ("drm/amd/display: Temporary Disable MST DP Colorspace Property"). Reported-and-tested-by: Tyler Schneider Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3353 Reviewed-by: Harry Wentland Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c index 0b03e659fdf3f2..8b0e997ebdaeb8 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c @@ -613,6 +613,9 @@ dm_dp_add_mst_connector(struct drm_dp_mst_topology_mgr *mgr, &connector->base, dev->mode_config.tile_property, 0); + connector->colorspace_property = master->base.colorspace_property; + if (connector->colorspace_property) + drm_connector_attach_colorspace_property(connector); drm_connector_set_path_property(connector, pathprop); From 699646734ab51bf5b1cd4a7a30c20074f6e74f6e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 20 May 2024 22:30:17 -0700 Subject: [PATCH 0057/1578] uprobes: prevent mutex_lock() under rcu_read_lock() Recent changes made uprobe_cpu_buffer preparation lazy, and moved it deeper into __uprobe_trace_func(). This is problematic because __uprobe_trace_func() is called inside rcu_read_lock()/rcu_read_unlock() block, which then calls prepare_uprobe_buffer() -> uprobe_buffer_get() -> mutex_lock(&ucb->mutex), leading to a splat about using mutex under non-sleepable RCU: BUG: sleeping function called from invalid context at kernel/locking/mutex.c:585 in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 98231, name: stress-ng-sigq preempt_count: 0, expected: 0 RCU nest depth: 1, expected: 0 ... Call Trace: dump_stack_lvl+0x3d/0xe0 __might_resched+0x24c/0x270 ? prepare_uprobe_buffer+0xd5/0x1d0 __mutex_lock+0x41/0x820 ? ___perf_sw_event+0x206/0x290 ? __perf_event_task_sched_in+0x54/0x660 ? __perf_event_task_sched_in+0x54/0x660 prepare_uprobe_buffer+0xd5/0x1d0 __uprobe_trace_func+0x4a/0x140 uprobe_dispatcher+0x135/0x280 ? uprobe_dispatcher+0x94/0x280 uprobe_notify_resume+0x650/0xec0 ? atomic_notifier_call_chain+0x21/0x110 ? atomic_notifier_call_chain+0xf8/0x110 irqentry_exit_to_user_mode+0xe2/0x1e0 asm_exc_int3+0x35/0x40 RIP: 0033:0x7f7e1d4da390 Code: 33 04 00 0f 1f 80 00 00 00 00 f3 0f 1e fa b9 01 00 00 00 e9 b2 fc ff ff 66 90 f3 0f 1e fa 31 c9 e9 a5 fc ff ff 0f 1f 44 00 00 0f 1e fa b8 27 00 00 00 0f 05 c3 0f 1f 40 00 f3 0f 1e fa b8 6e RSP: 002b:00007ffd2abc3608 EFLAGS: 00000246 RAX: 0000000000000000 RBX: 0000000076d325f1 RCX: 0000000000000000 RDX: 0000000076d325f1 RSI: 000000000000000a RDI: 00007ffd2abc3690 RBP: 000000000000000a R08: 00017fb700000000 R09: 00017fb700000000 R10: 00017fb700000000 R11: 0000000000000246 R12: 0000000000017ff2 R13: 00007ffd2abc3610 R14: 0000000000000000 R15: 00007ffd2abc3780 Luckily, it's easy to fix by moving prepare_uprobe_buffer() to be called slightly earlier: into uprobe_trace_func() and uretprobe_trace_func(), outside of RCU locked section. This still keeps this buffer preparation lazy and helps avoid the overhead when it's not needed. E.g., if there is only BPF uprobe handler installed on a given uprobe, buffer won't be initialized. Note, the other user of prepare_uprobe_buffer(), __uprobe_perf_func(), is not affected, as it doesn't prepare buffer under RCU read lock. Link: https://lore.kernel.org/all/20240521053017.3708530-1-andrii@kernel.org/ Fixes: 1b8f85defbc8 ("uprobes: prepare uprobe args buffer lazily") Reported-by: Breno Leitao Signed-off-by: Andrii Nakryiko Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_uprobe.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 8541fa1494ae3d..c98e3b3386badb 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -970,19 +970,17 @@ static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu, static void __uprobe_trace_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer **ucbp, + struct uprobe_cpu_buffer *ucb, struct trace_event_file *trace_file) { struct uprobe_trace_entry_head *entry; struct trace_event_buffer fbuffer; - struct uprobe_cpu_buffer *ucb; void *data; int size, esize; struct trace_event_call *call = trace_probe_event_call(&tu->tp); WARN_ON(call != trace_file->event_call); - ucb = prepare_uprobe_buffer(tu, regs, ucbp); if (WARN_ON_ONCE(ucb->dsize > PAGE_SIZE)) return; @@ -1014,13 +1012,16 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, struct uprobe_cpu_buffer **ucbp) { struct event_file_link *link; + struct uprobe_cpu_buffer *ucb; if (is_ret_probe(tu)) return 0; + ucb = prepare_uprobe_buffer(tu, regs, ucbp); + rcu_read_lock(); trace_probe_for_each_link_rcu(link, &tu->tp) - __uprobe_trace_func(tu, 0, regs, ucbp, link->file); + __uprobe_trace_func(tu, 0, regs, ucb, link->file); rcu_read_unlock(); return 0; @@ -1031,10 +1032,13 @@ static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func, struct uprobe_cpu_buffer **ucbp) { struct event_file_link *link; + struct uprobe_cpu_buffer *ucb; + + ucb = prepare_uprobe_buffer(tu, regs, ucbp); rcu_read_lock(); trace_probe_for_each_link_rcu(link, &tu->tp) - __uprobe_trace_func(tu, func, regs, ucbp, link->file); + __uprobe_trace_func(tu, func, regs, ucb, link->file); rcu_read_unlock(); } From d09c05aa35909adb7d29f92f0cd79fdcd1338ef0 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Mon, 20 May 2024 22:30:40 -0400 Subject: [PATCH 0058/1578] scsi: core: Handle devices which return an unusually large VPD page count Peter Schneider reported that a system would no longer boot after updating to 6.8.4. Peter bisected the issue and identified commit b5fc07a5fb56 ("scsi: core: Consult supported VPD page list prior to fetching page") as being the culprit. Turns out the enclosure device in Peter's system reports a byteswapped page length for VPD page 0. It reports "02 00" as page length instead of "00 02". This causes us to attempt to access 516 bytes (page length + header) of information despite only 2 pages being present. Limit the page search scope to the size of our VPD buffer to guard against devices returning a larger page count than requested. Link: https://lore.kernel.org/r/20240521023040.2703884-1-martin.petersen@oracle.com Fixes: b5fc07a5fb56 ("scsi: core: Consult supported VPD page list prior to fetching page") Cc: stable@vger.kernel.org Reported-by: Peter Schneider Closes: https://lore.kernel.org/all/eec6ebbf-061b-4a7b-96dc-ea748aa4d035@googlemail.com/ Tested-by: Peter Schneider Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c index 3e0c0381277acd..f0464db3f9de99 100644 --- a/drivers/scsi/scsi.c +++ b/drivers/scsi/scsi.c @@ -350,6 +350,13 @@ static int scsi_get_vpd_size(struct scsi_device *sdev, u8 page) if (result < SCSI_VPD_HEADER_SIZE) return 0; + if (result > sizeof(vpd)) { + dev_warn_once(&sdev->sdev_gendev, + "%s: long VPD page 0 length: %d bytes\n", + __func__, result); + result = sizeof(vpd); + } + result -= SCSI_VPD_HEADER_SIZE; if (!memchr(&vpd[SCSI_VPD_HEADER_SIZE], page, result)) return 0; From 06e785aeb9ea8a43d0a3967c1ba6e69d758e82d4 Mon Sep 17 00:00:00 2001 From: Matt Jan Date: Tue, 14 May 2024 12:10:46 +0800 Subject: [PATCH 0059/1578] connector: Fix invalid conversion in cn_proc.h The implicit conversion from unsigned int to enum proc_cn_event is invalid, so explicitly cast it for compilation in a C++ compiler. /usr/include/linux/cn_proc.h: In function 'proc_cn_event valid_event(proc_cn_event)': /usr/include/linux/cn_proc.h:72:17: error: invalid conversion from 'unsigned int' to 'proc_cn_event' [-fpermissive] 72 | ev_type &= PROC_EVENT_ALL; | ^ | | | unsigned int Signed-off-by: Matt Jan Signed-off-by: David S. Miller --- include/uapi/linux/cn_proc.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/uapi/linux/cn_proc.h b/include/uapi/linux/cn_proc.h index f2afb7cc4926cd..18e3745b86cd48 100644 --- a/include/uapi/linux/cn_proc.h +++ b/include/uapi/linux/cn_proc.h @@ -69,8 +69,7 @@ struct proc_input { static inline enum proc_cn_event valid_event(enum proc_cn_event ev_type) { - ev_type &= PROC_EVENT_ALL; - return ev_type; + return (enum proc_cn_event)(ev_type & PROC_EVENT_ALL); } /* From 128d54fbcb14b8717ecf596d3dbded327b9980b3 Mon Sep 17 00:00:00 2001 From: Mathieu Othacehe Date: Tue, 21 May 2024 08:54:06 +0200 Subject: [PATCH 0060/1578] net: phy: micrel: set soft_reset callback to genphy_soft_reset for KSZ8061 Following a similar reinstate for the KSZ8081 and KSZ9031. Older kernels would use the genphy_soft_reset if the PHY did not implement a .soft_reset. The KSZ8061 errata described here: https://ww1.microchip.com/downloads/en/DeviceDoc/KSZ8061-Errata-DS80000688B.pdf and worked around with 232ba3a51c ("net: phy: Micrel KSZ8061: link failure after cable connect") is back again without this soft reset. Fixes: 6e2d85ec0559 ("net: phy: Stop with excessive soft reset") Tested-by: Karim Ben Houcine Signed-off-by: Mathieu Othacehe Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/micrel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 13e30ea7eec5d1..1d769322b05938 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -5327,6 +5327,7 @@ static struct phy_driver ksphy_driver[] = { /* PHY_BASIC_FEATURES */ .probe = kszphy_probe, .config_init = ksz8061_config_init, + .soft_reset = genphy_soft_reset, .config_intr = kszphy_config_intr, .handle_interrupt = kszphy_handle_interrupt, .suspend = kszphy_suspend, From 9b038d004ce95551cb35381c49fe896c5bc11ffe Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 21 May 2024 14:37:43 +0100 Subject: [PATCH 0061/1578] netfs: Fix io_uring based write-through This can be triggered by mounting a cifs filesystem with a cache=strict mount option and then, using the fsx program from xfstests, doing: ltp/fsx -A -d -N 1000 -S 11463 -P /tmp /cifs-mount/foo \ --replay-ops=gen112-fsxops Where gen112-fsxops holds: fallocate 0x6be7 0x8fc5 0x377d3 copy_range 0x9c71 0x77e8 0x2edaf 0x377d3 write 0x2776d 0x8f65 0x377d3 The problem is that netfs_io_request::len is being used for two purposes and ends up getting set to the amount of data we transferred, not the amount of data the caller asked to be transferred (for various reasons, such as mmap'd writes, we might end up rounding out the data written to the server to include the entire folio at each end). Fix this by keeping the amount we were asked to write in ->len and using ->submitted to track what we issued ops for. Then, when we come to calling ->ki_complete(), ->len is the right size. This also required netfs_cleanup_dio_write() to change since we're no longer advancing wreq->len. Use wreq->transferred instead as we might have done a short read. With this, the generic/112 xfstest passes if cifs is forced to put all non-DIO opens into write-through mode. Fixes: 288ace2f57c9 ("netfs: New writeback implementation") Signed-off-by: David Howells Link: https://lore.kernel.org/r/295086.1716298663@warthog.procyon.org.uk cc: Jeff Layton cc: Steve French cc: Enzo Matsumiya cc: netfs@lists.linux.dev cc: v9fs@lists.linux.dev cc: linux-afs@lists.infradead.org cc: linux-cifs@vger.kernel.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/direct_write.c | 2 +- fs/netfs/write_collect.c | 7 ++++--- fs/netfs/write_issue.c | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index 608ba6416919a8..28163516bf037f 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -12,7 +12,7 @@ static void netfs_cleanup_dio_write(struct netfs_io_request *wreq) { struct inode *inode = wreq->inode; - unsigned long long end = wreq->start + wreq->len; + unsigned long long end = wreq->start + wreq->transferred; if (!wreq->error && i_size_read(inode) < end) { diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index 60112e4b2c5eb7..426cf87aaf2ecd 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -510,7 +510,7 @@ static void netfs_collect_write_results(struct netfs_io_request *wreq) * stream has a gap that can be jumped. */ if (notes & SOME_EMPTY) { - unsigned long long jump_to = wreq->start + wreq->len; + unsigned long long jump_to = wreq->start + READ_ONCE(wreq->submitted); for (s = 0; s < NR_IO_STREAMS; s++) { stream = &wreq->io_streams[s]; @@ -690,10 +690,11 @@ void netfs_write_collection_worker(struct work_struct *work) wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS); if (wreq->iocb) { - wreq->iocb->ki_pos += wreq->transferred; + size_t written = min(wreq->transferred, wreq->len); + wreq->iocb->ki_pos += written; if (wreq->iocb->ki_complete) wreq->iocb->ki_complete( - wreq->iocb, wreq->error ? wreq->error : wreq->transferred); + wreq->iocb, wreq->error ? wreq->error : written); wreq->iocb = VFS_PTR_POISON; } diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index e190043bc0daa7..86dad7e4202b5c 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -254,7 +254,7 @@ static void netfs_issue_write(struct netfs_io_request *wreq, stream->construct = NULL; if (subreq->start + subreq->len > wreq->start + wreq->submitted) - wreq->len = wreq->submitted = subreq->start + subreq->len - wreq->start; + WRITE_ONCE(wreq->submitted, subreq->start + subreq->len - wreq->start); netfs_do_issue_write(stream, subreq); } From 2c6b531020f0590db3b6b4950a41c692e9aa4f4a Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 21 May 2024 14:36:27 +0100 Subject: [PATCH 0062/1578] netfs: Fix AIO error handling when doing write-through If an error occurs whilst we're doing an AIO write in write-through mode, we may end up calling ->ki_complete() *and* returning an error from ->write_iter(). This can result in either a UAF (the ->ki_complete() func pointer may get overwritten, for example) or a refcount underflow in io_submit() as ->ki_complete is called twice. Fix this by making netfs_end_writethrough() - and thus netfs_perform_write() - unconditionally return -EIOCBQUEUED if we're doing an AIO write and wait for completion if we're not. Fixes: 288ace2f57c9 ("netfs: New writeback implementation") Signed-off-by: David Howells Link: https://lore.kernel.org/r/295052.1716298587@warthog.procyon.org.uk cc: Jeff Layton cc: Enzo Matsumiya cc: netfs@lists.linux.dev cc: v9fs@lists.linux.dev cc: linux-afs@lists.infradead.org cc: linux-cifs@vger.kernel.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/write_issue.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 86dad7e4202b5c..3aa86e268f40d4 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -636,7 +636,12 @@ int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_contr mutex_unlock(&ictx->wb_lock); - ret = wreq->error; + if (wreq->iocb) { + ret = -EIOCBQUEUED; + } else { + wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, TASK_UNINTERRUPTIBLE); + ret = wreq->error; + } netfs_put_request(wreq, false, netfs_rreq_trace_put_return); return ret; } From 79c137454815ba5554caa8eeb4ad5c94e96e45ce Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Tue, 21 May 2024 19:49:38 +0800 Subject: [PATCH 0063/1578] filemap: add helper mapping_max_folio_size() Add mapping_max_folio_size() to get the maximum folio size for this pagecache mapping. Fixes: 5d8edfb900d5 ("iomap: Copy larger chunks from userspace") Cc: stable@vger.kernel.org Reviewed-by: Darrick J. Wong Signed-off-by: Xu Yang Link: https://lore.kernel.org/r/20240521114939.2541461-1-xu.yang_2@nxp.com Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Christoph Hellwig Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- include/linux/pagemap.h | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 3d69589c00a4bb..ee633712bba0b9 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -346,6 +346,19 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) m->gfp_mask = mask; } +/* + * There are some parts of the kernel which assume that PMD entries + * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then, + * limit the maximum allocation order to PMD size. I'm not aware of any + * assumptions about maximum order if THP are disabled, but 8 seems like + * a good order (that's 1MB if you're using 4kB pages) + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER +#else +#define MAX_PAGECACHE_ORDER 8 +#endif + /** * mapping_set_large_folios() - Indicate the file supports large folios. * @mapping: The file. @@ -372,6 +385,14 @@ static inline bool mapping_large_folio_support(struct address_space *mapping) test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); } +/* Return the maximum folio size for this pagecache mapping, in bytes. */ +static inline size_t mapping_max_folio_size(struct address_space *mapping) +{ + if (mapping_large_folio_support(mapping)) + return PAGE_SIZE << MAX_PAGECACHE_ORDER; + return PAGE_SIZE; +} + static inline int filemap_nr_thps(struct address_space *mapping) { #ifdef CONFIG_READ_ONLY_THP_FOR_FS @@ -530,19 +551,6 @@ static inline void *detach_page_private(struct page *page) return folio_detach_private(page_folio(page)); } -/* - * There are some parts of the kernel which assume that PMD entries - * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then, - * limit the maximum allocation order to PMD size. I'm not aware of any - * assumptions about maximum order if THP are disabled, but 8 seems like - * a good order (that's 1MB if you're using 4kB pages) - */ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER -#else -#define MAX_PAGECACHE_ORDER 8 -#endif - #ifdef CONFIG_NUMA struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order); #else From 4e527d5841e24623181edc7fd6f6598ffa810e10 Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Tue, 21 May 2024 19:49:39 +0800 Subject: [PATCH 0064/1578] iomap: fault in smaller chunks for non-large folio mappings Since commit (5d8edfb900d5 "iomap: Copy larger chunks from userspace"), iomap will try to copy in larger chunks than PAGE_SIZE. However, if the mapping doesn't support large folio, only one page of maximum 4KB will be created and 4KB data will be writen to pagecache each time. Then, next 4KB will be handled in next iteration. This will cause potential write performance problem. If chunk is 2MB, total 512 pages need to be handled finally. During this period, fault_in_iov_iter_readable() is called to check iov_iter readable validity. Since only 4KB will be handled each time, below address space will be checked over and over again: start end - buf, buf+2MB buf+4KB, buf+2MB buf+8KB, buf+2MB ... buf+2044KB buf+2MB Obviously the checking size is wrong since only 4KB will be handled each time. So this will get a correct chunk to let iomap work well in non-large folio case. With this change, the write speed will be stable. Tested on ARM64 device. Before: - dd if=/dev/zero of=/dev/sda bs=400K count=10485 (334 MB/s) - dd if=/dev/zero of=/dev/sda bs=800K count=5242 (278 MB/s) - dd if=/dev/zero of=/dev/sda bs=1600K count=2621 (204 MB/s) - dd if=/dev/zero of=/dev/sda bs=2200K count=1906 (170 MB/s) - dd if=/dev/zero of=/dev/sda bs=3000K count=1398 (150 MB/s) - dd if=/dev/zero of=/dev/sda bs=4500K count=932 (139 MB/s) After: - dd if=/dev/zero of=/dev/sda bs=400K count=10485 (339 MB/s) - dd if=/dev/zero of=/dev/sda bs=800K count=5242 (330 MB/s) - dd if=/dev/zero of=/dev/sda bs=1600K count=2621 (332 MB/s) - dd if=/dev/zero of=/dev/sda bs=2200K count=1906 (333 MB/s) - dd if=/dev/zero of=/dev/sda bs=3000K count=1398 (333 MB/s) - dd if=/dev/zero of=/dev/sda bs=4500K count=932 (333 MB/s) Fixes: 5d8edfb900d5 ("iomap: Copy larger chunks from userspace") Cc: stable@vger.kernel.org Reviewed-by: Darrick J. Wong Signed-off-by: Xu Yang Link: https://lore.kernel.org/r/20240521114939.2541461-2-xu.yang_2@nxp.com Reviewed-by: Christoph Hellwig Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 41c8f0c68ef564..c5802a4593345a 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -898,11 +898,11 @@ static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) { loff_t length = iomap_length(iter); - size_t chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER; loff_t pos = iter->pos; ssize_t total_written = 0; long status = 0; struct address_space *mapping = iter->inode->i_mapping; + size_t chunk = mapping_max_folio_size(mapping); unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0; do { From f826bc9d6fc2f0e089fb8d104415d72e4d2e204c Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Mon, 20 May 2024 12:08:18 +0300 Subject: [PATCH 0065/1578] signalfd: fix error return code If anon_inode_getfile() fails, return appropriate error code. This looks like a single typo: the similar code changes in timerfd and userfaultfd are okay. Found by Linux Verification Center (linuxtesting.org). Fixes: fbe38120eb1d ("signalfd: convert to ->read_iter()") Signed-off-by: Fedor Pchelkin Link: https://lore.kernel.org/r/20240520090819.76342-1-pchelkin@ispras.ru Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- fs/signalfd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/signalfd.c b/fs/signalfd.c index 4a5614442dbfa7..65fe5eed0be4cd 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -282,7 +282,7 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags) if (IS_ERR(file)) { put_unused_fd(ufd); kfree(ctx); - return ufd; + return PTR_ERR(file); } file->f_mode |= FMODE_NOWAIT; From 65bea9953715b19371164a8bec4f74fdd22c9e5a Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Mon, 20 May 2024 12:08:19 +0300 Subject: [PATCH 0066/1578] signalfd: drop an obsolete comment Commit fbe38120eb1d ("signalfd: convert to ->read_iter()") removed the call to anon_inode_getfd() by splitting fd setup into two parts. Drop the comment referencing the internal details of that function. Signed-off-by: Fedor Pchelkin Link: https://lore.kernel.org/r/20240520090819.76342-2-pchelkin@ispras.ru Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- fs/signalfd.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/signalfd.c b/fs/signalfd.c index 65fe5eed0be4cd..ec7b2da2477a41 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -286,10 +286,6 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags) } file->f_mode |= FMODE_NOWAIT; - /* - * When we call this, the initialization must be complete, since - * anon_inode_getfd() will install the fd. - */ fd_install(ufd, file); } else { struct fd f = fdget(ufd); From c596bea1452ddf172ec9b588e4597228e9a1f4d5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 21 May 2024 16:49:46 +0100 Subject: [PATCH 0067/1578] netfs: Fix setting of BDP_ASYNC from iocb flags Fix netfs_perform_write() to set BDP_ASYNC if IOCB_NOWAIT is set rather than if IOCB_SYNC is not set. It reflects asynchronicity in the sense of not waiting rather than synchronicity in the sense of not returning until the op is complete. Without this, generic/590 fails on cifs in strict caching mode with a complaint that one of the writes fails with EAGAIN. The test can be distilled down to: mount -t cifs /my/share /mnt -ostuff xfs_io -i -c 'falloc 0 8191M -c fsync -f /mnt/file xfs_io -i -c 'pwrite -b 1M -W 0 8191M' /mnt/file Fixes: c38f4e96e605 ("netfs: Provide func to copy data to pagecache for buffered write") Signed-off-by: David Howells Link: https://lore.kernel.org/r/316306.1716306586@warthog.procyon.org.uk Reviewed-by: Jens Axboe cc: Jeff Layton cc: Enzo Matsumiya cc: Jens Axboe cc: Matthew Wilcox cc: netfs@lists.linux.dev cc: v9fs@lists.linux.dev cc: linux-afs@lists.infradead.org cc: linux-cifs@vger.kernel.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/buffered_write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 1121601536d184..07bc1fd4353092 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -181,7 +181,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, struct folio *folio, *writethrough = NULL; enum netfs_how_to_modify howto; enum netfs_folio_trace trace; - unsigned int bdp_flags = (iocb->ki_flags & IOCB_SYNC) ? 0: BDP_ASYNC; + unsigned int bdp_flags = (iocb->ki_flags & IOCB_NOWAIT) ? BDP_ASYNC : 0; ssize_t written = 0, ret, ret2; loff_t i_size, pos = iocb->ki_pos, from, to; size_t max_chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER; From 712182b67e831912f90259102ae334089e7bccd1 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 21 May 2024 21:00:44 +0200 Subject: [PATCH 0068/1578] swap: yield device immediately Otherwise we can cause spurious EBUSY issues when trying to mount the rootfs later on. Link: https://bugzilla.kernel.org/show_bug.cgi?id=218845 Reported-by: Petri Kaukasoina Signed-off-by: Christian Brauner --- kernel/power/swap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 5bc04bfe2db1d3..c6f24d17866d83 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -1600,7 +1600,7 @@ int swsusp_check(bool exclusive) put: if (error) - fput(hib_resume_bdev_file); + bdev_fput(hib_resume_bdev_file); else pr_debug("Image signature found, resuming\n"); } else { From 51ef9305b8f40946d65c40368ffb4c14636d369a Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Wed, 22 May 2024 22:26:52 +0300 Subject: [PATCH 0069/1578] net/mlx5: Lag, do bond only if slaves agree on roce state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the driver does not enforce that lag bond slaves must have matching roce capabilities. Yet, in mlx5_do_bond(), the driver attempts to enable roce on all vports of the bond slaves, causing the following syndrome when one slave has no roce fw support: mlx5_cmd_out_err:809:(pid 25427): MODIFY_NIC_VPORT_CONTEXT(0×755) op_mod(0×0) failed, status bad parameter(0×3), syndrome (0xc1f678), err(-22) Thus, create HW lag only if bond's slaves agree on roce state, either all slaves have roce support resulting in a roce lag bond, or none do, resulting in a raw eth bond. Fixes: 7907f23adc18 ("net/mlx5: Implement RoCE LAG feature") Signed-off-by: Maher Sanalla Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index f7f0476a4a58d3..d0871c46b8c543 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -719,6 +719,7 @@ bool mlx5_lag_check_prereq(struct mlx5_lag *ldev) struct mlx5_core_dev *dev; u8 mode; #endif + bool roce_support; int i; for (i = 0; i < ldev->ports; i++) @@ -743,6 +744,11 @@ bool mlx5_lag_check_prereq(struct mlx5_lag *ldev) if (mlx5_sriov_is_enabled(ldev->pf[i].dev)) return false; #endif + roce_support = mlx5_get_roce_state(ldev->pf[MLX5_LAG_P1].dev); + for (i = 1; i < ldev->ports; i++) + if (mlx5_get_roce_state(ldev->pf[i].dev) != roce_support) + return false; + return true; } @@ -910,8 +916,10 @@ static void mlx5_do_bond(struct mlx5_lag *ldev) } else if (roce_lag) { dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV; mlx5_rescan_drivers_locked(dev0); - for (i = 1; i < ldev->ports; i++) - mlx5_nic_vport_enable_roce(ldev->pf[i].dev); + for (i = 1; i < ldev->ports; i++) { + if (mlx5_get_roce_state(ldev->pf[i].dev)) + mlx5_nic_vport_enable_roce(ldev->pf[i].dev); + } } else if (shared_fdb) { int i; From fca3b4791850b7e2181f0b3195b66d53df83151b Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Wed, 22 May 2024 22:26:53 +0300 Subject: [PATCH 0070/1578] net/mlx5: Do not query MPIR on embedded CPU function A proper query to MPIR needs to set the correct value in the depth field. On embedded CPU this value is not necessarily zero. As there is no real use case for multi-PF netdev on the embedded CPU of the smart NIC, block this option. This fixes the following failure: ACCESS_REG(0x805) op_mod(0x1) failed, status bad system state(0x4), syndrome (0x685f19), err(-5) Fixes: 678eb448055a ("net/mlx5: SD, Implement basic query and instantiation") Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c index dd5d186dc6148f..f6deb5a3f82024 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c @@ -100,10 +100,6 @@ static bool ft_create_alias_supported(struct mlx5_core_dev *dev) static bool mlx5_sd_is_supported(struct mlx5_core_dev *dev, u8 host_buses) { - /* Feature is currently implemented for PFs only */ - if (!mlx5_core_is_pf(dev)) - return false; - /* Honor the SW implementation limit */ if (host_buses > MLX5_SD_MAX_GROUP_SZ) return false; @@ -162,6 +158,14 @@ static int sd_init(struct mlx5_core_dev *dev) bool sdm; int err; + /* Feature is currently implemented for PFs only */ + if (!mlx5_core_is_pf(dev)) + return 0; + + /* Block on embedded CPU PFs */ + if (mlx5_core_is_ecpf(dev)) + return 0; + if (!MLX5_CAP_MCAM_REG(dev, mpir)) return 0; From 1b9f86c6d53245dab087f1b2c05727b5982142ff Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 22 May 2024 22:26:54 +0300 Subject: [PATCH 0071/1578] net/mlx5: Fix MTMP register capability offset in MCAM register The MTMP register (0x900a) capability offset is off-by-one, move it to the right place. Fixes: 1f507e80c700 ("net/mlx5: Expose NIC temperature via hardware monitoring kernel API") Signed-off-by: Gal Pressman Reviewed-by: Cosmin Ratiu Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/mlx5/mlx5_ifc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index f468763478ae61..5df52e15f7d6ce 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -10308,9 +10308,9 @@ struct mlx5_ifc_mcam_access_reg_bits { u8 mfrl[0x1]; u8 regs_39_to_32[0x8]; - u8 regs_31_to_10[0x16]; + u8 regs_31_to_11[0x15]; u8 mtmp[0x1]; - u8 regs_8_to_0[0x9]; + u8 regs_9_to_0[0xa]; }; struct mlx5_ifc_mcam_access_reg_bits1 { From 16d66a4fa81da07bc4ed19f4e53b87263c2f8d38 Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Wed, 22 May 2024 22:26:55 +0300 Subject: [PATCH 0072/1578] net/mlx5: Use mlx5_ipsec_rx_status_destroy to correctly delete status rules rx_create no longer allocates a modify_hdr instance that needs to be cleaned up. The mlx5_modify_header_dealloc call will lead to a NULL pointer dereference. A leak in the rules also previously occurred since there are now two rules populated related to status. BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 109907067 P4D 109907067 PUD 116890067 PMD 0 Oops: 0000 [#1] SMP CPU: 1 PID: 484 Comm: ip Not tainted 6.9.0-rc2-rrameshbabu+ #254 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS Arch Linux 1.16.3-1-1 04/01/2014 RIP: 0010:mlx5_modify_header_dealloc+0xd/0x70 Call Trace: ? show_regs+0x60/0x70 ? __die+0x24/0x70 ? page_fault_oops+0x15f/0x430 ? free_to_partial_list.constprop.0+0x79/0x150 ? do_user_addr_fault+0x2c9/0x5c0 ? exc_page_fault+0x63/0x110 ? asm_exc_page_fault+0x27/0x30 ? mlx5_modify_header_dealloc+0xd/0x70 rx_create+0x374/0x590 rx_add_rule+0x3ad/0x500 ? rx_add_rule+0x3ad/0x500 ? mlx5_cmd_exec+0x2c/0x40 ? mlx5_create_ipsec_obj+0xd6/0x200 mlx5e_accel_ipsec_fs_add_rule+0x31/0xf0 mlx5e_xfrm_add_state+0x426/0xc00 Fixes: 94af50c0a9bb ("net/mlx5e: Unify esw and normal IPsec status table creation/destruction") Signed-off-by: Rahul Rameshbabu Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c index 41a2543a52cda0..e51b03d4c717f1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c @@ -750,8 +750,7 @@ static int rx_create(struct mlx5_core_dev *mdev, struct mlx5e_ipsec *ipsec, err_fs_ft: if (rx->allow_tunnel_mode) mlx5_eswitch_unblock_encap(mdev); - mlx5_del_flow_rules(rx->status.rule); - mlx5_modify_header_dealloc(mdev, rx->status.modify_hdr); + mlx5_ipsec_rx_status_destroy(ipsec, rx); err_add: mlx5_destroy_flow_table(rx->ft.status); err_fs_ft_status: From 9a52f6d44f4521773b4699b4ed34b8e21d5a175c Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Wed, 22 May 2024 22:26:56 +0300 Subject: [PATCH 0073/1578] net/mlx5e: Fix IPsec tunnel mode offload feature check Remove faulty check disabling checksum offload and GSO for offload of simple IPsec tunnel L4 traffic. Comment previously describing the deleted code incorrectly claimed the check prevented double tunnel (or three layers of ip headers). Fixes: f1267798c980 ("net/mlx5: Fix checksum issue of VXLAN and IPsec crypto offload") Signed-off-by: Rahul Rameshbabu Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- .../mellanox/mlx5/core/en_accel/ipsec_rxtx.h | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h index 82064614846f5f..359050f0b54ddc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h @@ -97,18 +97,11 @@ mlx5e_ipsec_feature_check(struct sk_buff *skb, netdev_features_t features) if (!x || !x->xso.offload_handle) goto out_disable; - if (xo->inner_ipproto) { - /* Cannot support tunnel packet over IPsec tunnel mode - * because we cannot offload three IP header csum - */ - if (x->props.mode == XFRM_MODE_TUNNEL) - goto out_disable; - - /* Only support UDP or TCP L4 checksum */ - if (xo->inner_ipproto != IPPROTO_UDP && - xo->inner_ipproto != IPPROTO_TCP) - goto out_disable; - } + /* Only support UDP or TCP L4 checksum */ + if (xo->inner_ipproto && + xo->inner_ipproto != IPPROTO_UDP && + xo->inner_ipproto != IPPROTO_TCP) + goto out_disable; return features; From f55cd31287e5f77f226c91d2f7756bafa0d583ed Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Wed, 22 May 2024 22:26:57 +0300 Subject: [PATCH 0074/1578] net/mlx5e: Do not use ptp structure for tx ts stats when not initialized The ptp channel instance is only initialized when ptp traffic is first processed by the driver. This means that there is a window in between when port timestamping is enabled and ptp traffic is sent where the ptp channel instance is not initialized. Accessing statistics during this window will lead to an access violation (NULL + member offset). Check the validity of the instance before attempting to query statistics. BUG: unable to handle page fault for address: 0000000000003524 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 109dfc067 P4D 109dfc067 PUD 1064ef067 PMD 0 Oops: 0000 [#1] SMP CPU: 0 PID: 420 Comm: ethtool Not tainted 6.9.0-rc2-rrameshbabu+ #245 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS Arch Linux 1.16.3-1-1 04/01/204 RIP: 0010:mlx5e_stats_ts_get+0x4c/0x130 Call Trace: ? show_regs+0x60/0x70 ? __die+0x24/0x70 ? page_fault_oops+0x15f/0x430 ? do_user_addr_fault+0x2c9/0x5c0 ? exc_page_fault+0x63/0x110 ? asm_exc_page_fault+0x27/0x30 ? mlx5e_stats_ts_get+0x4c/0x130 ? mlx5e_stats_ts_get+0x20/0x130 mlx5e_get_ts_stats+0x15/0x20 Fixes: 3579032c08c1 ("net/mlx5e: Implement ethtool hardware timestamping statistics") Signed-off-by: Rahul Rameshbabu Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_stats.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c index e211c41cec06a8..e1ed214e86517a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c @@ -1186,6 +1186,9 @@ void mlx5e_stats_ts_get(struct mlx5e_priv *priv, ts_stats->err = 0; ts_stats->lost = 0; + if (!ptp) + goto out; + /* Aggregate stats across all TCs */ for (i = 0; i < ptp->num_tc; i++) { struct mlx5e_ptp_cq_stats *stats = @@ -1214,6 +1217,7 @@ void mlx5e_stats_ts_get(struct mlx5e_priv *priv, } } +out: mutex_unlock(&priv->state_lock); } From 5c74195d5dd977e97556e6fa76909b831c241230 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Wed, 22 May 2024 22:26:58 +0300 Subject: [PATCH 0075/1578] net/mlx5e: Use rx_missed_errors instead of rx_dropped for reporting buffer exhaustion Previously, the driver incorrectly used rx_dropped to report device buffer exhaustion. According to the documentation, rx_dropped should not be used to count packets dropped due to buffer exhaustion, which is the purpose of rx_missed_errors. Use rx_missed_errors as intended for counting packets dropped due to buffer exhaustion. Fixes: 269e6b3af3bf ("net/mlx5e: Report additional error statistics in get stats ndo") Signed-off-by: Carolina Jubran Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index b758bc72ac36bb..c53c99dde55872 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3886,7 +3886,7 @@ mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) mlx5e_fold_sw_stats64(priv, stats); } - stats->rx_dropped = priv->stats.qcnt.rx_out_of_buffer; + stats->rx_missed_errors = priv->stats.qcnt.rx_out_of_buffer; stats->rx_length_errors = PPORT_802_3_GET(pstats, a_in_range_length_errors) + From 83fea49f2711fc90c0d115b0ed04046b45155b65 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 22 May 2024 22:26:59 +0300 Subject: [PATCH 0076/1578] net/mlx5e: Fix UDP GSO for encapsulated packets When the skb is encapsulated, adjust the inner UDP header instead of the outer one, and account for UDP header (instead of TCP) in the inline header size calculation. Fixes: 689adf0d4892 ("net/mlx5e: Add UDP GSO support") Reported-by: Jason Baron Closes: https://lore.kernel.org/netdev/c42961cb-50b9-4a9a-bd43-87fe48d88d29@akamai.com/ Signed-off-by: Gal Pressman Reviewed-by: Dragos Tatulea Reviewed-by: Boris Pismenny Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h | 8 +++++++- drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 6 +++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h index caa34b9c161e51..33e32584b07f57 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h @@ -102,8 +102,14 @@ static inline void mlx5e_udp_gso_handle_tx_skb(struct sk_buff *skb) { int payload_len = skb_shinfo(skb)->gso_size + sizeof(struct udphdr); + struct udphdr *udphdr; - udp_hdr(skb)->len = htons(payload_len); + if (skb->encapsulation) + udphdr = (struct udphdr *)skb_inner_transport_header(skb); + else + udphdr = udp_hdr(skb); + + udphdr->len = htons(payload_len); } struct mlx5e_accel_tx_state { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index 099bf107888990..b09e9abd39f37f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -153,7 +153,11 @@ mlx5e_tx_get_gso_ihs(struct mlx5e_txqsq *sq, struct sk_buff *skb, int *hopbyhop) *hopbyhop = 0; if (skb->encapsulation) { - ihs = skb_inner_tcp_all_headers(skb); + if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) + ihs = skb_inner_transport_offset(skb) + + sizeof(struct udphdr); + else + ihs = skb_inner_tcp_all_headers(skb); stats->tso_inner_packets++; stats->tso_inner_bytes += skb->len - ihs; } else { From b794918961516f667b0c745aebdfebbb8a98df39 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sun, 5 May 2024 23:08:31 +0900 Subject: [PATCH 0077/1578] dma-buf/sw-sync: don't enable IRQ from sync_print_obj() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit a6aa8fca4d79 ("dma-buf/sw-sync: Reduce irqsave/irqrestore from known context") by error replaced spin_unlock_irqrestore() with spin_unlock_irq() for both sync_debugfs_show() and sync_print_obj() despite sync_print_obj() is called from sync_debugfs_show(), lockdep complains inconsistent lock state warning. Use plain spin_{lock,unlock}() for sync_print_obj(), for sync_debugfs_show() is already using spin_{lock,unlock}_irq(). Reported-by: syzbot Closes: https://syzkaller.appspot.com/bug?extid=a225ee3df7e7f9372dbe Fixes: a6aa8fca4d79 ("dma-buf/sw-sync: Reduce irqsave/irqrestore from known context") Signed-off-by: Tetsuo Handa Reviewed-by: Christian König Link: https://patchwork.freedesktop.org/patch/msgid/c2e46020-aaa6-4e06-bf73-f05823f913f0@I-love.SAKURA.ne.jp Signed-off-by: Christian König --- drivers/dma-buf/sync_debug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/dma-buf/sync_debug.c b/drivers/dma-buf/sync_debug.c index 101394f16930f8..237bce21d1e724 100644 --- a/drivers/dma-buf/sync_debug.c +++ b/drivers/dma-buf/sync_debug.c @@ -110,12 +110,12 @@ static void sync_print_obj(struct seq_file *s, struct sync_timeline *obj) seq_printf(s, "%s: %d\n", obj->name, obj->value); - spin_lock_irq(&obj->lock); + spin_lock(&obj->lock); /* Caller already disabled IRQ. */ list_for_each(pos, &obj->pt_list) { struct sync_pt *pt = container_of(pos, struct sync_pt, link); sync_print_fence(s, &pt->base, false); } - spin_unlock_irq(&obj->lock); + spin_unlock(&obj->lock); } static void sync_print_sync_file(struct seq_file *s, From 44382b3ed6b2787710c8ade06c0e97f5970a47c8 Mon Sep 17 00:00:00 2001 From: Friedrich Vock Date: Tue, 14 May 2024 09:09:31 +0200 Subject: [PATCH 0078/1578] bpf: Fix potential integer overflow in resolve_btfids err is a 32-bit integer, but elf_update returns an off_t, which is 64-bit at least on 64-bit platforms. If symbols_patch is called on a binary between 2-4GB in size, the result will be negative when cast to a 32-bit integer, which the code assumes means an error occurred. This can wrongly trigger build failures when building very large kernel images. Fixes: fbbb68de80a4 ("bpf: Add resolve_btfids tool to resolve BTF IDs in ELF object") Signed-off-by: Friedrich Vock Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240514070931.199694-1-friedrich.vock@gmx.de --- tools/bpf/resolve_btfids/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index d9520cb826b31f..af393c7dee1f18 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -728,7 +728,7 @@ static int sets_patch(struct object *obj) static int symbols_patch(struct object *obj) { - int err; + off_t err; if (__symbols_patch(obj, &obj->structs) || __symbols_patch(obj, &obj->unions) || From 64e3d02b43b17390f0fa9af6708a4eafdf20ba01 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Fri, 24 May 2024 16:04:48 +0530 Subject: [PATCH 0079/1578] nvme: remove sgs and sws sgs/sws are unused, so remove these from nvme_ns_head structure. Signed-off-by: Kanchan Joshi Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/nvme.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index fc31bd340a63a2..c43a30753d87a8 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -471,8 +471,6 @@ struct nvme_ns_head { u8 pi_type; u8 pi_offset; u8 guard_type; - u16 sgs; - u32 sws; #ifdef CONFIG_BLK_DEV_ZONED u64 zsze; #endif From 1bd293fcf3af84674e82ed022c049491f3768840 Mon Sep 17 00:00:00 2001 From: Kundan Kumar Date: Thu, 23 May 2024 17:01:49 +0530 Subject: [PATCH 0080/1578] nvme: adjust multiples of NVME_CTRL_PAGE_SIZE in offset bio_vec start offset may be relatively large particularly when large folio gets added to the bio. A bigger offset will result in avoiding the single-segment mapping optimization and end up using expensive mempool_alloc further. Rather than using absolute value, adjust bv_offset by NVME_CTRL_PAGE_SIZE while checking if segment can be fitted into one/two PRP entries. Suggested-by: Christoph Hellwig Signed-off-by: Kundan Kumar Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 710043086dffa5..102a9fb0c65fff 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -778,7 +778,8 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, struct bio_vec bv = req_bvec(req); if (!is_pci_p2pdma_page(bv.bv_page)) { - if (bv.bv_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2) + if ((bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) + + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2) return nvme_setup_prp_simple(dev, req, &cmnd->rw, &bv); From a527c3ba41c4c61e2069bfce4091e5515f06a8dd Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 24 May 2024 18:14:19 +0200 Subject: [PATCH 0081/1578] nfs: Avoid flushing many pages with NFS_FILE_SYNC When we are doing WB_SYNC_ALL writeback, nfs submits write requests with NFS_FILE_SYNC flag to the server (which then generally treats it as an O_SYNC write). This helps to reduce latency for single requests but when submitting more requests, additional fsyncs on the server side hurt latency. NFS generally avoids this additional overhead by not setting NFS_FILE_SYNC if desc->pg_moreio is set. However this logic doesn't always work. When we do random 4k writes to a huge file and then call fsync(2), each page writeback is going to be sent with NFS_FILE_SYNC because after preparing one page for writeback, we start writing back next, nfs_do_writepage() will call nfs_pageio_cond_complete() which finds the page is not contiguous with previously prepared IO and submits is *without* setting desc->pg_moreio. Hence NFS_FILE_SYNC is used resulting in poor performance. Fix the problem by setting desc->pg_moreio in nfs_pageio_cond_complete() before submitting outstanding IO. This improves throughput of fsync-after-random-writes on my test SSD from ~70MB/s to ~250MB/s. Signed-off-by: Jan Kara Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 6efb5068c116e0..040b6b79c75e59 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -1545,6 +1545,11 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index) continue; } else if (index == prev->wb_index + 1) continue; + /* + * We will submit more requests after these. Indicate + * this to the underlying layers. + */ + desc->pg_moreio = 1; nfs_pageio_complete(desc); break; } From 0c8c7c559740d2d8b66048162af6c4dba8f0c88c Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Thu, 23 May 2024 15:01:22 -0400 Subject: [PATCH 0082/1578] nfs: don't invalidate dentries on transient errors This is a slight variation on a patch previously proposed by Neil Brown that never got merged. Prior to commit 5ceb9d7fdaaf ("NFS: Refactor nfs_lookup_revalidate()"), any error from nfs_lookup_verify_inode() other than -ESTALE would result in nfs_lookup_revalidate() returning that error (-ESTALE is mapped to zero). Since that commit, all errors result in nfs_lookup_revalidate() returning zero, resulting in dentries being invalidated where they previously were not (particularly in the case of -ERESTARTSYS). Fix it by passing the actual error code to nfs_lookup_revalidate_done(), and leaving the decision on whether to map the error code to zero or one to nfs_lookup_revalidate_done(). A simple reproducer is to run the following python code in a subdirectory of an NFS mount (not in the root of the NFS mount): ---8<--- import os import multiprocessing import time if __name__=="__main__": multiprocessing.set_start_method("spawn") count = 0 while True: try: os.getcwd() pool = multiprocessing.Pool(10) pool.close() pool.terminate() count += 1 except Exception as e: print(f"Failed after {count} iterations") print(e) break ---8<--- Prior to commit 5ceb9d7fdaaf, the above code would run indefinitely. After commit 5ceb9d7fdaaf, it fails almost immediately with -ENOENT. Signed-off-by: Scott Mayhew Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 3429309962267f..788077a4feb9a6 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1627,7 +1627,16 @@ nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry, switch (error) { case 1: break; - case 0: + case -ETIMEDOUT: + if (inode && (IS_ROOT(dentry) || + NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL)) + error = 1; + break; + case -ESTALE: + case -ENOENT: + error = 0; + fallthrough; + default: /* * We can't d_drop the root of a disconnected tree: * its d_hash is on the s_anon list and d_drop() would hide @@ -1682,18 +1691,8 @@ static int nfs_lookup_revalidate_dentry(struct inode *dir, dir_verifier = nfs_save_change_attribute(dir); ret = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr); - if (ret < 0) { - switch (ret) { - case -ESTALE: - case -ENOENT: - ret = 0; - break; - case -ETIMEDOUT: - if (NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL) - ret = 1; - } + if (ret < 0) goto out; - } /* Request help from readdirplus */ nfs_lookup_advise_force_readdirplus(dir, flags); @@ -1737,7 +1736,7 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode; - int error; + int error = 0; nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); inode = d_inode(dentry); @@ -1782,7 +1781,7 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, out_bad: if (flags & LOOKUP_RCU) return -ECHILD; - return nfs_lookup_revalidate_done(dir, dentry, inode, 0); + return nfs_lookup_revalidate_done(dir, dentry, inode, error); } static int From 29be9100aca2915fab54b5693309bc42956542e5 Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Fri, 24 May 2024 17:17:55 +0100 Subject: [PATCH 0083/1578] afs: Don't cross .backup mountpoint from backup volume Don't cross a mountpoint that explicitly specifies a backup volume (target is .backup) when starting from a backup volume. It it not uncommon to mount a volume's backup directly in the volume itself. This can cause tools that are not paying attention to get into a loop mounting the volume onto itself as they attempt to traverse the tree, leading to a variety of problems. This doesn't prevent the general case of loops in a sequence of mountpoints, but addresses a common special case in the same way as other afs clients. Reported-by: Jan Henrik Sylvester Link: http://lists.infradead.org/pipermail/linux-afs/2024-May/008454.html Reported-by: Markus Suvanto Link: http://lists.infradead.org/pipermail/linux-afs/2024-February/008074.html Signed-off-by: Marc Dionne Signed-off-by: David Howells Link: https://lore.kernel.org/r/768760.1716567475@warthog.procyon.org.uk Reviewed-by: Jeffrey Altman cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner --- fs/afs/mntpt.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 97f50e9fd9eb01..297487ee832317 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -140,6 +140,11 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) put_page(page); if (ret < 0) return ret; + + /* Don't cross a backup volume mountpoint from a backup volume */ + if (src_as->volume && src_as->volume->type == AFSVL_BACKVOL && + ctx->type == AFSVL_BACKVOL) + return -ENODEV; } return 0; From 46ba0e49b64232adac35a2bc892f1710c5b0fb7f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 21 May 2024 09:33:57 -0700 Subject: [PATCH 0084/1578] bpf: fix multi-uprobe PID filtering logic Current implementation of PID filtering logic for multi-uprobes in uprobe_prog_run() is filtering down to exact *thread*, while the intent for PID filtering it to filter by *process* instead. The check in uprobe_prog_run() also differs from the analogous one in uprobe_multi_link_filter() for some reason. The latter is correct, checking task->mm, not the task itself. Fix the check in uprobe_prog_run() to perform the same task->mm check. While doing this, we also update get_pid_task() use to use PIDTYPE_TGID type of lookup, given the intent is to get a representative task of an entire process. This doesn't change behavior, but seems more logical. It would hold task group leader task now, not any random thread task. Last but not least, given multi-uprobe support is half-broken due to this PID filtering logic (depending on whether PID filtering is important or not), we need to make it easy for user space consumers (including libbpf) to easily detect whether PID filtering logic was already fixed. We do it here by adding an early check on passed pid parameter. If it's negative (and so has no chance of being a valid PID), we return -EINVAL. Previous behavior would eventually return -ESRCH ("No process found"), given there can't be any process with negative PID. This subtle change won't make any practical change in behavior, but will allow applications to detect PID filtering fixes easily. Libbpf fixes take advantage of this in the next patch. Cc: stable@vger.kernel.org Acked-by: Jiri Olsa Fixes: b733eeade420 ("bpf: Add pid filter support for uprobe_multi link") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240521163401.3005045-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 8 ++++---- .../testing/selftests/bpf/prog_tests/uprobe_multi_test.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f5154c051d2c10..1baaeb9ca20556 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3295,7 +3295,7 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe, struct bpf_run_ctx *old_run_ctx; int err = 0; - if (link->task && current != link->task) + if (link->task && current->mm != link->task->mm) return 0; if (sleepable) @@ -3396,8 +3396,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path); uoffsets = u64_to_user_ptr(attr->link_create.uprobe_multi.offsets); cnt = attr->link_create.uprobe_multi.cnt; + pid = attr->link_create.uprobe_multi.pid; - if (!upath || !uoffsets || !cnt) + if (!upath || !uoffsets || !cnt || pid < 0) return -EINVAL; if (cnt > MAX_UPROBE_MULTI_CNT) return -E2BIG; @@ -3421,10 +3422,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr goto error_path_put; } - pid = attr->link_create.uprobe_multi.pid; if (pid) { rcu_read_lock(); - task = get_pid_task(find_vpid(pid), PIDTYPE_PID); + task = get_pid_task(find_vpid(pid), PIDTYPE_TGID); rcu_read_unlock(); if (!task) { err = -ESRCH; diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c index 8269cdee33ae97..38fda42fd70f9c 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c @@ -397,7 +397,7 @@ static void test_attach_api_fails(void) link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_UPROBE_MULTI, &opts); if (!ASSERT_ERR(link_fd, "link_fd")) goto cleanup; - ASSERT_EQ(link_fd, -ESRCH, "pid_is_wrong"); + ASSERT_EQ(link_fd, -EINVAL, "pid_is_wrong"); cleanup: if (link_fd >= 0) From 4a8f635a60540888dab3804992e86410360339c8 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 21 May 2024 09:33:58 -0700 Subject: [PATCH 0085/1578] bpf: remove unnecessary rcu_read_{lock,unlock}() in multi-uprobe attach logic get_pid_task() internally already calls rcu_read_lock() and rcu_read_unlock(), so there is no point to do this one extra time. This is a drive-by improvement and has no correctness implications. Acked-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240521163401.3005045-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 1baaeb9ca20556..6249dac6170183 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3423,9 +3423,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr } if (pid) { - rcu_read_lock(); task = get_pid_task(find_vpid(pid), PIDTYPE_TGID); - rcu_read_unlock(); if (!task) { err = -ESRCH; goto error_path_put; From 04d939a2ab229a3821f04fc81f7c027842f501f1 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 21 May 2024 09:33:59 -0700 Subject: [PATCH 0086/1578] libbpf: detect broken PID filtering logic for multi-uprobe Libbpf is automatically (and transparently to user) detecting multi-uprobe support in the kernel, and, if supported, uses multi-uprobes to improve USDT attachment speed. USDTs can be attached system-wide or for the specific process by PID. In the latter case, we rely on correct kernel logic of not triggering USDT for unrelated processes. As such, on older kernels that do support multi-uprobes, but still have broken PID filtering logic, we need to fall back to singular uprobes. Unfortunately, whether user is using PID filtering or not is known at the attachment time, which happens after relevant BPF programs were loaded into the kernel. Also unfortunately, we need to make a call whether to use multi-uprobes or singular uprobe for SEC("usdt") programs during BPF object load time, at which point we have no information about possible PID filtering. The distinction between single and multi-uprobes is small, but important for the kernel. Multi-uprobes get BPF_TRACE_UPROBE_MULTI attach type, and kernel internally substitiute different implementation of some of BPF helpers (e.g., bpf_get_attach_cookie()) depending on whether uprobe is multi or singular. So, multi-uprobes and singular uprobes cannot be intermixed. All the above implies that we have to make an early and conservative call about the use of multi-uprobes. And so this patch modifies libbpf's existing feature detector for multi-uprobe support to also check correct PID filtering. If PID filtering is not yet fixed, we fall back to singular uprobes for USDTs. This extension to feature detection is simple thanks to kernel's -EINVAL addition for pid < 0. Acked-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240521163401.3005045-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/features.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/features.c b/tools/lib/bpf/features.c index a336786a22a38d..3df0125ed5fa7a 100644 --- a/tools/lib/bpf/features.c +++ b/tools/lib/bpf/features.c @@ -392,11 +392,40 @@ static int probe_uprobe_multi_link(int token_fd) link_fd = bpf_link_create(prog_fd, -1, BPF_TRACE_UPROBE_MULTI, &link_opts); err = -errno; /* close() can clobber errno */ + if (link_fd >= 0 || err != -EBADF) { + close(link_fd); + close(prog_fd); + return 0; + } + + /* Initial multi-uprobe support in kernel didn't handle PID filtering + * correctly (it was doing thread filtering, not process filtering). + * So now we'll detect if PID filtering logic was fixed, and, if not, + * we'll pretend multi-uprobes are not supported, if not. + * Multi-uprobes are used in USDT attachment logic, and we need to be + * conservative here, because multi-uprobe selection happens early at + * load time, while the use of PID filtering is known late at + * attachment time, at which point it's too late to undo multi-uprobe + * selection. + * + * Creating uprobe with pid == -1 for (invalid) '/' binary will fail + * early with -EINVAL on kernels with fixed PID filtering logic; + * otherwise -ESRCH would be returned if passed correct binary path + * (but we'll just get -BADF, of course). + */ + link_opts.uprobe_multi.pid = -1; /* invalid PID */ + link_opts.uprobe_multi.path = "/"; /* invalid path */ + link_opts.uprobe_multi.offsets = &offset; + link_opts.uprobe_multi.cnt = 1; + + link_fd = bpf_link_create(prog_fd, -1, BPF_TRACE_UPROBE_MULTI, &link_opts); + err = -errno; /* close() can clobber errno */ + if (link_fd >= 0) close(link_fd); close(prog_fd); - return link_fd < 0 && err == -EBADF; + return link_fd < 0 && err == -EINVAL; } static int probe_kern_bpf_cookie(int token_fd) From 70342420a1cf1173bdec456e5fa574a804e422db Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 21 May 2024 09:34:00 -0700 Subject: [PATCH 0087/1578] selftests/bpf: extend multi-uprobe tests with child thread case Extend existing multi-uprobe tests to test that PID filtering works correctly. We already have child *process* tests, but we need also child *thread* tests. This patch adds spawn_thread() helper to start child thread, wait for it to be ready, and then instruct it to trigger desired uprobes. Additionally, we extend BPF-side code to track thread ID, not just process ID. Also we detect whether extraneous triggerings with unexpected process IDs happened, and validate that none of that happened in practice. These changes prove that fixed PID filtering logic for multi-uprobe works as expected. These tests fail on old kernels. Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20240521163401.3005045-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/uprobe_multi_test.c | 107 ++++++++++++++++-- .../selftests/bpf/progs/uprobe_multi.c | 17 ++- 2 files changed, 115 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c index 38fda42fd70f9c..677232d314322b 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include "uprobe_multi.skel.h" #include "uprobe_multi_bench.skel.h" @@ -27,7 +28,10 @@ noinline void uprobe_multi_func_3(void) struct child { int go[2]; + int c2p[2]; /* child -> parent channel */ int pid; + int tid; + pthread_t thread; }; static void release_child(struct child *child) @@ -38,6 +42,10 @@ static void release_child(struct child *child) return; close(child->go[1]); close(child->go[0]); + if (child->thread) + pthread_join(child->thread, NULL); + close(child->c2p[0]); + close(child->c2p[1]); if (child->pid > 0) waitpid(child->pid, &child_status, 0); } @@ -63,7 +71,7 @@ static struct child *spawn_child(void) if (pipe(child.go)) return NULL; - child.pid = fork(); + child.pid = child.tid = fork(); if (child.pid < 0) { release_child(&child); errno = EINVAL; @@ -89,6 +97,66 @@ static struct child *spawn_child(void) return &child; } +static void *child_thread(void *ctx) +{ + struct child *child = ctx; + int c = 0, err; + + child->tid = syscall(SYS_gettid); + + /* let parent know we are ready */ + err = write(child->c2p[1], &c, 1); + if (err != 1) + pthread_exit(&err); + + /* wait for parent's kick */ + err = read(child->go[0], &c, 1); + if (err != 1) + pthread_exit(&err); + + uprobe_multi_func_1(); + uprobe_multi_func_2(); + uprobe_multi_func_3(); + + err = 0; + pthread_exit(&err); +} + +static struct child *spawn_thread(void) +{ + static struct child child; + int c, err; + + /* pipe to notify child to execute the trigger functions */ + if (pipe(child.go)) + return NULL; + /* pipe to notify parent that child thread is ready */ + if (pipe(child.c2p)) { + close(child.go[0]); + close(child.go[1]); + return NULL; + } + + child.pid = getpid(); + + err = pthread_create(&child.thread, NULL, child_thread, &child); + if (err) { + err = -errno; + close(child.go[0]); + close(child.go[1]); + close(child.c2p[0]); + close(child.c2p[1]); + errno = -err; + return NULL; + } + + err = read(child.c2p[0], &c, 1); + if (!ASSERT_EQ(err, 1, "child_thread_ready")) + return NULL; + + return &child; +} + static void uprobe_multi_test_run(struct uprobe_multi *skel, struct child *child) { skel->bss->uprobe_multi_func_1_addr = (__u64) uprobe_multi_func_1; @@ -103,15 +171,22 @@ static void uprobe_multi_test_run(struct uprobe_multi *skel, struct child *child * passed at the probe attach. */ skel->bss->pid = child ? 0 : getpid(); + skel->bss->expect_pid = child ? child->pid : 0; + + /* trigger all probes, if we are testing child *process*, just to make + * sure that PID filtering doesn't let through activations from wrong + * PIDs; when we test child *thread*, we don't want to do this to + * avoid double counting number of triggering events + */ + if (!child || !child->thread) { + uprobe_multi_func_1(); + uprobe_multi_func_2(); + uprobe_multi_func_3(); + } if (child) kick_child(child); - /* trigger all probes */ - uprobe_multi_func_1(); - uprobe_multi_func_2(); - uprobe_multi_func_3(); - /* * There are 2 entry and 2 exit probe called for each uprobe_multi_func_[123] * function and each slepable probe (6) increments uprobe_multi_sleep_result. @@ -126,8 +201,12 @@ static void uprobe_multi_test_run(struct uprobe_multi *skel, struct child *child ASSERT_EQ(skel->bss->uprobe_multi_sleep_result, 6, "uprobe_multi_sleep_result"); - if (child) + ASSERT_FALSE(skel->bss->bad_pid_seen, "bad_pid_seen"); + + if (child) { ASSERT_EQ(skel->bss->child_pid, child->pid, "uprobe_multi_child_pid"); + ASSERT_EQ(skel->bss->child_tid, child->tid, "uprobe_multi_child_tid"); + } } static void test_skel_api(void) @@ -210,6 +289,13 @@ test_attach_api(const char *binary, const char *pattern, struct bpf_uprobe_multi return; __test_attach_api(binary, pattern, opts, child); + + /* pid filter (thread) */ + child = spawn_thread(); + if (!ASSERT_OK_PTR(child, "spawn_thread")) + return; + + __test_attach_api(binary, pattern, opts, child); } static void test_attach_api_pattern(void) @@ -495,6 +581,13 @@ static void test_link_api(void) return; __test_link_api(child); + + /* pid filter (thread) */ + child = spawn_thread(); + if (!ASSERT_OK_PTR(child, "spawn_thread")) + return; + + __test_link_api(child); } static void test_bench_attach_uprobe(void) diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi.c b/tools/testing/selftests/bpf/progs/uprobe_multi.c index 419d9aa28fce5c..86a7ff5d3726d0 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_multi.c +++ b/tools/testing/selftests/bpf/progs/uprobe_multi.c @@ -22,6 +22,10 @@ __u64 uprobe_multi_sleep_result = 0; int pid = 0; int child_pid = 0; +int child_tid = 0; + +int expect_pid = 0; +bool bad_pid_seen = false; bool test_cookie = false; void *user_ptr = 0; @@ -36,11 +40,19 @@ static __always_inline bool verify_sleepable_user_copy(void) static void uprobe_multi_check(void *ctx, bool is_return, bool is_sleep) { - child_pid = bpf_get_current_pid_tgid() >> 32; + __u64 cur_pid_tgid = bpf_get_current_pid_tgid(); + __u32 cur_pid; - if (pid && child_pid != pid) + cur_pid = cur_pid_tgid >> 32; + if (pid && cur_pid != pid) return; + if (expect_pid && cur_pid != expect_pid) + bad_pid_seen = true; + + child_pid = cur_pid_tgid >> 32; + child_tid = (__u32)cur_pid_tgid; + __u64 cookie = test_cookie ? bpf_get_attach_cookie(ctx) : 0; __u64 addr = bpf_get_func_ip(ctx); @@ -97,5 +109,6 @@ int uretprobe_sleep(struct pt_regs *ctx) SEC("uprobe.multi//proc/self/exe:uprobe_multi_func_*") int uprobe_extra(struct pt_regs *ctx) { + /* we need this one just to mix PID-filtered and global uprobes */ return 0; } From 198034a87dfeb64d5a8359a5089022c6b923646e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 21 May 2024 09:34:01 -0700 Subject: [PATCH 0088/1578] selftests/bpf: extend multi-uprobe tests with USDTs Validate libbpf's USDT-over-multi-uprobe logic by adding USDTs to existing multi-uprobe tests. This checks correct libbpf fallback to singular uprobes (when run on older kernels with buggy PID filtering). We reuse already established child process and child thread testing infrastructure, so additions are minimal. These test fail on either older kernels or older version of libbpf that doesn't detect PID filtering problems. Acked-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240521163401.3005045-6-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/uprobe_multi_test.c | 25 ++++++++++++++ .../selftests/bpf/progs/uprobe_multi.c | 33 +++++++++++++++++-- 2 files changed, 56 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c index 677232d314322b..bf6ca8e3eb1314 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c @@ -8,6 +8,7 @@ #include "uprobe_multi_usdt.skel.h" #include "bpf/libbpf_internal.h" #include "testing_helpers.h" +#include "../sdt.h" static char test_data[] = "test_data"; @@ -26,6 +27,11 @@ noinline void uprobe_multi_func_3(void) asm volatile (""); } +noinline void usdt_trigger(void) +{ + STAP_PROBE(test, pid_filter_usdt); +} + struct child { int go[2]; int c2p[2]; /* child -> parent channel */ @@ -90,6 +96,7 @@ static struct child *spawn_child(void) uprobe_multi_func_1(); uprobe_multi_func_2(); uprobe_multi_func_3(); + usdt_trigger(); exit(errno); } @@ -117,6 +124,7 @@ static void *child_thread(void *ctx) uprobe_multi_func_1(); uprobe_multi_func_2(); uprobe_multi_func_3(); + usdt_trigger(); err = 0; pthread_exit(&err); @@ -182,6 +190,7 @@ static void uprobe_multi_test_run(struct uprobe_multi *skel, struct child *child uprobe_multi_func_1(); uprobe_multi_func_2(); uprobe_multi_func_3(); + usdt_trigger(); } if (child) @@ -269,8 +278,24 @@ __test_attach_api(const char *binary, const char *pattern, struct bpf_uprobe_mul if (!ASSERT_OK_PTR(skel->links.uprobe_extra, "bpf_program__attach_uprobe_multi")) goto cleanup; + /* Attach (uprobe-backed) USDTs */ + skel->links.usdt_pid = bpf_program__attach_usdt(skel->progs.usdt_pid, pid, binary, + "test", "pid_filter_usdt", NULL); + if (!ASSERT_OK_PTR(skel->links.usdt_pid, "attach_usdt_pid")) + goto cleanup; + + skel->links.usdt_extra = bpf_program__attach_usdt(skel->progs.usdt_extra, -1, binary, + "test", "pid_filter_usdt", NULL); + if (!ASSERT_OK_PTR(skel->links.usdt_extra, "attach_usdt_extra")) + goto cleanup; + uprobe_multi_test_run(skel, child); + ASSERT_FALSE(skel->bss->bad_pid_seen_usdt, "bad_pid_seen_usdt"); + if (child) { + ASSERT_EQ(skel->bss->child_pid_usdt, child->pid, "usdt_multi_child_pid"); + ASSERT_EQ(skel->bss->child_tid_usdt, child->tid, "usdt_multi_child_tid"); + } cleanup: uprobe_multi__destroy(skel); } diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi.c b/tools/testing/selftests/bpf/progs/uprobe_multi.c index 86a7ff5d3726d0..44190efcdba217 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_multi.c +++ b/tools/testing/selftests/bpf/progs/uprobe_multi.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include "vmlinux.h" #include #include -#include +#include char _license[] SEC("license") = "GPL"; @@ -23,9 +23,12 @@ __u64 uprobe_multi_sleep_result = 0; int pid = 0; int child_pid = 0; int child_tid = 0; +int child_pid_usdt = 0; +int child_tid_usdt = 0; int expect_pid = 0; bool bad_pid_seen = false; +bool bad_pid_seen_usdt = false; bool test_cookie = false; void *user_ptr = 0; @@ -112,3 +115,29 @@ int uprobe_extra(struct pt_regs *ctx) /* we need this one just to mix PID-filtered and global uprobes */ return 0; } + +SEC("usdt") +int usdt_pid(struct pt_regs *ctx) +{ + __u64 cur_pid_tgid = bpf_get_current_pid_tgid(); + __u32 cur_pid; + + cur_pid = cur_pid_tgid >> 32; + if (pid && cur_pid != pid) + return 0; + + if (expect_pid && cur_pid != expect_pid) + bad_pid_seen_usdt = true; + + child_pid_usdt = cur_pid_tgid >> 32; + child_tid_usdt = (__u32)cur_pid_tgid; + + return 0; +} + +SEC("usdt") +int usdt_extra(struct pt_regs *ctx) +{ + /* we need this one just to mix PID-filtered and global USDT probes */ + return 0; +} From dd6a403795f0c7b5c566f86f2ee6b687278d3c1c Mon Sep 17 00:00:00 2001 From: Shahab Vahedi Date: Sat, 25 May 2024 05:56:28 +0200 Subject: [PATCH 0089/1578] ARC, bpf: Fix issues reported by the static analyzers Also updated couple of comments along the way. One of the issues reported was indeed a bug in the code: memset(ctx, 0, sizeof(ctx)) // original line memset(ctx, 0, sizeof(*ctx)) // fixed line That was a nice catch. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202405222314.UG5F2NHn-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202405232036.Xqoc3b0J-lkp@intel.com/ Signed-off-by: Shahab Vahedi Link: https://lore.kernel.org/r/20240525035628.1026-1-list+bpf@vahedi.org Signed-off-by: Alexei Starovoitov --- arch/arc/net/bpf_jit.h | 2 +- arch/arc/net/bpf_jit_arcv2.c | 10 ++++++---- arch/arc/net/bpf_jit_core.c | 22 +++++++++++----------- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/arch/arc/net/bpf_jit.h b/arch/arc/net/bpf_jit.h index ec44873c42d154..495f3023e4c182 100644 --- a/arch/arc/net/bpf_jit.h +++ b/arch/arc/net/bpf_jit.h @@ -39,7 +39,7 @@ /************** Functions that the back-end must provide **************/ /* Extension for 32-bit operations. */ -inline u8 zext(u8 *buf, u8 rd); +u8 zext(u8 *buf, u8 rd); /***** Moves *****/ u8 mov_r32(u8 *buf, u8 rd, u8 rs, u8 sign_ext); u8 mov_r32_i32(u8 *buf, u8 reg, s32 imm); diff --git a/arch/arc/net/bpf_jit_arcv2.c b/arch/arc/net/bpf_jit_arcv2.c index 31bfb6e9ce00fb..4458e409ca0a84 100644 --- a/arch/arc/net/bpf_jit_arcv2.c +++ b/arch/arc/net/bpf_jit_arcv2.c @@ -62,7 +62,7 @@ enum { * If/when we decide to add ARCv2 instructions that do use register pairs, * the mapping, hopefully, doesn't need to be revisited. */ -const u8 bpf2arc[][2] = { +static const u8 bpf2arc[][2] = { /* Return value from in-kernel function, and exit value from eBPF */ [BPF_REG_0] = {ARC_R_8, ARC_R_9}, /* Arguments from eBPF program to in-kernel function */ @@ -1302,7 +1302,7 @@ static u8 arc_b(u8 *buf, s32 offset) /************* Packers (Deal with BPF_REGs) **************/ -inline u8 zext(u8 *buf, u8 rd) +u8 zext(u8 *buf, u8 rd) { if (rd != BPF_REG_FP) return arc_movi_r(buf, REG_HI(rd), 0); @@ -2235,6 +2235,7 @@ u8 gen_swap(u8 *buf, u8 rd, u8 size, u8 endian, bool force, bool do_zext) break; default: /* The caller must have handled this. */ + break; } } else { /* @@ -2253,6 +2254,7 @@ u8 gen_swap(u8 *buf, u8 rd, u8 size, u8 endian, bool force, bool do_zext) break; default: /* The caller must have handled this. */ + break; } } @@ -2517,7 +2519,7 @@ u8 arc_epilogue(u8 *buf, u32 usage, u16 frame_size) #define JCC64_NR_OF_JMPS 3 /* Number of jumps in jcc64 template. */ #define JCC64_INSNS_TO_END 3 /* Number of insn. inclusive the 2nd jmp to end. */ #define JCC64_SKIP_JMP 1 /* Index of the "skip" jump to "end". */ -const struct { +static const struct { /* * "jit_off" is common between all "jmp[]" and is coupled with * "cond" of each "jmp[]" instance. e.g.: @@ -2883,7 +2885,7 @@ u8 gen_jmp_64(u8 *buf, u8 rd, u8 rs, u8 cond, u32 curr_off, u32 targ_off) * The "ARC_CC_SET" becomes "CC_unequal" because of the "tst" * instruction that precedes the conditional branch. */ -const u8 arcv2_32_jmps[ARC_CC_LAST] = { +static const u8 arcv2_32_jmps[ARC_CC_LAST] = { [ARC_CC_UGT] = CC_great_u, [ARC_CC_UGE] = CC_great_eq_u, [ARC_CC_ULT] = CC_less_u, diff --git a/arch/arc/net/bpf_jit_core.c b/arch/arc/net/bpf_jit_core.c index 6f6b4ffccf2c29..e3628922c24a0c 100644 --- a/arch/arc/net/bpf_jit_core.c +++ b/arch/arc/net/bpf_jit_core.c @@ -159,7 +159,7 @@ static void jit_dump(const struct jit_context *ctx) /* Initialise the context so there's no garbage. */ static int jit_ctx_init(struct jit_context *ctx, struct bpf_prog *prog) { - memset(ctx, 0, sizeof(ctx)); + memset(ctx, 0, sizeof(*ctx)); ctx->orig_prog = prog; @@ -167,7 +167,7 @@ static int jit_ctx_init(struct jit_context *ctx, struct bpf_prog *prog) ctx->prog = bpf_jit_blind_constants(prog); if (IS_ERR(ctx->prog)) return PTR_ERR(ctx->prog); - ctx->blinded = (ctx->prog == ctx->orig_prog ? false : true); + ctx->blinded = (ctx->prog != ctx->orig_prog); /* If the verifier doesn't zero-extend, then we have to do it. */ ctx->do_zext = !ctx->prog->aux->verifier_zext; @@ -1182,12 +1182,12 @@ static int jit_prepare(struct jit_context *ctx) } /* - * All the "handle_*()" functions have been called before by the - * "jit_prepare()". If there was an error, we would know by now. - * Therefore, no extra error checking at this point, other than - * a sanity check at the end that expects the calculated length - * (jit.len) to be equal to the length of generated instructions - * (jit.index). + * jit_compile() is the real compilation phase. jit_prepare() is + * invoked before jit_compile() as a dry-run to make sure everything + * will go OK and allocate the necessary memory. + * + * In the end, jit_compile() checks if it has produced the same number + * of instructions as jit_prepare() would. */ static int jit_compile(struct jit_context *ctx) { @@ -1407,9 +1407,9 @@ static struct bpf_prog *do_extra_pass(struct bpf_prog *prog) /* * This function may be invoked twice for the same stream of BPF - * instructions. The "extra pass" happens, when there are "call"s - * involved that their addresses are not known during the first - * invocation. + * instructions. The "extra pass" happens, when there are + * (re)locations involved that their addresses are not known + * during the first run. */ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { From d6fe532b7499e4575f9647879b7a34625817fe7f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 24 May 2024 18:36:16 +0200 Subject: [PATCH 0090/1578] netkit: Fix setting mac address in l2 mode When running Cilium connectivity test suite with netkit in L2 mode, we found that it is expected to be able to specify a custom MAC address for the devices, in particular, cilium-cni obtains the specified MAC address by querying the endpoint and sets the MAC address of the interface inside the Pod. Thus, fix the missing support in netkit for L2 mode. Fixes: 35dfaad7188c ("netkit, bpf: Add bpf programmable net device") Signed-off-by: Daniel Borkmann Acked-by: Nikolay Aleksandrov Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20240524163619.26001-1-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- drivers/net/netkit.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c index a4d2e76a8d587c..272894053e2c45 100644 --- a/drivers/net/netkit.c +++ b/drivers/net/netkit.c @@ -155,6 +155,16 @@ static void netkit_set_multicast(struct net_device *dev) /* Nothing to do, we receive whatever gets pushed to us! */ } +static int netkit_set_macaddr(struct net_device *dev, void *sa) +{ + struct netkit *nk = netkit_priv(dev); + + if (nk->mode != NETKIT_L2) + return -EOPNOTSUPP; + + return eth_mac_addr(dev, sa); +} + static void netkit_set_headroom(struct net_device *dev, int headroom) { struct netkit *nk = netkit_priv(dev), *nk2; @@ -198,6 +208,7 @@ static const struct net_device_ops netkit_netdev_ops = { .ndo_start_xmit = netkit_xmit, .ndo_set_rx_mode = netkit_set_multicast, .ndo_set_rx_headroom = netkit_set_headroom, + .ndo_set_mac_address = netkit_set_macaddr, .ndo_get_iflink = netkit_get_iflink, .ndo_get_peer_dev = netkit_peer_dev, .ndo_get_stats64 = netkit_get_stats, @@ -300,9 +311,11 @@ static int netkit_validate(struct nlattr *tb[], struct nlattr *data[], if (!attr) return 0; - NL_SET_ERR_MSG_ATTR(extack, attr, - "Setting Ethernet address is not supported"); - return -EOPNOTSUPP; + if (nla_len(attr) != ETH_ALEN) + return -EINVAL; + if (!is_valid_ether_addr(nla_data(attr))) + return -EADDRNOTAVAIL; + return 0; } static struct rtnl_link_ops netkit_link_ops; @@ -365,6 +378,9 @@ static int netkit_new_link(struct net *src_net, struct net_device *dev, strscpy(ifname, "nk%d", IFNAMSIZ); ifname_assign_type = NET_NAME_ENUM; } + if (mode != NETKIT_L2 && + (tb[IFLA_ADDRESS] || tbp[IFLA_ADDRESS])) + return -EOPNOTSUPP; net = rtnl_link_get_net(src_net, tbp); if (IS_ERR(net)) @@ -379,7 +395,7 @@ static int netkit_new_link(struct net *src_net, struct net_device *dev, netif_inherit_tso_max(peer, dev); - if (mode == NETKIT_L2) + if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS])) eth_hw_addr_random(peer); if (ifmp && dev->ifindex) peer->ifindex = ifmp->ifi_index; @@ -402,7 +418,7 @@ static int netkit_new_link(struct net *src_net, struct net_device *dev, if (err < 0) goto err_configure_peer; - if (mode == NETKIT_L2) + if (mode == NETKIT_L2 && !tb[IFLA_ADDRESS]) eth_hw_addr_random(dev); if (tb[IFLA_IFNAME]) nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); From 3998d184267dfcff858aaa84d3de17429253629d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 24 May 2024 18:36:17 +0200 Subject: [PATCH 0091/1578] netkit: Fix pkt_type override upon netkit pass verdict When running Cilium connectivity test suite with netkit in L2 mode, we found that compared to tcx a few tests were failing which pushed traffic into an L7 proxy sitting in host namespace. The problem in particular is around the invocation of eth_type_trans() in netkit. In case of tcx, this is run before the tcx ingress is triggered inside host namespace and thus if the BPF program uses the bpf_skb_change_type() helper the newly set type is retained. However, in case of netkit, the late eth_type_trans() invocation overrides the earlier decision from the BPF program which eventually leads to the test failure. Instead of eth_type_trans(), split out the relevant parts, meaning, reset of mac header and call to eth_skb_pkt_type() before the BPF program is run in order to have the same behavior as with tcx, and refactor a small helper called eth_skb_pull_mac() which is run in case it's passed up the stack where the mac header must be pulled. With this all connectivity tests pass. Fixes: 35dfaad7188c ("netkit, bpf: Add bpf programmable net device") Signed-off-by: Daniel Borkmann Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20240524163619.26001-2-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- drivers/net/netkit.c | 4 +++- include/linux/etherdevice.h | 8 ++++++++ net/ethernet/eth.c | 4 +--- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c index 272894053e2c45..16789cd446e9e4 100644 --- a/drivers/net/netkit.c +++ b/drivers/net/netkit.c @@ -55,6 +55,7 @@ static void netkit_prep_forward(struct sk_buff *skb, bool xnet) skb_scrub_packet(skb, xnet); skb->priority = 0; nf_skip_egress(skb, true); + skb_reset_mac_header(skb); } static struct netkit *netkit_priv(const struct net_device *dev) @@ -78,6 +79,7 @@ static netdev_tx_t netkit_xmit(struct sk_buff *skb, struct net_device *dev) skb_orphan_frags(skb, GFP_ATOMIC))) goto drop; netkit_prep_forward(skb, !net_eq(dev_net(dev), dev_net(peer))); + eth_skb_pkt_type(skb, peer); skb->dev = peer; entry = rcu_dereference(nk->active); if (entry) @@ -85,7 +87,7 @@ static netdev_tx_t netkit_xmit(struct sk_buff *skb, struct net_device *dev) switch (ret) { case NETKIT_NEXT: case NETKIT_PASS: - skb->protocol = eth_type_trans(skb, skb->dev); + eth_skb_pull_mac(skb); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) { dev_sw_netstats_tx_add(dev, 1, len); diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 2ad1ffa4ccb9fe..0ed47d00549ba7 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -636,6 +636,14 @@ static inline void eth_skb_pkt_type(struct sk_buff *skb, } } +static inline struct ethhdr *eth_skb_pull_mac(struct sk_buff *skb) +{ + struct ethhdr *eth = (struct ethhdr *)skb->data; + + skb_pull_inline(skb, ETH_HLEN); + return eth; +} + /** * eth_skb_pad - Pad buffer to mininum number of octets for Ethernet frame * @skb: Buffer to pad diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 049c3adeb85044..4e3651101b8660 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -161,9 +161,7 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) skb->dev = dev; skb_reset_mac_header(skb); - eth = (struct ethhdr *)skb->data; - skb_pull_inline(skb, ETH_HLEN); - + eth = eth_skb_pull_mac(skb); eth_skb_pkt_type(skb, dev); /* From 998ffeb2738e26f134dc8e63b5dcaece22573957 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 24 May 2024 18:36:18 +0200 Subject: [PATCH 0092/1578] selftests/bpf: Add netkit tests for mac address This adds simple tests around setting MAC addresses in the different netkit modes. Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20240524163619.26001-3-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/tc_netkit.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/tc_netkit.c b/tools/testing/selftests/bpf/prog_tests/tc_netkit.c index 15ee7b2fc41064..18b2e969a45673 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_netkit.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_netkit.c @@ -73,6 +73,16 @@ static int create_netkit(int mode, int policy, int peer_policy, int *ifindex, "up primary"); ASSERT_OK(system("ip addr add dev " netkit_name " 10.0.0.1/24"), "addr primary"); + + if (mode == NETKIT_L3) { + ASSERT_EQ(system("ip link set dev " netkit_name + " addr ee:ff:bb:cc:aa:dd 2> /dev/null"), 512, + "set hwaddress"); + } else { + ASSERT_OK(system("ip link set dev " netkit_name + " addr ee:ff:bb:cc:aa:dd"), + "set hwaddress"); + } if (same_netns) { ASSERT_OK(system("ip link set dev " netkit_peer " up"), "up peer"); From 95348e463eabc803341c67d562f9e0a5f0a48fe6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 24 May 2024 18:36:19 +0200 Subject: [PATCH 0093/1578] selftests/bpf: Add netkit test for pkt_type Add a test case to assert that the skb->pkt_type which was set from the BPF program is retained from the netkit xmit side to the peer's device at tcx ingress location. # ./vmtest.sh -- ./test_progs -t netkit [...] ./test_progs -t netkit [ 1.140780] bpf_testmod: loading out-of-tree module taints kernel. [ 1.141127] bpf_testmod: module verification failed: signature and/or required key missing - tainting kernel [ 1.284601] tsc: Refined TSC clocksource calibration: 3408.006 MHz [ 1.286672] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x311fd9b189d, max_idle_ns: 440795225691 ns [ 1.290384] clocksource: Switched to clocksource tsc #345 tc_netkit_basic:OK #346 tc_netkit_device:OK #347 tc_netkit_multi_links:OK #348 tc_netkit_multi_opts:OK #349 tc_netkit_neigh_links:OK #350 tc_netkit_pkt_type:OK Summary: 6/0 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20240524163619.26001-4-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/tc_netkit.c | 84 +++++++++++++++++++ .../selftests/bpf/progs/test_tc_link.c | 35 +++++++- 2 files changed, 118 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/tc_netkit.c b/tools/testing/selftests/bpf/prog_tests/tc_netkit.c index 18b2e969a45673..b9135720024ccf 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_netkit.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_netkit.c @@ -99,6 +99,16 @@ static int create_netkit(int mode, int policy, int peer_policy, int *ifindex, return err; } +static void move_netkit(void) +{ + ASSERT_OK(system("ip link set " netkit_peer " netns foo"), + "move peer"); + ASSERT_OK(system("ip netns exec foo ip link set dev " + netkit_peer " up"), "up peer"); + ASSERT_OK(system("ip netns exec foo ip addr add dev " + netkit_peer " 10.0.0.2/24"), "addr peer"); +} + static void destroy_netkit(void) { ASSERT_OK(system("ip link del dev " netkit_name), "del primary"); @@ -695,3 +705,77 @@ void serial_test_tc_netkit_neigh_links(void) serial_test_tc_netkit_neigh_links_target(NETKIT_L2, BPF_NETKIT_PRIMARY); serial_test_tc_netkit_neigh_links_target(NETKIT_L3, BPF_NETKIT_PRIMARY); } + +static void serial_test_tc_netkit_pkt_type_mode(int mode) +{ + LIBBPF_OPTS(bpf_netkit_opts, optl_nk); + LIBBPF_OPTS(bpf_tcx_opts, optl_tcx); + int err, ifindex, ifindex2; + struct test_tc_link *skel; + struct bpf_link *link; + + err = create_netkit(mode, NETKIT_PASS, NETKIT_PASS, + &ifindex, true); + if (err) + return; + + ifindex2 = if_nametoindex(netkit_peer); + ASSERT_NEQ(ifindex, ifindex2, "ifindex_1_2"); + + skel = test_tc_link__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + goto cleanup; + + ASSERT_EQ(bpf_program__set_expected_attach_type(skel->progs.tc1, + BPF_NETKIT_PRIMARY), 0, "tc1_attach_type"); + ASSERT_EQ(bpf_program__set_expected_attach_type(skel->progs.tc7, + BPF_TCX_INGRESS), 0, "tc7_attach_type"); + + err = test_tc_link__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + assert_mprog_count_ifindex(ifindex, BPF_NETKIT_PRIMARY, 0); + assert_mprog_count_ifindex(ifindex2, BPF_TCX_INGRESS, 0); + + link = bpf_program__attach_netkit(skel->progs.tc1, ifindex, &optl_nk); + if (!ASSERT_OK_PTR(link, "link_attach")) + goto cleanup; + + skel->links.tc1 = link; + + assert_mprog_count_ifindex(ifindex, BPF_NETKIT_PRIMARY, 1); + assert_mprog_count_ifindex(ifindex2, BPF_TCX_INGRESS, 0); + + link = bpf_program__attach_tcx(skel->progs.tc7, ifindex2, &optl_tcx); + if (!ASSERT_OK_PTR(link, "link_attach")) + goto cleanup; + + skel->links.tc7 = link; + + assert_mprog_count_ifindex(ifindex, BPF_NETKIT_PRIMARY, 1); + assert_mprog_count_ifindex(ifindex2, BPF_TCX_INGRESS, 1); + + move_netkit(); + + tc_skel_reset_all_seen(skel); + skel->bss->set_type = true; + ASSERT_EQ(send_icmp(), 0, "icmp_pkt"); + + ASSERT_EQ(skel->bss->seen_tc1, true, "seen_tc1"); + ASSERT_EQ(skel->bss->seen_tc7, true, "seen_tc7"); + + ASSERT_EQ(skel->bss->seen_host, true, "seen_host"); + ASSERT_EQ(skel->bss->seen_mcast, true, "seen_mcast"); +cleanup: + test_tc_link__destroy(skel); + + assert_mprog_count_ifindex(ifindex, BPF_NETKIT_PRIMARY, 0); + destroy_netkit(); +} + +void serial_test_tc_netkit_pkt_type(void) +{ + serial_test_tc_netkit_pkt_type_mode(NETKIT_L2); + serial_test_tc_netkit_pkt_type_mode(NETKIT_L3); +} diff --git a/tools/testing/selftests/bpf/progs/test_tc_link.c b/tools/testing/selftests/bpf/progs/test_tc_link.c index 992400acb9572a..ab3eae3d6af879 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_link.c +++ b/tools/testing/selftests/bpf/progs/test_tc_link.c @@ -4,7 +4,8 @@ #include #include - +#include +#include #include #include @@ -16,7 +17,13 @@ bool seen_tc3; bool seen_tc4; bool seen_tc5; bool seen_tc6; +bool seen_tc7; + +bool set_type; + bool seen_eth; +bool seen_host; +bool seen_mcast; SEC("tc/ingress") int tc1(struct __sk_buff *skb) @@ -28,8 +35,16 @@ int tc1(struct __sk_buff *skb) if (bpf_skb_load_bytes(skb, 0, ð, sizeof(eth))) goto out; seen_eth = eth.h_proto == bpf_htons(ETH_P_IP); + seen_host = skb->pkt_type == PACKET_HOST; + if (seen_host && set_type) { + eth.h_dest[0] = 4; + if (bpf_skb_store_bytes(skb, 0, ð, sizeof(eth), 0)) + goto fail; + bpf_skb_change_type(skb, PACKET_MULTICAST); + } out: seen_tc1 = true; +fail: return TCX_NEXT; } @@ -67,3 +82,21 @@ int tc6(struct __sk_buff *skb) seen_tc6 = true; return TCX_PASS; } + +SEC("tc/ingress") +int tc7(struct __sk_buff *skb) +{ + struct ethhdr eth = {}; + + if (skb->protocol != __bpf_constant_htons(ETH_P_IP)) + goto out; + if (bpf_skb_load_bytes(skb, 0, ð, sizeof(eth))) + goto out; + if (eth.h_dest[0] == 4 && set_type) { + seen_mcast = skb->pkt_type == PACKET_MULTICAST; + bpf_skb_change_type(skb, PACKET_HOST); + } +out: + seen_tc7 = true; + return TCX_PASS; +} From 67ec8cdf29971677b2fb4b6d92871eb5d5e95597 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 22 May 2024 13:37:54 +0800 Subject: [PATCH 0094/1578] hwrng: core - Remove add_early_randomness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A potential deadlock was reported with the config file at https://web.archive.org/web/20240522052129/https://0x0.st/XPN_.txt In this particular configuration, the deadlock doesn't exist because the warning triggered at a point before modules were even available. However, the deadlock can be real because any module loaded would invoke async_synchronize_full. The issue is spurious for software crypto algorithms which aren't themselves involved in async probing. However, it would be hard to avoid for a PCI crypto driver using async probing. In this particular call trace, the problem is easily avoided because the only reason the module is being requested during probing is the add_early_randomness call in the hwrng core. This feature is vestigial since there is now a kernel thread dedicated to doing exactly this. So remove add_early_randomness as it is no longer needed. Reported-by: Nícolas F. R. A. Prado Reported-by: Eric Biggers Fixes: 1b6d7f9eb150 ("tpm: add session encryption protection to tpm2_get_random()") Link: https://lore.kernel.org/r/119dc5ed-f159-41be-9dda-1a056f29888d@notapiano/ Signed-off-by: Herbert Xu Reviewed-by: Jarkko Sakkinen Tested-by: Nícolas F. R. A. Prado Signed-off-by: Herbert Xu --- drivers/char/hw_random/core.c | 47 +++-------------------------------- 1 file changed, 4 insertions(+), 43 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index f5c71a617a9931..4084df65c9fa33 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -64,19 +64,6 @@ static size_t rng_buffer_size(void) return RNG_BUFFER_SIZE; } -static void add_early_randomness(struct hwrng *rng) -{ - int bytes_read; - - mutex_lock(&reading_mutex); - bytes_read = rng_get_data(rng, rng_fillbuf, 32, 0); - mutex_unlock(&reading_mutex); - if (bytes_read > 0) { - size_t entropy = bytes_read * 8 * rng->quality / 1024; - add_hwgenerator_randomness(rng_fillbuf, bytes_read, entropy, false); - } -} - static inline void cleanup_rng(struct kref *kref) { struct hwrng *rng = container_of(kref, struct hwrng, ref); @@ -340,13 +327,12 @@ static ssize_t rng_current_store(struct device *dev, const char *buf, size_t len) { int err; - struct hwrng *rng, *old_rng, *new_rng; + struct hwrng *rng, *new_rng; err = mutex_lock_interruptible(&rng_mutex); if (err) return -ERESTARTSYS; - old_rng = current_rng; if (sysfs_streq(buf, "")) { err = enable_best_rng(); } else { @@ -362,11 +348,8 @@ static ssize_t rng_current_store(struct device *dev, new_rng = get_current_rng_nolock(); mutex_unlock(&rng_mutex); - if (new_rng) { - if (new_rng != old_rng) - add_early_randomness(new_rng); + if (new_rng) put_rng(new_rng); - } return err ? : len; } @@ -544,7 +527,6 @@ int hwrng_register(struct hwrng *rng) { int err = -EINVAL; struct hwrng *tmp; - bool is_new_current = false; if (!rng->name || (!rng->data_read && !rng->read)) goto out; @@ -573,25 +555,8 @@ int hwrng_register(struct hwrng *rng) err = set_current_rng(rng); if (err) goto out_unlock; - /* to use current_rng in add_early_randomness() we need - * to take a ref - */ - is_new_current = true; - kref_get(&rng->ref); } mutex_unlock(&rng_mutex); - if (is_new_current || !rng->init) { - /* - * Use a new device's input to add some randomness to - * the system. If this rng device isn't going to be - * used right away, its init function hasn't been - * called yet by set_current_rng(); so only use the - * randomness from devices that don't need an init callback - */ - add_early_randomness(rng); - } - if (is_new_current) - put_rng(rng); return 0; out_unlock: mutex_unlock(&rng_mutex); @@ -602,12 +567,11 @@ EXPORT_SYMBOL_GPL(hwrng_register); void hwrng_unregister(struct hwrng *rng) { - struct hwrng *old_rng, *new_rng; + struct hwrng *new_rng; int err; mutex_lock(&rng_mutex); - old_rng = current_rng; list_del(&rng->list); complete_all(&rng->dying); if (current_rng == rng) { @@ -626,11 +590,8 @@ void hwrng_unregister(struct hwrng *rng) } else mutex_unlock(&rng_mutex); - if (new_rng) { - if (old_rng != new_rng) - add_early_randomness(new_rng); + if (new_rng) put_rng(new_rng); - } wait_for_completion(&rng->cleanup_done); } From d509cadc3a48fee394d68757e4b685f7c143ed64 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 25 May 2024 00:53:25 -0400 Subject: [PATCH 0095/1578] bcachefs: Fix debug assert Reported-by: syzbot+a8074a75b8d73328751e@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 2206a8dee693ab..df2bea38e83f0f 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -564,7 +564,7 @@ static void __bch2_fs_free(struct bch_fs *c) BUG_ON(atomic_read(&c->journal_keys.ref)); bch2_fs_btree_write_buffer_exit(c); percpu_free_rwsem(&c->mark_lock); - EBUG_ON(percpu_u64_get(c->online_reserved)); + EBUG_ON(c->online_reserved && percpu_u64_get(c->online_reserved)); free_percpu(c->online_reserved); darray_exit(&c->btree_roots_extra); From 9242a34b760648b722f4958749ad83ef7d0f7525 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 25 May 2024 12:38:53 -0400 Subject: [PATCH 0096/1578] bcachefs: Fix sb-downgrade validation Superblock downgrade entries are only two byte aligned, but section sizes are 8 byte aligned, which means we have to be careful about overrun checks; an entry that crosses the end of the section is allowed (and ignored) as long as it has zero errors. Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-downgrade.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 390a1bbd2567ac..3fb23e399ffb3e 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -146,10 +146,17 @@ static int bch2_sb_downgrade_validate(struct bch_sb *sb, struct bch_sb_field *f, for (const struct bch_sb_field_downgrade_entry *i = e->entries; (void *) i < vstruct_end(&e->field); i = downgrade_entry_next_c(i)) { + /* + * Careful: sb_field_downgrade_entry is only 2 byte aligned, but + * section sizes are 8 byte aligned - an empty entry spanning + * the end of the section is allowed (and ignored): + */ + if ((void *) &i->errors[0] > vstruct_end(&e->field)) + break; + if (flags & BCH_VALIDATE_write && - ((void *) &i->errors[0] > vstruct_end(&e->field) || - (void *) downgrade_entry_next_c(i) > vstruct_end(&e->field))) { - prt_printf(err, "downgrade entry overruns end of superblock section)"); + (void *) downgrade_entry_next_c(i) > vstruct_end(&e->field)) { + prt_printf(err, "downgrade entry overruns end of superblock section"); return -BCH_ERR_invalid_sb_downgrade; } From f94b77709e82242c1101e59a90a7807455c4ab2a Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Thu, 23 May 2024 16:22:34 -0700 Subject: [PATCH 0097/1578] firewire: add missing MODULE_DESCRIPTION() to test modules Fix the 'make W=1' warnings: WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/firewire/uapi-test.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/firewire/packet-serdes-test.o Signed-off-by: Jeff Johnson Link: https://lore.kernel.org/r/20240523-md-firewire-uapi-test-v1-1-6be5adcc3aed@quicinc.com Signed-off-by: Takashi Sakamoto --- drivers/firewire/packet-serdes-test.c | 1 + drivers/firewire/uapi-test.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/firewire/packet-serdes-test.c b/drivers/firewire/packet-serdes-test.c index f93c966e794d97..e83b1fece780ee 100644 --- a/drivers/firewire/packet-serdes-test.c +++ b/drivers/firewire/packet-serdes-test.c @@ -579,4 +579,5 @@ static struct kunit_suite packet_serdes_test_suite = { }; kunit_test_suite(packet_serdes_test_suite); +MODULE_DESCRIPTION("FireWire packet serialization/deserialization unit test suite"); MODULE_LICENSE("GPL"); diff --git a/drivers/firewire/uapi-test.c b/drivers/firewire/uapi-test.c index 2fcbede4fab1f5..bc3f10a2e516e0 100644 --- a/drivers/firewire/uapi-test.c +++ b/drivers/firewire/uapi-test.c @@ -86,4 +86,5 @@ static struct kunit_suite structure_layout_test_suite = { }; kunit_test_suite(structure_layout_test_suite); +MODULE_DESCRIPTION("FireWire UAPI unit test suite"); MODULE_LICENSE("GPL"); From 611b7eb19d0a305d4de00280e4a71a1b15c507fc Mon Sep 17 00:00:00 2001 From: Jim Wylder Date: Thu, 23 May 2024 16:14:36 -0500 Subject: [PATCH 0098/1578] regmap-i2c: Subtract reg size from max_write Currently, when an adapter defines a max_write_len quirk, the data will be chunked into data sizes equal to the max_write_len quirk value. But the payload will be increased by the size of the register address before transmission. The resulting value always ends up larger than the limit set by the quirk. Avoid this error by setting regmap's max_write to the quirk's max_write_len minus the number of bytes for the register and padding. This allows the chunking to work correctly for this limited case without impacting other use-cases. Signed-off-by: Jim Wylder Link: https://msgid.link/r/20240523211437.2839942-1-jwylder@google.com Signed-off-by: Mark Brown --- drivers/base/regmap/regmap-i2c.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/base/regmap/regmap-i2c.c b/drivers/base/regmap/regmap-i2c.c index 3ec611dc0c09fb..a905e955bbfc78 100644 --- a/drivers/base/regmap/regmap-i2c.c +++ b/drivers/base/regmap/regmap-i2c.c @@ -350,7 +350,8 @@ static const struct regmap_bus *regmap_get_i2c_bus(struct i2c_client *i2c, if (quirks->max_write_len && (bus->max_raw_write == 0 || bus->max_raw_write > quirks->max_write_len)) - max_write = quirks->max_write_len; + max_write = quirks->max_write_len - + (config->reg_bits + config->pad_bits) / BITS_PER_BYTE; if (max_read || max_write) { ret_bus = kmemdup(bus, sizeof(*bus), GFP_KERNEL); From b82b6eeefd30f1ff049bff54da419a30ad9354c0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 24 May 2024 20:46:25 +0100 Subject: [PATCH 0099/1578] bcachefs: Use copy_folio_from_iter_atomic() copy_page_from_iter_atomic() will be removed at some point. Also fixup a comment for folios. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-io-buffered.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 6b69e5cd68dd51..54873ecc635cb0 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -437,8 +437,8 @@ static void bch2_writepage_io_done(struct bch_write_op *op) */ /* - * PageWriteback is effectively our ref on the inode - fixup i_blocks - * before calling end_page_writeback: + * The writeback flag is effectively our ref on the inode - + * fixup i_blocks before calling folio_end_writeback: */ bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); @@ -898,7 +898,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, darray_for_each(fs, fi) { f = *fi; f_len = min(end, folio_end_pos(f)) - f_pos; - f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter); + f_copied = copy_folio_from_iter_atomic(f, f_offset, f_len, iter); if (!f_copied) { folios_trunc(&fs, fi); break; From 9ee267a29309233b9ef8f58ee61e0b1c9b5879e8 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Sun, 26 May 2024 09:52:48 -0700 Subject: [PATCH 0100/1578] fs: smb: common: add missing MODULE_DESCRIPTION() macros Fix the 'make W=1' warnings: WARNING: modpost: missing MODULE_DESCRIPTION() in fs/smb/common/cifs_arc4.o WARNING: modpost: missing MODULE_DESCRIPTION() in fs/smb/common/cifs_md4.o Signed-off-by: Jeff Johnson Signed-off-by: Steve French --- fs/smb/common/cifs_arc4.c | 1 + fs/smb/common/cifs_md4.c | 1 + 2 files changed, 2 insertions(+) diff --git a/fs/smb/common/cifs_arc4.c b/fs/smb/common/cifs_arc4.c index 043e4cb839fa23..df360ca47826ab 100644 --- a/fs/smb/common/cifs_arc4.c +++ b/fs/smb/common/cifs_arc4.c @@ -10,6 +10,7 @@ #include #include "arc4.h" +MODULE_DESCRIPTION("ARC4 Cipher Algorithm"); MODULE_LICENSE("GPL"); int cifs_arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len) diff --git a/fs/smb/common/cifs_md4.c b/fs/smb/common/cifs_md4.c index 50f78cfc6ce921..7ee7f4dad90c00 100644 --- a/fs/smb/common/cifs_md4.c +++ b/fs/smb/common/cifs_md4.c @@ -24,6 +24,7 @@ #include #include "md4.h" +MODULE_DESCRIPTION("MD4 Message Digest Algorithm (RFC1320)"); MODULE_LICENSE("GPL"); static inline u32 lshift(u32 x, unsigned int s) From 495000a38634e640e2fd02f7e4f1512ccc92d770 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 24 May 2024 17:11:46 +0200 Subject: [PATCH 0101/1578] ALSA: core: Remove debugfs at disconnection The card-specific debugfs entries are removed at the last stage of card free phase, and it's performed after synchronization of the closes of all opened fds. This works fine for most cases, but it can be potentially problematic for a hotplug device like USB-audio. Due to the nature of snd_card_free_when_closed(), the card free isn't called immediately after the driver removal for a hotplug device, but it's left until the last fd is closed. It implies that the card debugfs entries also remain. Meanwhile, when a new device is inserted before the last close and the very same card slot is assigned, the driver tries to create the card debugfs root again on the very same path. This conflicts with the remaining entry, and results in the kernel warning such as: debugfs: Directory 'card0' with parent 'sound' already present! with the missing debugfs entry afterwards. For avoiding such conflicts, remove debugfs entries at the device disconnection phase instead. The jack kctl debugfs entries get removed in snd_jack_dev_disconnect() instead of each kctl private_free. Fixes: 2d670ea2bd53 ("ALSA: jack: implement software jack injection via debugfs") Link: https://lore.kernel.org/r/20240524151256.32521-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/init.c | 9 +++++---- sound/core/jack.c | 21 ++++++++++++++------- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/sound/core/init.c b/sound/core/init.c index 4e52bbe32786ba..b9b708cf980d6d 100644 --- a/sound/core/init.c +++ b/sound/core/init.c @@ -537,6 +537,11 @@ void snd_card_disconnect(struct snd_card *card) synchronize_irq(card->sync_irq); snd_info_card_disconnect(card); +#ifdef CONFIG_SND_DEBUG + debugfs_remove(card->debugfs_root); + card->debugfs_root = NULL; +#endif + if (card->registered) { device_del(&card->card_dev); card->registered = false; @@ -586,10 +591,6 @@ static int snd_card_do_free(struct snd_card *card) dev_warn(card->dev, "unable to free card info\n"); /* Not fatal error */ } -#ifdef CONFIG_SND_DEBUG - debugfs_remove(card->debugfs_root); - card->debugfs_root = NULL; -#endif if (card->release_completion) complete(card->release_completion); if (!card->managed) diff --git a/sound/core/jack.c b/sound/core/jack.c index e08b2c4fbd1a57..e4bcecdf89b7ec 100644 --- a/sound/core/jack.c +++ b/sound/core/jack.c @@ -37,11 +37,15 @@ static const int jack_switch_types[SND_JACK_SWITCH_TYPES] = { }; #endif /* CONFIG_SND_JACK_INPUT_DEV */ +static void snd_jack_remove_debugfs(struct snd_jack *jack); + static int snd_jack_dev_disconnect(struct snd_device *device) { -#ifdef CONFIG_SND_JACK_INPUT_DEV struct snd_jack *jack = device->device_data; + snd_jack_remove_debugfs(jack); + +#ifdef CONFIG_SND_JACK_INPUT_DEV guard(mutex)(&jack->input_dev_lock); if (!jack->input_dev) return 0; @@ -381,10 +385,14 @@ static int snd_jack_debugfs_add_inject_node(struct snd_jack *jack, return 0; } -static void snd_jack_debugfs_clear_inject_node(struct snd_jack_kctl *jack_kctl) +static void snd_jack_remove_debugfs(struct snd_jack *jack) { - debugfs_remove(jack_kctl->jack_debugfs_root); - jack_kctl->jack_debugfs_root = NULL; + struct snd_jack_kctl *jack_kctl; + + list_for_each_entry(jack_kctl, &jack->kctl_list, list) { + debugfs_remove(jack_kctl->jack_debugfs_root); + jack_kctl->jack_debugfs_root = NULL; + } } #else /* CONFIG_SND_JACK_INJECTION_DEBUG */ static int snd_jack_debugfs_add_inject_node(struct snd_jack *jack, @@ -393,7 +401,7 @@ static int snd_jack_debugfs_add_inject_node(struct snd_jack *jack, return 0; } -static void snd_jack_debugfs_clear_inject_node(struct snd_jack_kctl *jack_kctl) +static void snd_jack_remove_debugfs(struct snd_jack *jack) { } #endif /* CONFIG_SND_JACK_INJECTION_DEBUG */ @@ -404,7 +412,6 @@ static void snd_jack_kctl_private_free(struct snd_kcontrol *kctl) jack_kctl = kctl->private_data; if (jack_kctl) { - snd_jack_debugfs_clear_inject_node(jack_kctl); list_del(&jack_kctl->list); kfree(jack_kctl); } @@ -497,8 +504,8 @@ int snd_jack_new(struct snd_card *card, const char *id, int type, .dev_free = snd_jack_dev_free, #ifdef CONFIG_SND_JACK_INPUT_DEV .dev_register = snd_jack_dev_register, - .dev_disconnect = snd_jack_dev_disconnect, #endif /* CONFIG_SND_JACK_INPUT_DEV */ + .dev_disconnect = snd_jack_dev_disconnect, }; if (initial_kctl) { From 2be46155d792d629e8fe3188c2cde176833afe36 Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Sun, 26 May 2024 21:10:32 +1200 Subject: [PATCH 0102/1578] ALSA: hda/realtek: Adjust G814JZR to use SPI init for amp The 2024 ASUS ROG G814J model is much the same as the 2023 model and the 2023 16" version. We can use the same Cirrus Amp quirk. Fixes: 811dd426a9b1 ("ALSA: hda/realtek: Add quirks for Asus ROG 2024 laptops using CS35L41") Signed-off-by: Luke D. Jones Link: https://lore.kernel.org/r/20240526091032.114545-1-luke@ljones.dev Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index e3c0b9d5552d95..aa76d1c885895a 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10310,7 +10310,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x3030, "ASUS ZN270IE", ALC256_FIXUP_ASUS_AIO_GPIO2), SND_PCI_QUIRK(0x1043, 0x3a20, "ASUS G614JZR", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x3a30, "ASUS G814JVR/JIR", ALC245_FIXUP_CS35L41_SPI_2), - SND_PCI_QUIRK(0x1043, 0x3a40, "ASUS G814JZR", ALC245_FIXUP_CS35L41_SPI_2), + SND_PCI_QUIRK(0x1043, 0x3a40, "ASUS G814JZR", ALC285_FIXUP_ASUS_SPI_REAR_SPEAKERS), SND_PCI_QUIRK(0x1043, 0x3a50, "ASUS G834JYR/JZR", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x3a60, "ASUS G634JYR/JZR", ALC285_FIXUP_ASUS_SPI_REAR_SPEAKERS), SND_PCI_QUIRK(0x1043, 0x831a, "ASUS P901", ALC269_FIXUP_STEREO_DMIC), From 9e2f46cd87473c70d01fcaf8a559809e6d18dd50 Mon Sep 17 00:00:00 2001 From: Jason Nader Date: Tue, 21 May 2024 22:36:24 +0900 Subject: [PATCH 0103/1578] ata: ahci: Do not apply Intel PCS quirk on Intel Alder Lake Commit b8b8b4e0c052 ("ata: ahci: Add Intel Alder Lake-P AHCI controller to low power chipsets list") added Intel Alder Lake to the ahci_pci_tbl. Because of the way that the Intel PCS quirk was implemented, having an explicit entry in the ahci_pci_tbl caused the Intel PCS quirk to be applied. (The quirk was not being applied if there was no explict entry.) Thus, entries that were added to the ahci_pci_tbl also got the Intel PCS quirk applied. The quirk was cleaned up in commit 7edbb6059274 ("ahci: clean up intel_pcs_quirk"), such that it is clear which entries that actually applies the Intel PCS quirk. Newer Intel AHCI controllers do not need the Intel PCS quirk, and applying it when not needed actually breaks some platforms. Do not apply the Intel PCS quirk for Intel Alder Lake. This is in line with how things worked before commit b8b8b4e0c052 ("ata: ahci: Add Intel Alder Lake-P AHCI controller to low power chipsets list"), such that certain platforms using Intel Alder Lake will work once again. Cc: stable@vger.kernel.org # 6.7 Fixes: b8b8b4e0c052 ("ata: ahci: Add Intel Alder Lake-P AHCI controller to low power chipsets list") Signed-off-by: Jason Nader Signed-off-by: Niklas Cassel --- drivers/ata/ahci.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index 6548f10e61d9c7..07d66d2c5f0dd8 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -429,7 +429,6 @@ static const struct pci_device_id ahci_pci_tbl[] = { { PCI_VDEVICE(INTEL, 0x02d7), board_ahci_pcs_quirk }, /* Comet Lake PCH RAID */ /* Elkhart Lake IDs 0x4b60 & 0x4b62 https://sata-io.org/product/8803 not tested yet */ { PCI_VDEVICE(INTEL, 0x4b63), board_ahci_pcs_quirk }, /* Elkhart Lake AHCI */ - { PCI_VDEVICE(INTEL, 0x7ae2), board_ahci_pcs_quirk }, /* Alder Lake-P AHCI */ /* JMicron 360/1/3/5/6, match class to avoid IDE function */ { PCI_VENDOR_ID_JMICRON, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, From 8a01ef749b0a632f0e1f4ead0f08b3310d99fcb1 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 3 May 2024 14:45:05 -0500 Subject: [PATCH 0104/1578] iio: adc: ad9467: fix scan type sign According to the IIO documentation, the sign in the scan type should be lower case. The ad9467 driver was incorrectly using upper case. Fix by changing to lower case. Fixes: 4606d0f4b05f ("iio: adc: ad9467: add support for AD9434 high-speed ADC") Fixes: ad6797120238 ("iio: adc: ad9467: add support AD9467 ADC") Signed-off-by: David Lechner Link: https://lore.kernel.org/r/20240503-ad9467-fix-scan-type-sign-v1-1-c7a1a066ebb9@baylibre.com Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/adc/ad9467.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iio/adc/ad9467.c b/drivers/iio/adc/ad9467.c index e85b763b9ffcb1..8f5b9c3f6e3d6a 100644 --- a/drivers/iio/adc/ad9467.c +++ b/drivers/iio/adc/ad9467.c @@ -243,11 +243,11 @@ static void __ad9467_get_scale(struct ad9467_state *st, int index, } static const struct iio_chan_spec ad9434_channels[] = { - AD9467_CHAN(0, 0, 12, 'S'), + AD9467_CHAN(0, 0, 12, 's'), }; static const struct iio_chan_spec ad9467_channels[] = { - AD9467_CHAN(0, 0, 16, 'S'), + AD9467_CHAN(0, 0, 16, 's'), }; static const struct ad9467_chip_info ad9467_chip_tbl = { From 72d0a20fabcf231c9b4b17b0cabdcde0949d05eb Mon Sep 17 00:00:00 2001 From: Angelo Dureghello Date: Fri, 3 May 2024 20:55:28 +0200 Subject: [PATCH 0105/1578] dt-bindings: iio: dac: fix ad354xr output range Fix output range, as per datasheet must be -2.5 to 7.5. Signed-off-by: Angelo Dureghello Fixes: b0a96c5f599e ("dt-bindings: iio: dac: Add adi,ad3552r.yaml") Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240503185528.2043127-1-adureghello@baylibre.org Signed-off-by: Jonathan Cameron --- Documentation/devicetree/bindings/iio/dac/adi,ad3552r.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/iio/dac/adi,ad3552r.yaml b/Documentation/devicetree/bindings/iio/dac/adi,ad3552r.yaml index 96340a05754ce6..8265d709094dcb 100644 --- a/Documentation/devicetree/bindings/iio/dac/adi,ad3552r.yaml +++ b/Documentation/devicetree/bindings/iio/dac/adi,ad3552r.yaml @@ -139,7 +139,7 @@ allOf: Voltage output range of the channel as Required connections: Rfb1x for: 0 to 2.5 V; 0 to 3V; 0 to 5 V; - Rfb2x for: 0 to 10 V; 2.5 to 7.5V; -5 to 5 V; + Rfb2x for: 0 to 10 V; -2.5 to 7.5V; -5 to 5 V; oneOf: - items: - const: 0 From 0f0f6306617cb4b6231fc9d4ec68ab9a56dba7c0 Mon Sep 17 00:00:00 2001 From: Adam Rizkalla Date: Thu, 25 Apr 2024 01:22:49 -0500 Subject: [PATCH 0106/1578] iio: pressure: bmp280: Fix BMP580 temperature reading Fix overflow issue when storing BMP580 temperature reading and properly preserve sign of 24-bit data. Signed-off-by: Adam Rizkalla Tested-By: Vasileios Amoiridis Acked-by: Angel Iglesias Link: https://lore.kernel.org/r/Zin2udkXRD0+GrML@adam-asahi.lan Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/pressure/bmp280-core.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/iio/pressure/bmp280-core.c b/drivers/iio/pressure/bmp280-core.c index 09f53d987c7df5..221fa2c552ae2c 100644 --- a/drivers/iio/pressure/bmp280-core.c +++ b/drivers/iio/pressure/bmp280-core.c @@ -1394,12 +1394,12 @@ static int bmp580_read_temp(struct bmp280_data *data, int *val, int *val2) /* * Temperature is returned in Celsius degrees in fractional - * form down 2^16. We rescale by x1000 to return milli Celsius - * to respect IIO ABI. + * form down 2^16. We rescale by x1000 to return millidegrees + * Celsius to respect IIO ABI. */ - *val = raw_temp * 1000; - *val2 = 16; - return IIO_VAL_FRACTIONAL_LOG2; + raw_temp = sign_extend32(raw_temp, 23); + *val = ((s64)raw_temp * 1000) / (1 << 16); + return IIO_VAL_INT; } static int bmp580_read_press(struct bmp280_data *data, int *val, int *val2) From 279428df888319bf68f2686934897301a250bb84 Mon Sep 17 00:00:00 2001 From: Marc Ferland Date: Wed, 1 May 2024 11:05:54 -0400 Subject: [PATCH 0107/1578] iio: dac: ad5592r: fix temperature channel scaling value The scale value for the temperature channel is (assuming Vref=2.5 and the datasheet): 376.7897513 When calculating both val and val2 for the temperature scale we use (3767897513/25) and multiply it by Vref (here I assume 2500mV) to obtain: 2500 * (3767897513/25) ==> 376789751300 Finally we divide with remainder by 10^9 to get: val = 376 val2 = 789751300 However, we return IIO_VAL_INT_PLUS_MICRO (should have been NANO) as the scale type. So when converting the raw temperature value to the 'processed' temperature value we will get (assuming raw=810, offset=-753): processed = (raw + offset) * scale_val = (810 + -753) * 376 = 21432 processed += div((raw + offset) * scale_val2, 10^6) += div((810 + -753) * 789751300, 10^6) += 45015 ==> 66447 ==> 66.4 Celcius instead of the expected 21.5 Celsius. Fix this issue by changing IIO_VAL_INT_PLUS_MICRO to IIO_VAL_INT_PLUS_NANO. Fixes: 56ca9db862bf ("iio: dac: Add support for the AD5592R/AD5593R ADCs/DACs") Signed-off-by: Marc Ferland Link: https://lore.kernel.org/r/20240501150554.1871390-1-marc.ferland@sonatest.com Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/dac/ad5592r-base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/dac/ad5592r-base.c b/drivers/iio/dac/ad5592r-base.c index 076bc9ecfb4994..4763402dbcd66d 100644 --- a/drivers/iio/dac/ad5592r-base.c +++ b/drivers/iio/dac/ad5592r-base.c @@ -415,7 +415,7 @@ static int ad5592r_read_raw(struct iio_dev *iio_dev, s64 tmp = *val * (3767897513LL / 25LL); *val = div_s64_rem(tmp, 1000000000LL, val2); - return IIO_VAL_INT_PLUS_MICRO; + return IIO_VAL_INT_PLUS_NANO; } mutex_lock(&st->lock); From bedb2ccb566de5ca0c336ca3fd3588cea6d50414 Mon Sep 17 00:00:00 2001 From: Vasileios Amoiridis Date: Wed, 8 May 2024 17:54:07 +0200 Subject: [PATCH 0108/1578] iio: imu: bmi323: Fix trigger notification in case of error In case of error in the bmi323_trigger_handler() function, the function exits without calling the iio_trigger_notify_done() which is responsible for informing the attached trigger that the process is done and in case there is a .reenable(), to call it. Fixes: 8a636db3aa57 ("iio: imu: Add driver for BMI323 IMU") Signed-off-by: Vasileios Amoiridis Link: https://lore.kernel.org/r/20240508155407.139805-1-vassilisamir@gmail.com Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/imu/bmi323/bmi323_core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/iio/imu/bmi323/bmi323_core.c b/drivers/iio/imu/bmi323/bmi323_core.c index 5d42ab9b176a38..67d74a1a1b26df 100644 --- a/drivers/iio/imu/bmi323/bmi323_core.c +++ b/drivers/iio/imu/bmi323/bmi323_core.c @@ -1391,7 +1391,7 @@ static irqreturn_t bmi323_trigger_handler(int irq, void *p) &data->buffer.channels, ARRAY_SIZE(data->buffer.channels)); if (ret) - return IRQ_NONE; + goto out; } else { for_each_set_bit(bit, indio_dev->active_scan_mask, BMI323_CHAN_MAX) { @@ -1400,13 +1400,14 @@ static irqreturn_t bmi323_trigger_handler(int irq, void *p) &data->buffer.channels[index++], BMI323_BYTES_PER_SAMPLE); if (ret) - return IRQ_NONE; + goto out; } } iio_push_to_buffers_with_timestamp(indio_dev, &data->buffer, iio_get_time_ns(indio_dev)); +out: iio_trigger_notify_done(indio_dev->trig); return IRQ_HANDLED; From a23c14b062d8800a2192077d83273bbfe6c7552d Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Mon, 13 May 2024 13:34:27 -0700 Subject: [PATCH 0109/1578] iio: temperature: mlx90635: Fix ERR_PTR dereference in mlx90635_probe() When devm_regmap_init_i2c() fails, regmap_ee could be error pointer, instead of checking for IS_ERR(regmap_ee), regmap is checked which looks like a copy paste error. Fixes: a1d1ba5e1c28 ("iio: temperature: mlx90635 MLX90635 IR Temperature sensor") Reviewed-by: Crt Mori Signed-off-by: Harshit Mogalapalli Link: https://lore.kernel.org/r/20240513203427.3208696-1-harshit.m.mogalapalli@oracle.com Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/temperature/mlx90635.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/iio/temperature/mlx90635.c b/drivers/iio/temperature/mlx90635.c index 1f5c962c1818ff..f7f88498ba0e8d 100644 --- a/drivers/iio/temperature/mlx90635.c +++ b/drivers/iio/temperature/mlx90635.c @@ -947,9 +947,9 @@ static int mlx90635_probe(struct i2c_client *client) "failed to allocate regmap\n"); regmap_ee = devm_regmap_init_i2c(client, &mlx90635_regmap_ee); - if (IS_ERR(regmap)) - return dev_err_probe(&client->dev, PTR_ERR(regmap), - "failed to allocate regmap\n"); + if (IS_ERR(regmap_ee)) + return dev_err_probe(&client->dev, PTR_ERR(regmap_ee), + "failed to allocate EEPROM regmap\n"); mlx90635 = iio_priv(indio_dev); i2c_set_clientdata(client, indio_dev); From ab6f0ab178137170a6b40f8f3d7a3806708a202c Mon Sep 17 00:00:00 2001 From: Dumitru Ceclan Date: Tue, 21 May 2024 11:45:39 +0300 Subject: [PATCH 0110/1578] iio: adc: ad7173: fix buffers enablement for ad7176-2 AD7176-2 does not feature input buffers and marks corespondent register bits as read only. Enable buffers only on supported models. Fixes: 76a1e6a42802 ("iio: adc: ad7173: add AD7173 driver") Reviewed-by: David Lechner Signed-off-by: Dumitru Ceclan Link: https://lore.kernel.org/r/20240521-ad7173-fixes-v1-1-8161cc7f3ad1@analog.com Signed-off-by: Jonathan Cameron --- drivers/iio/adc/ad7173.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/iio/adc/ad7173.c b/drivers/iio/adc/ad7173.c index a7826bba0852af..91acedaddb576f 100644 --- a/drivers/iio/adc/ad7173.c +++ b/drivers/iio/adc/ad7173.c @@ -145,6 +145,7 @@ struct ad7173_device_info { unsigned int id; char *name; bool has_temp; + bool has_input_buf; bool has_int_ref; bool has_ref2; u8 num_gpios; @@ -212,6 +213,7 @@ static const struct ad7173_device_info ad7173_device_info[] = { .num_configs = 4, .num_gpios = 2, .has_temp = true, + .has_input_buf = true, .has_int_ref = true, .clock = 2 * HZ_PER_MHZ, .sinc5_data_rates = ad7173_sinc5_data_rates, @@ -224,6 +226,7 @@ static const struct ad7173_device_info ad7173_device_info[] = { .num_configs = 8, .num_gpios = 4, .has_temp = false, + .has_input_buf = true, .has_ref2 = true, .clock = 2 * HZ_PER_MHZ, .sinc5_data_rates = ad7173_sinc5_data_rates, @@ -237,6 +240,7 @@ static const struct ad7173_device_info ad7173_device_info[] = { .num_configs = 8, .num_gpios = 4, .has_temp = true, + .has_input_buf = true, .has_int_ref = true, .has_ref2 = true, .clock = 2 * HZ_PER_MHZ, @@ -251,6 +255,7 @@ static const struct ad7173_device_info ad7173_device_info[] = { .num_configs = 4, .num_gpios = 2, .has_temp = true, + .has_input_buf = true, .has_int_ref = true, .clock = 16 * HZ_PER_MHZ, .sinc5_data_rates = ad7175_sinc5_data_rates, @@ -263,6 +268,7 @@ static const struct ad7173_device_info ad7173_device_info[] = { .num_configs = 8, .num_gpios = 4, .has_temp = true, + .has_input_buf = true, .has_int_ref = true, .has_ref2 = true, .clock = 16 * HZ_PER_MHZ, @@ -277,6 +283,7 @@ static const struct ad7173_device_info ad7173_device_info[] = { .num_configs = 4, .num_gpios = 2, .has_temp = false, + .has_input_buf = false, .has_int_ref = true, .clock = 16 * HZ_PER_MHZ, .sinc5_data_rates = ad7175_sinc5_data_rates, @@ -289,6 +296,7 @@ static const struct ad7173_device_info ad7173_device_info[] = { .num_configs = 4, .num_gpios = 2, .has_temp = true, + .has_input_buf = true, .has_int_ref = true, .clock = 16 * HZ_PER_MHZ, .odr_start_value = AD7177_ODR_START_VALUE, @@ -932,7 +940,7 @@ static int ad7173_fw_parse_channel_config(struct iio_dev *indio_dev) AD7173_CH_ADDRESS(chan_arr[chan_index].channel, chan_arr[chan_index].channel2); chan_st_priv->cfg.bipolar = false; - chan_st_priv->cfg.input_buf = true; + chan_st_priv->cfg.input_buf = st->info->has_input_buf; chan_st_priv->cfg.ref_sel = AD7173_SETUP_REF_SEL_INT_REF; st->adc_mode |= AD7173_ADC_MODE_REF_EN; @@ -989,7 +997,7 @@ static int ad7173_fw_parse_channel_config(struct iio_dev *indio_dev) chan_st_priv->ain = AD7173_CH_ADDRESS(ain[0], ain[1]); chan_st_priv->chan_reg = chan_index; - chan_st_priv->cfg.input_buf = true; + chan_st_priv->cfg.input_buf = st->info->has_input_buf; chan_st_priv->cfg.odr = 0; chan_st_priv->cfg.bipolar = fwnode_property_read_bool(child, "bipolar"); From 3450ee7e800a8dc83290780c1b6ef66898e709d3 Mon Sep 17 00:00:00 2001 From: Dumitru Ceclan Date: Tue, 21 May 2024 11:45:40 +0300 Subject: [PATCH 0111/1578] iio: adc: ad7173: Add ad7173_device_info names Add missing names from the device info struct for 3 models to ensure consistency with the rest of the models. Fixes: 76a1e6a42802 ("iio: adc: ad7173: add AD7173 driver") Signed-off-by: Dumitru Ceclan Link: https://lore.kernel.org/r/20240521-ad7173-fixes-v1-2-8161cc7f3ad1@analog.com Signed-off-by: Jonathan Cameron --- drivers/iio/adc/ad7173.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/iio/adc/ad7173.c b/drivers/iio/adc/ad7173.c index 91acedaddb576f..4649af5f9d7681 100644 --- a/drivers/iio/adc/ad7173.c +++ b/drivers/iio/adc/ad7173.c @@ -220,6 +220,7 @@ static const struct ad7173_device_info ad7173_device_info[] = { .num_sinc5_data_rates = ARRAY_SIZE(ad7173_sinc5_data_rates), }, [ID_AD7172_4] = { + .name = "ad7172-4", .id = AD7172_4_ID, .num_inputs = 9, .num_channels = 8, @@ -262,6 +263,7 @@ static const struct ad7173_device_info ad7173_device_info[] = { .num_sinc5_data_rates = ARRAY_SIZE(ad7175_sinc5_data_rates), }, [ID_AD7175_8] = { + .name = "ad7175-8", .id = AD7175_8_ID, .num_inputs = 17, .num_channels = 16, @@ -290,6 +292,7 @@ static const struct ad7173_device_info ad7173_device_info[] = { .num_sinc5_data_rates = ARRAY_SIZE(ad7175_sinc5_data_rates), }, [ID_AD7177_2] = { + .name = "ad7177-2", .id = AD7177_ID, .num_inputs = 5, .num_channels = 4, From f00dd8953094091a8e9c8cc00661d01c33b93615 Mon Sep 17 00:00:00 2001 From: Dumitru Ceclan Date: Tue, 21 May 2024 11:45:41 +0300 Subject: [PATCH 0112/1578] iio: adc: ad7173: Remove index from temp channel Temperature channel is unique per device, index is not needed. This is breaking userspace: Include fixes tag to be released within the same rc cycle. Fixes: 76a1e6a42802 ("iio: adc: ad7173: add AD7173 driver") Signed-off-by: Dumitru Ceclan Link: https://lore.kernel.org/r/20240521-ad7173-fixes-v1-3-8161cc7f3ad1@analog.com Signed-off-by: Jonathan Cameron --- drivers/iio/adc/ad7173.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/iio/adc/ad7173.c b/drivers/iio/adc/ad7173.c index 4649af5f9d7681..048127bf979a96 100644 --- a/drivers/iio/adc/ad7173.c +++ b/drivers/iio/adc/ad7173.c @@ -815,7 +815,6 @@ static const struct iio_chan_spec ad7173_channel_template = { static const struct iio_chan_spec ad7173_temp_iio_channel_template = { .type = IIO_TEMP, - .indexed = 1, .channel = AD7173_AIN_TEMP_POS, .channel2 = AD7173_AIN_TEMP_NEG, .info_mask_separate = BIT(IIO_CHAN_INFO_RAW) | From 95444b9eeb8c5c0330563931d70c61ca3b101548 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Maneyrol Date: Fri, 24 May 2024 12:48:51 +0000 Subject: [PATCH 0113/1578] iio: invensense: fix odr switching to same value ODR switching happens in 2 steps, update to store the new value and then apply when the ODR change flag is received in the data. When switching to the same ODR value, the ODR change flag is never happening, and frequency switching is blocked waiting for the never coming apply. Fix the issue by preventing update to happen when switching to same ODR value. Fixes: 0ecc363ccea7 ("iio: make invensense timestamp module generic") Cc: stable@vger.kernel.org Signed-off-by: Jean-Baptiste Maneyrol Link: https://lore.kernel.org/r/20240524124851.567485-1-inv.git-commit@tdk.com Signed-off-by: Jonathan Cameron --- drivers/iio/common/inv_sensors/inv_sensors_timestamp.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/iio/common/inv_sensors/inv_sensors_timestamp.c b/drivers/iio/common/inv_sensors/inv_sensors_timestamp.c index fa205f17bd905f..f44458c380d928 100644 --- a/drivers/iio/common/inv_sensors/inv_sensors_timestamp.c +++ b/drivers/iio/common/inv_sensors/inv_sensors_timestamp.c @@ -60,11 +60,15 @@ EXPORT_SYMBOL_NS_GPL(inv_sensors_timestamp_init, IIO_INV_SENSORS_TIMESTAMP); int inv_sensors_timestamp_update_odr(struct inv_sensors_timestamp *ts, uint32_t period, bool fifo) { + uint32_t mult; + /* when FIFO is on, prevent odr change if one is already pending */ if (fifo && ts->new_mult != 0) return -EAGAIN; - ts->new_mult = period / ts->chip.clock_period; + mult = period / ts->chip.clock_period; + if (mult != ts->mult) + ts->new_mult = mult; return 0; } From e8021b94b0412c37bcc79027c2e382086b6ce449 Mon Sep 17 00:00:00 2001 From: Roded Zats Date: Wed, 22 May 2024 10:30:44 +0300 Subject: [PATCH 0114/1578] enic: Validate length of nl attributes in enic_set_vf_port enic_set_vf_port assumes that the nl attribute IFLA_PORT_PROFILE is of length PORT_PROFILE_MAX and that the nl attributes IFLA_PORT_INSTANCE_UUID, IFLA_PORT_HOST_UUID are of length PORT_UUID_MAX. These attributes are validated (in the function do_setlink in rtnetlink.c) using the nla_policy ifla_port_policy. The policy defines IFLA_PORT_PROFILE as NLA_STRING, IFLA_PORT_INSTANCE_UUID as NLA_BINARY and IFLA_PORT_HOST_UUID as NLA_STRING. That means that the length validation using the policy is for the max size of the attributes and not on exact size so the length of these attributes might be less than the sizes that enic_set_vf_port expects. This might cause an out of bands read access in the memcpys of the data of these attributes in enic_set_vf_port. Fixes: f8bd909183ac ("net: Add ndo_{set|get}_vf_port support for enic dynamic vnics") Signed-off-by: Roded Zats Link: https://lore.kernel.org/r/20240522073044.33519-1-rzats@paloaltonetworks.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/cisco/enic/enic_main.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c index f604119efc8098..5f26fc3ad65556 100644 --- a/drivers/net/ethernet/cisco/enic/enic_main.c +++ b/drivers/net/ethernet/cisco/enic/enic_main.c @@ -1117,18 +1117,30 @@ static int enic_set_vf_port(struct net_device *netdev, int vf, pp->request = nla_get_u8(port[IFLA_PORT_REQUEST]); if (port[IFLA_PORT_PROFILE]) { + if (nla_len(port[IFLA_PORT_PROFILE]) != PORT_PROFILE_MAX) { + memcpy(pp, &prev_pp, sizeof(*pp)); + return -EINVAL; + } pp->set |= ENIC_SET_NAME; memcpy(pp->name, nla_data(port[IFLA_PORT_PROFILE]), PORT_PROFILE_MAX); } if (port[IFLA_PORT_INSTANCE_UUID]) { + if (nla_len(port[IFLA_PORT_INSTANCE_UUID]) != PORT_UUID_MAX) { + memcpy(pp, &prev_pp, sizeof(*pp)); + return -EINVAL; + } pp->set |= ENIC_SET_INSTANCE; memcpy(pp->instance_uuid, nla_data(port[IFLA_PORT_INSTANCE_UUID]), PORT_UUID_MAX); } if (port[IFLA_PORT_HOST_UUID]) { + if (nla_len(port[IFLA_PORT_HOST_UUID]) != PORT_UUID_MAX) { + memcpy(pp, &prev_pp, sizeof(*pp)); + return -EINVAL; + } pp->set |= ENIC_SET_HOST; memcpy(pp->host_uuid, nla_data(port[IFLA_PORT_HOST_UUID]), PORT_UUID_MAX); From a4edf675ba3357f60e2ee310acc15eb9cd5a8ae0 Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Fri, 17 May 2024 07:49:46 -0700 Subject: [PATCH 0115/1578] platform/x86: ISST: fix use-after-free in tpmi_sst_dev_remove() In tpmi_sst_dev_remove(), tpmi_sst is dereferenced after being freed. Fix this by reordering the kfree() post the dereference. Fixes: 9d1d36268f3d ("platform/x86: ISST: Support partitioned systems") Signed-off-by: Harshit Mogalapalli Reviewed-by: Hans de Goede Acked-by: Srinivas Pandruvada Link: https://lore.kernel.org/r/20240517144946.289615-1-harshit.m.mogalapalli@oracle.com Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c b/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c index 7bac7841ff0aba..7fa360073f6ef4 100644 --- a/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c +++ b/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c @@ -1610,8 +1610,8 @@ void tpmi_sst_dev_remove(struct auxiliary_device *auxdev) tpmi_sst->partition_mask_current &= ~BIT(plat_info->partition); /* Free the package instance when the all partitions are removed */ if (!tpmi_sst->partition_mask_current) { - kfree(tpmi_sst); isst_common.sst_inst[tpmi_sst->package_id] = NULL; + kfree(tpmi_sst); } mutex_unlock(&isst_tpmi_dev_lock); } From 4d6ef1be2492a6789ba2b711933625cd72ced39d Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 21 May 2024 11:47:41 +0200 Subject: [PATCH 0116/1578] platform/x86: x86-android-tablets: Add "select LEDS_CLASS" Since the x86-android-tablets now calls devm_led_classdev_register_ext() it needs to select LEDS_CLASS as well as LEDS_CLASS' NEW_LEDS dependency. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202405182256.FsKBjIzG-lkp@intel.com/ Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20240521094741.273397-1-hdegoede@redhat.com --- drivers/platform/x86/x86-android-tablets/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/platform/x86/x86-android-tablets/Kconfig b/drivers/platform/x86/x86-android-tablets/Kconfig index 6603461d427300..b591419de80c30 100644 --- a/drivers/platform/x86/x86-android-tablets/Kconfig +++ b/drivers/platform/x86/x86-android-tablets/Kconfig @@ -6,6 +6,8 @@ config X86_ANDROID_TABLETS tristate "X86 Android tablet support" depends on I2C && SPI && SERIAL_DEV_BUS && ACPI && EFI && GPIOLIB && PMIC_OPREGION + select NEW_LEDS + select LEDS_CLASS help X86 tablets which ship with Android as (part of) the factory image typically have various problems with their DSDTs. The factory kernels From 5d059bf2b1c4d5779a4c09ec418e40eded44a187 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 22 May 2024 07:48:13 -0400 Subject: [PATCH 0117/1578] platform/x86: thinkpad_acpi: Select INPUT_SPARSEKMAP in Kconfig Now that drivers/platform/x86/thinkpad_acpi.c uses sparse_keymap_report_event(), it must select INPUT_SPARSEKMAP in its Kconfig option otherwise the build fails with: ld: vmlinux.o: in function `tpacpi_input_send_key': thinkpad_acpi.c:(.text+0xd4d27f): undefined reference to `sparse_keymap_report_event' ld: vmlinux.o: in function `hotkey_init': thinkpad_acpi.c:(.init.text+0x66cb6): undefined reference to `sparse_keymap_setup' Fixes: 42f7b965de9d ("platform/x86: thinkpad_acpi: Switch to using sparse-keymap helpers") Signed-off-by: Steven Rostedt (Google) Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20240522074813.379b9fc2@gandalf.local.home Signed-off-by: Hans de Goede --- drivers/platform/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 0ec952b5d03e77..1953317541eab8 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -515,6 +515,7 @@ config THINKPAD_ACPI select NVRAM select NEW_LEDS select LEDS_CLASS + select INPUT_SPARSEKMAP help This is a driver for the IBM and Lenovo ThinkPad laptops. It adds support for Fn-Fx key combinations, Bluetooth control, video From 825fc49497957310e421454fe3fb8b8d8d8e2dd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20T=C5=AFma?= Date: Tue, 21 May 2024 18:22:54 +0200 Subject: [PATCH 0118/1578] media: mgb4: Fix double debugfs remove MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes an error where debugfs_remove_recursive() is called first on a parent directory and then again on a child which causes a kernel panic. Signed-off-by: Martin Tůma Signed-off-by: Hans Verkuil Fixes: 0ab13674a9bd ("media: pci: mgb4: Added Digiteq Automotive MGB4 driver") Cc: [hverkuil: added Fixes/Cc tags] --- drivers/media/pci/mgb4/mgb4_core.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/media/pci/mgb4/mgb4_core.c b/drivers/media/pci/mgb4/mgb4_core.c index 60498a5abebf3b..ab4f07e2e56030 100644 --- a/drivers/media/pci/mgb4/mgb4_core.c +++ b/drivers/media/pci/mgb4/mgb4_core.c @@ -642,9 +642,6 @@ static void mgb4_remove(struct pci_dev *pdev) struct mgb4_dev *mgbdev = pci_get_drvdata(pdev); int i; -#ifdef CONFIG_DEBUG_FS - debugfs_remove_recursive(mgbdev->debugfs); -#endif #if IS_REACHABLE(CONFIG_HWMON) hwmon_device_unregister(mgbdev->hwmon_dev); #endif @@ -659,6 +656,10 @@ static void mgb4_remove(struct pci_dev *pdev) if (mgbdev->vin[i]) mgb4_vin_free(mgbdev->vin[i]); +#ifdef CONFIG_DEBUG_FS + debugfs_remove_recursive(mgbdev->debugfs); +#endif + device_remove_groups(&mgbdev->pdev->dev, mgb4_pci_groups); free_spi(mgbdev); free_i2c(mgbdev); From 0b178b02673998f5acca5a0365a8858ca45beedb Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 23 May 2024 16:36:01 +0200 Subject: [PATCH 0119/1578] platform/x86: touchscreen_dmi: Add support for setting touchscreen properties from cmdline On x86/ACPI platforms touchscreens mostly just work without needing any device/model specific configuration. But in some cases (mostly with Silead and Goodix touchscreens) it is still necessary to manually specify various touchscreen-properties on a per model basis. touchscreen_dmi is a special place for DMI quirks for this, but it can be challenging for users to figure out the right property values, especially for Silead touchscreens where non of these can be read back from the touchscreen-controller. ATM users can only test touchscreen properties by editing touchscreen_dmi.c and then building a completely new kernel which makes it unnecessary difficult for users to test and submit properties when necessary for their laptop / tablet model. Add support for specifying properties on the kernel commandline to allow users to easily figure out the right settings. See the added documentation in kernel-parameters.txt for the commandline syntax. Cc: Gregor Riepl Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20240523143601.47555-1-hdegoede@redhat.com --- .../admin-guide/kernel-parameters.txt | 22 ++++++ drivers/platform/x86/touchscreen_dmi.c | 79 ++++++++++++++++++- 2 files changed, 98 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 500cfa7762257c..b600df82669db0 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1921,6 +1921,28 @@ Format: , + i2c_touchscreen_props= [HW,ACPI,X86] + Set device-properties for ACPI-enumerated I2C-attached + touchscreen, to e.g. fix coordinates of upside-down + mounted touchscreens. If you need this option please + submit a drivers/platform/x86/touchscreen_dmi.c patch + adding a DMI quirk for this. + + Format: + :=[:prop_name=val][:...] + Where is one of: + Omit "=" entirely Set a boolean device-property + Unsigned number Set a u32 device-property + Anything else Set a string device-property + + Examples (split over multiple lines): + i2c_touchscreen_props=GDIX1001:touchscreen-inverted-x: + touchscreen-inverted-y + + i2c_touchscreen_props=MSSL1680:touchscreen-size-x=1920: + touchscreen-size-y=1080:touchscreen-inverted-y: + firmware-name=gsl1680-vendor-model.fw:silead,home-button + i8042.debug [HW] Toggle i8042 debug mode i8042.unmask_kbd_data [HW] Enable printing of interrupt data from the KBD port diff --git a/drivers/platform/x86/touchscreen_dmi.c b/drivers/platform/x86/touchscreen_dmi.c index c6a10ec2c83f66..b021fb9e579e84 100644 --- a/drivers/platform/x86/touchscreen_dmi.c +++ b/drivers/platform/x86/touchscreen_dmi.c @@ -9,10 +9,13 @@ */ #include +#include #include #include #include #include +#include +#include #include #include #include @@ -1817,7 +1820,7 @@ const struct dmi_system_id touchscreen_dmi_table[] = { { } }; -static const struct ts_dmi_data *ts_data; +static struct ts_dmi_data *ts_data; static void ts_dmi_add_props(struct i2c_client *client) { @@ -1852,6 +1855,64 @@ static int ts_dmi_notifier_call(struct notifier_block *nb, return 0; } +#define MAX_CMDLINE_PROPS 16 + +static struct property_entry ts_cmdline_props[MAX_CMDLINE_PROPS + 1]; + +static struct ts_dmi_data ts_cmdline_data = { + .properties = ts_cmdline_props, +}; + +static int __init ts_parse_props(char *str) +{ + /* Save the original str to show it on syntax errors */ + char orig_str[256]; + char *name, *value; + u32 u32val; + int i, ret; + + strscpy(orig_str, str, sizeof(orig_str)); + + /* + * str is part of the static_command_line from init/main.c and poking + * holes in that by writing 0 to it is allowed, as is taking long + * lasting references to it. + */ + ts_cmdline_data.acpi_name = strsep(&str, ":"); + + for (i = 0; i < MAX_CMDLINE_PROPS; i++) { + name = strsep(&str, ":"); + if (!name || !name[0]) + break; + + /* Replace '=' with 0 and make value point past '=' or NULL */ + value = name; + strsep(&value, "="); + if (!value) { + ts_cmdline_props[i] = PROPERTY_ENTRY_BOOL(name); + } else if (isdigit(value[0])) { + ret = kstrtou32(value, 0, &u32val); + if (ret) + goto syntax_error; + + ts_cmdline_props[i] = PROPERTY_ENTRY_U32(name, u32val); + } else { + ts_cmdline_props[i] = PROPERTY_ENTRY_STRING(name, value); + } + } + + if (!i || str) + goto syntax_error; + + ts_data = &ts_cmdline_data; + return 1; + +syntax_error: + pr_err("Invalid '%s' value for 'i2c_touchscreen_props='\n", orig_str); + return 1; /* "i2c_touchscreen_props=" is still a known parameter */ +} +__setup("i2c_touchscreen_props=", ts_parse_props); + static struct notifier_block ts_dmi_notifier = { .notifier_call = ts_dmi_notifier_call, }; @@ -1859,13 +1920,25 @@ static struct notifier_block ts_dmi_notifier = { static int __init ts_dmi_init(void) { const struct dmi_system_id *dmi_id; + struct ts_dmi_data *ts_data_dmi; int error; dmi_id = dmi_first_match(touchscreen_dmi_table); - if (!dmi_id) + ts_data_dmi = dmi_id ? dmi_id->driver_data : NULL; + + if (ts_data) { + /* + * Kernel cmdline provided data takes precedence, copy over + * DMI efi_embedded_fw info if available. + */ + if (ts_data_dmi) + ts_data->embedded_fw = ts_data_dmi->embedded_fw; + } else if (ts_data_dmi) { + ts_data = ts_data_dmi; + } else { return 0; /* Not an error */ + } - ts_data = dmi_id->driver_data; /* Some dmi table entries only provide an efi_embedded_fw_desc */ if (!ts_data->properties) return 0; From 7c8639aa41343fd7b3dbe09baf6b0791fcc407a1 Mon Sep 17 00:00:00 2001 From: hmtheboy154 Date: Mon, 27 May 2024 11:14:46 +0200 Subject: [PATCH 0120/1578] platform/x86: touchscreen_dmi: Add info for GlobalSpace SolT IVW 11.6" tablet This is a tablet created by GlobalSpace Technologies Limited which uses an Intel Atom x5-Z8300, 4GB of RAM & 64GB of storage. Link: https://web.archive.org/web/20171102141952/http://globalspace.in/11.6-device.html Signed-off-by: hmtheboy154 Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20240527091447.248849-2-hdegoede@redhat.com --- drivers/platform/x86/touchscreen_dmi.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/drivers/platform/x86/touchscreen_dmi.c b/drivers/platform/x86/touchscreen_dmi.c index b021fb9e579e84..6c03e7daadd4da 100644 --- a/drivers/platform/x86/touchscreen_dmi.c +++ b/drivers/platform/x86/touchscreen_dmi.c @@ -900,6 +900,22 @@ static const struct ts_dmi_data schneider_sct101ctm_data = { .properties = schneider_sct101ctm_props, }; +static const struct property_entry globalspace_solt_ivw116_props[] = { + PROPERTY_ENTRY_U32("touchscreen-min-x", 7), + PROPERTY_ENTRY_U32("touchscreen-min-y", 22), + PROPERTY_ENTRY_U32("touchscreen-size-x", 1723), + PROPERTY_ENTRY_U32("touchscreen-size-y", 1077), + PROPERTY_ENTRY_STRING("firmware-name", "gsl1680-globalspace-solt-ivw116.fw"), + PROPERTY_ENTRY_U32("silead,max-fingers", 10), + PROPERTY_ENTRY_BOOL("silead,home-button"), + { } +}; + +static const struct ts_dmi_data globalspace_solt_ivw116_data = { + .acpi_name = "MSSL1680:00", + .properties = globalspace_solt_ivw116_props, +}; + static const struct property_entry techbite_arc_11_6_props[] = { PROPERTY_ENTRY_U32("touchscreen-min-x", 5), PROPERTY_ENTRY_U32("touchscreen-min-y", 7), @@ -1627,6 +1643,15 @@ const struct dmi_system_id touchscreen_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "SCT101CTM"), }, }, + { + /* GlobalSpace SoLT IVW 11.6" */ + .driver_data = (void *)&globalspace_solt_ivw116_data, + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Globalspace Tech Pvt Ltd"), + DMI_MATCH(DMI_PRODUCT_NAME, "SolTIVW"), + DMI_MATCH(DMI_PRODUCT_SKU, "PN20170413488"), + }, + }, { /* Techbite Arc 11.6 */ .driver_data = (void *)&techbite_arc_11_6_data, From 3050052613790e75b5e4a8536930426b0a8b0774 Mon Sep 17 00:00:00 2001 From: hmtheboy154 Date: Mon, 27 May 2024 11:14:47 +0200 Subject: [PATCH 0121/1578] platform/x86: touchscreen_dmi: Add info for the EZpad 6s Pro The "EZpad 6s Pro" uses the same touchscreen as the "EZpad 6 Pro B", unlike the "Ezpad 6 Pro" which has its own touchscreen. Signed-off-by: hmtheboy154 Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20240527091447.248849-3-hdegoede@redhat.com --- drivers/platform/x86/touchscreen_dmi.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/platform/x86/touchscreen_dmi.c b/drivers/platform/x86/touchscreen_dmi.c index 6c03e7daadd4da..2d9ca2292ea19d 100644 --- a/drivers/platform/x86/touchscreen_dmi.c +++ b/drivers/platform/x86/touchscreen_dmi.c @@ -1404,6 +1404,17 @@ const struct dmi_system_id touchscreen_dmi_table[] = { DMI_MATCH(DMI_BIOS_DATE, "04/24/2018"), }, }, + { + /* Jumper EZpad 6s Pro */ + .driver_data = (void *)&jumper_ezpad_6_pro_b_data, + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Jumper"), + DMI_MATCH(DMI_PRODUCT_NAME, "Ezpad"), + /* Above matches are too generic, add bios match */ + DMI_MATCH(DMI_BIOS_VERSION, "E.WSA116_8.E1.042.bin"), + DMI_MATCH(DMI_BIOS_DATE, "01/08/2020"), + }, + }, { /* Jumper EZpad 6 m4 */ .driver_data = (void *)&jumper_ezpad_6_m4_data, From 21a22ed618d072a47597e63ee591973c18524880 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 22 May 2024 18:45:04 +0800 Subject: [PATCH 0122/1578] selftests: hsr: Fix "File exists" errors for hsr_ping The hsr_ping test reports the following errors: INFO: preparing interfaces for HSRv0. INFO: Initial validation ping. INFO: Longer ping test. INFO: Cutting one link. INFO: Delay the link and drop a few packages. INFO: All good. INFO: preparing interfaces for HSRv1. RTNETLINK answers: File exists RTNETLINK answers: File exists RTNETLINK answers: File exists RTNETLINK answers: File exists RTNETLINK answers: File exists RTNETLINK answers: File exists Error: ipv4: Address already assigned. Error: ipv6: address already assigned. Error: ipv4: Address already assigned. Error: ipv6: address already assigned. Error: ipv4: Address already assigned. Error: ipv6: address already assigned. INFO: Initial validation ping. That is because the cleanup code for the 2nd round test before "setup_hsr_interfaces 1" is removed incorrectly in commit 680fda4f6714 ("test: hsr: Remove script code already implemented in lib.sh"). This patch fixes it by re-setup the namespaces using setup_ns ns1 ns2 ns3 command before "setup_hsr_interfaces 1". It deletes previous namespaces and create new ones. Fixes: 680fda4f6714 ("test: hsr: Remove script code already implemented in lib.sh") Reviewed-by: Hangbin Liu Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/6485d3005f467758d49f0f313c8c009759ba6b05.1716374462.git.tanggeliang@kylinos.cn Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/hsr/hsr_ping.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/net/hsr/hsr_ping.sh b/tools/testing/selftests/net/hsr/hsr_ping.sh index 790294c8af8327..3684b813b0f67e 100755 --- a/tools/testing/selftests/net/hsr/hsr_ping.sh +++ b/tools/testing/selftests/net/hsr/hsr_ping.sh @@ -174,6 +174,8 @@ trap cleanup_all_ns EXIT setup_hsr_interfaces 0 do_complete_ping_test +setup_ns ns1 ns2 ns3 + setup_hsr_interfaces 1 do_complete_ping_test From 97e1db06c7bb948da10ba85acad8030b56886593 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 23 May 2024 00:40:02 +0900 Subject: [PATCH 0123/1578] af_unix: Annotate data-race around unix_sk(sk)->addr. Once unix_sk(sk)->addr is assigned under net->unx.table.locks and unix_sk(sk)->bindlock, *(unix_sk(sk)->addr) and unix_sk(sk)->path are fully set up, and unix_sk(sk)->addr is never changed. unix_getname() and unix_copy_addr() access the two fields locklessly, and commit ae3b564179bf ("missing barriers in some of unix_sock ->addr and ->path accesses") added smp_store_release() and smp_load_acquire() pairs. In other functions, we still read unix_sk(sk)->addr locklessly to check if the socket is bound, and KCSAN complains about it. [0] Given these functions have no dependency for *(unix_sk(sk)->addr) and unix_sk(sk)->path, READ_ONCE() is enough to annotate the data-race. Note that it is safe to access unix_sk(sk)->addr locklessly if the socket is found in the hash table. For example, the lockless read of otheru->addr in unix_stream_connect() is safe. Note also that newu->addr there is of the child socket that is still not accessible from userspace, and smp_store_release() publishes the address in case the socket is accept()ed and unix_getname() / unix_copy_addr() is called. [0]: BUG: KCSAN: data-race in unix_bind / unix_listen write (marked) to 0xffff88805f8d1840 of 8 bytes by task 13723 on cpu 0: __unix_set_addr_hash net/unix/af_unix.c:329 [inline] unix_bind_bsd net/unix/af_unix.c:1241 [inline] unix_bind+0x881/0x1000 net/unix/af_unix.c:1319 __sys_bind+0x194/0x1e0 net/socket.c:1847 __do_sys_bind net/socket.c:1858 [inline] __se_sys_bind net/socket.c:1856 [inline] __x64_sys_bind+0x40/0x50 net/socket.c:1856 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x4f/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x46/0x4e read to 0xffff88805f8d1840 of 8 bytes by task 13724 on cpu 1: unix_listen+0x72/0x180 net/unix/af_unix.c:734 __sys_listen+0xdc/0x160 net/socket.c:1881 __do_sys_listen net/socket.c:1890 [inline] __se_sys_listen net/socket.c:1888 [inline] __x64_sys_listen+0x2e/0x40 net/socket.c:1888 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x4f/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x46/0x4e value changed: 0x0000000000000000 -> 0xffff88807b5b1b40 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 13724 Comm: syz-executor.4 Not tainted 6.8.0-12822-gcd51db110a7e #12 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzkaller Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240522154002.77857-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e4af6616e1dff1..fe631212a345a2 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -731,7 +731,7 @@ static int unix_listen(struct socket *sock, int backlog) if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) goto out; /* Only stream/seqpacket sockets accept */ err = -EINVAL; - if (!u->addr) + if (!READ_ONCE(u->addr)) goto out; /* No listens on an unbound socket */ unix_state_lock(sk); if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) @@ -1369,7 +1369,7 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, if ((test_bit(SOCK_PASSCRED, &sock->flags) || test_bit(SOCK_PASSPIDFD, &sock->flags)) && - !unix_sk(sk)->addr) { + !READ_ONCE(unix_sk(sk)->addr)) { err = unix_autobind(sk); if (err) goto out; @@ -1481,7 +1481,8 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, goto out; if ((test_bit(SOCK_PASSCRED, &sock->flags) || - test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) { + test_bit(SOCK_PASSPIDFD, &sock->flags)) && + !READ_ONCE(u->addr)) { err = unix_autobind(sk); if (err) goto out; @@ -1950,7 +1951,8 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, } if ((test_bit(SOCK_PASSCRED, &sock->flags) || - test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) { + test_bit(SOCK_PASSPIDFD, &sock->flags)) && + !READ_ONCE(u->addr)) { err = unix_autobind(sk); if (err) goto out; From 51d1b25a720982324871338b1a36b197ec9bd6f0 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 23 May 2024 00:42:18 +0900 Subject: [PATCH 0124/1578] af_unix: Read sk->sk_hash under bindlock during bind(). syzkaller reported data-race of sk->sk_hash in unix_autobind() [0], and the same ones exist in unix_bind_bsd() and unix_bind_abstract(). The three bind() functions prefetch sk->sk_hash locklessly and use it later after validating that unix_sk(sk)->addr is NULL under unix_sk(sk)->bindlock. The prefetched sk->sk_hash is the hash value of unbound socket set in unix_create1() and does not change until bind() completes. There could be a chance that sk->sk_hash changes after the lockless read. However, in such a case, non-NULL unix_sk(sk)->addr is visible under unix_sk(sk)->bindlock, and bind() returns -EINVAL without using the prefetched value. The KCSAN splat is false-positive, but let's silence it by reading sk->sk_hash under unix_sk(sk)->bindlock. [0]: BUG: KCSAN: data-race in unix_autobind / unix_autobind write to 0xffff888034a9fb88 of 4 bytes by task 4468 on cpu 0: __unix_set_addr_hash net/unix/af_unix.c:331 [inline] unix_autobind+0x47a/0x7d0 net/unix/af_unix.c:1185 unix_dgram_connect+0x7e3/0x890 net/unix/af_unix.c:1373 __sys_connect_file+0xd7/0xe0 net/socket.c:2048 __sys_connect+0x114/0x140 net/socket.c:2065 __do_sys_connect net/socket.c:2075 [inline] __se_sys_connect net/socket.c:2072 [inline] __x64_sys_connect+0x40/0x50 net/socket.c:2072 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x4f/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x46/0x4e read to 0xffff888034a9fb88 of 4 bytes by task 4465 on cpu 1: unix_autobind+0x28/0x7d0 net/unix/af_unix.c:1134 unix_dgram_connect+0x7e3/0x890 net/unix/af_unix.c:1373 __sys_connect_file+0xd7/0xe0 net/socket.c:2048 __sys_connect+0x114/0x140 net/socket.c:2065 __do_sys_connect net/socket.c:2075 [inline] __se_sys_connect net/socket.c:2072 [inline] __x64_sys_connect+0x40/0x50 net/socket.c:2072 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x4f/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x46/0x4e value changed: 0x000000e4 -> 0x000001e3 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 4465 Comm: syz-executor.0 Not tainted 6.8.0-12822-gcd51db110a7e #12 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 Fixes: afd20b9290e1 ("af_unix: Replace the big lock with small locks.") Reported-by: syzkaller Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240522154218.78088-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index fe631212a345a2..25b49efc0926b0 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1131,8 +1131,8 @@ static struct sock *unix_find_other(struct net *net, static int unix_autobind(struct sock *sk) { - unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); + unsigned int new_hash, old_hash; struct net *net = sock_net(sk); struct unix_address *addr; u32 lastnum, ordernum; @@ -1155,6 +1155,7 @@ static int unix_autobind(struct sock *sk) addr->name->sun_family = AF_UNIX; refcount_set(&addr->refcnt, 1); + old_hash = sk->sk_hash; ordernum = get_random_u32(); lastnum = ordernum & 0xFFFFF; retry: @@ -1195,8 +1196,8 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, { umode_t mode = S_IFSOCK | (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); - unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); + unsigned int new_hash, old_hash; struct net *net = sock_net(sk); struct mnt_idmap *idmap; struct unix_address *addr; @@ -1234,6 +1235,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, if (u->addr) goto out_unlock; + old_hash = sk->sk_hash; new_hash = unix_bsd_hash(d_backing_inode(dentry)); unix_table_double_lock(net, old_hash, new_hash); u->path.mnt = mntget(parent.mnt); @@ -1261,8 +1263,8 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, int addr_len) { - unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); + unsigned int new_hash, old_hash; struct net *net = sock_net(sk); struct unix_address *addr; int err; @@ -1280,6 +1282,7 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, goto out_mutex; } + old_hash = sk->sk_hash; new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); unix_table_double_lock(net, old_hash, new_hash); From 1684842147677a1279bcff95f8adb6de9a656e30 Mon Sep 17 00:00:00 2001 From: Hariprasad Kelam Date: Thu, 23 May 2024 13:06:26 +0530 Subject: [PATCH 0125/1578] Octeontx2-pf: Free send queue buffers incase of leaf to inner There are two type of classes. "Leaf classes" that are the bottom of the class hierarchy. "Inner classes" that are neither the root class nor leaf classes. QoS rules can only specify leaf classes as targets for traffic. Root / \ / \ 1 2 /\ / \ 4 5 classes 1,4 and 5 are leaf classes. class 2 is a inner class. When a leaf class made as inner, or vice versa, resources associated with send queue (send queue buffers and transmit schedulers) are not getting freed. Fixes: 5e6808b4c68d ("octeontx2-pf: Add support for HTB offload") Signed-off-by: Hariprasad Kelam Link: https://lore.kernel.org/r/20240523073626.4114-1-hkelam@marvell.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/marvell/octeontx2/nic/qos.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/qos.c b/drivers/net/ethernet/marvell/octeontx2/nic/qos.c index 070711df612ece..edac008099c0c4 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/qos.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/qos.c @@ -1422,7 +1422,10 @@ static int otx2_qos_leaf_to_inner(struct otx2_nic *pfvf, u16 classid, otx2_qos_read_txschq_cfg(pfvf, node, old_cfg); /* delete the txschq nodes allocated for this node */ + otx2_qos_disable_sq(pfvf, qid); + otx2_qos_free_hw_node_schq(pfvf, node); otx2_qos_free_sw_node_schq(pfvf, node); + pfvf->qos.qid_to_sqmap[qid] = OTX2_QOS_INVALID_SQ; /* mark this node as htb inner node */ WRITE_ONCE(node->qid, OTX2_QOS_QID_INNER); @@ -1632,6 +1635,7 @@ static int otx2_qos_leaf_del_last(struct otx2_nic *pfvf, u16 classid, bool force dwrr_del_node = true; /* destroy the leaf node */ + otx2_qos_disable_sq(pfvf, qid); otx2_qos_destroy_node(pfvf, node); pfvf->qos.qid_to_sqmap[qid] = OTX2_QOS_INVALID_SQ; From d7ba701da636afae17d8e1243b3d12eed149abcb Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 2 May 2024 10:08:25 +0000 Subject: [PATCH 0126/1578] xfs: Clear W=1 warning in xfs_iwalk_run_callbacks() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For CONFIG_XFS_DEBUG unset, xfs_iwalk_run_callbacks() generates the following warning for when building with W=1: fs/xfs/xfs_iwalk.c: In function ‘xfs_iwalk_run_callbacks’: fs/xfs/xfs_iwalk.c:354:42: error: variable ‘irec’ set but not used [-Werror=unused-but-set-variable] 354 | struct xfs_inobt_rec_incore *irec; | ^~~~ cc1: all warnings being treated as errors Drop @irec, as it is only an intermediate variable. Suggested-by: Christoph Hellwig Signed-off-by: John Garry Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/xfs_iwalk.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index 730c8d48da2827..86f14ec7c31fed 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -351,7 +351,6 @@ xfs_iwalk_run_callbacks( int *has_more) { struct xfs_mount *mp = iwag->mp; - struct xfs_inobt_rec_incore *irec; xfs_agino_t next_agino; int error; @@ -361,8 +360,8 @@ xfs_iwalk_run_callbacks( /* Delete cursor but remember the last record we cached... */ xfs_iwalk_del_inobt(iwag->tp, curpp, agi_bpp, 0); - irec = &iwag->recs[iwag->nr_recs - 1]; - ASSERT(next_agino >= irec->ir_startino + XFS_INODES_PER_CHUNK); + ASSERT(next_agino >= iwag->recs[iwag->nr_recs - 1].ir_startino + + XFS_INODES_PER_CHUNK); if (iwag->drop_trans) { xfs_trans_cancel(iwag->tp); From b33874fb7f28326380562f208d948bab785fbd6f Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 2 May 2024 10:08:26 +0000 Subject: [PATCH 0127/1578] xfs: Stop using __maybe_unused in xfs_alloc.c In both xfs_alloc_cur_finish() and xfs_alloc_ag_vextent_exact(), local variable @afg is tagged as __maybe_unused. Otherwise an unused variable warning would be generated for when building with W=1 and CONFIG_XFS_DEBUG unset. In both cases, the variable is unused as it is only referenced in an ASSERT() call, which is compiled out (in this config). It is generally a poor programming style to use __maybe_unused for variables. The ASSERT() call is to verify that agbno of the end of the extent is within bounds for both functions. @afg is used as an intermediate variable to find the AG length. However xfs_verify_agbext() already exists to verify a valid extent range. The arguments for calling xfs_verify_agbext() are already available, so use that instead. An advantage of using xfs_verify_agbext() is that it verifies that both the start and the end of the extent are within the bounds of the AG and catches overflows. Suggested-by: Dave Chinner Signed-off-by: John Garry Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_alloc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 6cb8b2ddc541b4..6c55a6e88eba3f 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -1008,13 +1008,12 @@ xfs_alloc_cur_finish( struct xfs_alloc_arg *args, struct xfs_alloc_cur *acur) { - struct xfs_agf __maybe_unused *agf = args->agbp->b_addr; int error; ASSERT(acur->cnt && acur->bnolt); ASSERT(acur->bno >= acur->rec_bno); ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len); - ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length)); + ASSERT(xfs_verify_agbext(args->pag, acur->rec_bno, acur->rec_len)); error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno, acur->rec_len, acur->bno, acur->len, 0); @@ -1217,7 +1216,6 @@ STATIC int /* error */ xfs_alloc_ag_vextent_exact( xfs_alloc_arg_t *args) /* allocation argument structure */ { - struct xfs_agf __maybe_unused *agf = args->agbp->b_addr; struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */ struct xfs_btree_cur *cnt_cur;/* by count btree cursor */ int error; @@ -1297,7 +1295,7 @@ xfs_alloc_ag_vextent_exact( */ cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp, args->pag); - ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length)); + ASSERT(xfs_verify_agbext(args->pag, args->agbno, args->len)); error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno, args->len, XFSA_FIXUP_BNO_OK); if (error) { From 2b3f004d3d518ec7a392066d935fd85c81412e33 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 21 May 2024 23:01:45 -0700 Subject: [PATCH 0128/1578] xfs: drop xfarray sortinfo folio on error Chandan Babu reports the following livelock in xfs/708: run fstests xfs/708 at 2024-05-04 15:35:29 XFS (loop16): EXPERIMENTAL online scrub feature in use. Use at your own risk! XFS (loop5): Mounting V5 Filesystem e96086f0-a2f9-4424-a1d5-c75d53d823be XFS (loop5): Ending clean mount XFS (loop5): Quotacheck needed: Please wait. XFS (loop5): Quotacheck: Done. XFS (loop5): EXPERIMENTAL online scrub feature in use. Use at your own risk! INFO: task xfs_io:143725 blocked for more than 122 seconds. Not tainted 6.9.0-rc4+ #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:xfs_io state:D stack:0 pid:143725 tgid:143725 ppid:117661 flags:0x00004006 Call Trace: __schedule+0x69c/0x17a0 schedule+0x74/0x1b0 io_schedule+0xc4/0x140 folio_wait_bit_common+0x254/0x650 shmem_undo_range+0x9d5/0xb40 shmem_evict_inode+0x322/0x8f0 evict+0x24e/0x560 __dentry_kill+0x17d/0x4d0 dput+0x263/0x430 __fput+0x2fc/0xaa0 task_work_run+0x132/0x210 get_signal+0x1a8/0x1910 arch_do_signal_or_restart+0x7b/0x2f0 syscall_exit_to_user_mode+0x1c2/0x200 do_syscall_64+0x72/0x170 entry_SYSCALL_64_after_hwframe+0x76/0x7e The shmem code is trying to drop all the folios attached to a shmem file and gets stuck on a locked folio after a bnobt repair. It looks like the process has a signal pending, so I started looking for places where we lock an xfile folio and then deal with a fatal signal. I found a bug in xfarray_sort_scan via code inspection. This function is called to set up the scanning phase of a quicksort operation, which may involve grabbing a locked xfile folio. If we exit the function with an error code, the caller does not call xfarray_sort_scan_done to put the xfile folio. If _sort_scan returns an error code while si->folio is set, we leak the reference and never unlock the folio. Therefore, change xfarray_sort to call _scan_done on exit. This is safe to call multiple times because it sets si->folio to NULL and ignores a NULL si->folio. Also change _sort_scan to use an intermediate variable so that we never pollute si->folio with an errptr. Fixes: 232ea052775f9 ("xfs: enable sorting of xfile-backed arrays") Reported-by: Chandan Babu R Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/scrub/xfarray.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c index 9185ae7088d49a..cdd13ed9c569a7 100644 --- a/fs/xfs/scrub/xfarray.c +++ b/fs/xfs/scrub/xfarray.c @@ -822,12 +822,14 @@ xfarray_sort_scan( /* Grab the first folio that backs this array element. */ if (!si->folio) { + struct folio *folio; loff_t next_pos; - si->folio = xfile_get_folio(si->array->xfile, idx_pos, + folio = xfile_get_folio(si->array->xfile, idx_pos, si->array->obj_size, XFILE_ALLOC); - if (IS_ERR(si->folio)) - return PTR_ERR(si->folio); + if (IS_ERR(folio)) + return PTR_ERR(folio); + si->folio = folio; si->first_folio_idx = xfarray_idx(si->array, folio_pos(si->folio) + si->array->obj_size - 1); @@ -1048,6 +1050,7 @@ xfarray_sort( out_free: trace_xfarray_sort_stats(si, error); + xfarray_sort_scan_done(si); kvfree(si); return error; } From 97835e6866796874571646a1a8ff44f24c0b39f7 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 21 May 2024 23:02:01 -0700 Subject: [PATCH 0129/1578] xfs: fix xfs_init_attr_trans not handling explicit operation codes When we were converting the attr code to use an explicit operation code instead of keying off of attr->value being null, we forgot to change the code that initializes the transaction reservation. Split the function into two helpers that handle the !remove and remove cases, then fix both callsites to handle this correctly. Fixes: c27411d4c640 ("xfs: make attr removal an explicit operation") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_attr.c | 40 +++++++++++++++++++--------------------- fs/xfs/libxfs/xfs_attr.h | 3 +-- fs/xfs/xfs_attr_item.c | 17 +++++++++++++++-- 3 files changed, 35 insertions(+), 25 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 430cd3244c143d..f30bcc64100d56 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -329,26 +329,20 @@ xfs_attr_calc_size( return nblks; } -/* Initialize transaction reservation for attr operations */ -void -xfs_init_attr_trans( - struct xfs_da_args *args, - struct xfs_trans_res *tres, - unsigned int *total) +/* Initialize transaction reservation for an xattr set/replace/upsert */ +inline struct xfs_trans_res +xfs_attr_set_resv( + const struct xfs_da_args *args) { - struct xfs_mount *mp = args->dp->i_mount; - - if (args->value) { - tres->tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + - M_RES(mp)->tr_attrsetrt.tr_logres * - args->total; - tres->tr_logcount = XFS_ATTRSET_LOG_COUNT; - tres->tr_logflags = XFS_TRANS_PERM_LOG_RES; - *total = args->total; - } else { - *tres = M_RES(mp)->tr_attrrm; - *total = XFS_ATTRRM_SPACE_RES(mp); - } + struct xfs_mount *mp = args->dp->i_mount; + struct xfs_trans_res ret = { + .tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + + M_RES(mp)->tr_attrsetrt.tr_logres * args->total, + .tr_logcount = XFS_ATTRSET_LOG_COUNT, + .tr_logflags = XFS_TRANS_PERM_LOG_RES, + }; + + return ret; } /* @@ -1006,7 +1000,7 @@ xfs_attr_set( struct xfs_trans_res tres; int error, local; int rmt_blks = 0; - unsigned int total; + unsigned int total = 0; ASSERT(!args->trans); @@ -1033,10 +1027,15 @@ xfs_attr_set( if (!local) rmt_blks = xfs_attr3_rmt_blocks(mp, args->valuelen); + + tres = xfs_attr_set_resv(args); + total = args->total; break; case XFS_ATTRUPDATE_REMOVE: XFS_STATS_INC(mp, xs_attr_remove); rmt_blks = xfs_attr3_max_rmt_blocks(mp); + tres = M_RES(mp)->tr_attrrm; + total = XFS_ATTRRM_SPACE_RES(mp); break; } @@ -1044,7 +1043,6 @@ xfs_attr_set( * Root fork attributes can use reserved data blocks for this * operation if necessary */ - xfs_init_attr_trans(args, &tres, &total); error = xfs_trans_alloc_inode(dp, &tres, total, 0, rsvd, &args->trans); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 088cb7b301680c..0e51d0723f9aa3 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -565,8 +565,7 @@ bool xfs_attr_check_namespace(unsigned int attr_flags); bool xfs_attr_namecheck(unsigned int attr_flags, const void *name, size_t length); int xfs_attr_calc_size(struct xfs_da_args *args, int *local); -void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres, - unsigned int *total); +struct xfs_trans_res xfs_attr_set_resv(const struct xfs_da_args *args); /* * Check to see if the attr should be upgraded from non-existent or shortform to diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 2b10ac4c5fce24..f683b7a9323f16 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -746,7 +746,7 @@ xfs_attr_recover_work( struct xfs_attri_log_format *attrp; struct xfs_attri_log_nameval *nv = attrip->attri_nameval; int error; - int total; + unsigned int total = 0; /* * First check the validity of the attr described by the ATTRI. If any @@ -763,7 +763,20 @@ xfs_attr_recover_work( return PTR_ERR(attr); args = attr->xattri_da_args; - xfs_init_attr_trans(args, &resv, &total); + switch (xfs_attr_intent_op(attr)) { + case XFS_ATTRI_OP_FLAGS_PPTR_SET: + case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE: + case XFS_ATTRI_OP_FLAGS_SET: + case XFS_ATTRI_OP_FLAGS_REPLACE: + resv = xfs_attr_set_resv(args); + total = args->total; + break; + case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE: + case XFS_ATTRI_OP_FLAGS_REMOVE: + resv = M_RES(mp)->tr_attrrm; + total = XFS_ATTRRM_SPACE_RES(mp); + break; + } resv = xlog_recover_resv(&resv); error = xfs_trans_alloc(mp, &resv, total, 0, XFS_TRANS_RESERVE, &tp); if (error) From 38de567906d95c397d87f292b892686b7ec6fbc3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 21 May 2024 23:02:16 -0700 Subject: [PATCH 0130/1578] xfs: allow symlinks with short remote targets An internal user complained about log recovery failing on a symlink ("Bad dinode after recovery") with the following (excerpted) format: core.magic = 0x494e core.mode = 0120777 core.version = 3 core.format = 2 (extents) core.nlinkv2 = 1 core.nextents = 1 core.size = 297 core.nblocks = 1 core.naextents = 0 core.forkoff = 0 core.aformat = 2 (extents) u3.bmx[0] = [startoff,startblock,blockcount,extentflag] 0:[0,12,1,0] This is a symbolic link with a 297-byte target stored in a disk block, which is to say this is a symlink with a remote target. The forkoff is 0, which is to say that there's 512 - 176 == 336 bytes in the inode core to store the data fork. Eventually, testing of generic/388 failed with the same inode corruption message during inode recovery. In writing a debugging patch to call xfs_dinode_verify on dirty inode log items when we're committing transactions, I observed that xfs/298 can reproduce the problem quite quickly. xfs/298 creates a symbolic link, adds some extended attributes, then deletes them all. The test failure occurs when the final removexattr also deletes the attr fork because that does not convert the remote symlink back into a shortform symlink. That is how we trip this test. The only reason why xfs/298 only triggers with the debug patch added is that it deletes the symlink, so the final iflush shows the inode as free. I wrote a quick fstest to emulate the behavior of xfs/298, except that it leaves the symlinks on the filesystem after inducing the "corrupt" state. Kernels going back at least as far as 4.18 have written out symlink inodes in this manner and prior to 1eb70f54c445f they did not object to reading them back in. Because we've been writing out inodes this way for quite some time, the only way to fix this is to relax the check for symbolic links. Directories don't have this problem because di_size is bumped to blocksize during the sf->data conversion. Fixes: 1eb70f54c445f ("xfs: validate inode fork size against fork format") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_inode_buf.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index d79002343d0b63..e7a7bfbe75b46a 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -374,17 +374,37 @@ xfs_dinode_verify_fork( /* * For fork types that can contain local data, check that the fork * format matches the size of local data contained within the fork. - * - * For all types, check that when the size says the should be in extent - * or btree format, the inode isn't claiming it is in local format. */ if (whichfork == XFS_DATA_FORK) { - if (S_ISDIR(mode) || S_ISLNK(mode)) { + /* + * A directory small enough to fit in the inode must be stored + * in local format. The directory sf <-> extents conversion + * code updates the directory size accordingly. + */ + if (S_ISDIR(mode)) { + if (be64_to_cpu(dip->di_size) <= fork_size && + fork_format != XFS_DINODE_FMT_LOCAL) + return __this_address; + } + + /* + * A symlink with a target small enough to fit in the inode can + * be stored in extents format if xattrs were added (thus + * converting the data fork from shortform to remote format) + * and then removed. + */ + if (S_ISLNK(mode)) { if (be64_to_cpu(dip->di_size) <= fork_size && + fork_format != XFS_DINODE_FMT_EXTENTS && fork_format != XFS_DINODE_FMT_LOCAL) return __this_address; } + /* + * For all types, check that when the size says the fork should + * be in extent or btree format, the inode isn't claiming to be + * in local format. + */ if (be64_to_cpu(dip->di_size) > fork_size && fork_format == XFS_DINODE_FMT_LOCAL) return __this_address; From 95b19e2f4e0f730c83910e7f5e4d62ec68c6d862 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 21 May 2024 23:02:32 -0700 Subject: [PATCH 0131/1578] xfs: don't open-code u64_to_user_ptr Don't open-code what the kernel already provides. Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/scrub/scrub.c | 2 +- fs/xfs/xfs_handle.c | 7 +------ 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index c013f0ba4f36b2..4cbcf7a86dbec5 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -856,7 +856,7 @@ xfs_ioc_scrubv_metadata( if (vec_bytes > PAGE_SIZE) return -ENOMEM; - uvectors = (void __user *)(uintptr_t)head.svh_vectors; + uvectors = u64_to_user_ptr(head.svh_vectors); vectors = memdup_user(uvectors, vec_bytes); if (IS_ERR(vectors)) return PTR_ERR(vectors); diff --git a/fs/xfs/xfs_handle.c b/fs/xfs/xfs_handle.c index c8785ed5954342..a3f16e9b6fe5b5 100644 --- a/fs/xfs/xfs_handle.c +++ b/fs/xfs/xfs_handle.c @@ -773,11 +773,6 @@ xfs_getparents_expand_lastrec( trace_xfs_getparents_expand_lastrec(gpx->ip, gp, &gpx->context, gpr); } -static inline void __user *u64_to_uptr(u64 val) -{ - return (void __user *)(uintptr_t)val; -} - /* Retrieve the parent pointers for a given inode. */ STATIC int xfs_getparents( @@ -862,7 +857,7 @@ xfs_getparents( ASSERT(gpx->context.firstu <= gpx->gph.gph_request.gp_bufsize); /* Copy the records to userspace. */ - if (copy_to_user(u64_to_uptr(gpx->gph.gph_request.gp_buffer), + if (copy_to_user(u64_to_user_ptr(gpx->gph.gph_request.gp_buffer), gpx->krecords, gpx->context.firstu)) error = -EFAULT; From 7ff6c798eca05e4a9dcb80163cb454d7787a4bc3 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 21 May 2024 15:46:32 -0700 Subject: [PATCH 0132/1578] ACPI: APEI: EINJ: Fix einj_dev release leak The platform driver conversion of EINJ mistakenly used platform_device_del() to unwind platform_device_register_full() at module exit. This leads to a small leak of one 'struct platform_device' instance per module load/unload cycle. Switch to platform_device_unregister() which performs both device_del() and final put_device(). Fixes: 5621fafaac00 ("EINJ: Migrate to a platform driver") Cc: 6.9+ # 6.9+ Signed-off-by: Dan Williams Reviewed-by: Ben Cheatham Signed-off-by: Rafael J. Wysocki --- drivers/acpi/apei/einj-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/apei/einj-core.c b/drivers/acpi/apei/einj-core.c index 9515bcfe5e9732..73903a497d73f7 100644 --- a/drivers/acpi/apei/einj-core.c +++ b/drivers/acpi/apei/einj-core.c @@ -909,7 +909,7 @@ static void __exit einj_exit(void) if (einj_initialized) platform_driver_unregister(&einj_driver); - platform_device_del(einj_dev); + platform_device_unregister(einj_dev); } module_init(einj_init); From f6f172dc6a6d7775b2df6adfd1350700e9a847ec Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Wed, 22 May 2024 23:36:48 +0200 Subject: [PATCH 0133/1578] ACPI: EC: Abort address space access upon error When a multi-byte address space access is requested, acpi_ec_read()/ acpi_ec_write() is being called multiple times. Abort such operations if a single call to acpi_ec_read() / acpi_ec_write() fails, as the data read from / written to the EC might be incomplete. Signed-off-by: Armin Wolf Signed-off-by: Rafael J. Wysocki --- drivers/acpi/ec.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index e7793ee9e6498e..a68dce2147a4db 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -1333,10 +1333,13 @@ acpi_ec_space_handler(u32 function, acpi_physical_address address, if (ec->busy_polling || bits > 8) acpi_ec_burst_enable(ec); - for (i = 0; i < bytes; ++i, ++address, ++value) + for (i = 0; i < bytes; ++i, ++address, ++value) { result = (function == ACPI_READ) ? acpi_ec_read(ec, address, value) : acpi_ec_write(ec, address, *value); + if (result < 0) + break; + } if (ec->busy_polling || bits > 8) acpi_ec_burst_disable(ec); From c4bd7f1d78340e63de4d073fd3dbe5391e2996e5 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Wed, 22 May 2024 23:36:49 +0200 Subject: [PATCH 0134/1578] ACPI: EC: Avoid returning AE_OK on errors in address space handler If an error code other than EINVAL, ENODEV or ETIME is returned by acpi_ec_read() / acpi_ec_write(), then AE_OK is incorrectly returned by acpi_ec_space_handler(). Fix this by only returning AE_OK on success, and return AE_ERROR otherwise. Signed-off-by: Armin Wolf [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/ec.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index a68dce2147a4db..68dd17f96f636f 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -1351,8 +1351,10 @@ acpi_ec_space_handler(u32 function, acpi_physical_address address, return AE_NOT_FOUND; case -ETIME: return AE_TIME; - default: + case 0: return AE_OK; + default: + return AE_ERROR; } } From 52a2f0608366a629d43dacd3191039c95fef74ba Mon Sep 17 00:00:00 2001 From: Parthiban Veerasooran Date: Thu, 23 May 2024 14:23:14 +0530 Subject: [PATCH 0135/1578] net: usb: smsc95xx: fix changing LED_SEL bit value updated from EEPROM LED Select (LED_SEL) bit in the LED General Purpose IO Configuration register is used to determine the functionality of external LED pins (Speed Indicator, Link and Activity Indicator, Full Duplex Link Indicator). The default value for this bit is 0 when no EEPROM is present. If a EEPROM is present, the default value is the value of the LED Select bit in the Configuration Flags of the EEPROM. A USB Reset or Lite Reset (LRST) will cause this bit to be restored to the image value last loaded from EEPROM, or to be set to 0 if no EEPROM is present. While configuring the dual purpose GPIO/LED pins to LED outputs in the LED General Purpose IO Configuration register, the LED_SEL bit is changed as 0 and resulting the configured value from the EEPROM is cleared. The issue is fixed by using read-modify-write approach. Fixes: f293501c61c5 ("smsc95xx: configure LED outputs") Signed-off-by: Parthiban Veerasooran Reviewed-by: Simon Horman Reviewed-by: Woojung Huh Link: https://lore.kernel.org/r/20240523085314.167650-1-Parthiban.Veerasooran@microchip.com Signed-off-by: Paolo Abeni --- drivers/net/usb/smsc95xx.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c index cbea246664795f..8e82184be5e7d9 100644 --- a/drivers/net/usb/smsc95xx.c +++ b/drivers/net/usb/smsc95xx.c @@ -879,7 +879,7 @@ static int smsc95xx_start_rx_path(struct usbnet *dev) static int smsc95xx_reset(struct usbnet *dev) { struct smsc95xx_priv *pdata = dev->driver_priv; - u32 read_buf, write_buf, burst_cap; + u32 read_buf, burst_cap; int ret = 0, timeout; netif_dbg(dev, ifup, dev->net, "entering smsc95xx_reset\n"); @@ -1003,10 +1003,13 @@ static int smsc95xx_reset(struct usbnet *dev) return ret; netif_dbg(dev, ifup, dev->net, "ID_REV = 0x%08x\n", read_buf); + ret = smsc95xx_read_reg(dev, LED_GPIO_CFG, &read_buf); + if (ret < 0) + return ret; /* Configure GPIO pins as LED outputs */ - write_buf = LED_GPIO_CFG_SPD_LED | LED_GPIO_CFG_LNK_LED | - LED_GPIO_CFG_FDX_LED; - ret = smsc95xx_write_reg(dev, LED_GPIO_CFG, write_buf); + read_buf |= LED_GPIO_CFG_SPD_LED | LED_GPIO_CFG_LNK_LED | + LED_GPIO_CFG_FDX_LED; + ret = smsc95xx_write_reg(dev, LED_GPIO_CFG, read_buf); if (ret < 0) return ret; From 9e69acc1de306b5243ca2ef8a54242e56d01b8e5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 23 May 2024 17:57:21 +0200 Subject: [PATCH 0136/1578] thermal/debugfs: Print initial trip temperature and hysteresis in tze_seq_show() The temperature and hysteresis of a trip point may change during a mitigation episode it is involved in (it may even become invalid altogether), so in order to avoid possible confusion related to that, store the temperature and hysteresis of trip points at the time they are crossed on the way up and print those values instead of their current temperature and hysteresis. Fixes: 7ef01f228c9f ("thermal/debugfs: Add thermal debugfs information for mitigation episodes") Signed-off-by: Rafael J. Wysocki --- drivers/thermal/thermal_debugfs.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/thermal/thermal_debugfs.c b/drivers/thermal/thermal_debugfs.c index 91f9c21235a8a9..e0e4eb6fad3975 100644 --- a/drivers/thermal/thermal_debugfs.c +++ b/drivers/thermal/thermal_debugfs.c @@ -91,6 +91,8 @@ struct cdev_record { * * @timestamp: the trip crossing timestamp * @duration: total time when the zone temperature was above the trip point + * @trip_temp: trip temperature at mitigation start + * @trip_hyst: trip hysteresis at mitigation start * @count: the number of times the zone temperature was above the trip point * @max: maximum recorded temperature above the trip point * @min: minimum recorded temperature above the trip point @@ -99,6 +101,8 @@ struct cdev_record { struct trip_stats { ktime_t timestamp; ktime_t duration; + int trip_temp; + int trip_hyst; int count; int max; int min; @@ -574,6 +578,7 @@ void thermal_debug_tz_trip_up(struct thermal_zone_device *tz, struct thermal_debugfs *thermal_dbg = tz->debugfs; int trip_id = thermal_zone_trip_id(tz, trip); ktime_t now = ktime_get(); + struct trip_stats *trip_stats; if (!thermal_dbg) return; @@ -639,7 +644,10 @@ void thermal_debug_tz_trip_up(struct thermal_zone_device *tz, tz_dbg->trips_crossed[tz_dbg->nr_trips++] = trip_id; tze = list_first_entry(&tz_dbg->tz_episodes, struct tz_episode, node); - tze->trip_stats[trip_id].timestamp = now; + trip_stats = &tze->trip_stats[trip_id]; + trip_stats->trip_temp = trip->temperature; + trip_stats->trip_hyst = trip->hysteresis; + trip_stats->timestamp = now; unlock: mutex_unlock(&thermal_dbg->lock); @@ -836,8 +844,8 @@ static int tze_seq_show(struct seq_file *s, void *v) seq_printf(s, "| %*d | %*s | %*d | %*d | %c%*lld | %*d | %*d | %*d |\n", 4 , trip_id, 8, type, - 9, trip->temperature, - 9, trip->hysteresis, + 9, trip_stats->trip_temp, + 9, trip_stats->trip_hyst, c, 10, duration_ms, 9, trip_stats->avg, 9, trip_stats->min, From 5a599e10e53d1914adf0032c730bc0f604a5d947 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 23 May 2024 17:58:42 +0200 Subject: [PATCH 0137/1578] thermal/debugfs: Allow tze_seq_show() to print statistics for invalid trips Commit a6258fde8de3 ("thermal/debugfs: Make tze_seq_show() skip invalid trips and trips with no stats") modified tze_seq_show() to skip invalid trips, but it overlooked the fact that a trip may become invalid during a mitigation eposide involving it, in which case its statistics should still be reported. For this reason, remove the invalid trip temperature check from the main loop in tze_seq_show(). The trips that have never been valid will still be skipped after this change because there are no statistics to report for them. Fixes: a6258fde8de3 ("thermal/debugfs: Make tze_seq_show() skip invalid trips and trips with no stats") Signed-off-by: Rafael J. Wysocki --- drivers/thermal/thermal_debugfs.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/thermal/thermal_debugfs.c b/drivers/thermal/thermal_debugfs.c index e0e4eb6fad3975..94244722915708 100644 --- a/drivers/thermal/thermal_debugfs.c +++ b/drivers/thermal/thermal_debugfs.c @@ -802,10 +802,6 @@ static int tze_seq_show(struct seq_file *s, void *v) const struct thermal_trip *trip = &td->trip; struct trip_stats *trip_stats; - /* Skip invalid trips. */ - if (trip->temperature == THERMAL_TEMP_INVALID) - continue; - /* * There is no possible mitigation happening at the * critical trip point, so the stats will be always From cb573eec609c44f44b27c2fa07afd926eddc9f89 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 23 May 2024 18:00:14 +0200 Subject: [PATCH 0138/1578] thermal: core: Introduce thermal_trip_crossed() Add a helper function called thermal_trip_crossed() to be invoked by __thermal_zone_device_update() in order to notify user space, the thermal debug code and the zone governor about trip crossing. Subsequently, this will also be used in the case when a trip point becomes invalid after being crossed on the way up. No intentional functional impact. Signed-off-by: Rafael J. Wysocki --- drivers/thermal/thermal_core.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 54cce4e523bc51..c119ef920793d3 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -467,6 +467,21 @@ static void thermal_governor_trip_crossed(struct thermal_governor *governor, governor->trip_crossed(tz, trip, crossed_up); } +static void thermal_trip_crossed(struct thermal_zone_device *tz, + const struct thermal_trip *trip, + struct thermal_governor *governor, + bool crossed_up) +{ + if (crossed_up) { + thermal_notify_tz_trip_up(tz, trip); + thermal_debug_tz_trip_up(tz, trip); + } else { + thermal_notify_tz_trip_down(tz, trip); + thermal_debug_tz_trip_down(tz, trip); + } + thermal_governor_trip_crossed(governor, tz, trip, crossed_up); +} + static int thermal_trip_notify_cmp(void *ascending, const struct list_head *a, const struct list_head *b) { @@ -506,18 +521,12 @@ void __thermal_zone_device_update(struct thermal_zone_device *tz, handle_thermal_trip(tz, td, &way_up_list, &way_down_list); list_sort(&way_up_list, &way_up_list, thermal_trip_notify_cmp); - list_for_each_entry(td, &way_up_list, notify_list_node) { - thermal_notify_tz_trip_up(tz, &td->trip); - thermal_debug_tz_trip_up(tz, &td->trip); - thermal_governor_trip_crossed(governor, tz, &td->trip, true); - } + list_for_each_entry(td, &way_up_list, notify_list_node) + thermal_trip_crossed(tz, &td->trip, governor, true); list_sort(NULL, &way_down_list, thermal_trip_notify_cmp); - list_for_each_entry(td, &way_down_list, notify_list_node) { - thermal_notify_tz_trip_down(tz, &td->trip); - thermal_debug_tz_trip_down(tz, &td->trip); - thermal_governor_trip_crossed(governor, tz, &td->trip, false); - } + list_for_each_entry(td, &way_down_list, notify_list_node) + thermal_trip_crossed(tz, &td->trip, governor, false); if (governor->manage) governor->manage(tz); From ae2170d6ea96e652c7fb5689f1980986bf48b7b8 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 23 May 2024 18:05:03 +0200 Subject: [PATCH 0139/1578] thermal: trip: Trigger trip down notifications when trips involved in mitigation become invalid When a trip point becomes invalid after being crossed on the way up, it is involved in a mitigation episode that needs to be adjusted to compensate for the trip going away. For this reason, introduce thermal_zone_trip_down() as a wrapper around thermal_trip_crossed() and make thermal_zone_set_trip_temp() call it if the new temperature of the trip at hand is equal to THERMAL_TEMP_INVALID and it has been crossed on the way up to trigger all of the necessary adjustments in user space, the thermal debug code and the zone governor. Fixes: 8c69a777e480 ("thermal: core: Fix the handling of invalid trip points") Signed-off-by: Rafael J. Wysocki --- drivers/thermal/thermal_core.c | 6 ++++++ drivers/thermal/thermal_core.h | 2 ++ drivers/thermal/thermal_trip.c | 20 ++++++++++++-------- 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index c119ef920793d3..30567b4994551b 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -602,6 +602,12 @@ void thermal_zone_device_update(struct thermal_zone_device *tz, } EXPORT_SYMBOL_GPL(thermal_zone_device_update); +void thermal_zone_trip_down(struct thermal_zone_device *tz, + const struct thermal_trip *trip) +{ + thermal_trip_crossed(tz, trip, thermal_get_tz_governor(tz), false); +} + int for_each_thermal_governor(int (*cb)(struct thermal_governor *, void *), void *data) { diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h index d9785e5bbb08cd..20e7b45673d683 100644 --- a/drivers/thermal/thermal_core.h +++ b/drivers/thermal/thermal_core.h @@ -246,6 +246,8 @@ int thermal_zone_trip_id(const struct thermal_zone_device *tz, void thermal_zone_trip_updated(struct thermal_zone_device *tz, const struct thermal_trip *trip); int __thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp); +void thermal_zone_trip_down(struct thermal_zone_device *tz, + const struct thermal_trip *trip); /* sysfs I/F */ int thermal_zone_create_device_groups(struct thermal_zone_device *tz); diff --git a/drivers/thermal/thermal_trip.c b/drivers/thermal/thermal_trip.c index d6a6acc78ddb6c..49e63db685172e 100644 --- a/drivers/thermal/thermal_trip.c +++ b/drivers/thermal/thermal_trip.c @@ -152,17 +152,23 @@ void thermal_zone_set_trip_temp(struct thermal_zone_device *tz, if (trip->temperature == temp) return; + trip->temperature = temp; + thermal_notify_tz_trip_change(tz, trip); + if (temp == THERMAL_TEMP_INVALID) { struct thermal_trip_desc *td = trip_to_trip_desc(trip); - if (trip->type == THERMAL_TRIP_PASSIVE && - tz->temperature >= td->threshold) { + if (tz->temperature >= td->threshold) { /* - * The trip has been crossed, so the thermal zone's - * passive count needs to be adjusted. + * The trip has been crossed on the way up, so some + * adjustments are needed to compensate for the lack + * of it going forward. */ - tz->passive--; - WARN_ON_ONCE(tz->passive < 0); + if (trip->type == THERMAL_TRIP_PASSIVE) { + tz->passive--; + WARN_ON_ONCE(tz->passive < 0); + } + thermal_zone_trip_down(tz, trip); } /* * Invalidate the threshold to avoid triggering a spurious @@ -170,7 +176,5 @@ void thermal_zone_set_trip_temp(struct thermal_zone_device *tz, */ td->threshold = INT_MAX; } - trip->temperature = temp; - thermal_notify_tz_trip_change(tz, trip); } EXPORT_SYMBOL_GPL(thermal_zone_set_trip_temp); From 779aa4d74785078575ee20d05d49e6942d1f2844 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Fri, 24 May 2024 06:48:17 -0700 Subject: [PATCH 0140/1578] drm/nouveau/nvif: Avoid build error due to potential integer overflows Trying to build parisc:allmodconfig with gcc 12.x or later results in the following build error. drivers/gpu/drm/nouveau/nvif/object.c: In function 'nvif_object_mthd': drivers/gpu/drm/nouveau/nvif/object.c:161:9: error: 'memcpy' accessing 4294967264 or more bytes at offsets 0 and 32 overlaps 6442450881 bytes at offset -2147483617 [-Werror=restrict] 161 | memcpy(data, args->mthd.data, size); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/gpu/drm/nouveau/nvif/object.c: In function 'nvif_object_ctor': drivers/gpu/drm/nouveau/nvif/object.c:298:17: error: 'memcpy' accessing 4294967240 or more bytes at offsets 0 and 56 overlaps 6442450833 bytes at offset -2147483593 [-Werror=restrict] 298 | memcpy(data, args->new.data, size); gcc assumes that 'sizeof(*args) + size' can overflow, which would result in the problem. The problem is not new, only it is now no longer a warning but an error since W=1 has been enabled for the drm subsystem and since Werror is enabled for test builds. Rearrange arithmetic and use check_add_overflow() for validating the allocation size to avoid the overflow. While at it, split assignments out of if conditions. Fixes: a61ddb4393ad ("drm: enable (most) W=1 warnings by default across the subsystem") Cc: Javier Martinez Canillas Cc: Jani Nikula Cc: Thomas Zimmermann Cc: Danilo Krummrich Cc: Maxime Ripard Cc: Kees Cook Cc: Christophe JAILLET Cc: Joe Perches Reviewed-by: Kees Cook Signed-off-by: Guenter Roeck Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20240524134817.1369993-1-linux@roeck-us.net --- drivers/gpu/drm/nouveau/nvif/object.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nvif/object.c b/drivers/gpu/drm/nouveau/nvif/object.c index 4d1aaee8fe15fc..1d19c87eaec18e 100644 --- a/drivers/gpu/drm/nouveau/nvif/object.c +++ b/drivers/gpu/drm/nouveau/nvif/object.c @@ -142,11 +142,16 @@ nvif_object_mthd(struct nvif_object *object, u32 mthd, void *data, u32 size) struct nvif_ioctl_v0 ioctl; struct nvif_ioctl_mthd_v0 mthd; } *args; + u32 args_size; u8 stack[128]; int ret; - if (sizeof(*args) + size > sizeof(stack)) { - if (!(args = kmalloc(sizeof(*args) + size, GFP_KERNEL))) + if (check_add_overflow(sizeof(*args), size, &args_size)) + return -ENOMEM; + + if (args_size > sizeof(stack)) { + args = kmalloc(args_size, GFP_KERNEL); + if (!args) return -ENOMEM; } else { args = (void *)stack; @@ -157,7 +162,7 @@ nvif_object_mthd(struct nvif_object *object, u32 mthd, void *data, u32 size) args->mthd.method = mthd; memcpy(args->mthd.data, data, size); - ret = nvif_object_ioctl(object, args, sizeof(*args) + size, NULL); + ret = nvif_object_ioctl(object, args, args_size, NULL); memcpy(data, args->mthd.data, size); if (args != (void *)stack) kfree(args); @@ -276,7 +281,15 @@ nvif_object_ctor(struct nvif_object *parent, const char *name, u32 handle, object->map.size = 0; if (parent) { - if (!(args = kmalloc(sizeof(*args) + size, GFP_KERNEL))) { + u32 args_size; + + if (check_add_overflow(sizeof(*args), size, &args_size)) { + nvif_object_dtor(object); + return -ENOMEM; + } + + args = kmalloc(args_size, GFP_KERNEL); + if (!args) { nvif_object_dtor(object); return -ENOMEM; } @@ -293,8 +306,7 @@ nvif_object_ctor(struct nvif_object *parent, const char *name, u32 handle, args->new.oclass = oclass; memcpy(args->new.data, data, size); - ret = nvif_object_ioctl(parent, args, sizeof(*args) + size, - &object->priv); + ret = nvif_object_ioctl(parent, args, args_size, &object->priv); memcpy(data, args->new.data, size); kfree(args); if (ret == 0) From f89ea63f1c65d3e93b255f14f9d9e05df87955fa Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 24 May 2024 15:26:11 +0100 Subject: [PATCH 0141/1578] netfs, 9p: Fix race between umount and async request completion There's a problem in 9p's interaction with netfslib whereby a crash occurs because the 9p_fid structs get forcibly destroyed during client teardown (without paying attention to their refcounts) before netfslib has finished with them. However, it's not a simple case of deferring the clunking that p9_fid_put() does as that requires the p9_client record to still be present. The problem is that netfslib has to unlock pages and clear the IN_PROGRESS flag before destroying the objects involved - including the fid - and, in any case, nothing checks to see if writeback completed barring looking at the page flags. Fix this by keeping a count of outstanding I/O requests (of any type) and waiting for it to quiesce during inode eviction. Reported-by: syzbot+df038d463cca332e8414@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/0000000000005be0aa061846f8d6@google.com/ Reported-by: syzbot+d7c7a495a5e466c031b6@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/000000000000b86c5e06130da9c6@google.com/ Reported-by: syzbot+1527696d41a634cc1819@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/000000000000041f960618206d7e@google.com/ Signed-off-by: David Howells Link: https://lore.kernel.org/r/755891.1716560771@warthog.procyon.org.uk Tested-by: syzbot+d7c7a495a5e466c031b6@syzkaller.appspotmail.com Reviewed-by: Dominique Martinet cc: Eric Van Hensbergen cc: Latchesar Ionkov cc: Christian Schoenebeck cc: Jeff Layton cc: Steve French cc: Hillf Danton cc: v9fs@lists.linux.dev cc: linux-afs@lists.infradead.org cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Reported-and-tested-by: syzbot+d7c7a495a5e466c031b6@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- fs/9p/vfs_inode.c | 1 + fs/afs/inode.c | 1 + fs/netfs/objects.c | 5 +++++ fs/smb/client/cifsfs.c | 1 + include/linux/netfs.h | 18 ++++++++++++++++++ 5 files changed, 26 insertions(+) diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 7a3308d776060e..fd72fc38c8f5b1 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -348,6 +348,7 @@ void v9fs_evict_inode(struct inode *inode) __le32 __maybe_unused version; if (!is_bad_inode(inode)) { + netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); version = cpu_to_le32(v9inode->qid.version); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 94fc049aff584f..15bb7989c387ae 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -648,6 +648,7 @@ void afs_evict_inode(struct inode *inode) ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode); + netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); afs_set_cache_aux(vnode, &aux); diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index c90d482b16505d..f4a64272747925 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -72,6 +72,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, } } + atomic_inc(&ctx->io_count); trace_netfs_rreq_ref(rreq->debug_id, 1, netfs_rreq_trace_new); netfs_proc_add_rreq(rreq); netfs_stat(&netfs_n_rh_rreq); @@ -124,6 +125,7 @@ static void netfs_free_request(struct work_struct *work) { struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work); + struct netfs_inode *ictx = netfs_inode(rreq->inode); unsigned int i; trace_netfs_rreq(rreq, netfs_rreq_trace_free); @@ -142,6 +144,9 @@ static void netfs_free_request(struct work_struct *work) } kvfree(rreq->direct_bv); } + + if (atomic_dec_and_test(&ictx->io_count)) + wake_up_var(&ictx->io_count); call_rcu(&rreq->rcu, netfs_free_request_rcu); } diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index ec5b639f421a27..14810ffd15c89a 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -431,6 +431,7 @@ cifs_free_inode(struct inode *inode) static void cifs_evict_inode(struct inode *inode) { + netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); if (inode->i_state & I_PINNING_NETFS_WB) cifs_fscache_unuse_inode_cookie(inode, true); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index ca56a4428043d5..3b22ce0d064cc4 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -68,6 +68,7 @@ struct netfs_inode { loff_t remote_i_size; /* Size of the remote file */ loff_t zero_point; /* Size after which we assume there's no data * on the server */ + atomic_t io_count; /* Number of outstanding reqs */ unsigned long flags; #define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */ #define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */ @@ -472,6 +473,7 @@ static inline void netfs_inode_init(struct netfs_inode *ctx, ctx->remote_i_size = i_size_read(&ctx->inode); ctx->zero_point = LLONG_MAX; ctx->flags = 0; + atomic_set(&ctx->io_count, 0); #if IS_ENABLED(CONFIG_FSCACHE) ctx->cache = NULL; #endif @@ -515,4 +517,20 @@ static inline struct fscache_cookie *netfs_i_cookie(struct netfs_inode *ctx) #endif } +/** + * netfs_wait_for_outstanding_io - Wait for outstanding I/O to complete + * @ctx: The netfs inode to wait on + * + * Wait for outstanding I/O requests of any type to complete. This is intended + * to be called from inode eviction routines. This makes sure that any + * resources held by those requests are cleaned up before we let the inode get + * cleaned up. + */ +static inline void netfs_wait_for_outstanding_io(struct inode *inode) +{ + struct netfs_inode *ictx = netfs_inode(inode); + + wait_var_event(&ictx->io_count, atomic_read(&ictx->io_count) == 0); +} + #endif /* _LINUX_NETFS_H */ From e569eb34970281438e2b48a3ef11c87459fcfbcb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20L=C3=B3pez?= Date: Mon, 27 May 2024 11:43:52 +0200 Subject: [PATCH 0142/1578] tracing/probes: fix error check in parse_btf_field() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit btf_find_struct_member() might return NULL or an error via the ERR_PTR() macro. However, its caller in parse_btf_field() only checks for the NULL condition. Fix this by using IS_ERR() and returning the error up the stack. Link: https://lore.kernel.org/all/20240527094351.15687-1-clopez@suse.de/ Fixes: c440adfbe3025 ("tracing/probes: Support BTF based data structure field access") Signed-off-by: Carlos López Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_probe.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 5e263c14157403..39877c80d6cb9a 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -554,6 +554,10 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type, anon_offs = 0; field = btf_find_struct_member(ctx->btf, type, fieldname, &anon_offs); + if (IS_ERR(field)) { + trace_probe_log_err(ctx->offset, BAD_BTF_TID); + return PTR_ERR(field); + } if (!field) { trace_probe_log_err(ctx->offset, NO_BTF_FIELD); return -ENOENT; From 797c525e85d1e44cf0e6f338890e8e0c661f524a Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Mon, 27 May 2024 11:08:40 +0100 Subject: [PATCH 0143/1578] ASoC: cs42l43: Only restrict 44.1kHz for the ASP The SoundWire interface can always support 44.1kHz using flow controlled mode, and whether the ASP is in master mode should obviously only affect the ASP. Update cs42l43_startup() to only restrict the rates for the ASP DAI. Fixes: fc918cbe874e ("ASoC: cs42l43: Add support for the cs42l43") Signed-off-by: Charles Keepax Link: https://msgid.link/r/20240527100840.439832-1-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs42l43.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/cs42l43.c b/sound/soc/codecs/cs42l43.c index 94685449f0f48c..92674314227c46 100644 --- a/sound/soc/codecs/cs42l43.c +++ b/sound/soc/codecs/cs42l43.c @@ -310,8 +310,9 @@ static int cs42l43_startup(struct snd_pcm_substream *substream, struct snd_soc_d struct snd_soc_component *component = dai->component; struct cs42l43_codec *priv = snd_soc_component_get_drvdata(component); struct cs42l43 *cs42l43 = priv->core; - int provider = !!regmap_test_bits(cs42l43->regmap, CS42L43_ASP_CLK_CONFIG2, - CS42L43_ASP_MASTER_MODE_MASK); + int provider = !dai->id || !!regmap_test_bits(cs42l43->regmap, + CS42L43_ASP_CLK_CONFIG2, + CS42L43_ASP_MASTER_MODE_MASK); if (provider) priv->constraint.mask = CS42L43_PROVIDER_RATE_MASK; From 8d34c12e8751fe180157fbb34a758ed4eede3806 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Mon, 27 May 2024 11:02:37 +0100 Subject: [PATCH 0144/1578] ASoC: wm_adsp: Add missing MODULE_DESCRIPTION() wm_adsp is built as a separate module and as such should include a MODULE_DESCRIPTION() macro. Signed-off-by: Charles Keepax Link: https://msgid.link/r/20240527100237.430240-1-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/wm_adsp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/codecs/wm_adsp.c b/sound/soc/codecs/wm_adsp.c index c9d9a7b28efb08..68d2d6444533f5 100644 --- a/sound/soc/codecs/wm_adsp.c +++ b/sound/soc/codecs/wm_adsp.c @@ -2085,5 +2085,6 @@ static const struct cs_dsp_client_ops wm_adsp2_client_ops = { .watchdog_expired = wm_adsp_fatal_error, }; +MODULE_DESCRIPTION("Cirrus Logic ASoC DSP Support"); MODULE_LICENSE("GPL v2"); MODULE_IMPORT_NS(FW_CS_DSP); From d5d2a5dacbc8ea4386071ce243c43ea0dac23cb8 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Mon, 27 May 2024 11:13:26 +0100 Subject: [PATCH 0145/1578] MAINTAINERS: Remove James Schulman from Cirrus audio maintainers James no longer works for Cirrus Logic, remove him from the list of maintainers for the Cirrus audio CODEC drivers. Signed-off-by: Charles Keepax Link: https://msgid.link/r/20240527101326.440345-1-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index d6c90161c7bfe3..a6a011792167f6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5187,7 +5187,6 @@ F: Documentation/devicetree/bindings/media/i2c/chrontel,ch7322.yaml F: drivers/media/cec/i2c/ch7322.c CIRRUS LOGIC AUDIO CODEC DRIVERS -M: James Schulman M: David Rhodes M: Richard Fitzgerald L: alsa-devel@alsa-project.org (moderated for non-subscribers) From 6cb05d89fd62a76a9b74bd16211fb0930e89fea8 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Wed, 22 May 2024 21:13:08 +0300 Subject: [PATCH 0146/1578] dma-buf: handle testing kthreads creation failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kthread creation may possibly fail inside race_signal_callback(). In such a case stop the already started threads, put the already taken references to them and return with error code. Found by Linux Verification Center (linuxtesting.org). Fixes: 2989f6451084 ("dma-buf: Add selftests for dma-fence") Cc: stable@vger.kernel.org Signed-off-by: Fedor Pchelkin Reviewed-by: T.J. Mercier Link: https://patchwork.freedesktop.org/patch/msgid/20240522181308.841686-1-pchelkin@ispras.ru Signed-off-by: Christian König --- drivers/dma-buf/st-dma-fence.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/dma-buf/st-dma-fence.c b/drivers/dma-buf/st-dma-fence.c index b7c6f7ea9e0c83..6a1bfcd0cc2108 100644 --- a/drivers/dma-buf/st-dma-fence.c +++ b/drivers/dma-buf/st-dma-fence.c @@ -540,6 +540,12 @@ static int race_signal_callback(void *arg) t[i].before = pass; t[i].task = kthread_run(thread_signal_callback, &t[i], "dma-fence:%d", i); + if (IS_ERR(t[i].task)) { + ret = PTR_ERR(t[i].task); + while (--i >= 0) + kthread_stop_put(t[i].task); + return ret; + } get_task_struct(t[i].task); } From 82d71b53d7e732ede6028591342bdc80fabfa29f Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 27 May 2024 15:13:14 +0200 Subject: [PATCH 0147/1578] Documentation/core-api: correct reference to SWIOTLB_DYNAMIC Commit c93f261dfc39 ("Documentation/core-api: add swiotlb documentation") accidentally refers to CONFIG_DYNAMIC_SWIOTLB in one place, while the config is actually called CONFIG_SWIOTLB_DYNAMIC. Correct the reference to the intended config option. Signed-off-by: Lukas Bulwahn Reviewed-by: Petr Tesarik Signed-off-by: Christoph Hellwig --- Documentation/core-api/swiotlb.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/core-api/swiotlb.rst b/Documentation/core-api/swiotlb.rst index 5ad2c9ca85bcbb..cf06bae44ff85b 100644 --- a/Documentation/core-api/swiotlb.rst +++ b/Documentation/core-api/swiotlb.rst @@ -192,7 +192,7 @@ alignment larger than PAGE_SIZE. Dynamic swiotlb --------------- -When CONFIG_DYNAMIC_SWIOTLB is enabled, swiotlb can do on-demand expansion of +When CONFIG_SWIOTLB_DYNAMIC is enabled, swiotlb can do on-demand expansion of the amount of memory available for allocation as bounce buffers. If a bounce buffer request fails due to lack of available space, an asynchronous background task is kicked off to allocate memory from general system memory and turn it From a39741d38c048a48ae0d65226d9548005a088f5f Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Wed, 22 May 2024 08:54:21 +0300 Subject: [PATCH 0148/1578] pinctrl: renesas: rzg2l: Use spin_{lock,unlock}_irq{save,restore} On PREEMPT_RT kernels the spinlock_t maps to an rtmutex. Using raw_spin_lock_irqsave()/raw_spin_unlock_irqrestore() on &pctrl->lock.rlock breaks the PREEMPT_RT builds. To fix this use spin_lock_irqsave()/spin_unlock_irqrestore() on &pctrl->lock. Fixes: 02cd2d3be1c3 ("pinctrl: renesas: rzg2l: Configure the interrupt type on resume") Reported-by: Diederik de Haas Closes: https://lore.kernel.org/all/131999629.KQPSlr0Zke@bagend Signed-off-by: Claudiu Beznea Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20240522055421.2842689-1-claudiu.beznea.uj@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- drivers/pinctrl/renesas/pinctrl-rzg2l.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pinctrl/renesas/pinctrl-rzg2l.c b/drivers/pinctrl/renesas/pinctrl-rzg2l.c index c3256bfde50233..60be78da9f529b 100644 --- a/drivers/pinctrl/renesas/pinctrl-rzg2l.c +++ b/drivers/pinctrl/renesas/pinctrl-rzg2l.c @@ -2071,11 +2071,11 @@ static void rzg2l_gpio_irq_restore(struct rzg2l_pinctrl *pctrl) * This has to be atomically executed to protect against a concurrent * interrupt. */ - raw_spin_lock_irqsave(&pctrl->lock.rlock, flags); + spin_lock_irqsave(&pctrl->lock, flags); ret = rzg2l_gpio_irq_set_type(data, irqd_get_trigger_type(data)); if (!ret && !irqd_irq_disabled(data)) rzg2l_gpio_irq_enable(data); - raw_spin_unlock_irqrestore(&pctrl->lock.rlock, flags); + spin_unlock_irqrestore(&pctrl->lock, flags); if (ret) dev_crit(pctrl->dev, "Failed to set IRQ type for virq=%u\n", virq); From e5c7bd4e5ca0f549108a7013ba77885926c6a56b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 22 May 2024 13:40:19 -0300 Subject: [PATCH 0149/1578] tools include UAPI: Sync linux/stat.h with the kernel sources To get the changes in: 2a82bb02941fb53d ("statx: stx_subvol") To pick up this change and support it: $ tools/perf/trace/beauty/statx_mask.sh > before $ cp include/uapi/linux/stat.h tools/perf/trace/beauty/include/uapi/linux/stat.h $ tools/perf/trace/beauty/statx_mask.sh > after $ diff -u before after --- before 2024-05-22 13:39:49.742470571 -0300 +++ after 2024-05-22 13:39:59.157883101 -0300 @@ -14,4 +14,5 @@ [ilog2(0x00001000) + 1] = "MNT_ID", [ilog2(0x00002000) + 1] = "DIOALIGN", [ilog2(0x00004000) + 1] = "MNT_ID_UNIQUE", + [ilog2(0x00008000) + 1] = "SUBVOL", }; $ Now we'll see it like we see these: # perf trace -e statx 0.000 ( 0.015 ms): systemd-userwo/3982299 statx(dfd: 6, filename: ".", mask: TYPE|INO|MNT_ID, buffer: 0x7ffd8945e850) = 0 180.559 ( 0.007 ms): (ostnamed)/3982957 statx(dfd: 4, filename: "sys", flags: SYMLINK_NOFOLLOW|NO_AUTOMOUNT|STATX_DONT_SYNC, mask: TYPE, buffer: 0x7fff13161190) = 0 180.918 ( 0.011 ms): (ostnamed)/3982957 statx(dfd: CWD, filename: "/run/systemd/mount-rootfs/sys/kernel/security", flags: SYMLINK_NOFOLLOW|NO_AUTOMOUNT|STATX_DONT_SYNC, mask: MNT_ID, buffer: 0x7fff13161120) = 0 180.956 ( 0.010 ms): (ostnamed)/3982957 statx(dfd: CWD, filename: "/run/systemd/mount-rootfs/sys/fs/cgroup", flags: SYMLINK_NOFOLLOW|NO_AUTOMOUNT|STATX_DONT_SYNC, mask: MNT_ID, buffer: 0x7fff13161120) = 0 Cc: Adrian Hunter Cc: Christian Brauner Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Kent Overstreet Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/Zk5nO9yT0oPezUoo@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/trace/beauty/include/uapi/linux/stat.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/perf/trace/beauty/include/uapi/linux/stat.h b/tools/perf/trace/beauty/include/uapi/linux/stat.h index 2f2ee82d55175d..67626d53531664 100644 --- a/tools/perf/trace/beauty/include/uapi/linux/stat.h +++ b/tools/perf/trace/beauty/include/uapi/linux/stat.h @@ -126,8 +126,9 @@ struct statx { __u64 stx_mnt_id; __u32 stx_dio_mem_align; /* Memory buffer alignment for direct I/O */ __u32 stx_dio_offset_align; /* File offset alignment for direct I/O */ + __u64 stx_subvol; /* Subvolume identifier */ /* 0xa0 */ - __u64 __spare3[12]; /* Spare space for future expansion */ + __u64 __spare3[11]; /* Spare space for future expansion */ /* 0x100 */ }; @@ -155,6 +156,7 @@ struct statx { #define STATX_MNT_ID 0x00001000U /* Got stx_mnt_id */ #define STATX_DIOALIGN 0x00002000U /* Want/got direct I/O alignment info */ #define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */ +#define STATX_SUBVOL 0x00008000U /* Want/got stx_subvol */ #define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */ From 80e4e17ac9e05adba3f1f0e1793398086e6d4007 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 21 May 2024 15:16:06 -0700 Subject: [PATCH 0150/1578] block: remove blk_queue_max_integrity_segments This is unused now that all the atomic queue limit conversions are merged. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240521221606.393040-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-integrity.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index e253e7bd0d1793..7428cb43952da0 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -66,12 +66,6 @@ blk_integrity_queue_supports_integrity(struct request_queue *q) return q->integrity.profile; } -static inline void blk_queue_max_integrity_segments(struct request_queue *q, - unsigned int segs) -{ - q->limits.max_integrity_segments = segs; -} - static inline unsigned short queue_max_integrity_segments(const struct request_queue *q) { @@ -151,10 +145,6 @@ static inline void blk_integrity_register(struct gendisk *d, static inline void blk_integrity_unregister(struct gendisk *d) { } -static inline void blk_queue_max_integrity_segments(struct request_queue *q, - unsigned int segs) -{ -} static inline unsigned short queue_max_integrity_segments(const struct request_queue *q) { From d9780064b163b91c28e4d44ec3115599db65b7fa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 27 May 2024 14:36:18 +0200 Subject: [PATCH 0151/1578] dm: move setting zoned_enabled to dm_table_set_restrictions Keep it together with the rest of the zoned code. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240527123634.1116952-2-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/dm-table.c | 3 --- drivers/md/dm-zone.c | 8 +++++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index cc66a27c363a65..e291b78b307b13 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -2040,9 +2040,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, r = dm_set_zones_restrictions(t, q); if (r) return r; - if (blk_queue_is_zoned(q) && - !static_key_enabled(&zoned_enabled.key)) - static_branch_enable(&zoned_enabled); } dm_update_crypto_profile(q, t); diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 8e6bcb0d786a1a..3103360ce7f040 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -287,7 +287,13 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q) queue_emulates_zone_append(q) ? "emulated" : "native"); } - return dm_revalidate_zones(md, t); + ret = dm_revalidate_zones(md, t); + if (ret < 0) + return ret; + + if (!static_key_enabled(&zoned_enabled.key)) + static_branch_enable(&zoned_enabled); + return 0; } /* From 5e7a4bbcc33d7df6bcc8565a8938c196285e5423 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 27 May 2024 14:36:19 +0200 Subject: [PATCH 0152/1578] dm: remove dm_check_zoned Fold it into the only caller in preparation to changes in the queue limits setup. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240527123634.1116952-3-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/dm-zone.c | 59 +++++++++++++++++--------------------------- 1 file changed, 23 insertions(+), 36 deletions(-) diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 3103360ce7f040..0ee22494857d07 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -160,37 +160,6 @@ static int dm_check_zoned_cb(struct blk_zone *zone, unsigned int idx, return 0; } -static int dm_check_zoned(struct mapped_device *md, struct dm_table *t) -{ - struct gendisk *disk = md->disk; - unsigned int nr_conv_zones = 0; - int ret; - - /* Count conventional zones */ - md->zone_revalidate_map = t; - ret = dm_blk_report_zones(disk, 0, UINT_MAX, - dm_check_zoned_cb, &nr_conv_zones); - md->zone_revalidate_map = NULL; - if (ret < 0) { - DMERR("Check zoned failed %d", ret); - return ret; - } - - /* - * If we only have conventional zones, expose the mapped device as - * a regular device. - */ - if (nr_conv_zones >= ret) { - disk->queue->limits.max_open_zones = 0; - disk->queue->limits.max_active_zones = 0; - disk->queue->limits.zoned = false; - clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); - disk->nr_zones = 0; - } - - return 0; -} - /* * Revalidate the zones of a mapped device to initialize resource necessary * for zone append emulation. Note that we cannot simply use the block layer @@ -254,6 +223,8 @@ static bool dm_table_supports_zone_append(struct dm_table *t) int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q) { struct mapped_device *md = t->md; + struct gendisk *disk = md->disk; + unsigned int nr_conv_zones = 0; int ret; /* @@ -272,14 +243,30 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q) return 0; /* - * Check that the mapped device will indeed be zoned, that is, that it - * has sequential write required zones. + * Count conventional zones to check that the mapped device will indeed + * have sequential write required zones. */ - ret = dm_check_zoned(md, t); - if (ret) + md->zone_revalidate_map = t; + ret = dm_blk_report_zones(disk, 0, UINT_MAX, + dm_check_zoned_cb, &nr_conv_zones); + md->zone_revalidate_map = NULL; + if (ret < 0) { + DMERR("Check zoned failed %d", ret); return ret; - if (!blk_queue_is_zoned(q)) + } + + /* + * If we only have conventional zones, expose the mapped device as + * a regular device. + */ + if (nr_conv_zones >= ret) { + disk->queue->limits.max_open_zones = 0; + disk->queue->limits.max_active_zones = 0; + disk->queue->limits.zoned = false; + clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + disk->nr_zones = 0; return 0; + } if (!md->disk->nr_zones) { DMINFO("%s using %s zone append", From c8c1f7012b807ca4da0136eacab96961b56f25d5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 27 May 2024 14:36:20 +0200 Subject: [PATCH 0153/1578] dm: make dm_set_zones_restrictions work on the queue limits Don't stuff the values directly into the queue without any synchronization, but instead delay applying the queue limits in the caller and let dm_set_zones_restrictions work on the limit structure. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240527123634.1116952-4-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/dm-table.c | 12 ++++++------ drivers/md/dm-zone.c | 11 ++++++----- drivers/md/dm.h | 3 ++- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index e291b78b307b13..b2d5246cff2102 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1981,10 +1981,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, if (!dm_table_supports_secure_erase(t)) limits->max_secure_erase_sectors = 0; - r = queue_limits_set(q, limits); - if (r) - return r; - if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) { wc = true; if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_FUA))) @@ -2036,12 +2032,16 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, * For a zoned target, setup the zones related queue attributes * and resources necessary for zone append emulation if necessary. */ - if (blk_queue_is_zoned(q)) { - r = dm_set_zones_restrictions(t, q); + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && limits->zoned) { + r = dm_set_zones_restrictions(t, q, limits); if (r) return r; } + r = queue_limits_set(q, limits); + if (r) + return r; + dm_update_crypto_profile(q, t); /* diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 0ee22494857d07..5d66d916730efa 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -220,7 +220,8 @@ static bool dm_table_supports_zone_append(struct dm_table *t) return true; } -int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q) +int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, + struct queue_limits *lim) { struct mapped_device *md = t->md; struct gendisk *disk = md->disk; @@ -236,7 +237,7 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q) clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); } else { set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); - blk_queue_max_zone_append_sectors(q, 0); + lim->max_zone_append_sectors = 0; } if (!get_capacity(md->disk)) @@ -260,9 +261,9 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q) * a regular device. */ if (nr_conv_zones >= ret) { - disk->queue->limits.max_open_zones = 0; - disk->queue->limits.max_active_zones = 0; - disk->queue->limits.zoned = false; + lim->max_open_zones = 0; + lim->max_active_zones = 0; + lim->zoned = false; clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); disk->nr_zones = 0; return 0; diff --git a/drivers/md/dm.h b/drivers/md/dm.h index e0c57f19839b29..53ef8207fe2c15 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -101,7 +101,8 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t); /* * Zoned targets related functions. */ -int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q); +int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, + struct queue_limits *lim); void dm_zone_endio(struct dm_io *io, struct bio *clone); #ifdef CONFIG_BLK_DEV_ZONED int dm_blk_report_zones(struct gendisk *disk, sector_t sector, From 0efc88e444d9d74f220f60a3e37143c8decf1bea Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 27 May 2024 12:15:27 -0300 Subject: [PATCH 0154/1578] tools headers UAPI: Sync linux/prctl.h with the kernel sources To pick the changes in: 628d701f2de5b9a1 ("powerpc/dexcr: Add DEXCR prctl interface") 6b9391b581fddd85 ("riscv: Include riscv_set_icache_flush_ctx prctl") That adds some PowerPC and a RISC-V specific prctl options: $ tools/perf/trace/beauty/prctl_option.sh > before $ cp include/uapi/linux/prctl.h tools/perf/trace/beauty/include/uapi/linux/prctl.h $ tools/perf/trace/beauty/prctl_option.sh > after $ diff -u before after --- before 2024-05-27 12:14:21.358032781 -0300 +++ after 2024-05-27 12:14:32.364530185 -0300 @@ -65,6 +65,9 @@ [68] = "GET_MEMORY_MERGE", [69] = "RISCV_V_SET_CONTROL", [70] = "RISCV_V_GET_CONTROL", + [71] = "RISCV_SET_ICACHE_FLUSH_CTX", + [72] = "PPC_GET_DEXCR", + [73] = "PPC_SET_DEXCR", }; static const char *prctl_set_mm_options[] = { [1] = "START_CODE", $ That now will be used to decode the syscall option and also to compose filters, for instance: [root@five ~]# perf trace -e syscalls:sys_enter_prctl --filter option==SET_NAME 0.000 Isolated Servi/3474327 syscalls:sys_enter_prctl(option: SET_NAME, arg2: 0x7f23f13b7aee) 0.032 DOM Worker/3474327 syscalls:sys_enter_prctl(option: SET_NAME, arg2: 0x7f23deb25670) 7.920 :3474328/3474328 syscalls:sys_enter_prctl(option: SET_NAME, arg2: 0x7f23e24fbb10) 7.935 StreamT~s #374/3474328 syscalls:sys_enter_prctl(option: SET_NAME, arg2: 0x7f23e24fb970) 8.400 Isolated Servi/3474329 syscalls:sys_enter_prctl(option: SET_NAME, arg2: 0x7f23e24bab10) 8.418 StreamT~s #374/3474329 syscalls:sys_enter_prctl(option: SET_NAME, arg2: 0x7f23e24ba970) ^C[root@five ~]# This addresses this perf build warning: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/prctl.h include/uapi/linux/prctl.h Cc: Adrian Hunter Cc: Benjamin Gray Cc: Charlie Jenkins Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Michael Ellerman Cc: Namhyung Kim Cc: Palmer Dabbelt Link: https://lore.kernel.org/lkml/ZlSklGWp--v_Ije7@x1 Signed-off-by: Arnaldo Carvalho de Melo --- .../trace/beauty/include/uapi/linux/prctl.h | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tools/perf/trace/beauty/include/uapi/linux/prctl.h b/tools/perf/trace/beauty/include/uapi/linux/prctl.h index 370ed14b1ae092..35791791a879b2 100644 --- a/tools/perf/trace/beauty/include/uapi/linux/prctl.h +++ b/tools/perf/trace/beauty/include/uapi/linux/prctl.h @@ -306,4 +306,26 @@ struct prctl_mm_map { # define PR_RISCV_V_VSTATE_CTRL_NEXT_MASK 0xc # define PR_RISCV_V_VSTATE_CTRL_MASK 0x1f +#define PR_RISCV_SET_ICACHE_FLUSH_CTX 71 +# define PR_RISCV_CTX_SW_FENCEI_ON 0 +# define PR_RISCV_CTX_SW_FENCEI_OFF 1 +# define PR_RISCV_SCOPE_PER_PROCESS 0 +# define PR_RISCV_SCOPE_PER_THREAD 1 + +/* PowerPC Dynamic Execution Control Register (DEXCR) controls */ +#define PR_PPC_GET_DEXCR 72 +#define PR_PPC_SET_DEXCR 73 +/* DEXCR aspect to act on */ +# define PR_PPC_DEXCR_SBHE 0 /* Speculative branch hint enable */ +# define PR_PPC_DEXCR_IBRTPD 1 /* Indirect branch recurrent target prediction disable */ +# define PR_PPC_DEXCR_SRAPD 2 /* Subroutine return address prediction disable */ +# define PR_PPC_DEXCR_NPHIE 3 /* Non-privileged hash instruction enable */ +/* Action to apply / return */ +# define PR_PPC_DEXCR_CTRL_EDITABLE 0x1 /* Aspect can be modified with PR_PPC_SET_DEXCR */ +# define PR_PPC_DEXCR_CTRL_SET 0x2 /* Set the aspect for this process */ +# define PR_PPC_DEXCR_CTRL_CLEAR 0x4 /* Clear the aspect for this process */ +# define PR_PPC_DEXCR_CTRL_SET_ONEXEC 0x8 /* Set the aspect on exec */ +# define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC 0x10 /* Clear the aspect on exec */ +# define PR_PPC_DEXCR_CTRL_MASK 0x1f + #endif /* _LINUX_PRCTL_H */ From b0c6bcd58d44b1b843d1b7218db5a1efe917d27e Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Tue, 7 May 2024 15:06:55 +0530 Subject: [PATCH 0155/1578] xfs: Add cond_resched to block unmap range and reflink remap path An async dio write to a sparse file can generate a lot of extents and when we unlink this file (using rm), the kernel can be busy in umapping and freeing those extents as part of transaction processing. Similarly xfs reflink remapping path can also iterate over a million extent entries in xfs_reflink_remap_blocks(). Since we can busy loop in these two functions, so let's add cond_resched() to avoid softlockup messages like these. watchdog: BUG: soft lockup - CPU#1 stuck for 22s! [kworker/1:0:82435] CPU: 1 PID: 82435 Comm: kworker/1:0 Tainted: G S L 6.9.0-rc5-0-default #1 Workqueue: xfs-inodegc/sda2 xfs_inodegc_worker NIP [c000000000beea10] xfs_extent_busy_trim+0x100/0x290 LR [c000000000bee958] xfs_extent_busy_trim+0x48/0x290 Call Trace: xfs_alloc_get_rec+0x54/0x1b0 (unreliable) xfs_alloc_compute_aligned+0x5c/0x144 xfs_alloc_ag_vextent_size+0x238/0x8d4 xfs_alloc_fix_freelist+0x540/0x694 xfs_free_extent_fix_freelist+0x84/0xe0 __xfs_free_extent+0x74/0x1ec xfs_extent_free_finish_item+0xcc/0x214 xfs_defer_finish_one+0x194/0x388 xfs_defer_finish_noroll+0x1b4/0x5c8 xfs_defer_finish+0x2c/0xc4 xfs_bunmapi_range+0xa4/0x100 xfs_itruncate_extents_flags+0x1b8/0x2f4 xfs_inactive_truncate+0xe0/0x124 xfs_inactive+0x30c/0x3e0 xfs_inodegc_worker+0x140/0x234 process_scheduled_works+0x240/0x57c worker_thread+0x198/0x468 kthread+0x138/0x140 start_kernel_thread+0x14/0x18 run fstests generic/175 at 2024-02-02 04:40:21 [ C17] watchdog: BUG: soft lockup - CPU#17 stuck for 23s! [xfs_io:7679] watchdog: BUG: soft lockup - CPU#17 stuck for 23s! [xfs_io:7679] CPU: 17 PID: 7679 Comm: xfs_io Kdump: loaded Tainted: G X 6.4.0 NIP [c008000005e3ec94] xfs_rmapbt_diff_two_keys+0x54/0xe0 [xfs] LR [c008000005e08798] xfs_btree_get_leaf_keys+0x110/0x1e0 [xfs] Call Trace: 0xc000000014107c00 (unreliable) __xfs_btree_updkeys+0x8c/0x2c0 [xfs] xfs_btree_update_keys+0x150/0x170 [xfs] xfs_btree_lshift+0x534/0x660 [xfs] xfs_btree_make_block_unfull+0x19c/0x240 [xfs] xfs_btree_insrec+0x4e4/0x630 [xfs] xfs_btree_insert+0x104/0x2d0 [xfs] xfs_rmap_insert+0xc4/0x260 [xfs] xfs_rmap_map_shared+0x228/0x630 [xfs] xfs_rmap_finish_one+0x2d4/0x350 [xfs] xfs_rmap_update_finish_item+0x44/0xc0 [xfs] xfs_defer_finish_noroll+0x2e4/0x740 [xfs] __xfs_trans_commit+0x1f4/0x400 [xfs] xfs_reflink_remap_extent+0x2d8/0x650 [xfs] xfs_reflink_remap_blocks+0x154/0x320 [xfs] xfs_file_remap_range+0x138/0x3a0 [xfs] do_clone_file_range+0x11c/0x2f0 vfs_clone_file_range+0x60/0x1c0 ioctl_file_clone+0x78/0x140 sys_ioctl+0x934/0x1270 system_call_exception+0x158/0x320 system_call_vectored_common+0x15c/0x2ec Cc: Ojaswin Mujoo Signed-off-by: Ritesh Harjani (IBM) Reviewed-by: Darrick J. Wong Tested-by: Disha Goel Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_bmap.c | 1 + fs/xfs/xfs_reflink.c | 1 + 2 files changed, 2 insertions(+) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 3b3206d312d6fe..c101cf266bc4db 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -6383,6 +6383,7 @@ xfs_bunmapi_range( error = xfs_defer_finish(tpp); if (error) goto out; + cond_resched(); } out: return error; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 063a2e00d16915..265a2a418bc7b0 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1387,6 +1387,7 @@ xfs_reflink_remap_blocks( destoff += imap.br_blockcount; len -= imap.br_blockcount; remapped_len += imap.br_blockcount; + cond_resched(); } if (error) From 1437a9f06f740cc2950d9b6ac23fad838ff023a3 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 27 May 2024 12:34:20 -0300 Subject: [PATCH 0156/1578] tools headers UAPI: Sync fcntl.h with the kernel sources to pick F_DUPFD_QUERY There is no scrape script yet for those, but the warning pointed out we need to update the array with the F_LINUX_SPECIFIC_BASE entries, do it. Now 'perf trace' can decode that cmd and also use it in filter, as in: root@number:~# perf trace -e syscalls:*enter_fcntl --filter 'cmd != SETFL && cmd != GETFL' 0.000 sssd_kcm/303828 syscalls:sys_enter_fcntl(fd: 13, cmd: SETLK, arg: 0x7fffdc6a8a50) 0.013 sssd_kcm/303828 syscalls:sys_enter_fcntl(fd: 13, cmd: SETLKW, arg: 0x7fffdc6a8aa0) 0.090 sssd_kcm/303828 syscalls:sys_enter_fcntl(fd: 13, cmd: SETLKW, arg: 0x7fffdc6a88e0) ^Croot@number:~# This picks up the changes in: c62b758bae6af16f ("fcntl: add F_DUPFD_QUERY fcntl()") Addressing this perf tools build warning: Warning: Kernel ABI header differences: diff -u tools/perf/trace/beauty/include/uapi/linux/fcntl.h include/uapi/linux/fcntl.h Cc: Adrian Hunter Cc: Christian Brauner Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/ZlSqNQH9mFw2bmjq@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 2 +- tools/perf/trace/beauty/include/uapi/linux/fcntl.h | 14 ++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 51eca671c79760..08a3a6effac18e 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -765,7 +765,7 @@ static const char *fcntl_cmds[] = { static DEFINE_STRARRAY(fcntl_cmds, "F_"); static const char *fcntl_linux_specific_cmds[] = { - "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC", + "SETLEASE", "GETLEASE", "NOTIFY", "DUPFD_QUERY", [5] = "CANCELLK", "DUPFD_CLOEXEC", "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS", "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT", }; diff --git a/tools/perf/trace/beauty/include/uapi/linux/fcntl.h b/tools/perf/trace/beauty/include/uapi/linux/fcntl.h index 282e90aeb163c0..c0bcc185fa48f8 100644 --- a/tools/perf/trace/beauty/include/uapi/linux/fcntl.h +++ b/tools/perf/trace/beauty/include/uapi/linux/fcntl.h @@ -8,6 +8,14 @@ #define F_SETLEASE (F_LINUX_SPECIFIC_BASE + 0) #define F_GETLEASE (F_LINUX_SPECIFIC_BASE + 1) +/* + * Request nofications on a directory. + * See below for events that may be notified. + */ +#define F_NOTIFY (F_LINUX_SPECIFIC_BASE + 2) + +#define F_DUPFD_QUERY (F_LINUX_SPECIFIC_BASE + 3) + /* * Cancel a blocking posix lock; internal use only until we expose an * asynchronous lock api to userspace: @@ -17,12 +25,6 @@ /* Create a file descriptor with FD_CLOEXEC set. */ #define F_DUPFD_CLOEXEC (F_LINUX_SPECIFIC_BASE + 6) -/* - * Request nofications on a directory. - * See below for events that may be notified. - */ -#define F_NOTIFY (F_LINUX_SPECIFIC_BASE+2) - /* * Set and get of pipe page size array */ From a3eed53beec4a7bac8acd109abcb187df2577781 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 27 May 2024 12:45:33 -0300 Subject: [PATCH 0157/1578] perf beauty: Update copy of linux/socket.h with the kernel sources To pick up the fixes in: 0645fbe760afcc53 ("net: have do_accept() take a struct proto_accept_arg argument") That just changes a function prototype, not touching things used by the perf scrape scripts such as: $ tools/perf/trace/beauty/sockaddr.sh | head -5 static const char *socket_families[] = { [0] = "UNSPEC", [1] = "LOCAL", [2] = "INET", [3] = "AX25", $ This addresses this perf tools build warning: Warning: Kernel ABI header differences: diff -u tools/perf/trace/beauty/include/linux/socket.h include/linux/socket.h Cc: Adrian Hunter Cc: Ian Rogers Cc: Jens Axboe Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/ZlSrceExgjrUiDb5@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/trace/beauty/include/linux/socket.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/perf/trace/beauty/include/linux/socket.h b/tools/perf/trace/beauty/include/linux/socket.h index 139c330ccf2c3b..89d16b90370bd4 100644 --- a/tools/perf/trace/beauty/include/linux/socket.h +++ b/tools/perf/trace/beauty/include/linux/socket.h @@ -16,6 +16,7 @@ struct cred; struct socket; struct sock; struct sk_buff; +struct proto_accept_arg; #define __sockaddr_check_size(size) \ BUILD_BUG_ON(((size) > sizeof(struct __kernel_sockaddr_storage))) @@ -433,7 +434,7 @@ extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size, extern int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags, struct sockaddr __user *addr, int addr_len); -extern struct file *do_accept(struct file *file, unsigned file_flags, +extern struct file *do_accept(struct file *file, struct proto_accept_arg *arg, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags); extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, From 001821b0e79716c4e17c71d8e053a23599a7a508 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 27 May 2024 12:51:30 -0300 Subject: [PATCH 0158/1578] perf trace beauty: Update the arch/x86/include/asm/irq_vectors.h copy with the kernel sources to pick POSTED_MSI_NOTIFICATION To pick up the change in: f5a3562ec9dd29e6 ("x86/irq: Reserve a per CPU IDT vector for posted MSIs") That picks up this new vector: $ cp arch/x86/include/asm/irq_vectors.h tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h $ tools/perf/trace/beauty/tracepoints/x86_irq_vectors.sh > after $ diff -u before after --- before 2024-05-27 12:50:47.708863932 -0300 +++ after 2024-05-27 12:51:15.335113123 -0300 @@ -1,6 +1,7 @@ static const char *x86_irq_vectors[] = { [0x02] = "NMI", [0x80] = "IA32_SYSCALL", + [0xeb] = "POSTED_MSI_NOTIFICATION", [0xec] = "LOCAL_TIMER", [0xed] = "HYPERV_STIMER0", [0xee] = "HYPERV_REENLIGHTENMENT", $ Now those will be known when pretty printing the irq_vectors:* tracepoints. Cc: Adrian Hunter Cc: Ian Rogers Cc: Jacob Pan Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Thomas Gleixner Link: https://lore.kernel.org/lkml/ZlS34M0x30EFVhbg@x1 Signed-off-by: Arnaldo Carvalho de Melo --- .../perf/trace/beauty/arch/x86/include/asm/irq_vectors.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h b/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h index d18bfb238f660f..13aea8fc3d45fc 100644 --- a/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h +++ b/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h @@ -97,10 +97,16 @@ #define LOCAL_TIMER_VECTOR 0xec +/* + * Posted interrupt notification vector for all device MSIs delivered to + * the host kernel. + */ +#define POSTED_MSI_NOTIFICATION_VECTOR 0xeb + #define NR_VECTORS 256 #ifdef CONFIG_X86_LOCAL_APIC -#define FIRST_SYSTEM_VECTOR LOCAL_TIMER_VECTOR +#define FIRST_SYSTEM_VECTOR POSTED_MSI_NOTIFICATION_VECTOR #else #define FIRST_SYSTEM_VECTOR NR_VECTORS #endif From 947051e361d551e0590777080ffc4926190f62f2 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 24 May 2024 15:19:54 +0100 Subject: [PATCH 0159/1578] KVM: arm64: Fix AArch32 register narrowing on userspace write When userspace writes to one of the core registers, we make sure to narrow the corresponding GPRs if PSTATE indicates an AArch32 context. The code tries to check whether the context is EL0 or EL1 so that it narrows the correct registers. But it does so by checking the full PSTATE instead of PSTATE.M. As a consequence, and if we are restoring an AArch32 EL0 context in a 64bit guest, and that PSTATE has *any* bit set outside of PSTATE.M, we narrow *all* registers instead of only the first 15, destroying the 64bit state. Obviously, this is not something the guest is likely to enjoy. Correctly masking PSTATE to only evaluate PSTATE.M fixes it. Fixes: 90c1f934ed71 ("KVM: arm64: Get rid of the AArch32 register mapping code") Reported-by: Nina Schoetterl-Glausch Cc: stable@vger.kernel.org Reviewed-by: Nina Schoetterl-Glausch Acked-by: Oliver Upton Link: https://lore.kernel.org/r/20240524141956.1450304-2-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/guest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index e2f762d959bb33..d9617b11f7a87d 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -276,7 +276,7 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) if (*vcpu_cpsr(vcpu) & PSR_MODE32_BIT) { int i, nr_reg; - switch (*vcpu_cpsr(vcpu)) { + switch (*vcpu_cpsr(vcpu) & PSR_AA32_MODE_MASK) { /* * Either we are dealing with user mode, and only the * first 15 registers (+ PC) must be narrowed to 32bit. From dfe6d190f38fc5df5ff2614b463a5195a399c885 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 24 May 2024 15:19:55 +0100 Subject: [PATCH 0160/1578] KVM: arm64: Allow AArch32 PSTATE.M to be restored as System mode It appears that we don't allow a vcpu to be restored in AArch32 System mode, as we *never* included it in the list of valid modes. Just add it to the list of allowed modes. Fixes: 0d854a60b1d7 ("arm64: KVM: enable initialization of a 32bit vcpu") Cc: stable@vger.kernel.org Acked-by: Oliver Upton Link: https://lore.kernel.org/r/20240524141956.1450304-3-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/guest.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index d9617b11f7a87d..11098eb7eb44af 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -251,6 +251,7 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) case PSR_AA32_MODE_SVC: case PSR_AA32_MODE_ABT: case PSR_AA32_MODE_UND: + case PSR_AA32_MODE_SYS: if (!vcpu_el1_is_32bit(vcpu)) return -EINVAL; break; From c92e8b9eacebb4060634ebd9395bba1b29aadc68 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 24 May 2024 15:19:56 +0100 Subject: [PATCH 0161/1578] KVM: arm64: AArch32: Fix spurious trapping of conditional instructions We recently upgraded the view of ESR_EL2 to 64bit, in keeping with the requirements of the architecture. However, the AArch32 emulation code was left unaudited, and the (already dodgy) code that triages whether a trap is spurious or not (because the condition code failed) broke in a subtle way: If ESR_EL2.ISS2 is ever non-zero (unlikely, but hey, this is the ARM architecture we're talking about), the hack that tests the top bits of ESR_EL2.EC will break in an interesting way. Instead, use kvm_vcpu_trap_get_class() to obtain the EC, and list all the possible ECs that can fail a condition code check. While we're at it, add SMC32 to the list, as it is explicitly listed as being allowed to trap despite failing a condition code check (as described in the HCR_EL2.TSC documentation). Fixes: 0b12620fddb8 ("KVM: arm64: Treat ESR_EL2 as a 64-bit register") Cc: stable@vger.kernel.org Acked-by: Oliver Upton Link: https://lore.kernel.org/r/20240524141956.1450304-4-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/aarch32.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/hyp/aarch32.c b/arch/arm64/kvm/hyp/aarch32.c index 8d9670e6615dc8..449fa58cf3b63b 100644 --- a/arch/arm64/kvm/hyp/aarch32.c +++ b/arch/arm64/kvm/hyp/aarch32.c @@ -50,9 +50,23 @@ bool kvm_condition_valid32(const struct kvm_vcpu *vcpu) u32 cpsr_cond; int cond; - /* Top two bits non-zero? Unconditional. */ - if (kvm_vcpu_get_esr(vcpu) >> 30) + /* + * These are the exception classes that could fire with a + * conditional instruction. + */ + switch (kvm_vcpu_trap_get_class(vcpu)) { + case ESR_ELx_EC_CP15_32: + case ESR_ELx_EC_CP15_64: + case ESR_ELx_EC_CP14_MR: + case ESR_ELx_EC_CP14_LS: + case ESR_ELx_EC_FP_ASIMD: + case ESR_ELx_EC_CP10_ID: + case ESR_ELx_EC_CP14_64: + case ESR_ELx_EC_SVC32: + break; + default: return true; + } /* Is condition field valid? */ cond = kvm_vcpu_get_condition(vcpu); From 98e948fb60d41447fd8d2d0c3b8637fc6b6dc26d Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Mon, 27 May 2024 13:20:07 +0200 Subject: [PATCH 0162/1578] bpf: Allow delete from sockmap/sockhash only if update is allowed We have seen an influx of syzkaller reports where a BPF program attached to a tracepoint triggers a locking rule violation by performing a map_delete on a sockmap/sockhash. We don't intend to support this artificial use scenario. Extend the existing verifier allowed-program-type check for updating sockmap/sockhash to also cover deleting from a map. From now on only BPF programs which were previously allowed to update sockmap/sockhash can delete from these map types. Fixes: ff9105993240 ("bpf, sockmap: Prevent lock inversion deadlock in map delete elem") Reported-by: Tetsuo Handa Reported-by: syzbot+ec941d6e24f633a59172@syzkaller.appspotmail.com Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Tested-by: syzbot+ec941d6e24f633a59172@syzkaller.appspotmail.com Acked-by: John Fastabend Closes: https://syzkaller.appspot.com/bug?extid=ec941d6e24f633a59172 Link: https://lore.kernel.org/bpf/20240527-sockmap-verify-deletes-v1-1-944b372f2101@cloudflare.com --- kernel/bpf/verifier.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 77da1f438becce..48f3a9acdef3d4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8882,7 +8882,8 @@ static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id) enum bpf_attach_type eatype = env->prog->expected_attach_type; enum bpf_prog_type type = resolve_prog_type(env->prog); - if (func_id != BPF_FUNC_map_update_elem) + if (func_id != BPF_FUNC_map_update_elem && + func_id != BPF_FUNC_map_delete_elem) return false; /* It's not possible to get access to a locked struct sock in these @@ -8893,6 +8894,11 @@ static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id) if (eatype == BPF_TRACE_ITER) return true; break; + case BPF_PROG_TYPE_SOCK_OPS: + /* map_update allowed only via dedicated helpers with event type checks */ + if (func_id == BPF_FUNC_map_delete_elem) + return true; + break; case BPF_PROG_TYPE_SOCKET_FILTER: case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: @@ -8988,7 +8994,6 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_SOCKMAP: if (func_id != BPF_FUNC_sk_redirect_map && func_id != BPF_FUNC_sock_map_update && - func_id != BPF_FUNC_map_delete_elem && func_id != BPF_FUNC_msg_redirect_map && func_id != BPF_FUNC_sk_select_reuseport && func_id != BPF_FUNC_map_lookup_elem && @@ -8998,7 +9003,6 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_SOCKHASH: if (func_id != BPF_FUNC_sk_redirect_hash && func_id != BPF_FUNC_sock_hash_update && - func_id != BPF_FUNC_map_delete_elem && func_id != BPF_FUNC_msg_redirect_hash && func_id != BPF_FUNC_sk_select_reuseport && func_id != BPF_FUNC_map_lookup_elem && From 3b9ce0491a43e9af7f108b2f1bced7cd35931660 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Mon, 27 May 2024 13:20:08 +0200 Subject: [PATCH 0163/1578] Revert "bpf, sockmap: Prevent lock inversion deadlock in map delete elem" This reverts commit ff91059932401894e6c86341915615c5eb0eca48. This check is no longer needed. BPF programs attached to tracepoints are now rejected by the verifier when they attempt to delete from a sockmap/sockhash maps. Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20240527-sockmap-verify-deletes-v1-2-944b372f2101@cloudflare.com --- net/core/sock_map.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 9402889840bf7e..63c016b4c1696a 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -423,9 +423,6 @@ static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, struct sock *sk; int err = 0; - if (irqs_disabled()) - return -EOPNOTSUPP; /* locks here are hardirq-unsafe */ - spin_lock_bh(&stab->lock); sk = *psk; if (!sk_test || sk_test == sk) @@ -948,9 +945,6 @@ static long sock_hash_delete_elem(struct bpf_map *map, void *key) struct bpf_shtab_elem *elem; int ret = -ENOENT; - if (irqs_disabled()) - return -EOPNOTSUPP; /* locks here are hardirq-unsafe */ - hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); From a63bf556160fb19591183383da6757f52119981d Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Mon, 27 May 2024 13:20:09 +0200 Subject: [PATCH 0164/1578] selftests/bpf: Cover verifier checks for mutating sockmap/sockhash Verifier enforces that only certain program types can mutate sock{map,hash} maps, that is update it or delete from it. Add test coverage for these checks so we don't regress. Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20240527-sockmap-verify-deletes-v1-3-944b372f2101@cloudflare.com --- .../selftests/bpf/prog_tests/verifier.c | 2 + .../bpf/progs/verifier_sockmap_mutate.c | 187 ++++++++++++++++++ 2 files changed, 189 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_sockmap_mutate.c diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index c60db8beeb7342..1c9c4ec1be11ed 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -67,6 +67,7 @@ #include "verifier_search_pruning.skel.h" #include "verifier_sock.skel.h" #include "verifier_sock_addr.skel.h" +#include "verifier_sockmap_mutate.skel.h" #include "verifier_spill_fill.skel.h" #include "verifier_spin_lock.skel.h" #include "verifier_stack_ptr.skel.h" @@ -183,6 +184,7 @@ void test_verifier_sdiv(void) { RUN(verifier_sdiv); } void test_verifier_search_pruning(void) { RUN(verifier_search_pruning); } void test_verifier_sock(void) { RUN(verifier_sock); } void test_verifier_sock_addr(void) { RUN(verifier_sock_addr); } +void test_verifier_sockmap_mutate(void) { RUN(verifier_sockmap_mutate); } void test_verifier_spill_fill(void) { RUN(verifier_spill_fill); } void test_verifier_spin_lock(void) { RUN(verifier_spin_lock); } void test_verifier_stack_ptr(void) { RUN(verifier_stack_ptr); } diff --git a/tools/testing/selftests/bpf/progs/verifier_sockmap_mutate.c b/tools/testing/selftests/bpf/progs/verifier_sockmap_mutate.c new file mode 100644 index 00000000000000..fe4b123187b86e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_sockmap_mutate.c @@ -0,0 +1,187 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include + +#include "bpf_misc.h" + +#define __always_unused __attribute__((unused)) + +char _license[] SEC("license") = "GPL"; + +struct sock { +} __attribute__((preserve_access_index)); + +struct bpf_iter__sockmap { + union { + struct sock *sk; + }; +} __attribute__((preserve_access_index)); + +struct { + __uint(type, BPF_MAP_TYPE_SOCKHASH); + __uint(max_entries, 1); + __type(key, int); + __type(value, int); +} sockhash SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(max_entries, 1); + __type(key, int); + __type(value, int); +} sockmap SEC(".maps"); + +enum { CG_OK = 1 }; + +int zero = 0; + +static __always_inline void test_sockmap_delete(void) +{ + bpf_map_delete_elem(&sockmap, &zero); + bpf_map_delete_elem(&sockhash, &zero); +} + +static __always_inline void test_sockmap_update(void *sk) +{ + if (sk) { + bpf_map_update_elem(&sockmap, &zero, sk, BPF_ANY); + bpf_map_update_elem(&sockhash, &zero, sk, BPF_ANY); + } +} + +static __always_inline void test_sockmap_lookup_and_update(void) +{ + struct bpf_sock *sk = bpf_map_lookup_elem(&sockmap, &zero); + + if (sk) { + test_sockmap_update(sk); + bpf_sk_release(sk); + } +} + +static __always_inline void test_sockmap_mutate(void *sk) +{ + test_sockmap_delete(); + test_sockmap_update(sk); +} + +static __always_inline void test_sockmap_lookup_and_mutate(void) +{ + test_sockmap_delete(); + test_sockmap_lookup_and_update(); +} + +SEC("action") +__success +int test_sched_act(struct __sk_buff *skb) +{ + test_sockmap_mutate(skb->sk); + return 0; +} + +SEC("classifier") +__success +int test_sched_cls(struct __sk_buff *skb) +{ + test_sockmap_mutate(skb->sk); + return 0; +} + +SEC("flow_dissector") +__success +int test_flow_dissector_delete(struct __sk_buff *skb __always_unused) +{ + test_sockmap_delete(); + return 0; +} + +SEC("flow_dissector") +__failure __msg("program of this type cannot use helper bpf_sk_release") +int test_flow_dissector_update(struct __sk_buff *skb __always_unused) +{ + test_sockmap_lookup_and_update(); /* no access to skb->sk */ + return 0; +} + +SEC("iter/sockmap") +__success +int test_trace_iter(struct bpf_iter__sockmap *ctx) +{ + test_sockmap_mutate(ctx->sk); + return 0; +} + +SEC("raw_tp/kfree") +__failure __msg("cannot update sockmap in this context") +int test_raw_tp_delete(const void *ctx __always_unused) +{ + test_sockmap_delete(); + return 0; +} + +SEC("raw_tp/kfree") +__failure __msg("cannot update sockmap in this context") +int test_raw_tp_update(const void *ctx __always_unused) +{ + test_sockmap_lookup_and_update(); + return 0; +} + +SEC("sk_lookup") +__success +int test_sk_lookup(struct bpf_sk_lookup *ctx) +{ + test_sockmap_mutate(ctx->sk); + return 0; +} + +SEC("sk_reuseport") +__success +int test_sk_reuseport(struct sk_reuseport_md *ctx) +{ + test_sockmap_mutate(ctx->sk); + return 0; +} + +SEC("socket") +__success +int test_socket_filter(struct __sk_buff *skb) +{ + test_sockmap_mutate(skb->sk); + return 0; +} + +SEC("sockops") +__success +int test_sockops_delete(struct bpf_sock_ops *ctx __always_unused) +{ + test_sockmap_delete(); + return CG_OK; +} + +SEC("sockops") +__failure __msg("cannot update sockmap in this context") +int test_sockops_update(struct bpf_sock_ops *ctx) +{ + test_sockmap_update(ctx->sk); + return CG_OK; +} + +SEC("sockops") +__success +int test_sockops_update_dedicated(struct bpf_sock_ops *ctx) +{ + bpf_sock_map_update(ctx, &sockmap, &zero, BPF_ANY); + bpf_sock_hash_update(ctx, &sockhash, &zero, BPF_ANY); + return CG_OK; +} + +SEC("xdp") +__success +int test_xdp(struct xdp_md *ctx __always_unused) +{ + test_sockmap_lookup_and_mutate(); + return XDP_PASS; +} From d9ff882b54f99f96787fa3df7cd938966843c418 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 27 May 2024 13:34:45 +0900 Subject: [PATCH 0165/1578] null_blk: Fix return value of nullb_device_power_store() When powering on a null_blk device that is not already on, the return value ret that is initialized to be count is reused to check the return value of null_add_dev(), leading to nullb_device_power_store() to return null_add_dev() return value (0 on success) instead of "count". So make sure to set ret to be equal to count when there are no errors. Fixes: a2db328b0839 ("null_blk: fix null-ptr-dereference while configuring 'power' and 'submit_queues'") Signed-off-by: Damien Le Moal Reviewed-by: Yu Kuai Reviewed-by: Kanchan Joshi Link: https://lore.kernel.org/r/20240527043445.235267-1-dlemoal@kernel.org Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index eb023d26736934..631dca2e4e8442 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -494,6 +494,7 @@ static ssize_t nullb_device_power_store(struct config_item *item, set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); dev->power = newp; + ret = count; } else if (dev->power && !newp) { if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { dev->power = newp; From 30a0e3135f9aa14ba745f767375183b3112d7440 Mon Sep 17 00:00:00 2001 From: hexue Date: Mon, 27 May 2024 16:45:33 +0800 Subject: [PATCH 0166/1578] block: delete redundant function declaration blk_stats_alloc_enable was used for block hybrid poll, the related function definition was removed by patch: commit 54bdd67d0f88 ("blk-mq: remove hybrid polling") but the function declaration was not deleted. Signed-off-by: hexue Link: https://lore.kernel.org/r/20240527084533.1485210-1-xue01.he@samsung.com Signed-off-by: Jens Axboe --- block/blk-stat.h | 1 - 1 file changed, 1 deletion(-) diff --git a/block/blk-stat.h b/block/blk-stat.h index 17e1eb4ec7e25d..5d7f18ba436d00 100644 --- a/block/blk-stat.h +++ b/block/blk-stat.h @@ -64,7 +64,6 @@ struct blk_stat_callback { struct blk_queue_stats *blk_alloc_queue_stats(void); void blk_free_queue_stats(struct blk_queue_stats *); -bool blk_stats_alloc_enable(struct request_queue *q); void blk_stat_add(struct request *rq, u64 now); From e30a942861b540e056425a8e31ba801de1ed4f25 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 27 May 2024 14:44:11 -0500 Subject: [PATCH 0167/1578] ASoC: SOF: stream-ipc: remove unnecessary MODULE_LICENSE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This file is part of the snd-sof module, there's no reason to re-add the MODULE_LICENSE here. Signed-off-by: Pierre-Louis Bossart Reviewed-by: Bard Liao Reviewed-by: Daniel Baluta Reviewed-by: Péter Ujfalusi Link: https://msgid.link/r/20240527194414.166156-2-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/stream-ipc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/sound/soc/sof/stream-ipc.c b/sound/soc/sof/stream-ipc.c index eb71303aa24c6a..794c7bbccbaf92 100644 --- a/sound/soc/sof/stream-ipc.c +++ b/sound/soc/sof/stream-ipc.c @@ -125,5 +125,3 @@ int sof_stream_pcm_close(struct snd_sof_dev *sdev, return 0; } EXPORT_SYMBOL(sof_stream_pcm_close); - -MODULE_LICENSE("Dual BSD/GPL"); From b88056df4fcb7b5930d6ee3fef494e8729dcf2b2 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 27 May 2024 14:44:12 -0500 Subject: [PATCH 0168/1578] ASoC: SOF: AMD: group all module related information MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The module information is spread across files, group in a single location. For maintenability and alignment, the arbitrary Intel convention is used with the following order: MODULE_LICENSE MODULE_DESCRIPTION MODULE_IMPORT Signed-off-by: Pierre-Louis Bossart Reviewed-by: Bard Liao Reviewed-by: Daniel Baluta Reviewed-by: Péter Ujfalusi Link: https://msgid.link/r/20240527194414.166156-3-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/amd/acp63.c | 4 ---- sound/soc/sof/amd/pci-acp63.c | 1 + sound/soc/sof/amd/pci-rmb.c | 1 + sound/soc/sof/amd/pci-rn.c | 1 + sound/soc/sof/amd/pci-vangogh.c | 1 + sound/soc/sof/amd/rembrandt.c | 4 ---- sound/soc/sof/amd/renoir.c | 4 ---- sound/soc/sof/amd/vangogh.c | 4 ---- 8 files changed, 4 insertions(+), 16 deletions(-) diff --git a/sound/soc/sof/amd/acp63.c b/sound/soc/sof/amd/acp63.c index 9fb645079c3ad3..9e6eb4bfc805b2 100644 --- a/sound/soc/sof/amd/acp63.c +++ b/sound/soc/sof/amd/acp63.c @@ -140,7 +140,3 @@ int sof_acp63_ops_init(struct snd_sof_dev *sdev) return 0; } - -MODULE_IMPORT_NS(SND_SOC_SOF_AMD_COMMON); -MODULE_DESCRIPTION("ACP63 SOF Driver"); -MODULE_LICENSE("Dual BSD/GPL"); diff --git a/sound/soc/sof/amd/pci-acp63.c b/sound/soc/sof/amd/pci-acp63.c index eeaa12cceb2331..fc898444736571 100644 --- a/sound/soc/sof/amd/pci-acp63.c +++ b/sound/soc/sof/amd/pci-acp63.c @@ -109,5 +109,6 @@ static struct pci_driver snd_sof_pci_amd_acp63_driver = { module_pci_driver(snd_sof_pci_amd_acp63_driver); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("ACP63 SOF Driver"); MODULE_IMPORT_NS(SND_SOC_SOF_AMD_COMMON); MODULE_IMPORT_NS(SND_SOC_SOF_PCI_DEV); diff --git a/sound/soc/sof/amd/pci-rmb.c b/sound/soc/sof/amd/pci-rmb.c index 2f288545c42605..4bc30951f8b0d7 100644 --- a/sound/soc/sof/amd/pci-rmb.c +++ b/sound/soc/sof/amd/pci-rmb.c @@ -99,5 +99,6 @@ static struct pci_driver snd_sof_pci_amd_rmb_driver = { module_pci_driver(snd_sof_pci_amd_rmb_driver); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("REMBRANDT SOF Driver"); MODULE_IMPORT_NS(SND_SOC_SOF_AMD_COMMON); MODULE_IMPORT_NS(SND_SOC_SOF_PCI_DEV); diff --git a/sound/soc/sof/amd/pci-rn.c b/sound/soc/sof/amd/pci-rn.c index a0195e9b400c89..e08875bdfa8b16 100644 --- a/sound/soc/sof/amd/pci-rn.c +++ b/sound/soc/sof/amd/pci-rn.c @@ -103,5 +103,6 @@ static struct pci_driver snd_sof_pci_amd_rn_driver = { module_pci_driver(snd_sof_pci_amd_rn_driver); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("RENOIR SOF Driver"); MODULE_IMPORT_NS(SND_SOC_SOF_AMD_COMMON); MODULE_IMPORT_NS(SND_SOC_SOF_PCI_DEV); diff --git a/sound/soc/sof/amd/pci-vangogh.c b/sound/soc/sof/amd/pci-vangogh.c index 5cd3ac84752fb8..16eb2994fbab90 100644 --- a/sound/soc/sof/amd/pci-vangogh.c +++ b/sound/soc/sof/amd/pci-vangogh.c @@ -101,5 +101,6 @@ static struct pci_driver snd_sof_pci_amd_vgh_driver = { module_pci_driver(snd_sof_pci_amd_vgh_driver); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("VANGOGH SOF Driver"); MODULE_IMPORT_NS(SND_SOC_SOF_AMD_COMMON); MODULE_IMPORT_NS(SND_SOC_SOF_PCI_DEV); diff --git a/sound/soc/sof/amd/rembrandt.c b/sound/soc/sof/amd/rembrandt.c index f1d1ba57ab3a0d..076f2f05a95c20 100644 --- a/sound/soc/sof/amd/rembrandt.c +++ b/sound/soc/sof/amd/rembrandt.c @@ -140,7 +140,3 @@ int sof_rembrandt_ops_init(struct snd_sof_dev *sdev) return 0; } - -MODULE_IMPORT_NS(SND_SOC_SOF_AMD_COMMON); -MODULE_DESCRIPTION("REMBRANDT SOF Driver"); -MODULE_LICENSE("Dual BSD/GPL"); diff --git a/sound/soc/sof/amd/renoir.c b/sound/soc/sof/amd/renoir.c index 47b863f6258c6d..aa2d24dac6f5d5 100644 --- a/sound/soc/sof/amd/renoir.c +++ b/sound/soc/sof/amd/renoir.c @@ -115,7 +115,3 @@ int sof_renoir_ops_init(struct snd_sof_dev *sdev) return 0; } - -MODULE_IMPORT_NS(SND_SOC_SOF_AMD_COMMON); -MODULE_DESCRIPTION("RENOIR SOF Driver"); -MODULE_LICENSE("Dual BSD/GPL"); diff --git a/sound/soc/sof/amd/vangogh.c b/sound/soc/sof/amd/vangogh.c index bc6ffdb5471a58..61372958c09dc8 100644 --- a/sound/soc/sof/amd/vangogh.c +++ b/sound/soc/sof/amd/vangogh.c @@ -161,7 +161,3 @@ int sof_vangogh_ops_init(struct snd_sof_dev *sdev) return 0; } - -MODULE_IMPORT_NS(SND_SOC_SOF_AMD_COMMON); -MODULE_DESCRIPTION("VANGOGH SOF Driver"); -MODULE_LICENSE("Dual BSD/GPL"); From 06a2315da0b02db4f2115bc9253daa270571e389 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 27 May 2024 14:44:13 -0500 Subject: [PATCH 0169/1578] ASoC: SOF: reorder MODULE_ definitions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow the arbitrary Intel convention order to allow for easier grep. MODULE_LICENSE MODULE_DESCRIPTION MODULE_IMPORT Signed-off-by: Pierre-Louis Bossart Reviewed-by: Bard Liao Reviewed-by: Daniel Baluta Reviewed-by: Péter Ujfalusi Link: https://msgid.link/r/20240527194414.166156-4-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/amd/acp-common.c | 4 ++-- sound/soc/sof/amd/acp.c | 2 +- sound/soc/sof/core.c | 2 +- sound/soc/sof/nocodec.c | 2 +- sound/soc/sof/sof-client-ipc-flood-test.c | 2 +- sound/soc/sof/sof-client-ipc-kernel-injector.c | 2 +- sound/soc/sof/sof-client-ipc-msg-injector.c | 2 +- sound/soc/sof/sof-client-probes.c | 2 +- sound/soc/sof/xtensa/core.c | 2 +- 9 files changed, 10 insertions(+), 10 deletions(-) diff --git a/sound/soc/sof/amd/acp-common.c b/sound/soc/sof/amd/acp-common.c index b26fa471b431ff..81bb93e9835864 100644 --- a/sound/soc/sof/amd/acp-common.c +++ b/sound/soc/sof/amd/acp-common.c @@ -258,8 +258,8 @@ const struct snd_sof_dsp_ops sof_acp_common_ops = { }; EXPORT_SYMBOL_NS(sof_acp_common_ops, SND_SOC_SOF_AMD_COMMON); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("ACP SOF COMMON Driver"); MODULE_IMPORT_NS(SND_SOC_SOF_AMD_COMMON); MODULE_IMPORT_NS(SND_SOC_SOF_XTENSA); MODULE_IMPORT_NS(SOUNDWIRE_AMD_INIT); -MODULE_DESCRIPTION("ACP SOF COMMON Driver"); -MODULE_LICENSE("Dual BSD/GPL"); diff --git a/sound/soc/sof/amd/acp.c b/sound/soc/sof/amd/acp.c index c12c7f82052947..74fd5f2b148b85 100644 --- a/sound/soc/sof/amd/acp.c +++ b/sound/soc/sof/amd/acp.c @@ -801,7 +801,7 @@ void amd_sof_acp_remove(struct snd_sof_dev *sdev) } EXPORT_SYMBOL_NS(amd_sof_acp_remove, SND_SOC_SOF_AMD_COMMON); +MODULE_LICENSE("Dual BSD/GPL"); MODULE_DESCRIPTION("AMD ACP sof driver"); MODULE_IMPORT_NS(SOUNDWIRE_AMD_INIT); MODULE_IMPORT_NS(SND_AMD_SOUNDWIRE_ACPI); -MODULE_LICENSE("Dual BSD/GPL"); diff --git a/sound/soc/sof/core.c b/sound/soc/sof/core.c index 0a4917136ff971..83fe0401baf867 100644 --- a/sound/soc/sof/core.c +++ b/sound/soc/sof/core.c @@ -769,7 +769,7 @@ void sof_machine_unregister(struct snd_sof_dev *sdev, void *pdata) EXPORT_SYMBOL(sof_machine_unregister); MODULE_AUTHOR("Liam Girdwood"); -MODULE_DESCRIPTION("Sound Open Firmware (SOF) Core"); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("Sound Open Firmware (SOF) Core"); MODULE_ALIAS("platform:sof-audio"); MODULE_IMPORT_NS(SND_SOC_SOF_CLIENT); diff --git a/sound/soc/sof/nocodec.c b/sound/soc/sof/nocodec.c index fdcbe33d3dcfb8..b12b3d865ae30c 100644 --- a/sound/soc/sof/nocodec.c +++ b/sound/soc/sof/nocodec.c @@ -110,7 +110,7 @@ static struct platform_driver sof_nocodec_audio = { }; module_platform_driver(sof_nocodec_audio) +MODULE_LICENSE("Dual BSD/GPL"); MODULE_DESCRIPTION("ASoC sof nocodec"); MODULE_AUTHOR("Liam Girdwood"); -MODULE_LICENSE("Dual BSD/GPL"); MODULE_ALIAS("platform:sof-nocodec"); diff --git a/sound/soc/sof/sof-client-ipc-flood-test.c b/sound/soc/sof/sof-client-ipc-flood-test.c index 43561492609252..e7d2001140e81a 100644 --- a/sound/soc/sof/sof-client-ipc-flood-test.c +++ b/sound/soc/sof/sof-client-ipc-flood-test.c @@ -394,6 +394,6 @@ static struct auxiliary_driver sof_ipc_flood_client_drv = { module_auxiliary_driver(sof_ipc_flood_client_drv); -MODULE_DESCRIPTION("SOF IPC Flood Test Client Driver"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SOF IPC Flood Test Client Driver"); MODULE_IMPORT_NS(SND_SOC_SOF_CLIENT); diff --git a/sound/soc/sof/sof-client-ipc-kernel-injector.c b/sound/soc/sof/sof-client-ipc-kernel-injector.c index 6973b6690df4b8..d3f541069b24b8 100644 --- a/sound/soc/sof/sof-client-ipc-kernel-injector.c +++ b/sound/soc/sof/sof-client-ipc-kernel-injector.c @@ -157,6 +157,6 @@ static struct auxiliary_driver sof_msg_inject_client_drv = { module_auxiliary_driver(sof_msg_inject_client_drv); -MODULE_DESCRIPTION("SOF IPC Kernel Injector Client Driver"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SOF IPC Kernel Injector Client Driver"); MODULE_IMPORT_NS(SND_SOC_SOF_CLIENT); diff --git a/sound/soc/sof/sof-client-ipc-msg-injector.c b/sound/soc/sof/sof-client-ipc-msg-injector.c index af22e6421029c6..d0f8beb9d0008d 100644 --- a/sound/soc/sof/sof-client-ipc-msg-injector.c +++ b/sound/soc/sof/sof-client-ipc-msg-injector.c @@ -335,6 +335,6 @@ static struct auxiliary_driver sof_msg_inject_client_drv = { module_auxiliary_driver(sof_msg_inject_client_drv); -MODULE_DESCRIPTION("SOF IPC Message Injector Client Driver"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SOF IPC Message Injector Client Driver"); MODULE_IMPORT_NS(SND_SOC_SOF_CLIENT); diff --git a/sound/soc/sof/sof-client-probes.c b/sound/soc/sof/sof-client-probes.c index b8f297307565cc..ccc7d38ddc3834 100644 --- a/sound/soc/sof/sof-client-probes.c +++ b/sound/soc/sof/sof-client-probes.c @@ -540,6 +540,6 @@ static struct auxiliary_driver sof_probes_client_drv = { module_auxiliary_driver(sof_probes_client_drv); -MODULE_DESCRIPTION("SOF Probes Client Driver"); MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("SOF Probes Client Driver"); MODULE_IMPORT_NS(SND_SOC_SOF_CLIENT); diff --git a/sound/soc/sof/xtensa/core.c b/sound/soc/sof/xtensa/core.c index ccbc3fcdadd5ec..3cf8c84beff957 100644 --- a/sound/soc/sof/xtensa/core.c +++ b/sound/soc/sof/xtensa/core.c @@ -151,5 +151,5 @@ const struct dsp_arch_ops sof_xtensa_arch_ops = { }; EXPORT_SYMBOL_NS(sof_xtensa_arch_ops, SND_SOC_SOF_XTENSA); -MODULE_DESCRIPTION("SOF Xtensa DSP support"); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("SOF Xtensa DSP support"); From 3ff78451b8e446e9a548b98a0d4dd8d24dc5780b Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 27 May 2024 14:44:14 -0500 Subject: [PATCH 0170/1578] ASoC: SOF: add missing MODULE_DESCRIPTION() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MODULE_DESCRIPTION() was optional until it became mandatory and flagged as an error by 'make W=1'. Reported-by: Andy Shevchenko Signed-off-by: Pierre-Louis Bossart Reviewed-by: Bard Liao Reviewed-by: Daniel Baluta Reviewed-by: Péter Ujfalusi Link: https://msgid.link/r/20240527194414.166156-5-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/imx/imx-common.c | 1 + sound/soc/sof/imx/imx8.c | 3 ++- sound/soc/sof/imx/imx8m.c | 3 ++- sound/soc/sof/imx/imx8ulp.c | 3 ++- sound/soc/sof/intel/atom.c | 1 + sound/soc/sof/intel/bdw.c | 1 + sound/soc/sof/intel/byt.c | 1 + sound/soc/sof/intel/hda-codec.c | 1 + sound/soc/sof/intel/hda-ctrl.c | 1 + sound/soc/sof/intel/hda-mlink.c | 1 + sound/soc/sof/intel/hda.c | 1 + sound/soc/sof/intel/pci-apl.c | 1 + sound/soc/sof/intel/pci-cnl.c | 1 + sound/soc/sof/intel/pci-icl.c | 1 + sound/soc/sof/intel/pci-lnl.c | 1 + sound/soc/sof/intel/pci-mtl.c | 1 + sound/soc/sof/intel/pci-skl.c | 1 + sound/soc/sof/intel/pci-tgl.c | 1 + sound/soc/sof/intel/pci-tng.c | 1 + sound/soc/sof/mediatek/mt8186/mt8186.c | 3 ++- sound/soc/sof/mediatek/mt8195/mt8195.c | 3 ++- sound/soc/sof/mediatek/mtk-adsp-common.c | 1 + sound/soc/sof/sof-acpi-dev.c | 1 + sound/soc/sof/sof-of-dev.c | 1 + sound/soc/sof/sof-pci-dev.c | 1 + sound/soc/sof/sof-utils.c | 1 + 26 files changed, 31 insertions(+), 5 deletions(-) diff --git a/sound/soc/sof/imx/imx-common.c b/sound/soc/sof/imx/imx-common.c index 2981aea123d976..fce6d9cf6a6be8 100644 --- a/sound/soc/sof/imx/imx-common.c +++ b/sound/soc/sof/imx/imx-common.c @@ -75,3 +75,4 @@ void imx8_dump(struct snd_sof_dev *sdev, u32 flags) EXPORT_SYMBOL(imx8_dump); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("SOF helpers for IMX platforms"); diff --git a/sound/soc/sof/imx/imx8.c b/sound/soc/sof/imx/imx8.c index 3021dc87ab5a23..9f24e3c283dd41 100644 --- a/sound/soc/sof/imx/imx8.c +++ b/sound/soc/sof/imx/imx8.c @@ -667,5 +667,6 @@ static struct platform_driver snd_sof_of_imx8_driver = { }; module_platform_driver(snd_sof_of_imx8_driver); -MODULE_IMPORT_NS(SND_SOC_SOF_XTENSA); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("SOF support for IMX8 platforms"); +MODULE_IMPORT_NS(SND_SOC_SOF_XTENSA); diff --git a/sound/soc/sof/imx/imx8m.c b/sound/soc/sof/imx/imx8m.c index 4ed415f0434544..1c7019c3cbd38c 100644 --- a/sound/soc/sof/imx/imx8m.c +++ b/sound/soc/sof/imx/imx8m.c @@ -514,5 +514,6 @@ static struct platform_driver snd_sof_of_imx8m_driver = { }; module_platform_driver(snd_sof_of_imx8m_driver); -MODULE_IMPORT_NS(SND_SOC_SOF_XTENSA); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("SOF support for IMX8M platforms"); +MODULE_IMPORT_NS(SND_SOC_SOF_XTENSA); diff --git a/sound/soc/sof/imx/imx8ulp.c b/sound/soc/sof/imx/imx8ulp.c index 8adfdd00413aa1..2585b1beef23f8 100644 --- a/sound/soc/sof/imx/imx8ulp.c +++ b/sound/soc/sof/imx/imx8ulp.c @@ -516,5 +516,6 @@ static struct platform_driver snd_sof_of_imx8ulp_driver = { }; module_platform_driver(snd_sof_of_imx8ulp_driver); -MODULE_IMPORT_NS(SND_SOC_SOF_XTENSA); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("SOF support for IMX8ULP platforms"); +MODULE_IMPORT_NS(SND_SOC_SOF_XTENSA); diff --git a/sound/soc/sof/intel/atom.c b/sound/soc/sof/intel/atom.c index 86af4e9a716ef7..3505ac3a1b143f 100644 --- a/sound/soc/sof/intel/atom.c +++ b/sound/soc/sof/intel/atom.c @@ -418,3 +418,4 @@ void atom_set_mach_params(struct snd_soc_acpi_mach *mach, EXPORT_SYMBOL_NS(atom_set_mach_params, SND_SOC_SOF_INTEL_ATOM_HIFI_EP); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("SOF support for Atom platforms"); diff --git a/sound/soc/sof/intel/bdw.c b/sound/soc/sof/intel/bdw.c index 3262286a9a9d74..7f18080e4e1918 100644 --- a/sound/soc/sof/intel/bdw.c +++ b/sound/soc/sof/intel/bdw.c @@ -694,6 +694,7 @@ static struct platform_driver snd_sof_acpi_intel_bdw_driver = { module_platform_driver(snd_sof_acpi_intel_bdw_driver); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("SOF support for Broadwell platforms"); MODULE_IMPORT_NS(SND_SOC_SOF_INTEL_HIFI_EP_IPC); MODULE_IMPORT_NS(SND_SOC_SOF_XTENSA); MODULE_IMPORT_NS(SND_SOC_SOF_ACPI_DEV); diff --git a/sound/soc/sof/intel/byt.c b/sound/soc/sof/intel/byt.c index d78d11d4cfbf6c..7a57e162fb1c2b 100644 --- a/sound/soc/sof/intel/byt.c +++ b/sound/soc/sof/intel/byt.c @@ -475,6 +475,7 @@ static struct platform_driver snd_sof_acpi_intel_byt_driver = { module_platform_driver(snd_sof_acpi_intel_byt_driver); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("SOF support for Baytrail/Cherrytrail"); MODULE_IMPORT_NS(SND_SOC_SOF_INTEL_HIFI_EP_IPC); MODULE_IMPORT_NS(SND_SOC_SOF_XTENSA); MODULE_IMPORT_NS(SND_SOC_SOF_ACPI_DEV); diff --git a/sound/soc/sof/intel/hda-codec.c b/sound/soc/sof/intel/hda-codec.c index da3db3ed379ecb..dc46888faa0dc9 100644 --- a/sound/soc/sof/intel/hda-codec.c +++ b/sound/soc/sof/intel/hda-codec.c @@ -457,3 +457,4 @@ EXPORT_SYMBOL_NS_GPL(hda_codec_i915_exit, SND_SOC_SOF_HDA_AUDIO_CODEC_I915); #endif MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("SOF support for HDaudio codecs"); diff --git a/sound/soc/sof/intel/hda-ctrl.c b/sound/soc/sof/intel/hda-ctrl.c index 262b482dc0a800..b9a02750ce6131 100644 --- a/sound/soc/sof/intel/hda-ctrl.c +++ b/sound/soc/sof/intel/hda-ctrl.c @@ -328,6 +328,7 @@ void hda_dsp_ctrl_stop_chip(struct snd_sof_dev *sdev) } MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("SOF helpers for HDaudio platforms"); MODULE_IMPORT_NS(SND_SOC_SOF_HDA_MLINK); MODULE_IMPORT_NS(SND_SOC_SOF_HDA_AUDIO_CODEC); MODULE_IMPORT_NS(SND_SOC_SOF_HDA_AUDIO_CODEC_I915); diff --git a/sound/soc/sof/intel/hda-mlink.c b/sound/soc/sof/intel/hda-mlink.c index 04bbc5c9904ce6..9a3559c78b6279 100644 --- a/sound/soc/sof/intel/hda-mlink.c +++ b/sound/soc/sof/intel/hda-mlink.c @@ -972,3 +972,4 @@ EXPORT_SYMBOL_NS(hdac_bus_eml_enable_offload, SND_SOC_SOF_HDA_MLINK); #endif MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("SOF support for HDaudio multi-link"); diff --git a/sound/soc/sof/intel/hda.c b/sound/soc/sof/intel/hda.c index e6a38de0a0aa10..dead1c19558bb0 100644 --- a/sound/soc/sof/intel/hda.c +++ b/sound/soc/sof/intel/hda.c @@ -1522,6 +1522,7 @@ void hda_unregister_clients(struct snd_sof_dev *sdev) } MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("SOF support for HDaudio platforms"); MODULE_IMPORT_NS(SND_SOC_SOF_PCI_DEV); MODULE_IMPORT_NS(SND_SOC_SOF_HDA_AUDIO_CODEC); MODULE_IMPORT_NS(SND_SOC_SOF_HDA_AUDIO_CODEC_I915); diff --git a/sound/soc/sof/intel/pci-apl.c b/sound/soc/sof/intel/pci-apl.c index df6d897da290b6..f006dcf5458aee 100644 --- a/sound/soc/sof/intel/pci-apl.c +++ b/sound/soc/sof/intel/pci-apl.c @@ -105,6 +105,7 @@ static struct pci_driver snd_sof_pci_intel_apl_driver = { module_pci_driver(snd_sof_pci_intel_apl_driver); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("SOF support for ApolloLake platforms"); MODULE_IMPORT_NS(SND_SOC_SOF_INTEL_HDA_GENERIC); MODULE_IMPORT_NS(SND_SOC_SOF_INTEL_HDA_COMMON); MODULE_IMPORT_NS(SND_SOC_SOF_PCI_DEV); diff --git a/sound/soc/sof/intel/pci-cnl.c b/sound/soc/sof/intel/pci-cnl.c index a39fa3657d5556..a8406342f08bdd 100644 --- a/sound/soc/sof/intel/pci-cnl.c +++ b/sound/soc/sof/intel/pci-cnl.c @@ -143,6 +143,7 @@ static struct pci_driver snd_sof_pci_intel_cnl_driver = { module_pci_driver(snd_sof_pci_intel_cnl_driver); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("SOF support for CannonLake platforms"); MODULE_IMPORT_NS(SND_SOC_SOF_INTEL_HDA_GENERIC); MODULE_IMPORT_NS(SND_SOC_SOF_INTEL_HDA_COMMON); MODULE_IMPORT_NS(SND_SOC_SOF_PCI_DEV); diff --git a/sound/soc/sof/intel/pci-icl.c b/sound/soc/sof/intel/pci-icl.c index 9f1fe47475fb17..25effca50d9fed 100644 --- a/sound/soc/sof/intel/pci-icl.c +++ b/sound/soc/sof/intel/pci-icl.c @@ -108,6 +108,7 @@ static struct pci_driver snd_sof_pci_intel_icl_driver = { module_pci_driver(snd_sof_pci_intel_icl_driver); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("SOF support for IceLake platforms"); MODULE_IMPORT_NS(SND_SOC_SOF_INTEL_HDA_GENERIC); MODULE_IMPORT_NS(SND_SOC_SOF_INTEL_HDA_COMMON); MODULE_IMPORT_NS(SND_SOC_SOF_INTEL_CNL); diff --git a/sound/soc/sof/intel/pci-lnl.c b/sound/soc/sof/intel/pci-lnl.c index 68e5c90151b272..602c574064eb5c 100644 --- a/sound/soc/sof/intel/pci-lnl.c +++ b/sound/soc/sof/intel/pci-lnl.c @@ -70,6 +70,7 @@ static struct pci_driver snd_sof_pci_intel_lnl_driver = { module_pci_driver(snd_sof_pci_intel_lnl_driver); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("SOF support for LunarLake platforms"); MODULE_IMPORT_NS(SND_SOC_SOF_INTEL_HDA_GENERIC); MODULE_IMPORT_NS(SND_SOC_SOF_INTEL_HDA_COMMON); MODULE_IMPORT_NS(SND_SOC_SOF_INTEL_MTL); diff --git a/sound/soc/sof/intel/pci-mtl.c b/sound/soc/sof/intel/pci-mtl.c index c685cb8d617123..8cb0333c033ef1 100644 --- a/sound/soc/sof/intel/pci-mtl.c +++ b/sound/soc/sof/intel/pci-mtl.c @@ -133,6 +133,7 @@ static struct pci_driver snd_sof_pci_intel_mtl_driver = { module_pci_driver(snd_sof_pci_intel_mtl_driver); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("SOF support for MeteorLake platforms"); MODULE_IMPORT_NS(SND_SOC_SOF_INTEL_HDA_GENERIC); MODULE_IMPORT_NS(SND_SOC_SOF_INTEL_HDA_COMMON); MODULE_IMPORT_NS(SND_SOC_SOF_PCI_DEV); diff --git a/sound/soc/sof/intel/pci-skl.c b/sound/soc/sof/intel/pci-skl.c index 862da8009543bb..8ca0231d7e4f6e 100644 --- a/sound/soc/sof/intel/pci-skl.c +++ b/sound/soc/sof/intel/pci-skl.c @@ -89,6 +89,7 @@ static struct pci_driver snd_sof_pci_intel_skl_driver = { module_pci_driver(snd_sof_pci_intel_skl_driver); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("SOF support for SkyLake platforms"); MODULE_IMPORT_NS(SND_SOC_SOF_INTEL_HDA_GENERIC); MODULE_IMPORT_NS(SND_SOC_SOF_INTEL_HDA_COMMON); MODULE_IMPORT_NS(SND_SOC_SOF_PCI_DEV); diff --git a/sound/soc/sof/intel/pci-tgl.c b/sound/soc/sof/intel/pci-tgl.c index f73bb47cd79e7b..ebe1a7d1668946 100644 --- a/sound/soc/sof/intel/pci-tgl.c +++ b/sound/soc/sof/intel/pci-tgl.c @@ -317,6 +317,7 @@ static struct pci_driver snd_sof_pci_intel_tgl_driver = { module_pci_driver(snd_sof_pci_intel_tgl_driver); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("SOF support for TigerLake platforms"); MODULE_IMPORT_NS(SND_SOC_SOF_INTEL_HDA_GENERIC); MODULE_IMPORT_NS(SND_SOC_SOF_INTEL_HDA_COMMON); MODULE_IMPORT_NS(SND_SOC_SOF_INTEL_CNL); diff --git a/sound/soc/sof/intel/pci-tng.c b/sound/soc/sof/intel/pci-tng.c index 5c3069588bb771..1375c393827e11 100644 --- a/sound/soc/sof/intel/pci-tng.c +++ b/sound/soc/sof/intel/pci-tng.c @@ -244,6 +244,7 @@ static struct pci_driver snd_sof_pci_intel_tng_driver = { module_pci_driver(snd_sof_pci_intel_tng_driver); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("SOF support for Tangier platforms"); MODULE_IMPORT_NS(SND_SOC_SOF_INTEL_HIFI_EP_IPC); MODULE_IMPORT_NS(SND_SOC_SOF_XTENSA); MODULE_IMPORT_NS(SND_SOC_SOF_PCI_DEV); diff --git a/sound/soc/sof/mediatek/mt8186/mt8186.c b/sound/soc/sof/mediatek/mt8186/mt8186.c index c63e0d2f4b9686..bea1b9d9ca2886 100644 --- a/sound/soc/sof/mediatek/mt8186/mt8186.c +++ b/sound/soc/sof/mediatek/mt8186/mt8186.c @@ -666,6 +666,7 @@ static struct platform_driver snd_sof_of_mt8186_driver = { }; module_platform_driver(snd_sof_of_mt8186_driver); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("SOF support for MT8186/MT8188 platforms"); MODULE_IMPORT_NS(SND_SOC_SOF_XTENSA); MODULE_IMPORT_NS(SND_SOC_SOF_MTK_COMMON); -MODULE_LICENSE("Dual BSD/GPL"); diff --git a/sound/soc/sof/mediatek/mt8195/mt8195.c b/sound/soc/sof/mediatek/mt8195/mt8195.c index fc1c016104aee3..31dc98d1b1d8bd 100644 --- a/sound/soc/sof/mediatek/mt8195/mt8195.c +++ b/sound/soc/sof/mediatek/mt8195/mt8195.c @@ -619,6 +619,7 @@ static struct platform_driver snd_sof_of_mt8195_driver = { }; module_platform_driver(snd_sof_of_mt8195_driver); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("SOF support for MTL 8195 platforms"); MODULE_IMPORT_NS(SND_SOC_SOF_XTENSA); MODULE_IMPORT_NS(SND_SOC_SOF_MTK_COMMON); -MODULE_LICENSE("Dual BSD/GPL"); diff --git a/sound/soc/sof/mediatek/mtk-adsp-common.c b/sound/soc/sof/mediatek/mtk-adsp-common.c index de8dbe27cd0def..20bcf5590eb885 100644 --- a/sound/soc/sof/mediatek/mtk-adsp-common.c +++ b/sound/soc/sof/mediatek/mtk-adsp-common.c @@ -82,3 +82,4 @@ void mtk_adsp_dump(struct snd_sof_dev *sdev, u32 flags) EXPORT_SYMBOL(mtk_adsp_dump); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("SOF helpers for MTK ADSP platforms"); diff --git a/sound/soc/sof/sof-acpi-dev.c b/sound/soc/sof/sof-acpi-dev.c index 2d96d00f1c449a..b196b2b74c2647 100644 --- a/sound/soc/sof/sof-acpi-dev.c +++ b/sound/soc/sof/sof-acpi-dev.c @@ -100,3 +100,4 @@ void sof_acpi_remove(struct platform_device *pdev) EXPORT_SYMBOL_NS(sof_acpi_remove, SND_SOC_SOF_ACPI_DEV); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("SOF support for ACPI platforms"); diff --git a/sound/soc/sof/sof-of-dev.c b/sound/soc/sof/sof-of-dev.c index b9a499e92b9a57..71f7153cf79c69 100644 --- a/sound/soc/sof/sof-of-dev.c +++ b/sound/soc/sof/sof-of-dev.c @@ -93,3 +93,4 @@ void sof_of_shutdown(struct platform_device *pdev) EXPORT_SYMBOL(sof_of_shutdown); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("SOF support for OF/DT platforms"); diff --git a/sound/soc/sof/sof-pci-dev.c b/sound/soc/sof/sof-pci-dev.c index 4365405783e61d..38f2187da5de18 100644 --- a/sound/soc/sof/sof-pci-dev.c +++ b/sound/soc/sof/sof-pci-dev.c @@ -304,3 +304,4 @@ void sof_pci_shutdown(struct pci_dev *pci) EXPORT_SYMBOL_NS(sof_pci_shutdown, SND_SOC_SOF_PCI_DEV); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("SOF support for PCI platforms"); diff --git a/sound/soc/sof/sof-utils.c b/sound/soc/sof/sof-utils.c index cad041bf56ccf6..44608682e9f8f6 100644 --- a/sound/soc/sof/sof-utils.c +++ b/sound/soc/sof/sof-utils.c @@ -73,3 +73,4 @@ int snd_sof_create_page_table(struct device *dev, EXPORT_SYMBOL(snd_sof_create_page_table); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("SOF utils"); From 7b05ab85e28f615e70520d24c075249b4512044e Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 23 May 2024 14:02:57 +0300 Subject: [PATCH 0171/1578] ipv4: Fix address dump when IPv4 is disabled on an interface Cited commit started returning an error when user space requests to dump the interface's IPv4 addresses and IPv4 is disabled on the interface. Restore the previous behavior and do not return an error. Before cited commit: # ip address show dev dummy1 10: dummy1: mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000 link/ether e2:40:68:98:d0:18 brd ff:ff:ff:ff:ff:ff inet6 fe80::e040:68ff:fe98:d018/64 scope link proto kernel_ll valid_lft forever preferred_lft forever # ip link set dev dummy1 mtu 67 # ip address show dev dummy1 10: dummy1: mtu 67 qdisc noqueue state UNKNOWN group default qlen 1000 link/ether e2:40:68:98:d0:18 brd ff:ff:ff:ff:ff:ff After cited commit: # ip address show dev dummy1 10: dummy1: mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000 link/ether 32:2d:69:f2:9c:99 brd ff:ff:ff:ff:ff:ff inet6 fe80::302d:69ff:fef2:9c99/64 scope link proto kernel_ll valid_lft forever preferred_lft forever # ip link set dev dummy1 mtu 67 # ip address show dev dummy1 RTNETLINK answers: No such device Dump terminated With this patch: # ip address show dev dummy1 10: dummy1: mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000 link/ether de:17:56:bb:57:c0 brd ff:ff:ff:ff:ff:ff inet6 fe80::dc17:56ff:febb:57c0/64 scope link proto kernel_ll valid_lft forever preferred_lft forever # ip link set dev dummy1 mtu 67 # ip address show dev dummy1 10: dummy1: mtu 67 qdisc noqueue state UNKNOWN group default qlen 1000 link/ether de:17:56:bb:57:c0 brd ff:ff:ff:ff:ff:ff I fixed the exact same issue for IPv6 in commit c04f7dfe6ec2 ("ipv6: Fix address dump when IPv6 is disabled on an interface"), but noted [1] that I am not doing the change for IPv4 because I am not aware of a way to disable IPv4 on an interface other than unregistering it. I clearly missed the above case. [1] https://lore.kernel.org/netdev/20240321173042.2151756-1-idosch@nvidia.com/ Fixes: cdb2f80f1c10 ("inet: use xa_array iterator to implement inet_dump_ifaddr()") Reported-by: Carolina Jubran Reported-by: Yamen Safadi Tested-by: Carolina Jubran Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240523110257.334315-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- net/ipv4/devinet.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 96accde527da24..e827da128c5f55 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1887,10 +1887,11 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) goto done; if (fillargs.ifindex) { - err = -ENODEV; dev = dev_get_by_index_rcu(tgt_net, fillargs.ifindex); - if (!dev) + if (!dev) { + err = -ENODEV; goto done; + } in_dev = __in_dev_get_rcu(dev); if (!in_dev) goto done; From be008726d0ac338a6bb19c2da2853e3e2112b055 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 23 May 2024 10:13:45 -0400 Subject: [PATCH 0172/1578] net: gro: initialize network_offset in network layer Syzkaller was able to trigger kernel BUG at net/core/gro.c:424 ! RIP: 0010:gro_pull_from_frag0 net/core/gro.c:424 [inline] RIP: 0010:gro_try_pull_from_frag0 net/core/gro.c:446 [inline] RIP: 0010:dev_gro_receive+0x242f/0x24b0 net/core/gro.c:571 Due to using an incorrect NAPI_GRO_CB(skb)->network_offset. The referenced commit sets this offset to 0 in skb_gro_reset_offset. That matches the expected case in dev_gro_receive: pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive, ipv6_gro_receive, inet_gro_receive, &gro_list->list, skb); But syzkaller injected an skb with protocol ETH_P_TEB into an ip6gre device (by writing the IP6GRE encapsulated version to a TAP device). The result was a first call to eth_gro_receive, and thus an extra ETH_HLEN in network_offset that should not be there. First issue hit is when computing offset from network header in ipv6_gro_pull_exthdrs. Initialize both offsets in the network layer gro_receive. This pairs with all reads in gro_receive, which use skb_gro_receive_network_offset(). Fixes: 186b1ea73ad8 ("net: gro: use cb instead of skb->network_header") Reported-by: syzkaller Signed-off-by: Willem de Bruijn CC: Richard Gobert Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240523141434.1752483-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/af_inet.c | 2 +- net/ipv6/ip6_offload.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e03ba4a21c3904..b24d74616637a0 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1532,7 +1532,7 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) } NAPI_GRO_CB(skb)->flush |= flush; - NAPI_GRO_CB(skb)->inner_network_offset = off; + NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark] = off; /* Note : No need to call skb_gro_postpull_rcsum() here, * as we already checked checksum over ipv4 header was 0 diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index bd5aff97d8b1d8..9822163428b028 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -236,7 +236,7 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head, if (unlikely(!iph)) goto out; - NAPI_GRO_CB(skb)->inner_network_offset = off; + NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark] = off; flush += ntohs(iph->payload_len) != skb->len - hlen; From f4dca95fc0f6350918f2e6727e35b41f7f86fcce Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 23 May 2024 13:05:27 +0000 Subject: [PATCH 0173/1578] tcp: reduce accepted window in NEW_SYN_RECV state Jason commit made checks against ACK sequence less strict and can be exploited by attackers to establish spoofed flows with less probes. Innocent users might use tcp_rmem[1] == 1,000,000,000, or something more reasonable. An attacker can use a regular TCP connection to learn the server initial tp->rcv_wnd, and use it to optimize the attack. If we make sure that only the announced window (smaller than 65535) is used for ACK validation, we force an attacker to use 65537 packets to complete the 3WHS (assuming server ISN is unknown) Fixes: 378979e94e95 ("tcp: remove 64 KByte limit for initial tp->rcv_wnd value") Link: https://datatracker.ietf.org/meeting/119/materials/slides-119-tcpm-ghost-acks-00 Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Reviewed-by: Jason Xing Link: https://lore.kernel.org/r/20240523130528.60376-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/request_sock.h | 12 ++++++++++++ net/ipv4/tcp_ipv4.c | 7 +------ net/ipv4/tcp_minisocks.c | 7 +++++-- net/ipv6/tcp_ipv6.c | 7 +------ 4 files changed, 19 insertions(+), 14 deletions(-) diff --git a/include/net/request_sock.h b/include/net/request_sock.h index d88c0dfc2d4683..ebcb8896bffc1f 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -285,4 +285,16 @@ static inline int reqsk_queue_len_young(const struct request_sock_queue *queue) return atomic_read(&queue->young); } +/* RFC 7323 2.3 Using the Window Scale Option + * The window field (SEG.WND) of every outgoing segment, with the + * exception of segments, MUST be right-shifted by + * Rcv.Wind.Shift bits. + * + * This means the SEG.WND carried in SYNACK can not exceed 65535. + * We use this property to harden TCP stack while in NEW_SYN_RECV state. + */ +static inline u32 tcp_synack_window(const struct request_sock *req) +{ + return min(req->rsk_rcv_wnd, 65535U); +} #endif /* _REQUEST_SOCK_H */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 30ef0c8f5e92d3..b710958393e64e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1144,14 +1144,9 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, #endif } - /* RFC 7323 2.3 - * The window field (SEG.WND) of every outgoing segment, with the - * exception of segments, MUST be right-shifted by - * Rcv.Wind.Shift bits: - */ tcp_v4_send_ack(sk, skb, seq, tcp_rsk(req)->rcv_nxt, - req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, + tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, tcp_rsk_tsval(tcp_rsk(req)), READ_ONCE(req->ts_recent), 0, &key, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b93619b2384b37..538c06f95918de 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -783,8 +783,11 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, /* RFC793: "first check sequence number". */ - if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, - tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) { + if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq, + tcp_rsk(req)->rcv_nxt, + tcp_rsk(req)->rcv_nxt + + tcp_synack_window(req))) { /* Out of window: send ACK and drop. */ if (!(flg & TCP_FLAG_RST) && !tcp_oow_rate_limited(sock_net(sk), skb, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 4c3605485b68e7..8c577b651bfcd2 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1272,15 +1272,10 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV * sk->sk_state == TCP_SYN_RECV -> for Fast Open. */ - /* RFC 7323 2.3 - * The window field (SEG.WND) of every outgoing segment, with the - * exception of segments, MUST be right-shifted by - * Rcv.Wind.Shift bits: - */ tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, tcp_rsk(req)->rcv_nxt, - req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, + tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, tcp_rsk_tsval(tcp_rsk(req)), READ_ONCE(req->ts_recent), sk->sk_bound_dev_if, &key, ipv6_get_dsfield(ipv6_hdr(skb)), 0, From 0fe53c0ab018b3399b8d4be95f32fd017c9719e1 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Thu, 23 May 2024 12:17:31 -0500 Subject: [PATCH 0174/1578] dt-bindings: net: pse-pd: microchip,pd692x0: Fix missing "additionalProperties" constraints The child nodes are missing "additionalProperties" constraints which means any undocumented properties or child nodes are allowed. Add the constraints, and fix the fallout of wrong manager node regex and missing properties. Fixes: 9c1de033afad ("dt-bindings: net: pse-pd: Add bindings for PD692x0 PSE controller") Signed-off-by: Rob Herring (Arm) Acked-by: Kory Maincent Link: https://lore.kernel.org/r/20240523171732.2836880-1-robh@kernel.org Signed-off-by: Jakub Kicinski --- .../bindings/net/pse-pd/microchip,pd692x0.yaml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/net/pse-pd/microchip,pd692x0.yaml b/Documentation/devicetree/bindings/net/pse-pd/microchip,pd692x0.yaml index 828439398fdf98..fd4244fceced9f 100644 --- a/Documentation/devicetree/bindings/net/pse-pd/microchip,pd692x0.yaml +++ b/Documentation/devicetree/bindings/net/pse-pd/microchip,pd692x0.yaml @@ -24,6 +24,7 @@ properties: managers: type: object + additionalProperties: false description: List of the PD69208T4/PD69204T4/PD69208M PSE managers. Each manager have 4 or 8 physical ports according to the chip version. No need to @@ -47,8 +48,9 @@ properties: - "#size-cells" patternProperties: - "^manager@0[0-9a-b]$": + "^manager@[0-9a-b]$": type: object + additionalProperties: false description: PD69208T4/PD69204T4/PD69208M PSE manager exposing 4 or 8 physical ports. @@ -69,9 +71,14 @@ properties: patternProperties: '^port@[0-7]$': type: object + additionalProperties: false + + properties: + reg: + maxItems: 1 + required: - reg - additionalProperties: false required: - reg From 12f86b9af96a8b09969e4392311602f787b40834 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Thu, 23 May 2024 12:17:50 -0500 Subject: [PATCH 0175/1578] dt-bindings: net: pse-pd: ti,tps23881: Fix missing "additionalProperties" constraints The child nodes are missing "additionalProperties" constraints which means any undocumented properties or child nodes are allowed. Add the constraints and all the undocumented properties exposed by the fix. Fixes: f562202fedad ("dt-bindings: net: pse-pd: Add bindings for TPS23881 PSE controller") Signed-off-by: Rob Herring (Arm) Acked-by: Kory Maincent Link: https://lore.kernel.org/r/20240523171750.2837331-1-robh@kernel.org Signed-off-by: Jakub Kicinski --- .../bindings/net/pse-pd/ti,tps23881.yaml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/Documentation/devicetree/bindings/net/pse-pd/ti,tps23881.yaml b/Documentation/devicetree/bindings/net/pse-pd/ti,tps23881.yaml index 4147adb11e1019..6992d56832bf95 100644 --- a/Documentation/devicetree/bindings/net/pse-pd/ti,tps23881.yaml +++ b/Documentation/devicetree/bindings/net/pse-pd/ti,tps23881.yaml @@ -29,13 +29,31 @@ properties: of the ports conversion matrix that establishes relationship between the logical ports and the physical channels. type: object + additionalProperties: false + + properties: + "#address-cells": + const: 1 + + "#size-cells": + const: 0 patternProperties: '^channel@[0-7]$': type: object + additionalProperties: false + + properties: + reg: + maxItems: 1 + required: - reg + required: + - "#address-cells" + - "#size-cells" + unevaluatedProperties: false required: From bf0497f53c8535f99b72041529d3f7708a6e2c0d Mon Sep 17 00:00:00 2001 From: Xiaolei Wang Date: Fri, 24 May 2024 13:05:28 +0800 Subject: [PATCH 0176/1578] net:fec: Add fec_enet_deinit() When fec_probe() fails or fec_drv_remove() needs to release the fec queue and remove a NAPI context, therefore add a function corresponding to fec_enet_init() and call fec_enet_deinit() which does the opposite to release memory and remove a NAPI context. Fixes: 59d0f7465644 ("net: fec: init multi queue date structure") Signed-off-by: Xiaolei Wang Reviewed-by: Wei Fang Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20240524050528.4115581-1-xiaolei.wang@windriver.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/fec_main.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index a72d8a2eb0b315..881ece735dcf14 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -4130,6 +4130,14 @@ static int fec_enet_init(struct net_device *ndev) return ret; } +static void fec_enet_deinit(struct net_device *ndev) +{ + struct fec_enet_private *fep = netdev_priv(ndev); + + netif_napi_del(&fep->napi); + fec_enet_free_queue(ndev); +} + #ifdef CONFIG_OF static int fec_reset_phy(struct platform_device *pdev) { @@ -4524,6 +4532,7 @@ fec_probe(struct platform_device *pdev) fec_enet_mii_remove(fep); failed_mii_init: failed_irq: + fec_enet_deinit(ndev); failed_init: fec_ptp_stop(pdev); failed_reset: @@ -4587,6 +4596,7 @@ fec_drv_remove(struct platform_device *pdev) pm_runtime_put_noidle(&pdev->dev); pm_runtime_disable(&pdev->dev); + fec_enet_deinit(ndev); free_netdev(ndev); } From 4fb679040d9f758eeb3b4d01bbde6405bf20e64e Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Fri, 24 May 2024 10:53:50 +0200 Subject: [PATCH 0177/1578] net: micrel: Fix lan8841_config_intr after getting out of sleep mode When the interrupt is enabled, the function lan8841_config_intr tries to clear any pending interrupts by reading the interrupt status, then checks the return value for errors and then continue to enable the interrupt. It has been seen that once the system gets out of sleep mode, the interrupt status has the value 0x400 meaning that the PHY detected that the link was in low power. That is correct value but the problem is that the check is wrong. We try to check for errors but we return an error also in this case which is not an error. Therefore fix this by returning only when there is an error. Fixes: a8f1a19d27ef ("net: micrel: Add support for lan8841 PHY") Signed-off-by: Horatiu Vultur Reviewed-by: Suman Ghosh Reviewed-by: Andrew Lunn Reviewed-by: Russell King (Oracle) Link: https://lore.kernel.org/r/20240524085350.359812-1-horatiu.vultur@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/micrel.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 1d769322b05938..2b8f8b7f1517cc 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -4029,7 +4029,7 @@ static int lan8841_config_intr(struct phy_device *phydev) if (phydev->interrupts == PHY_INTERRUPT_ENABLED) { err = phy_read(phydev, LAN8814_INTS); - if (err) + if (err < 0) return err; /* Enable / disable interrupts. It is OK to enable PTP interrupt @@ -4045,6 +4045,14 @@ static int lan8841_config_intr(struct phy_device *phydev) return err; err = phy_read(phydev, LAN8814_INTS); + if (err < 0) + return err; + + /* Getting a positive value doesn't mean that is an error, it + * just indicates what was the status. Therefore make sure to + * clear the value and say that there is no error. + */ + err = 0; } return err; From 266aa3b4812e97942a8ce5c7aafa7da059f7b5b8 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Fri, 24 May 2024 13:28:59 +0200 Subject: [PATCH 0178/1578] page_pool: fix &page_pool_params kdoc issues After the tagged commit, @netdev got documented twice and the kdoc script didn't notice that. Remove the second description added later and move the initial one according to the field position. After merging commit 5f8e4007c10d ("kernel-doc: fix struct_group_tagged() parsing"), kdoc requires to describe struct groups as well. &page_pool_params has 2 struct groups which generated new warnings, describe them to resolve this. Fixes: 403f11ac9ab7 ("page_pool: don't use driver-set flags field directly") Signed-off-by: Alexander Lobakin Link: https://lore.kernel.org/r/20240524112859.2757403-1-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/net/page_pool/types.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index b088d131aeb0d0..7e8477057f3d14 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -45,16 +45,17 @@ struct pp_alloc_cache { /** * struct page_pool_params - page pool parameters + * @fast: params accessed frequently on hotpath * @order: 2^order pages on allocation * @pool_size: size of the ptr_ring * @nid: NUMA node id to allocate from pages from * @dev: device, for DMA pre-mapping purposes - * @netdev: netdev this pool will serve (leave as NULL if none or multiple) * @napi: NAPI which is the sole consumer of pages, otherwise NULL * @dma_dir: DMA mapping direction * @max_len: max DMA sync memory size for PP_FLAG_DMA_SYNC_DEV * @offset: DMA sync address offset for PP_FLAG_DMA_SYNC_DEV - * @netdev: corresponding &net_device for Netlink introspection + * @slow: params with slowpath access only (initialization and Netlink) + * @netdev: netdev this pool will serve (leave as NULL if none or multiple) * @flags: PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV, PP_FLAG_SYSTEM_POOL */ struct page_pool_params { From d514c8b54209de7a95ab37259fe32c7406976bd9 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 23 May 2024 10:45:29 -0700 Subject: [PATCH 0179/1578] idpf: don't enable NAPI and interrupts prior to allocating Rx buffers Currently, idpf enables NAPI and interrupts prior to allocating Rx buffers. This may lead to frame loss (there are no buffers to place incoming frames) and even crashes on quick ifup-ifdown. Interrupts must be enabled only after all the resources are here and available. Split interrupt init into two phases: initialization and enabling, and perform the second only after the queues are fully initialized. Note that we can't just move interrupt initialization down the init process, as the queues must have correct a ::q_vector pointer set and NAPI already added in order to allocate buffers correctly. Also, during the deinit process, disable HW interrupts first and only then disable NAPI. Otherwise, there can be a HW event leading to napi_schedule(), but the NAPI will already be unavailable. Fixes: d4d558718266 ("idpf: initialize interrupts and enable vport") Reported-by: Michal Kubiak Reviewed-by: Wojciech Drewek Signed-off-by: Alexander Lobakin Reviewed-by: Simon Horman Tested-by: Krishneil Singh Signed-off-by: Jacob Keller Link: https://lore.kernel.org/r/20240523-net-2024-05-23-intel-net-fixes-v1-1-17a923e0bb5f@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/idpf/idpf_lib.c | 1 + drivers/net/ethernet/intel/idpf/idpf_txrx.c | 12 +++++++----- drivers/net/ethernet/intel/idpf/idpf_txrx.h | 1 + 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_lib.c b/drivers/net/ethernet/intel/idpf/idpf_lib.c index 52ceda6306a3d0..f1ee5584e8fa20 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_lib.c +++ b/drivers/net/ethernet/intel/idpf/idpf_lib.c @@ -1394,6 +1394,7 @@ static int idpf_vport_open(struct idpf_vport *vport, bool alloc_res) } idpf_rx_init_buf_tail(vport); + idpf_vport_intr_ena(vport); err = idpf_send_config_queues_msg(vport); if (err) { diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 285da2177ee446..b023704bbbdab8 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -3746,9 +3746,9 @@ static void idpf_vport_intr_ena_irq_all(struct idpf_vport *vport) */ void idpf_vport_intr_deinit(struct idpf_vport *vport) { + idpf_vport_intr_dis_irq_all(vport); idpf_vport_intr_napi_dis_all(vport); idpf_vport_intr_napi_del_all(vport); - idpf_vport_intr_dis_irq_all(vport); idpf_vport_intr_rel_irq(vport); } @@ -4179,7 +4179,6 @@ int idpf_vport_intr_init(struct idpf_vport *vport) idpf_vport_intr_map_vector_to_qs(vport); idpf_vport_intr_napi_add_all(vport); - idpf_vport_intr_napi_ena_all(vport); err = vport->adapter->dev_ops.reg_ops.intr_reg_init(vport); if (err) @@ -4193,17 +4192,20 @@ int idpf_vport_intr_init(struct idpf_vport *vport) if (err) goto unroll_vectors_alloc; - idpf_vport_intr_ena_irq_all(vport); - return 0; unroll_vectors_alloc: - idpf_vport_intr_napi_dis_all(vport); idpf_vport_intr_napi_del_all(vport); return err; } +void idpf_vport_intr_ena(struct idpf_vport *vport) +{ + idpf_vport_intr_napi_ena_all(vport); + idpf_vport_intr_ena_irq_all(vport); +} + /** * idpf_config_rss - Send virtchnl messages to configure RSS * @vport: virtual port diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h index 3d046b81e507a1..551391e2046470 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h @@ -990,6 +990,7 @@ int idpf_vport_intr_alloc(struct idpf_vport *vport); void idpf_vport_intr_update_itr_ena_irq(struct idpf_q_vector *q_vector); void idpf_vport_intr_deinit(struct idpf_vport *vport); int idpf_vport_intr_init(struct idpf_vport *vport); +void idpf_vport_intr_ena(struct idpf_vport *vport); enum pkt_hash_types idpf_ptype_to_htype(const struct idpf_rx_ptype_decoded *decoded); int idpf_config_rss(struct idpf_vport *vport); int idpf_init_rss(struct idpf_vport *vport); From 82617b9a04649e83ee8731918aeadbb6e6d7cbc7 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Thu, 23 May 2024 10:45:30 -0700 Subject: [PATCH 0180/1578] ice: fix accounting if a VLAN already exists The ice_vsi_add_vlan() function is used to add a VLAN filter for the target VSI. This function prepares a filter in the switch table for the given VSI. If it succeeds, the vsi->num_vlan counter is incremented. It is not considered an error to add a VLAN which already exists in the switch table, so the function explicitly checks and ignores -EEXIST. The vsi->num_vlan counter is still incremented. This seems incorrect, as it means we can double-count in the case where the same VLAN is added twice by the caller. The actual table will have one less filter than the count. The ice_vsi_del_vlan() function similarly checks and handles the -ENOENT condition for when deleting a filter that doesn't exist. This flow only decrements the vsi->num_vlan if it actually deleted a filter. The vsi->num_vlan counter is used only in a few places, primarily related to tracking the number of non-zero VLANs. If the vsi->num_vlans gets out of sync, then ice_vsi_num_non_zero_vlans() will incorrectly report more VLANs than are present, and ice_vsi_has_non_zero_vlans() could return true potentially in cases where there are only VLAN 0 filters left. Fix this by only incrementing the vsi->num_vlan in the case where we actually added an entry, and not in the case where the entry already existed. Fixes: a1ffafb0b4a4 ("ice: Support configuring the device to Double VLAN Mode") Signed-off-by: Jacob Keller Tested-by: Pucha Himasekhar Reddy Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240523-net-2024-05-23-intel-net-fixes-v1-2-17a923e0bb5f@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_vsi_vlan_lib.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_vsi_vlan_lib.c b/drivers/net/ethernet/intel/ice/ice_vsi_vlan_lib.c index 2e9ad27cb9d13e..6e8f2aab608015 100644 --- a/drivers/net/ethernet/intel/ice/ice_vsi_vlan_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_vsi_vlan_lib.c @@ -45,14 +45,15 @@ int ice_vsi_add_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan) return -EINVAL; err = ice_fltr_add_vlan(vsi, vlan); - if (err && err != -EEXIST) { + if (!err) + vsi->num_vlan++; + else if (err == -EEXIST) + err = 0; + else dev_err(ice_pf_to_dev(vsi->back), "Failure Adding VLAN %d on VSI %i, status %d\n", vlan->vid, vsi->vsi_num, err); - return err; - } - vsi->num_vlan++; - return 0; + return err; } /** From 5597613fb3cf0e36d26cfd8fb2a63196da249333 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 24 May 2024 18:30:56 +0200 Subject: [PATCH 0181/1578] selftests: mptcp: lib: support flaky subtests Some subtests can be unstable, failing once every X runs. Fixing them can take time: there could be an issue in the kernel or in the subtest, and it is then important to do a proper analysis, not to hide real bugs. To avoid creating noises on the different CIs, it is important to have a simple way to mark subtests as flaky, and ignore the errors. This is what this patch introduces: subtests can be marked as flaky by setting MPTCP_LIB_SUBTEST_FLAKY env var to 1, e.g. MPTCP_LIB_SUBTEST_FLAKY=1 The subtest will be executed, and errors (if any) will be ignored. It is still good to run these subtests, as it exercises code, and the results can still be useful for the on-going investigations. Note that the MPTCP CI will continue to track these flaky subtests by setting SELFTESTS_MPTCP_LIB_OVERRIDE_FLAKY env var to 1, and a ticket has to be created before marking subtests as flaky. Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240524-upstream-net-20240524-selftests-mptcp-flaky-v1-1-a352362f3f8e@kernel.org Signed-off-by: Jakub Kicinski --- .../testing/selftests/net/mptcp/mptcp_lib.sh | 30 +++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index ad2ebda5cb64ba..6ffa9b7a3260df 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -21,6 +21,7 @@ declare -rx MPTCP_LIB_AF_INET6=10 MPTCP_LIB_SUBTESTS=() MPTCP_LIB_SUBTESTS_DUPLICATED=0 +MPTCP_LIB_SUBTEST_FLAKY=0 MPTCP_LIB_TEST_COUNTER=0 MPTCP_LIB_TEST_FORMAT="%02u %-50s" MPTCP_LIB_IP_MPTCP=0 @@ -41,6 +42,16 @@ else readonly MPTCP_LIB_COLOR_RESET= fi +# SELFTESTS_MPTCP_LIB_OVERRIDE_FLAKY env var can be set not to ignore errors +# from subtests marked as flaky +mptcp_lib_override_flaky() { + [ "${SELFTESTS_MPTCP_LIB_OVERRIDE_FLAKY:-}" = 1 ] +} + +mptcp_lib_subtest_is_flaky() { + [ "${MPTCP_LIB_SUBTEST_FLAKY}" = 1 ] && ! mptcp_lib_override_flaky +} + # $1: color, $2: text mptcp_lib_print_color() { echo -e "${MPTCP_LIB_START_PRINT:-}${*}${MPTCP_LIB_COLOR_RESET}" @@ -72,7 +83,16 @@ mptcp_lib_pr_skip() { } mptcp_lib_pr_fail() { - mptcp_lib_print_err "[FAIL]${1:+ ${*}}" + local title cmt + + if mptcp_lib_subtest_is_flaky; then + title="IGNO" + cmt=" (flaky)" + else + title="FAIL" + fi + + mptcp_lib_print_err "[${title}]${cmt}${1:+ ${*}}" } mptcp_lib_pr_info() { @@ -208,7 +228,13 @@ mptcp_lib_result_pass() { # $1: test name mptcp_lib_result_fail() { - __mptcp_lib_result_add "not ok" "${1}" + if mptcp_lib_subtest_is_flaky; then + # It might sound better to use 'not ok # TODO' or 'ok # SKIP', + # but some CIs don't understand 'TODO' and treat SKIP as errors. + __mptcp_lib_result_add "ok" "${1} # IGNORE Flaky" + else + __mptcp_lib_result_add "not ok" "${1}" + fi } # $1: test name From cc73a6577ae64247898269d138dee6b73ff710cc Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 24 May 2024 18:30:57 +0200 Subject: [PATCH 0182/1578] selftests: mptcp: simult flows: mark 'unbalanced' tests as flaky These tests are flaky since their introduction. This might be less or not visible depending on the CI running the tests, especially if it is also busy doing other tasks in parallel. A first analysis shown that the transfer can be slowed down when there are some re-injections at the MPTCP level. Such re-injections can of course happen, and disturb the transfer, but it looks strange to have them in this lab. That could be caused by the kernel having access to less CPU cycles -- e.g. when other activities are executed in parallel -- or by a misinterpretation on the MPTCP packet scheduler side. While this is being investigated, the tests are marked as flaky not to create noises in other CIs. Fixes: 219d04992b68 ("mptcp: push pending frames when subflow has free space") Link: https://github.com/multipath-tcp/mptcp_net-next/issues/475 Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240524-upstream-net-20240524-selftests-mptcp-flaky-v1-2-a352362f3f8e@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/simult_flows.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh index 4b14b4412166b5..f74e1c3c126d18 100755 --- a/tools/testing/selftests/net/mptcp/simult_flows.sh +++ b/tools/testing/selftests/net/mptcp/simult_flows.sh @@ -244,7 +244,7 @@ run_test() do_transfer $small $large $time lret=$? mptcp_lib_result_code "${lret}" "${msg}" - if [ $lret -ne 0 ]; then + if [ $lret -ne 0 ] && ! mptcp_lib_subtest_is_flaky; then ret=$lret [ $bail -eq 0 ] || exit $ret fi @@ -254,7 +254,7 @@ run_test() do_transfer $large $small $time lret=$? mptcp_lib_result_code "${lret}" "${msg}" - if [ $lret -ne 0 ]; then + if [ $lret -ne 0 ] && ! mptcp_lib_subtest_is_flaky; then ret=$lret [ $bail -eq 0 ] || exit $ret fi @@ -290,7 +290,7 @@ run_test 10 10 0 0 "balanced bwidth" run_test 10 10 1 25 "balanced bwidth with unbalanced delay" # we still need some additional infrastructure to pass the following test-cases -run_test 10 3 0 0 "unbalanced bwidth" +MPTCP_LIB_SUBTEST_FLAKY=1 run_test 10 3 0 0 "unbalanced bwidth" run_test 10 3 1 25 "unbalanced bwidth with unbalanced delay" run_test 10 3 25 1 "unbalanced bwidth with opposed, unbalanced delay" From 8c06ac2178a9dee887929232226e35a5cdda1793 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 24 May 2024 18:30:58 +0200 Subject: [PATCH 0183/1578] selftests: mptcp: join: mark 'fastclose' tests as flaky These tests are flaky since their introduction. This might be less or not visible depending on the CI running the tests, especially if it is also busy doing other tasks in parallel, and if a debug kernel config is being used. It looks like this issue is often present with the NetDev CI. While this is being investigated, the tests are marked as flaky not to create noises on such CIs. Fixes: 01542c9bf9ab ("selftests: mptcp: add fastclose testcase") Link: https://github.com/multipath-tcp/mptcp_net-next/issues/324 Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240524-upstream-net-20240524-selftests-mptcp-flaky-v1-3-a352362f3f8e@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index fefa9173bdaaab..b869b46823d7ab 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -261,6 +261,8 @@ reset() TEST_NAME="${1}" + MPTCP_LIB_SUBTEST_FLAKY=0 # reset if modified + if skip_test; then MPTCP_LIB_TEST_COUNTER=$((MPTCP_LIB_TEST_COUNTER+1)) last_test_ignored=1 @@ -448,7 +450,9 @@ reset_with_tcp_filter() # $1: err msg fail_test() { - ret=${KSFT_FAIL} + if ! mptcp_lib_subtest_is_flaky; then + ret=${KSFT_FAIL} + fi if [ ${#} -gt 0 ]; then print_fail "${@}" @@ -3069,6 +3073,7 @@ fullmesh_tests() fastclose_tests() { if reset_check_counter "fastclose test" "MPTcpExtMPFastcloseTx"; then + MPTCP_LIB_SUBTEST_FLAKY=1 test_linkfail=1024 fastclose=client \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 0 0 0 @@ -3077,6 +3082,7 @@ fastclose_tests() fi if reset_check_counter "fastclose server test" "MPTcpExtMPFastcloseRx"; then + MPTCP_LIB_SUBTEST_FLAKY=1 test_linkfail=1024 fastclose=server \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 0 0 0 0 0 0 1 From 38af56e6668b455f7dd0a8e2d9afe74100068e17 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 24 May 2024 18:30:59 +0200 Subject: [PATCH 0184/1578] selftests: mptcp: join: mark 'fail' tests as flaky These tests are rarely unstable. It depends on the CI running the tests, especially if it is also busy doing other tasks in parallel, and if a debug kernel config is being used. It looks like this issue is sometimes present with the NetDev CI. While this is being investigated, the tests are marked as flaky not to create noises on such CIs. Fixes: b6e074e171bc ("selftests: mptcp: add infinite map testcase") Link: https://github.com/multipath-tcp/mptcp_net-next/issues/491 Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240524-upstream-net-20240524-selftests-mptcp-flaky-v1-4-a352362f3f8e@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index b869b46823d7ab..2b66c5fa71ebff 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3101,6 +3101,7 @@ fail_tests() { # single subflow if reset_with_fail "Infinite map" 1; then + MPTCP_LIB_SUBTEST_FLAKY=1 test_linkfail=128 \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 0 0 0 +1 +0 1 0 1 "$(pedit_action_pkts)" @@ -3109,6 +3110,7 @@ fail_tests() # multiple subflows if reset_with_fail "MP_FAIL MP_RST" 2; then + MPTCP_LIB_SUBTEST_FLAKY=1 tc -n $ns2 qdisc add dev ns2eth1 root netem rate 1mbit delay 5ms pm_nl_set_limits $ns1 0 1 pm_nl_set_limits $ns2 0 1 From c519cf9b7434183bb56ed1e200ac577a5fd34d9b Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Mon, 27 May 2024 12:36:19 +0200 Subject: [PATCH 0185/1578] docs: netdev: Fix typo in Signed-off-by tag s/of/off/ Signed-off-by: Thorsten Blum Fixes: e110ba659271 ("docs: netdev: add note about Changes Requested and revising commit messages") Link: https://lore.kernel.org/r/20240527103618.265801-2-thorsten.blum@toblux.com Signed-off-by: Jakub Kicinski --- Documentation/process/maintainer-netdev.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/process/maintainer-netdev.rst b/Documentation/process/maintainer-netdev.rst index fd96e4a3cef9c0..5e1fcfad1c4c3e 100644 --- a/Documentation/process/maintainer-netdev.rst +++ b/Documentation/process/maintainer-netdev.rst @@ -227,7 +227,7 @@ preferably including links to previous postings, for example:: The amount of mooing will depend on packet rate so should match the diurnal cycle quite well. - Signed-of-by: Joe Defarmer + Signed-off-by: Joe Defarmer --- v3: - add a note about time-of-day mooing fluctuation to the commit message From bb3ca38ef7aa56dcfa7f6e81675c7a39d5ee9bf1 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Fri, 3 May 2024 08:43:11 -0700 Subject: [PATCH 0186/1578] hv_balloon: Use kernel macros to simplify open coded sequences Code sequences equivalent to ALIGN(), ALIGN_DOWN(), and umin() are currently open coded. Change these to use the kernel macro to improve code clarity. ALIGN() and ALIGN_DOWN() require the alignment value to be a power of 2, which is the case here. Reviewed-by: David Hildenbrand Signed-off-by: Michael Kelley Link: https://lore.kernel.org/r/20240503154312.142466-1-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20240503154312.142466-1-mhklinux@outlook.com> --- drivers/hv/hv_balloon.c | 40 ++++++++-------------------------------- 1 file changed, 8 insertions(+), 32 deletions(-) diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index e000fa3b9f978c..9f45b8a6762c87 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -729,15 +729,8 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, scoped_guard(spinlock_irqsave, &dm_device.ha_lock) { has->ha_end_pfn += HA_CHUNK; - - if (total_pfn > HA_CHUNK) { - processed_pfn = HA_CHUNK; - total_pfn -= HA_CHUNK; - } else { - processed_pfn = total_pfn; - total_pfn = 0; - } - + processed_pfn = umin(total_pfn, HA_CHUNK); + total_pfn -= processed_pfn; has->covered_end_pfn += processed_pfn; } @@ -800,7 +793,7 @@ static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) { struct hv_hotadd_state *has; struct hv_hotadd_gap *gap; - unsigned long residual, new_inc; + unsigned long residual; int ret = 0; guard(spinlock_irqsave)(&dm_device.ha_lock); @@ -836,15 +829,9 @@ static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) * our current limit; extend it. */ if ((start_pfn + pfn_cnt) > has->end_pfn) { + /* Extend the region by multiples of HA_CHUNK */ residual = (start_pfn + pfn_cnt - has->end_pfn); - /* - * Extend the region by multiples of HA_CHUNK. - */ - new_inc = (residual / HA_CHUNK) * HA_CHUNK; - if (residual % HA_CHUNK) - new_inc += HA_CHUNK; - - has->end_pfn += new_inc; + has->end_pfn += ALIGN(residual, HA_CHUNK); } ret = 1; @@ -915,9 +902,7 @@ static unsigned long handle_pg_range(unsigned long pg_start, */ size = (has->end_pfn - has->ha_end_pfn); if (pfn_cnt <= size) { - size = ((pfn_cnt / HA_CHUNK) * HA_CHUNK); - if (pfn_cnt % HA_CHUNK) - size += HA_CHUNK; + size = ALIGN(pfn_cnt, HA_CHUNK); } else { pfn_cnt = size; } @@ -1011,9 +996,6 @@ static void hot_add_req(struct work_struct *dummy) rg_sz = dm->ha_wrk.ha_region_range.finfo.page_cnt; if ((rg_start == 0) && (!dm->host_specified_ha_region)) { - unsigned long region_size; - unsigned long region_start; - /* * The host has not specified the hot-add region. * Based on the hot-add page range being specified, @@ -1021,14 +1003,8 @@ static void hot_add_req(struct work_struct *dummy) * that need to be hot-added while ensuring the alignment * and size requirements of Linux as it relates to hot-add. */ - region_size = (pfn_cnt / HA_CHUNK) * HA_CHUNK; - if (pfn_cnt % HA_CHUNK) - region_size += HA_CHUNK; - - region_start = (pg_start / HA_CHUNK) * HA_CHUNK; - - rg_start = region_start; - rg_sz = region_size; + rg_start = ALIGN_DOWN(pg_start, HA_CHUNK); + rg_sz = ALIGN(pfn_cnt, HA_CHUNK); } if (do_hot_add) From 8852ebf1948d94ecaf4d1113032dda7e58e72b84 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Fri, 3 May 2024 08:43:12 -0700 Subject: [PATCH 0187/1578] hv_balloon: Enable hot-add for memblock sizes > 128 MiB The Hyper-V balloon driver supports hot-add of memory in addition to ballooning. Current code hot-adds in fixed size chunks of 128 MiB (fixed constant HA_CHUNK in the code). While this works in Hyper-V VMs with 64 GiB or less or memory where the Linux memblock size is 128 MiB, the hot-add fails for larger memblock sizes because add_memory() expects memory to be added in chunks that match the memblock size. Messages like the following are reported when Linux has a 256 MiB memblock size: [ 312.668859] Block size [0x10000000] unaligned hotplug range: start 0x310000000, size 0x8000000 [ 312.668880] hv_balloon: hot_add memory failed error is -22 [ 312.668984] hv_balloon: Memory hot add failed Larger memblock sizes are usually used in VMs with more than 64 GiB of memory, depending on the alignment of the VM's physical address space. Fix this problem by having the Hyper-V balloon driver determine the Linux memblock size, and process hot-add requests in that chunk size instead of a fixed 128 MiB. Also update the hot-add alignment requested of the Hyper-V host to match the memblock size. The code changes look significant, but in fact are just a simple text substitution of a new global variable for the previous HA_CHUNK constant. No algorithms are changed except to initialize the new global variable and to calculate the alignment value to pass to Hyper-V. Testing with memblock sizes of 256 MiB and 2 GiB shows correct operation. Reviewed-by: David Hildenbrand Signed-off-by: Michael Kelley Link: https://lore.kernel.org/r/20240503154312.142466-2-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20240503154312.142466-2-mhklinux@outlook.com> --- drivers/hv/hv_balloon.c | 64 +++++++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 21 deletions(-) diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 9f45b8a6762c87..4370ad31b5b381 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -425,11 +426,11 @@ struct dm_info_msg { * The range start_pfn : end_pfn specifies the range * that the host has asked us to hot add. The range * start_pfn : ha_end_pfn specifies the range that we have - * currently hot added. We hot add in multiples of 128M - * chunks; it is possible that we may not be able to bring - * online all the pages in the region. The range + * currently hot added. We hot add in chunks equal to the + * memory block size; it is possible that we may not be able + * to bring online all the pages in the region. The range * covered_start_pfn:covered_end_pfn defines the pages that can - * be brough online. + * be brought online. */ struct hv_hotadd_state { @@ -505,8 +506,11 @@ enum hv_dm_state { static __u8 recv_buffer[HV_HYP_PAGE_SIZE]; static __u8 balloon_up_send_buffer[HV_HYP_PAGE_SIZE]; + +static unsigned long ha_pages_in_chunk; +#define HA_BYTES_IN_CHUNK (ha_pages_in_chunk << PAGE_SHIFT) + #define PAGES_IN_2M (2 * 1024 * 1024 / PAGE_SIZE) -#define HA_CHUNK (128 * 1024 * 1024 / PAGE_SIZE) struct hv_dynmem_device { struct hv_device *dev; @@ -724,21 +728,21 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, unsigned long processed_pfn; unsigned long total_pfn = pfn_count; - for (i = 0; i < (size/HA_CHUNK); i++) { - start_pfn = start + (i * HA_CHUNK); + for (i = 0; i < (size/ha_pages_in_chunk); i++) { + start_pfn = start + (i * ha_pages_in_chunk); scoped_guard(spinlock_irqsave, &dm_device.ha_lock) { - has->ha_end_pfn += HA_CHUNK; - processed_pfn = umin(total_pfn, HA_CHUNK); + has->ha_end_pfn += ha_pages_in_chunk; + processed_pfn = umin(total_pfn, ha_pages_in_chunk); total_pfn -= processed_pfn; - has->covered_end_pfn += processed_pfn; + has->covered_end_pfn += processed_pfn; } reinit_completion(&dm_device.ol_waitevent); nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn)); ret = add_memory(nid, PFN_PHYS((start_pfn)), - (HA_CHUNK << PAGE_SHIFT), MHP_MERGE_RESOURCE); + HA_BYTES_IN_CHUNK, MHP_MERGE_RESOURCE); if (ret) { pr_err("hot_add memory failed error is %d\n", ret); @@ -753,7 +757,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, do_hot_add = false; } scoped_guard(spinlock_irqsave, &dm_device.ha_lock) { - has->ha_end_pfn -= HA_CHUNK; + has->ha_end_pfn -= ha_pages_in_chunk; has->covered_end_pfn -= processed_pfn; } break; @@ -829,9 +833,9 @@ static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) * our current limit; extend it. */ if ((start_pfn + pfn_cnt) > has->end_pfn) { - /* Extend the region by multiples of HA_CHUNK */ + /* Extend the region by multiples of ha_pages_in_chunk */ residual = (start_pfn + pfn_cnt - has->end_pfn); - has->end_pfn += ALIGN(residual, HA_CHUNK); + has->end_pfn += ALIGN(residual, ha_pages_in_chunk); } ret = 1; @@ -897,12 +901,12 @@ static unsigned long handle_pg_range(unsigned long pg_start, * We have some residual hot add range * that needs to be hot added; hot add * it now. Hot add a multiple of - * HA_CHUNK that fully covers the pages + * ha_pages_in_chunk that fully covers the pages * we have. */ size = (has->end_pfn - has->ha_end_pfn); if (pfn_cnt <= size) { - size = ALIGN(pfn_cnt, HA_CHUNK); + size = ALIGN(pfn_cnt, ha_pages_in_chunk); } else { pfn_cnt = size; } @@ -1003,8 +1007,8 @@ static void hot_add_req(struct work_struct *dummy) * that need to be hot-added while ensuring the alignment * and size requirements of Linux as it relates to hot-add. */ - rg_start = ALIGN_DOWN(pg_start, HA_CHUNK); - rg_sz = ALIGN(pfn_cnt, HA_CHUNK); + rg_start = ALIGN_DOWN(pg_start, ha_pages_in_chunk); + rg_sz = ALIGN(pfn_cnt, ha_pages_in_chunk); } if (do_hot_add) @@ -1807,10 +1811,13 @@ static int balloon_connect_vsp(struct hv_device *dev) cap_msg.caps.cap_bits.hot_add = hot_add_enabled(); /* - * Specify our alignment requirements as it relates - * memory hot-add. Specify 128MB alignment. + * Specify our alignment requirements for memory hot-add. The value is + * the log base 2 of the number of megabytes in a chunk. For example, + * with 256 MiB chunks, the value is 8. The number of MiB in a chunk + * must be a power of 2. */ - cap_msg.caps.cap_bits.hot_add_alignment = 7; + cap_msg.caps.cap_bits.hot_add_alignment = + ilog2(HA_BYTES_IN_CHUNK / SZ_1M); /* * Currently the host does not use these @@ -1960,8 +1967,23 @@ static int balloon_probe(struct hv_device *dev, hot_add = false; #ifdef CONFIG_MEMORY_HOTPLUG + /* + * Hot-add must operate in chunks that are of size equal to the + * memory block size because that's what the core add_memory() + * interface requires. The Hyper-V interface requires that the memory + * block size be a power of 2, which is guaranteed by the check in + * memory_dev_init(). + */ + ha_pages_in_chunk = memory_block_size_bytes() / PAGE_SIZE; do_hot_add = hot_add; #else + /* + * Without MEMORY_HOTPLUG, the guest returns a failure status for all + * hot add requests from Hyper-V, and the chunk size is used only to + * specify alignment to Hyper-V as required by the host/guest protocol. + * Somewhat arbitrarily, use 128 MiB. + */ + ha_pages_in_chunk = SZ_128M / PAGE_SIZE; do_hot_add = false; #endif dm_device.dev = dev; From 207e03b00b47ccbd692941b183510026e1bd6ce9 Mon Sep 17 00:00:00 2001 From: Saurabh Sengar Date: Sun, 5 May 2024 22:38:58 -0700 Subject: [PATCH 0188/1578] tools: hv: suppress the invalid warning for packed member alignment Packed struct vmbus_bufring is 4096 byte aligned and the reporting warning is for the first member of that struct which shouldn't add any offset to create alignment issue. Suppress the warning by adding -Wno-address-of-packed-member flag to gcc. Fixes: 45bab4d74651 ("tools: hv: Add vmbus_bufring") Reported-by: kernel test robot Closes: https://lore.kernel.org/all/202404121913.GhtSoKbW-lkp@intel.com/ Signed-off-by: Saurabh Sengar Link: https://lore.kernel.org/r/1714973938-4063-1-git-send-email-ssengar@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1714973938-4063-1-git-send-email-ssengar@linux.microsoft.com> --- tools/hv/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/hv/Makefile b/tools/hv/Makefile index bb52871da34123..2e60e2c212cd9e 100644 --- a/tools/hv/Makefile +++ b/tools/hv/Makefile @@ -17,6 +17,7 @@ endif MAKEFLAGS += -r override CFLAGS += -O2 -Wall -g -D_GNU_SOURCE -I$(OUTPUT)include +override CFLAGS += -Wno-address-of-packed-member ALL_TARGETS := hv_kvp_daemon hv_vss_daemon ifneq ($(ARCH), aarch64) From 4c5a65fd10895708952106652b2ac2ca3b7bb9d9 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Sat, 11 May 2024 06:38:17 -0700 Subject: [PATCH 0189/1578] Documentation: hyperv: Update spelling and fix typo Update spelling from "VMbus" to "VMBus" to match Hyper-V product documentation. Also correct typo: "SNP-SEV" should be "SEV-SNP". Signed-off-by: Michael Kelley Reviewed-by: Easwar Hariharan Reviewed-by: Bagas Sanjaya Link: https://lore.kernel.org/r/20240511133818.19649-1-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20240511133818.19649-1-mhklinux@outlook.com> --- Documentation/virt/hyperv/overview.rst | 22 +++---- Documentation/virt/hyperv/vmbus.rst | 82 +++++++++++++------------- 2 files changed, 52 insertions(+), 52 deletions(-) diff --git a/Documentation/virt/hyperv/overview.rst b/Documentation/virt/hyperv/overview.rst index cd493332c88a63..77408a89d1a407 100644 --- a/Documentation/virt/hyperv/overview.rst +++ b/Documentation/virt/hyperv/overview.rst @@ -40,7 +40,7 @@ Linux guests communicate with Hyper-V in four different ways: arm64, these synthetic registers must be accessed using explicit hypercalls. -* VMbus: VMbus is a higher-level software construct that is built on +* VMBus: VMBus is a higher-level software construct that is built on the other 3 mechanisms. It is a message passing interface between the Hyper-V host and the Linux guest. It uses memory that is shared between Hyper-V and the guest, along with various signaling @@ -54,8 +54,8 @@ x86/x64 architecture only. .. _Hyper-V Top Level Functional Spec (TLFS): https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/tlfs/tlfs -VMbus is not documented. This documentation provides a high-level -overview of VMbus and how it works, but the details can be discerned +VMBus is not documented. This documentation provides a high-level +overview of VMBus and how it works, but the details can be discerned only from the code. Sharing Memory @@ -74,7 +74,7 @@ follows: physical address space. How Hyper-V is told about the GPA or list of GPAs varies. In some cases, a single GPA is written to a synthetic register. In other cases, a GPA or list of GPAs is sent - in a VMbus message. + in a VMBus message. * Hyper-V translates the GPAs into "real" physical memory addresses, and creates a virtual mapping that it can use to access the memory. @@ -133,9 +133,9 @@ only the CPUs actually present in the VM, so Linux does not report any hot-add CPUs. A Linux guest CPU may be taken offline using the normal Linux -mechanisms, provided no VMbus channel interrupts are assigned to -the CPU. See the section on VMbus Interrupts for more details -on how VMbus channel interrupts can be re-assigned to permit +mechanisms, provided no VMBus channel interrupts are assigned to +the CPU. See the section on VMBus Interrupts for more details +on how VMBus channel interrupts can be re-assigned to permit taking a CPU offline. 32-bit and 64-bit @@ -169,14 +169,14 @@ and functionality. Hyper-V indicates feature/function availability via flags in synthetic MSRs that Hyper-V provides to the guest, and the guest code tests these flags. -VMbus has its own protocol version that is negotiated during the -initial VMbus connection from the guest to Hyper-V. This version +VMBus has its own protocol version that is negotiated during the +initial VMBus connection from the guest to Hyper-V. This version number is also output to dmesg during boot. This version number is checked in a few places in the code to determine if specific functionality is present. -Furthermore, each synthetic device on VMbus also has a protocol -version that is separate from the VMbus protocol version. Device +Furthermore, each synthetic device on VMBus also has a protocol +version that is separate from the VMBus protocol version. Device drivers for these synthetic devices typically negotiate the device protocol version, and may test that protocol version to determine if specific device functionality is present. diff --git a/Documentation/virt/hyperv/vmbus.rst b/Documentation/virt/hyperv/vmbus.rst index d2012d9022c5e1..f0d83ebda626a4 100644 --- a/Documentation/virt/hyperv/vmbus.rst +++ b/Documentation/virt/hyperv/vmbus.rst @@ -1,8 +1,8 @@ .. SPDX-License-Identifier: GPL-2.0 -VMbus +VMBus ===== -VMbus is a software construct provided by Hyper-V to guest VMs. It +VMBus is a software construct provided by Hyper-V to guest VMs. It consists of a control path and common facilities used by synthetic devices that Hyper-V presents to guest VMs. The control path is used to offer synthetic devices to the guest VM and, in some cases, @@ -12,9 +12,9 @@ and the synthetic device implementation that is part of Hyper-V, and signaling primitives to allow Hyper-V and the guest to interrupt each other. -VMbus is modeled in Linux as a bus, with the expected /sys/bus/vmbus -entry in a running Linux guest. The VMbus driver (drivers/hv/vmbus_drv.c) -establishes the VMbus control path with the Hyper-V host, then +VMBus is modeled in Linux as a bus, with the expected /sys/bus/vmbus +entry in a running Linux guest. The VMBus driver (drivers/hv/vmbus_drv.c) +establishes the VMBus control path with the Hyper-V host, then registers itself as a Linux bus driver. It implements the standard bus functions for adding and removing devices to/from the bus. @@ -49,9 +49,9 @@ synthetic NIC is referred to as "netvsc" and the Linux driver for the synthetic SCSI controller is "storvsc". These drivers contain functions with names like "storvsc_connect_to_vsp". -VMbus channels +VMBus channels -------------- -An instance of a synthetic device uses VMbus channels to communicate +An instance of a synthetic device uses VMBus channels to communicate between the VSP and the VSC. Channels are bi-directional and used for passing messages. Most synthetic devices use a single channel, but the synthetic SCSI controller and synthetic NIC may use multiple @@ -73,7 +73,7 @@ write indices and some control flags, followed by the memory for the actual ring. The size of the ring is determined by the VSC in the guest and is specific to each synthetic device. The list of GPAs making up the ring is communicated to the Hyper-V host over the -VMbus control path as a GPA Descriptor List (GPADL). See function +VMBus control path as a GPA Descriptor List (GPADL). See function vmbus_establish_gpadl(). Each ring buffer is mapped into contiguous Linux kernel virtual @@ -102,9 +102,9 @@ resources. For Windows Server 2019 and later, this limit is approximately 1280 Mbytes. For versions prior to Windows Server 2019, the limit is approximately 384 Mbytes. -VMbus messages +VMBus messages -------------- -All VMbus messages have a standard header that includes the message +All VMBus messages have a standard header that includes the message length, the offset of the message payload, some flags, and a transactionID. The portion of the message after the header is unique to each VSP/VSC pair. @@ -137,7 +137,7 @@ control message contains a list of GPAs that describe the data buffer. For example, the storvsc driver uses this approach to specify the data buffers to/from which disk I/O is done. -Three functions exist to send VMbus messages: +Three functions exist to send VMBus messages: 1. vmbus_sendpacket(): Control-only messages and messages with embedded data -- no GPAs @@ -154,20 +154,20 @@ Historically, Linux guests have trusted Hyper-V to send well-formed and valid messages, and Linux drivers for synthetic devices did not fully validate messages. With the introduction of processor technologies that fully encrypt guest memory and that allow the -guest to not trust the hypervisor (AMD SNP-SEV, Intel TDX), trusting +guest to not trust the hypervisor (AMD SEV-SNP, Intel TDX), trusting the Hyper-V host is no longer a valid assumption. The drivers for -VMbus synthetic devices are being updated to fully validate any +VMBus synthetic devices are being updated to fully validate any values read from memory that is shared with Hyper-V, which includes -messages from VMbus devices. To facilitate such validation, +messages from VMBus devices. To facilitate such validation, messages read by the guest from the "in" ring buffer are copied to a temporary buffer that is not shared with Hyper-V. Validation is performed in this temporary buffer without the risk of Hyper-V maliciously modifying the message after it is validated but before it is used. -VMbus interrupts +VMBus interrupts ---------------- -VMbus provides a mechanism for the guest to interrupt the host when +VMBus provides a mechanism for the guest to interrupt the host when the guest has queued new messages in a ring buffer. The host expects that the guest will send an interrupt only when an "out" ring buffer transitions from empty to non-empty. If the guest sends @@ -177,62 +177,62 @@ interrupts, the host may throttle that guest by suspending its execution for a few seconds to prevent a denial-of-service attack. Similarly, the host will interrupt the guest when it sends a new -message on the VMbus control path, or when a VMbus channel "in" ring +message on the VMBus control path, or when a VMBus channel "in" ring buffer transitions from empty to non-empty. Each CPU in the guest -may receive VMbus interrupts, so they are best modeled as per-CPU +may receive VMBus interrupts, so they are best modeled as per-CPU interrupts in Linux. This model works well on arm64 where a single -per-CPU IRQ is allocated for VMbus. Since x86/x64 lacks support for +per-CPU IRQ is allocated for VMBus. Since x86/x64 lacks support for per-CPU IRQs, an x86 interrupt vector is statically allocated (see HYPERVISOR_CALLBACK_VECTOR) across all CPUs and explicitly coded to -call the VMbus interrupt service routine. These interrupts are +call the VMBus interrupt service routine. These interrupts are visible in /proc/interrupts on the "HYP" line. -The guest CPU that a VMbus channel will interrupt is selected by the +The guest CPU that a VMBus channel will interrupt is selected by the guest when the channel is created, and the host is informed of that -selection. VMbus devices are broadly grouped into two categories: +selection. VMBus devices are broadly grouped into two categories: -1. "Slow" devices that need only one VMbus channel. The devices +1. "Slow" devices that need only one VMBus channel. The devices (such as keyboard, mouse, heartbeat, and timesync) generate - relatively few interrupts. Their VMbus channels are all + relatively few interrupts. Their VMBus channels are all assigned to interrupt the VMBUS_CONNECT_CPU, which is always CPU 0. -2. "High speed" devices that may use multiple VMbus channels for +2. "High speed" devices that may use multiple VMBus channels for higher parallelism and performance. These devices include the - synthetic SCSI controller and synthetic NIC. Their VMbus + synthetic SCSI controller and synthetic NIC. Their VMBus channels interrupts are assigned to CPUs that are spread out among the available CPUs in the VM so that interrupts on multiple channels can be processed in parallel. -The assignment of VMbus channel interrupts to CPUs is done in the +The assignment of VMBus channel interrupts to CPUs is done in the function init_vp_index(). This assignment is done outside of the normal Linux interrupt affinity mechanism, so the interrupts are neither "unmanaged" nor "managed" interrupts. -The CPU that a VMbus channel will interrupt can be seen in +The CPU that a VMBus channel will interrupt can be seen in /sys/bus/vmbus/devices// channels//cpu. When running on later versions of Hyper-V, the CPU can be changed by writing a new value to this sysfs entry. Because the interrupt assignment is done outside of the normal Linux affinity mechanism, there are no entries in /proc/irq corresponding to individual -VMbus channel interrupts. +VMBus channel interrupts. An online CPU in a Linux guest may not be taken offline if it has -VMbus channel interrupts assigned to it. Any such channel +VMBus channel interrupts assigned to it. Any such channel interrupts must first be manually reassigned to another CPU as described above. When no channel interrupts are assigned to the CPU, it can be taken offline. -When a guest CPU receives a VMbus interrupt from the host, the +When a guest CPU receives a VMBus interrupt from the host, the function vmbus_isr() handles the interrupt. It first checks for channel interrupts by calling vmbus_chan_sched(), which looks at a bitmap setup by the host to determine which channels have pending interrupts on this CPU. If multiple channels have pending interrupts for this CPU, they are processed sequentially. When all channel interrupts have been processed, vmbus_isr() checks for and -processes any message received on the VMbus control path. +processes any message received on the VMBus control path. -The VMbus channel interrupt handling code is designed to work +The VMBus channel interrupt handling code is designed to work correctly even if an interrupt is received on a CPU other than the CPU assigned to the channel. Specifically, the code does not use CPU-based exclusion for correctness. In normal operation, Hyper-V @@ -242,23 +242,23 @@ when Hyper-V will make the transition. The code must work correctly even if there is a time lag before Hyper-V starts interrupting the new CPU. See comments in target_cpu_store(). -VMbus device creation/deletion +VMBus device creation/deletion ------------------------------ Hyper-V and the Linux guest have a separate message-passing path that is used for synthetic device creation and deletion. This -path does not use a VMbus channel. See vmbus_post_msg() and +path does not use a VMBus channel. See vmbus_post_msg() and vmbus_on_msg_dpc(). The first step is for the guest to connect to the generic -Hyper-V VMbus mechanism. As part of establishing this connection, -the guest and Hyper-V agree on a VMbus protocol version they will +Hyper-V VMBus mechanism. As part of establishing this connection, +the guest and Hyper-V agree on a VMBus protocol version they will use. This negotiation allows newer Linux kernels to run on older Hyper-V versions, and vice versa. The guest then tells Hyper-V to "send offers". Hyper-V sends an offer message to the guest for each synthetic device that the VM -is configured to have. Each VMbus device type has a fixed GUID -known as the "class ID", and each VMbus device instance is also +is configured to have. Each VMBus device type has a fixed GUID +known as the "class ID", and each VMBus device instance is also identified by a GUID. The offer message from Hyper-V contains both GUIDs to uniquely (within the VM) identify the device. There is one offer message for each device instance, so a VM with @@ -275,7 +275,7 @@ type based on the class ID, and invokes the correct driver to set up the device. Driver/device matching is performed using the standard Linux mechanism. -The device driver probe function opens the primary VMbus channel to +The device driver probe function opens the primary VMBus channel to the corresponding VSP. It allocates guest memory for the channel ring buffers and shares the ring buffer with the Hyper-V host by giving the host a list of GPAs for the ring buffer memory. See @@ -285,7 +285,7 @@ Once the ring buffer is set up, the device driver and VSP exchange setup messages via the primary channel. These messages may include negotiating the device protocol version to be used between the Linux VSC and the VSP on the Hyper-V host. The setup messages may also -include creating additional VMbus channels, which are somewhat +include creating additional VMBus channels, which are somewhat mis-named as "sub-channels" since they are functionally equivalent to the primary channel once they are created. From a0b134032e6c5552635c7142ad7f181eba2f3256 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Sat, 11 May 2024 06:38:18 -0700 Subject: [PATCH 0190/1578] Documentation: hyperv: Improve synic and interrupt handling description Current documentation does not describe how Linux handles the synthetic interrupt controller (synic) that Hyper-V provides to guest VMs, nor how VMBus or timer interrupts are handled. Add text describing the synic and reorganize existing text to make this more clear. Signed-off-by: Michael Kelley Reviewed-by: Easwar Hariharan Reviewed-by: Bagas Sanjaya Link: https://lore.kernel.org/r/20240511133818.19649-2-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20240511133818.19649-2-mhklinux@outlook.com> --- Documentation/virt/hyperv/clocks.rst | 21 +++++--- Documentation/virt/hyperv/vmbus.rst | 79 ++++++++++++++++++---------- 2 files changed, 66 insertions(+), 34 deletions(-) diff --git a/Documentation/virt/hyperv/clocks.rst b/Documentation/virt/hyperv/clocks.rst index a56f4837d44336..1760432658039c 100644 --- a/Documentation/virt/hyperv/clocks.rst +++ b/Documentation/virt/hyperv/clocks.rst @@ -62,12 +62,21 @@ shared page with scale and offset values into user space. User space code performs the same algorithm of reading the TSC and applying the scale and offset to get the constant 10 MHz clock. -Linux clockevents are based on Hyper-V synthetic timer 0. While -Hyper-V offers 4 synthetic timers for each CPU, Linux only uses -timer 0. Interrupts from stimer0 are recorded on the "HVS" line in -/proc/interrupts. Clockevents based on the virtualized PIT and -local APIC timer also work, but the Hyper-V synthetic timer is -preferred. +Linux clockevents are based on Hyper-V synthetic timer 0 (stimer0). +While Hyper-V offers 4 synthetic timers for each CPU, Linux only uses +timer 0. In older versions of Hyper-V, an interrupt from stimer0 +results in a VMBus control message that is demultiplexed by +vmbus_isr() as described in the Documentation/virt/hyperv/vmbus.rst +documentation. In newer versions of Hyper-V, stimer0 interrupts can +be mapped to an architectural interrupt, which is referred to as +"Direct Mode". Linux prefers to use Direct Mode when available. Since +x86/x64 doesn't support per-CPU interrupts, Direct Mode statically +allocates an x86 interrupt vector (HYPERV_STIMER0_VECTOR) across all CPUs +and explicitly codes it to call the stimer0 interrupt handler. Hence +interrupts from stimer0 are recorded on the "HVS" line in /proc/interrupts +rather than being associated with a Linux IRQ. Clockevents based on the +virtualized PIT and local APIC timer also work, but Hyper-V stimer0 +is preferred. The driver for the Hyper-V synthetic system clock and timers is drivers/clocksource/hyperv_timer.c. diff --git a/Documentation/virt/hyperv/vmbus.rst b/Documentation/virt/hyperv/vmbus.rst index f0d83ebda626a4..1dcef6a7fda3fd 100644 --- a/Documentation/virt/hyperv/vmbus.rst +++ b/Documentation/virt/hyperv/vmbus.rst @@ -102,10 +102,10 @@ resources. For Windows Server 2019 and later, this limit is approximately 1280 Mbytes. For versions prior to Windows Server 2019, the limit is approximately 384 Mbytes. -VMBus messages --------------- -All VMBus messages have a standard header that includes the message -length, the offset of the message payload, some flags, and a +VMBus channel messages +---------------------- +All messages sent in a VMBus channel have a standard header that includes +the message length, the offset of the message payload, some flags, and a transactionID. The portion of the message after the header is unique to each VSP/VSC pair. @@ -137,7 +137,7 @@ control message contains a list of GPAs that describe the data buffer. For example, the storvsc driver uses this approach to specify the data buffers to/from which disk I/O is done. -Three functions exist to send VMBus messages: +Three functions exist to send VMBus channel messages: 1. vmbus_sendpacket(): Control-only messages and messages with embedded data -- no GPAs @@ -165,6 +165,37 @@ performed in this temporary buffer without the risk of Hyper-V maliciously modifying the message after it is validated but before it is used. +Synthetic Interrupt Controller (synic) +-------------------------------------- +Hyper-V provides each guest CPU with a synthetic interrupt controller +that is used by VMBus for host-guest communication. While each synic +defines 16 synthetic interrupts (SINT), Linux uses only one of the 16 +(VMBUS_MESSAGE_SINT). All interrupts related to communication between +the Hyper-V host and a guest CPU use that SINT. + +The SINT is mapped to a single per-CPU architectural interrupt (i.e, +an 8-bit x86/x64 interrupt vector, or an arm64 PPI INTID). Because +each CPU in the guest has a synic and may receive VMBus interrupts, +they are best modeled in Linux as per-CPU interrupts. This model works +well on arm64 where a single per-CPU Linux IRQ is allocated for +VMBUS_MESSAGE_SINT. This IRQ appears in /proc/interrupts as an IRQ labelled +"Hyper-V VMbus". Since x86/x64 lacks support for per-CPU IRQs, an x86 +interrupt vector is statically allocated (HYPERVISOR_CALLBACK_VECTOR) +across all CPUs and explicitly coded to call vmbus_isr(). In this case, +there's no Linux IRQ, and the interrupts are visible in aggregate in +/proc/interrupts on the "HYP" line. + +The synic provides the means to demultiplex the architectural interrupt into +one or more logical interrupts and route the logical interrupt to the proper +VMBus handler in Linux. This demultiplexing is done by vmbus_isr() and +related functions that access synic data structures. + +The synic is not modeled in Linux as an irq chip or irq domain, +and the demultiplexed logical interrupts are not Linux IRQs. As such, +they don't appear in /proc/interrupts or /proc/irq. The CPU +affinity for one of these logical interrupts is controlled via an +entry under /sys/bus/vmbus as described below. + VMBus interrupts ---------------- VMBus provides a mechanism for the guest to interrupt the host when @@ -176,16 +207,18 @@ unnecessary. If a guest sends an excessive number of unnecessary interrupts, the host may throttle that guest by suspending its execution for a few seconds to prevent a denial-of-service attack. -Similarly, the host will interrupt the guest when it sends a new -message on the VMBus control path, or when a VMBus channel "in" ring -buffer transitions from empty to non-empty. Each CPU in the guest -may receive VMBus interrupts, so they are best modeled as per-CPU -interrupts in Linux. This model works well on arm64 where a single -per-CPU IRQ is allocated for VMBus. Since x86/x64 lacks support for -per-CPU IRQs, an x86 interrupt vector is statically allocated (see -HYPERVISOR_CALLBACK_VECTOR) across all CPUs and explicitly coded to -call the VMBus interrupt service routine. These interrupts are -visible in /proc/interrupts on the "HYP" line. +Similarly, the host will interrupt the guest via the synic when +it sends a new message on the VMBus control path, or when a VMBus +channel "in" ring buffer transitions from empty to non-empty due to +the host inserting a new VMBus channel message. The control message stream +and each VMBus channel "in" ring buffer are separate logical interrupts +that are demultiplexed by vmbus_isr(). It demultiplexes by first checking +for channel interrupts by calling vmbus_chan_sched(), which looks at a synic +bitmap to determine which channels have pending interrupts on this CPU. +If multiple channels have pending interrupts for this CPU, they are +processed sequentially. When all channel interrupts have been processed, +vmbus_isr() checks for and processes any messages received on the VMBus +control path. The guest CPU that a VMBus channel will interrupt is selected by the guest when the channel is created, and the host is informed of that @@ -212,10 +245,9 @@ neither "unmanaged" nor "managed" interrupts. The CPU that a VMBus channel will interrupt can be seen in /sys/bus/vmbus/devices// channels//cpu. When running on later versions of Hyper-V, the CPU can be changed -by writing a new value to this sysfs entry. Because the interrupt -assignment is done outside of the normal Linux affinity mechanism, -there are no entries in /proc/irq corresponding to individual -VMBus channel interrupts. +by writing a new value to this sysfs entry. Because VMBus channel +interrupts are not Linux IRQs, there are no entries in /proc/interrupts +or /proc/irq corresponding to individual VMBus channel interrupts. An online CPU in a Linux guest may not be taken offline if it has VMBus channel interrupts assigned to it. Any such channel @@ -223,15 +255,6 @@ interrupts must first be manually reassigned to another CPU as described above. When no channel interrupts are assigned to the CPU, it can be taken offline. -When a guest CPU receives a VMBus interrupt from the host, the -function vmbus_isr() handles the interrupt. It first checks for -channel interrupts by calling vmbus_chan_sched(), which looks at a -bitmap setup by the host to determine which channels have pending -interrupts on this CPU. If multiple channels have pending -interrupts for this CPU, they are processed sequentially. When all -channel interrupts have been processed, vmbus_isr() checks for and -processes any message received on the VMBus control path. - The VMBus channel interrupt handling code is designed to work correctly even if an interrupt is received on a CPU other than the CPU assigned to the channel. Specifically, the code does not use From fd7ccfb112302c6941f03095544c10879ced98be Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Fri, 17 May 2024 09:42:04 +0300 Subject: [PATCH 0191/1578] media: Documentation: v4l: Fix ACTIVE route flag The documentation in one occasion mentions the VIDIOC_SUBDEV_STREAM_FL_ACTIVE flag. This was meant to be V4L2_SUBDEV_STREAM_FL_ACTIVE as it's a flag, not an IOCTL. Fix it. Fixes: cd2c75454d74 ("media: Documentation: Document S_ROUTING behaviour") Reported-by: Samuel Wein PhD Signed-off-by: Sakari Ailus Signed-off-by: Hans Verkuil --- Documentation/userspace-api/media/v4l/dev-subdev.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/userspace-api/media/v4l/dev-subdev.rst b/Documentation/userspace-api/media/v4l/dev-subdev.rst index 0f9eda3187f349..161b43f1ce66ab 100644 --- a/Documentation/userspace-api/media/v4l/dev-subdev.rst +++ b/Documentation/userspace-api/media/v4l/dev-subdev.rst @@ -582,7 +582,7 @@ depending on the hardware. In all cases, however, only routes that have the Devices generating the streams may allow enabling and disabling some of the routes or have a fixed routing configuration. If the routes can be disabled, not declaring the routes (or declaring them without -``VIDIOC_SUBDEV_STREAM_FL_ACTIVE`` flag set) in ``VIDIOC_SUBDEV_S_ROUTING`` will +``V4L2_SUBDEV_STREAM_FL_ACTIVE`` flag set) in ``VIDIOC_SUBDEV_S_ROUTING`` will disable the routes. ``VIDIOC_SUBDEV_S_ROUTING`` will still return such routes back to the user in the routes array, with the ``V4L2_SUBDEV_STREAM_FL_ACTIVE`` flag unset. From fe61b2906bd046535f4ef7dfcd69562f531ccd38 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 20 May 2024 11:55:54 +0200 Subject: [PATCH 0192/1578] media: intel/ipu6: Fix some redundant resources freeing in ipu6_pci_remove() pcim_iomap_regions() and pcim_enable_device() are used in the probe. So the corresponding managed resources don't need to be freed explicitly in the remove function. Remove the incorrect pci_release_regions() and pci_disable_device() calls. Fixes: 25fedc021985 ("media: intel/ipu6: add Intel IPU6 PCI device driver") Signed-off-by: Christophe JAILLET Reviewed-by: Bingbu Cao Signed-off-by: Sakari Ailus Signed-off-by: Hans Verkuil --- drivers/media/pci/intel/ipu6/ipu6.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/media/pci/intel/ipu6/ipu6.c b/drivers/media/pci/intel/ipu6/ipu6.c index d2bebd20846151..f587f609259d7c 100644 --- a/drivers/media/pci/intel/ipu6/ipu6.c +++ b/drivers/media/pci/intel/ipu6/ipu6.c @@ -727,9 +727,6 @@ static void ipu6_pci_remove(struct pci_dev *pdev) pm_runtime_forbid(&pdev->dev); pm_runtime_get_noresume(&pdev->dev); - pci_release_regions(pdev); - pci_disable_device(pdev); - release_firmware(isp->cpd_fw); ipu6_mmu_cleanup(psys_mmu); From 266b44ec9a26dd2ad5d1dfcbb2f7f4e2c10da3fb Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 26 May 2024 09:59:56 +0200 Subject: [PATCH 0193/1578] media: intel/ipu6: Move isys_remove() close to isys_probe() In preparation to fixing a leak in isys_probe(), move isys_remove(). The fix will introduce a new function that will also be called from isys_remove(). The code needs to be rearranged to avoid a forward declaration. Having the .remove function close to the .probe function is also more standard. Signed-off-by: Christophe JAILLET Signed-off-by: Sakari Ailus Signed-off-by: Hans Verkuil --- drivers/media/pci/intel/ipu6/ipu6-isys.c | 70 ++++++++++++------------ 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/drivers/media/pci/intel/ipu6/ipu6-isys.c b/drivers/media/pci/intel/ipu6/ipu6-isys.c index 5992138c7290dc..7ce2047a09b597 100644 --- a/drivers/media/pci/intel/ipu6/ipu6-isys.c +++ b/drivers/media/pci/intel/ipu6/ipu6-isys.c @@ -925,41 +925,6 @@ static const struct dev_pm_ops isys_pm_ops = { .resume = isys_resume, }; -static void isys_remove(struct auxiliary_device *auxdev) -{ - struct ipu6_bus_device *adev = auxdev_to_adev(auxdev); - struct ipu6_isys *isys = dev_get_drvdata(&auxdev->dev); - struct ipu6_device *isp = adev->isp; - struct isys_fw_msgs *fwmsg, *safe; - unsigned int i; - - list_for_each_entry_safe(fwmsg, safe, &isys->framebuflist, head) - dma_free_attrs(&auxdev->dev, sizeof(struct isys_fw_msgs), - fwmsg, fwmsg->dma_addr, 0); - - list_for_each_entry_safe(fwmsg, safe, &isys->framebuflist_fw, head) - dma_free_attrs(&auxdev->dev, sizeof(struct isys_fw_msgs), - fwmsg, fwmsg->dma_addr, 0); - - isys_unregister_devices(isys); - isys_notifier_cleanup(isys); - - cpu_latency_qos_remove_request(&isys->pm_qos); - - if (!isp->secure_mode) { - ipu6_cpd_free_pkg_dir(adev); - ipu6_buttress_unmap_fw_image(adev, &adev->fw_sgt); - release_firmware(adev->fw); - } - - for (i = 0; i < IPU6_ISYS_MAX_STREAMS; i++) - mutex_destroy(&isys->streams[i].mutex); - - isys_iwake_watermark_cleanup(isys); - mutex_destroy(&isys->stream_mutex); - mutex_destroy(&isys->mutex); -} - static int alloc_fw_msg_bufs(struct ipu6_isys *isys, int amount) { struct device *dev = &isys->adev->auxdev.dev; @@ -1167,6 +1132,41 @@ static int isys_probe(struct auxiliary_device *auxdev, return ret; } +static void isys_remove(struct auxiliary_device *auxdev) +{ + struct ipu6_bus_device *adev = auxdev_to_adev(auxdev); + struct ipu6_isys *isys = dev_get_drvdata(&auxdev->dev); + struct ipu6_device *isp = adev->isp; + struct isys_fw_msgs *fwmsg, *safe; + unsigned int i; + + list_for_each_entry_safe(fwmsg, safe, &isys->framebuflist, head) + dma_free_attrs(&auxdev->dev, sizeof(struct isys_fw_msgs), + fwmsg, fwmsg->dma_addr, 0); + + list_for_each_entry_safe(fwmsg, safe, &isys->framebuflist_fw, head) + dma_free_attrs(&auxdev->dev, sizeof(struct isys_fw_msgs), + fwmsg, fwmsg->dma_addr, 0); + + isys_unregister_devices(isys); + isys_notifier_cleanup(isys); + + cpu_latency_qos_remove_request(&isys->pm_qos); + + if (!isp->secure_mode) { + ipu6_cpd_free_pkg_dir(adev); + ipu6_buttress_unmap_fw_image(adev, &adev->fw_sgt); + release_firmware(adev->fw); + } + + for (i = 0; i < IPU6_ISYS_MAX_STREAMS; i++) + mutex_destroy(&isys->streams[i].mutex); + + isys_iwake_watermark_cleanup(isys); + mutex_destroy(&isys->stream_mutex); + mutex_destroy(&isys->mutex); +} + struct fwmsg { int type; char *msg; From ab0ed481012811fed052f609b364c118ce8a576e Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 26 May 2024 09:59:57 +0200 Subject: [PATCH 0194/1578] media: intel/ipu6: Fix an error handling path in isys_probe() If an error occurs after a successful alloc_fw_msg_bufs() call, some resources should be released as already done in the remove function. Add a new free_fw_msg_bufs() function that releases what has been allocated by alloc_fw_msg_bufs(). Also use this new function in isys_remove() to avoid some code duplication. Fixes: f50c4ca0a820 ("media: intel/ipu6: add the main input system driver") Signed-off-by: Christophe JAILLET Signed-off-by: Sakari Ailus Signed-off-by: Hans Verkuil --- drivers/media/pci/intel/ipu6/ipu6-isys.c | 27 ++++++++++++++++-------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/drivers/media/pci/intel/ipu6/ipu6-isys.c b/drivers/media/pci/intel/ipu6/ipu6-isys.c index 7ce2047a09b597..1998b72ac07de9 100644 --- a/drivers/media/pci/intel/ipu6/ipu6-isys.c +++ b/drivers/media/pci/intel/ipu6/ipu6-isys.c @@ -925,6 +925,20 @@ static const struct dev_pm_ops isys_pm_ops = { .resume = isys_resume, }; +static void free_fw_msg_bufs(struct ipu6_isys *isys) +{ + struct device *dev = &isys->adev->auxdev.dev; + struct isys_fw_msgs *fwmsg, *safe; + + list_for_each_entry_safe(fwmsg, safe, &isys->framebuflist, head) + dma_free_attrs(dev, sizeof(struct isys_fw_msgs), fwmsg, + fwmsg->dma_addr, 0); + + list_for_each_entry_safe(fwmsg, safe, &isys->framebuflist_fw, head) + dma_free_attrs(dev, sizeof(struct isys_fw_msgs), fwmsg, + fwmsg->dma_addr, 0); +} + static int alloc_fw_msg_bufs(struct ipu6_isys *isys, int amount) { struct device *dev = &isys->adev->auxdev.dev; @@ -1105,12 +1119,14 @@ static int isys_probe(struct auxiliary_device *auxdev, ret = isys_register_devices(isys); if (ret) - goto out_remove_pkg_dir_shared_buffer; + goto free_fw_msg_bufs; ipu6_mmu_hw_cleanup(adev->mmu); return 0; +free_fw_msg_bufs: + free_fw_msg_bufs(isys); out_remove_pkg_dir_shared_buffer: if (!isp->secure_mode) ipu6_cpd_free_pkg_dir(adev); @@ -1137,16 +1153,9 @@ static void isys_remove(struct auxiliary_device *auxdev) struct ipu6_bus_device *adev = auxdev_to_adev(auxdev); struct ipu6_isys *isys = dev_get_drvdata(&auxdev->dev); struct ipu6_device *isp = adev->isp; - struct isys_fw_msgs *fwmsg, *safe; unsigned int i; - list_for_each_entry_safe(fwmsg, safe, &isys->framebuflist, head) - dma_free_attrs(&auxdev->dev, sizeof(struct isys_fw_msgs), - fwmsg, fwmsg->dma_addr, 0); - - list_for_each_entry_safe(fwmsg, safe, &isys->framebuflist_fw, head) - dma_free_attrs(&auxdev->dev, sizeof(struct isys_fw_msgs), - fwmsg, fwmsg->dma_addr, 0); + free_fw_msg_bufs(isys); isys_unregister_devices(isys); isys_notifier_cleanup(isys); From c19fa08c1414644b0d9275d336bdaff90af57d0b Mon Sep 17 00:00:00 2001 From: Bingbu Cao Date: Mon, 27 May 2024 18:48:57 +0800 Subject: [PATCH 0195/1578] media: intel/ipu6: fix the buffer flags caused by wrong parentheses The buffer flags is set by wrong due to wrong parentheses, the FL_INCOMING flag is never taken an account. Fix it by wrapping the ternary conditional operation with parentheses. Fixes: 3c1dfb5a69cf ("media: intel/ipu6: input system video nodes and buffer queues") Signed-off-by: Bingbu Cao Signed-off-by: Sakari Ailus Signed-off-by: Hans Verkuil --- drivers/media/pci/intel/ipu6/ipu6-isys-queue.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/media/pci/intel/ipu6/ipu6-isys-queue.c b/drivers/media/pci/intel/ipu6/ipu6-isys-queue.c index 40a8ebfcfce2d2..4bd4e324abc9f6 100644 --- a/drivers/media/pci/intel/ipu6/ipu6-isys-queue.c +++ b/drivers/media/pci/intel/ipu6/ipu6-isys-queue.c @@ -301,10 +301,10 @@ static int ipu6_isys_stream_start(struct ipu6_isys_video *av, out_requeue: if (bl && bl->nbufs) ipu6_isys_buffer_list_queue(bl, - (IPU6_ISYS_BUFFER_LIST_FL_INCOMING | - error) ? + IPU6_ISYS_BUFFER_LIST_FL_INCOMING | + (error ? IPU6_ISYS_BUFFER_LIST_FL_SET_STATE : - 0, error ? VB2_BUF_STATE_ERROR : + 0), error ? VB2_BUF_STATE_ERROR : VB2_BUF_STATE_QUEUED); flush_firmware_streamon_fail(stream); From 77b79df0268bee3ef38fd5e76e86a076ce02995d Mon Sep 17 00:00:00 2001 From: Himal Prasad Ghimiray Date: Wed, 8 May 2024 20:52:15 +0530 Subject: [PATCH 0196/1578] drm/xe: Change pcode timeout to 50msec while polling again MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Polling is initially attempted with timeout_base_ms enabled for preemption, and if it exceeds this timeframe, another attempt is made without preemption, allowing an additional 50 ms before timing out. v2 - Rebase v3 - Move warnings to separate patch (Lucas) Cc: Lucas De Marchi Cc: Rodrigo Vivi Signed-off-by: Himal Prasad Ghimiray Fixes: 7dc9b92dcfef ("drm/xe: Remove i915_utils dependency from xe_pcode.") Reviewed-by: Lucas De Marchi Link: https://patchwork.freedesktop.org/patch/msgid/20240508152216.3263109-2-himal.prasad.ghimiray@intel.com Signed-off-by: Rodrigo Vivi (cherry picked from commit c81858eb52266b3d6ba28ca4f62a198231a10cdc) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_pcode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_pcode.c b/drivers/gpu/drm/xe/xe_pcode.c index c010ef16fbf574..a5e7da8cf94416 100644 --- a/drivers/gpu/drm/xe/xe_pcode.c +++ b/drivers/gpu/drm/xe/xe_pcode.c @@ -191,7 +191,7 @@ int xe_pcode_request(struct xe_gt *gt, u32 mbox, u32 request, drm_WARN_ON_ONCE(>_to_xe(gt)->drm, timeout_base_ms > 1); preempt_disable(); ret = pcode_try_request(gt, mbox, request, reply_mask, reply, &status, - true, timeout_base_ms * 1000, true); + true, 50 * 1000, true); preempt_enable(); out: From c8ea2c31f5ea437199b239d76ad5db27343edb0c Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Mon, 15 Apr 2024 12:04:53 -0700 Subject: [PATCH 0197/1578] drm/xe: Only use reserved BCS instances for usm migrate exec queue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The GuC context scheduling queue is 2 entires deep, thus it is possible for a migration job to be stuck behind a fault if migration exec queue shares engines with user jobs. This can deadlock as the migrate exec queue is required to service page faults. Avoid deadlock by only using reserved BCS instances for usm migrate exec queue. Fixes: a043fbab7af5 ("drm/xe/pvc: Use fast copy engines as migrate engine on PVC") Cc: Matt Roper Cc: Niranjana Vishwanathapura Signed-off-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20240415190453.696553-2-matthew.brost@intel.com Reviewed-by: Brian Welty (cherry picked from commit 04f4a70a183a688a60fe3882d6e4236ea02cfc67) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_migrate.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c index 9f6e9b7f11c848..65e5a3f4c340f0 100644 --- a/drivers/gpu/drm/xe/xe_migrate.c +++ b/drivers/gpu/drm/xe/xe_migrate.c @@ -34,7 +34,6 @@ #include "xe_sync.h" #include "xe_trace.h" #include "xe_vm.h" -#include "xe_wa.h" /** * struct xe_migrate - migrate context. @@ -300,10 +299,6 @@ static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m, } /* - * Due to workaround 16017236439, odd instance hardware copy engines are - * faster than even instance ones. - * This function returns the mask involving all fast copy engines and the - * reserved copy engine to be used as logical mask for migrate engine. * Including the reserved copy engine is required to avoid deadlocks due to * migrate jobs servicing the faults gets stuck behind the job that faulted. */ @@ -317,8 +312,7 @@ static u32 xe_migrate_usm_logical_mask(struct xe_gt *gt) if (hwe->class != XE_ENGINE_CLASS_COPY) continue; - if (!XE_WA(gt, 16017236439) || - xe_gt_is_usm_hwe(gt, hwe) || hwe->instance & 1) + if (xe_gt_is_usm_hwe(gt, hwe)) logical_mask |= BIT(hwe->logical_instance); } @@ -369,6 +363,10 @@ struct xe_migrate *xe_migrate_init(struct xe_tile *tile) if (!hwe || !logical_mask) return ERR_PTR(-EINVAL); + /* + * XXX: Currently only reserving 1 (likely slow) BCS instance on + * PVC, may want to revisit if performance is needed. + */ m->q = xe_exec_queue_create(xe, vm, logical_mask, 1, hwe, EXEC_QUEUE_FLAG_KERNEL | EXEC_QUEUE_FLAG_PERMANENT | From 6c5cd0807c79eb4c0cda70b48f6be668a241d584 Mon Sep 17 00:00:00 2001 From: Niranjana Vishwanathapura Date: Tue, 21 May 2024 13:17:11 -0700 Subject: [PATCH 0198/1578] drm/xe: Properly handle alloc_guc_id() failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Release the submission_state lock if alloc_guc_id() fails. v2: Add Fixes tag and CC stable kernel Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Cc: # v6.8+ Signed-off-by: Niranjana Vishwanathapura Reviewed-by: Nirmoy Das Reviewed-by: Matthew Brost Signed-off-by: José Roberto de Souza Link: https://patchwork.freedesktop.org/patch/msgid/20240521201711.4934-1-niranjana.vishwanathapura@intel.com (cherry picked from commit 40672b792a36894aff3a337b695f6136ee6ac5d4) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_guc_submit.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index c7d38469fb4690..e4e3658e6a1384 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -1240,6 +1240,7 @@ static int guc_exec_queue_init(struct xe_exec_queue *q) return 0; err_entity: + mutex_unlock(&guc->submission_state.lock); xe_sched_entity_fini(&ge->entity); err_sched: xe_sched_fini(&ge->sched); From db03d39053a97d2f2a6baec025ebdacbab5886d2 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 27 May 2024 15:44:48 +0200 Subject: [PATCH 0199/1578] ovl: fix copy-up in tmpfile Move ovl_copy_up() call outside of ovl_want_write()/ovl_drop_write() region, since copy up may also call ovl_want_write() resulting in recursive locking on sb->s_writers. Reported-and-tested-by: syzbot+85e58cdf5b3136471d4b@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/000000000000f6865106191c3e58@google.com/ Fixes: 9a87907de359 ("ovl: implement tmpfile") Signed-off-by: Miklos Szeredi --- fs/overlayfs/dir.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 116f542442ddd6..ab65e98a1defdd 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -1314,10 +1314,6 @@ static int ovl_create_tmpfile(struct file *file, struct dentry *dentry, int flags = file->f_flags | OVL_OPEN_FLAGS; int err; - err = ovl_copy_up(dentry->d_parent); - if (err) - return err; - old_cred = ovl_override_creds(dentry->d_sb); err = ovl_setup_cred_for_create(dentry, inode, mode, old_cred); if (err) @@ -1360,6 +1356,10 @@ static int ovl_tmpfile(struct mnt_idmap *idmap, struct inode *dir, if (!OVL_FS(dentry->d_sb)->tmpfile) return -EOPNOTSUPP; + err = ovl_copy_up(dentry->d_parent); + if (err) + return err; + err = ovl_want_write(dentry); if (err) return err; From 195aba96b854dd664768f382cd1db375d8181f88 Mon Sep 17 00:00:00 2001 From: "Matthew R. Ochs" Date: Wed, 22 May 2024 15:06:40 +0300 Subject: [PATCH 0200/1578] tpm_tis_spi: Account for SPI header when allocating TPM SPI xfer buffer The TPM SPI transfer mechanism uses MAX_SPI_FRAMESIZE for computing the maximum transfer length and the size of the transfer buffer. As such, it does not account for the 4 bytes of header that prepends the SPI data frame. This can result in out-of-bounds accesses and was confirmed with KASAN. Introduce SPI_HDRSIZE to account for the header and use to allocate the transfer buffer. Fixes: a86a42ac2bd6 ("tpm_tis_spi: Add hardware wait polling") Signed-off-by: Matthew R. Ochs Tested-by: Carol Soto Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm_tis_spi_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/char/tpm/tpm_tis_spi_main.c b/drivers/char/tpm/tpm_tis_spi_main.c index 3f9eaf27b41b89..c9eca24bbad478 100644 --- a/drivers/char/tpm/tpm_tis_spi_main.c +++ b/drivers/char/tpm/tpm_tis_spi_main.c @@ -37,6 +37,7 @@ #include "tpm_tis_spi.h" #define MAX_SPI_FRAMESIZE 64 +#define SPI_HDRSIZE 4 /* * TCG SPI flow control is documented in section 6.4 of the spec[1]. In short, @@ -247,7 +248,7 @@ static int tpm_tis_spi_write_bytes(struct tpm_tis_data *data, u32 addr, int tpm_tis_spi_init(struct spi_device *spi, struct tpm_tis_spi_phy *phy, int irq, const struct tpm_tis_phy_ops *phy_ops) { - phy->iobuf = devm_kmalloc(&spi->dev, MAX_SPI_FRAMESIZE, GFP_KERNEL); + phy->iobuf = devm_kmalloc(&spi->dev, SPI_HDRSIZE + MAX_SPI_FRAMESIZE, GFP_KERNEL); if (!phy->iobuf) return -ENOMEM; From f3d7ba9e1bc0c9080834f263d4887bd9c9ea491f Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Mon, 27 May 2024 13:56:27 +0300 Subject: [PATCH 0201/1578] tpm: Open code tpm_buf_parameters() With only single call site, this makes no sense (slipped out of the radar during the review). Open code and document the action directly to the site, to make it more readable. Fixes: 1b6d7f9eb150 ("tpm: add session encryption protection to tpm2_get_random()") Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm-buf.c | 26 -------------------------- drivers/char/tpm/tpm2-cmd.c | 10 +++++++++- include/linux/tpm.h | 2 -- 3 files changed, 9 insertions(+), 29 deletions(-) diff --git a/drivers/char/tpm/tpm-buf.c b/drivers/char/tpm/tpm-buf.c index 647c6ca92ac3ca..cad0048bcc3c68 100644 --- a/drivers/char/tpm/tpm-buf.c +++ b/drivers/char/tpm/tpm-buf.c @@ -223,30 +223,4 @@ u32 tpm_buf_read_u32(struct tpm_buf *buf, off_t *offset) } EXPORT_SYMBOL_GPL(tpm_buf_read_u32); -static u16 tpm_buf_tag(struct tpm_buf *buf) -{ - struct tpm_header *head = (struct tpm_header *)buf->data; - - return be16_to_cpu(head->tag); -} - -/** - * tpm_buf_parameters - return the TPM response parameters area of the tpm_buf - * @buf: tpm_buf to use - * - * Where the parameters are located depends on the tag of a TPM - * command (it's immediately after the header for TPM_ST_NO_SESSIONS - * or 4 bytes after for TPM_ST_SESSIONS). Evaluate this and return a - * pointer to the first byte of the parameters area. - * - * @return: pointer to parameters area - */ -u8 *tpm_buf_parameters(struct tpm_buf *buf) -{ - int offset = TPM_HEADER_SIZE; - - if (tpm_buf_tag(buf) == TPM2_ST_SESSIONS) - offset += 4; - return &buf->data[offset]; -} diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c index 0cdf892ec2a7d4..1e856259219e2e 100644 --- a/drivers/char/tpm/tpm2-cmd.c +++ b/drivers/char/tpm/tpm2-cmd.c @@ -281,6 +281,7 @@ struct tpm2_get_random_out { int tpm2_get_random(struct tpm_chip *chip, u8 *dest, size_t max) { struct tpm2_get_random_out *out; + struct tpm_header *head; struct tpm_buf buf; u32 recd; u32 num_bytes = max; @@ -288,6 +289,7 @@ int tpm2_get_random(struct tpm_chip *chip, u8 *dest, size_t max) int total = 0; int retries = 5; u8 *dest_ptr = dest; + off_t offset; if (!num_bytes || max > TPM_MAX_RNG_DATA) return -EINVAL; @@ -320,7 +322,13 @@ int tpm2_get_random(struct tpm_chip *chip, u8 *dest, size_t max) goto out; } - out = (struct tpm2_get_random_out *)tpm_buf_parameters(&buf); + head = (struct tpm_header *)buf.data; + offset = TPM_HEADER_SIZE; + /* Skip the parameter size field: */ + if (be16_to_cpu(head->tag) == TPM2_ST_SESSIONS) + offset += 4; + + out = (struct tpm2_get_random_out *)&buf.data[offset]; recd = min_t(u32, be16_to_cpu(out->size), num_bytes); if (tpm_buf_length(&buf) < TPM_HEADER_SIZE + diff --git a/include/linux/tpm.h b/include/linux/tpm.h index c17e4efbb2e5cd..b3217200df28a5 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -437,8 +437,6 @@ u8 tpm_buf_read_u8(struct tpm_buf *buf, off_t *offset); u16 tpm_buf_read_u16(struct tpm_buf *buf, off_t *offset); u32 tpm_buf_read_u32(struct tpm_buf *buf, off_t *offset); -u8 *tpm_buf_parameters(struct tpm_buf *buf); - /* * Check if TPM device is in the firmware upgrade mode. */ From 4b4647add7d3c8530493f7247d11e257ee425bf0 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Fri, 24 May 2024 11:47:02 -0300 Subject: [PATCH 0202/1578] sock_map: avoid race between sock_map_close and sk_psock_put sk_psock_get will return NULL if the refcount of psock has gone to 0, which will happen when the last call of sk_psock_put is done. However, sk_psock_drop may not have finished yet, so the close callback will still point to sock_map_close despite psock being NULL. This can be reproduced with a thread deleting an element from the sock map, while the second one creates a socket, adds it to the map and closes it. That will trigger the WARN_ON_ONCE: ------------[ cut here ]------------ WARNING: CPU: 1 PID: 7220 at net/core/sock_map.c:1701 sock_map_close+0x2a2/0x2d0 net/core/sock_map.c:1701 Modules linked in: CPU: 1 PID: 7220 Comm: syz-executor380 Not tainted 6.9.0-syzkaller-07726-g3c999d1ae3c7 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/02/2024 RIP: 0010:sock_map_close+0x2a2/0x2d0 net/core/sock_map.c:1701 Code: df e8 92 29 88 f8 48 8b 1b 48 89 d8 48 c1 e8 03 42 80 3c 20 00 74 08 48 89 df e8 79 29 88 f8 4c 8b 23 eb 89 e8 4f 15 23 f8 90 <0f> 0b 90 48 83 c4 08 5b 41 5c 41 5d 41 5e 41 5f 5d e9 13 26 3d 02 RSP: 0018:ffffc9000441fda8 EFLAGS: 00010293 RAX: ffffffff89731ae1 RBX: ffffffff94b87540 RCX: ffff888029470000 RDX: 0000000000000000 RSI: ffffffff8bcab5c0 RDI: ffffffff8c1faba0 RBP: 0000000000000000 R08: ffffffff92f9b61f R09: 1ffffffff25f36c3 R10: dffffc0000000000 R11: fffffbfff25f36c4 R12: ffffffff89731840 R13: ffff88804b587000 R14: ffff88804b587000 R15: ffffffff89731870 FS: 000055555e080380(0000) GS:ffff8880b9500000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000000207d4000 CR4: 0000000000350ef0 Call Trace: unix_release+0x87/0xc0 net/unix/af_unix.c:1048 __sock_release net/socket.c:659 [inline] sock_close+0xbe/0x240 net/socket.c:1421 __fput+0x42b/0x8a0 fs/file_table.c:422 __do_sys_close fs/open.c:1556 [inline] __se_sys_close fs/open.c:1541 [inline] __x64_sys_close+0x7f/0x110 fs/open.c:1541 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf5/0x240 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7fb37d618070 Code: 00 00 48 c7 c2 b8 ff ff ff f7 d8 64 89 02 b8 ff ff ff ff eb d4 e8 10 2c 00 00 80 3d 31 f0 07 00 00 74 17 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 48 c3 0f 1f 80 00 00 00 00 48 83 ec 18 89 7c RSP: 002b:00007ffcd4a525d8 EFLAGS: 00000202 ORIG_RAX: 0000000000000003 RAX: ffffffffffffffda RBX: 0000000000000005 RCX: 00007fb37d618070 RDX: 0000000000000010 RSI: 00000000200001c0 RDI: 0000000000000004 RBP: 0000000000000000 R08: 0000000100000000 R09: 0000000100000000 R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 Use sk_psock, which will only check that the pointer is not been set to NULL yet, which should only happen after the callbacks are restored. If, then, a reference can still be gotten, we may call sk_psock_stop and cancel psock->work. As suggested by Paolo Abeni, reorder the condition so the control flow is less convoluted. After that change, the reproducer does not trigger the WARN_ON_ONCE anymore. Suggested-by: Paolo Abeni Reported-by: syzbot+07a2e4a1a57118ef7355@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=07a2e4a1a57118ef7355 Fixes: aadb2bb83ff7 ("sock_map: Fix a potential use-after-free in sock_map_close()") Fixes: 5b4a79ba65a1 ("bpf, sockmap: Don't let sock_map_{close,destroy,unhash} call itself") Cc: stable@vger.kernel.org Signed-off-by: Thadeu Lima de Souza Cascardo Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/r/20240524144702.1178377-1-cascardo@igalia.com Signed-off-by: Paolo Abeni --- net/core/sock_map.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 63c016b4c1696a..d3dbb92153f2fe 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1674,19 +1674,23 @@ void sock_map_close(struct sock *sk, long timeout) lock_sock(sk); rcu_read_lock(); - psock = sk_psock_get(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - release_sock(sk); - saved_close = READ_ONCE(sk->sk_prot)->close; - } else { + psock = sk_psock(sk); + if (likely(psock)) { saved_close = psock->saved_close; sock_map_remove_links(sk, psock); + psock = sk_psock_get(sk); + if (unlikely(!psock)) + goto no_psock; rcu_read_unlock(); sk_psock_stop(psock); release_sock(sk); cancel_delayed_work_sync(&psock->work); sk_psock_put(sk, psock); + } else { + saved_close = READ_ONCE(sk->sk_prot)->close; +no_psock: + rcu_read_unlock(); + release_sock(sk); } /* Make sure we do not recurse. This is a bug. From f09fc6cee0dcfc38148ee6b6dd04f93e353d22f2 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 28 May 2024 12:52:21 +0300 Subject: [PATCH 0203/1578] tpm: Rename TPM2_OA_TMPL to TPM2_OA_NULL_KEY and make it local Rename and document TPM2_OA_TMPL, as originally requested in the patch set review, but left unaddressed without any appropriate reasoning. The new name is TPM2_OA_NULL_KEY, has a documentation and is local only to tpm2-sessions.c. Link: https://lore.kernel.org/linux-integrity/ddbeb8111f48a8ddb0b8fca248dff6cc9d7079b2.camel@HansenPartnership.com/ Link: https://lore.kernel.org/linux-integrity/CZCKTWU6ZCC9.2UTEQPEVICYHL@suppilovahvero/ Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm2-sessions.c | 21 +++++++++++++++++++-- include/linux/tpm.h | 15 --------------- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/drivers/char/tpm/tpm2-sessions.c b/drivers/char/tpm/tpm2-sessions.c index ea8860661876ed..907ac9956a78f0 100644 --- a/drivers/char/tpm/tpm2-sessions.c +++ b/drivers/char/tpm/tpm2-sessions.c @@ -80,6 +80,9 @@ /* maximum number of names the TPM must remember for authorization */ #define AUTH_MAX_NAMES 3 +#define AES_KEY_BYTES AES_KEYSIZE_128 +#define AES_KEY_BITS (AES_KEY_BYTES*8) + static int tpm2_create_primary(struct tpm_chip *chip, u32 hierarchy, u32 *handle, u8 *name); @@ -954,6 +957,20 @@ int tpm2_start_auth_session(struct tpm_chip *chip) } EXPORT_SYMBOL(tpm2_start_auth_session); +/* + * A mask containing the object attributes for the kernel held null primary key + * used in HMAC encryption. For more information on specific attributes look up + * to "8.3 TPMA_OBJECT (Object Attributes)". + */ +#define TPM2_OA_NULL_KEY ( \ + TPM2_OA_NO_DA | \ + TPM2_OA_FIXED_TPM | \ + TPM2_OA_FIXED_PARENT | \ + TPM2_OA_SENSITIVE_DATA_ORIGIN | \ + TPM2_OA_USER_WITH_AUTH | \ + TPM2_OA_DECRYPT | \ + TPM2_OA_RESTRICTED) + /** * tpm2_parse_create_primary() - parse the data returned from TPM_CC_CREATE_PRIMARY * @@ -1018,7 +1035,7 @@ static int tpm2_parse_create_primary(struct tpm_chip *chip, struct tpm_buf *buf, val = tpm_buf_read_u32(buf, &offset_t); /* object properties */ - if (val != TPM2_OA_TMPL) + if (val != TPM2_OA_NULL_KEY) return -EINVAL; /* auth policy (empty) */ @@ -1178,7 +1195,7 @@ static int tpm2_create_primary(struct tpm_chip *chip, u32 hierarchy, tpm_buf_append_u16(&template, TPM_ALG_SHA256); /* object properties */ - tpm_buf_append_u32(&template, TPM2_OA_TMPL); + tpm_buf_append_u32(&template, TPM2_OA_NULL_KEY); /* sauth policy (empty) */ tpm_buf_append_u16(&template, 0); diff --git a/include/linux/tpm.h b/include/linux/tpm.h index b3217200df28a5..21a67dc9efe80b 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -394,21 +394,6 @@ enum tpm2_object_attributes { TPM2_OA_SIGN = BIT(18), }; -/* - * definitions for the canonical template. These are mandated - * by the TCG key template documents - */ - -#define AES_KEY_BYTES AES_KEYSIZE_128 -#define AES_KEY_BITS (AES_KEY_BYTES*8) -#define TPM2_OA_TMPL (TPM2_OA_NO_DA | \ - TPM2_OA_FIXED_TPM | \ - TPM2_OA_FIXED_PARENT | \ - TPM2_OA_SENSITIVE_DATA_ORIGIN | \ - TPM2_OA_USER_WITH_AUTH | \ - TPM2_OA_DECRYPT | \ - TPM2_OA_RESTRICTED) - enum tpm2_session_attributes { TPM2_SA_CONTINUE_SESSION = BIT(0), TPM2_SA_AUDIT_EXCLUSIVE = BIT(1), From d3e43a8fa43effdbb62c7edc206df7ac67772205 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 28 May 2024 12:58:41 +0300 Subject: [PATCH 0204/1578] tpm: Enable TCG_TPM2_HMAC by default only for X86_64 Given the not fully root caused performance issues on non-x86 platforms, enable the feature by default only for x86-64. That is the platform it brings the most value and has gone most of the QA. Can be reconsidered later and can be obviously opt-in enabled too on any arch. Link: https://lore.kernel.org/linux-integrity/bf67346ef623ff3c452c4f968b7d900911e250c3.camel@gmail.com/#t Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/tpm/Kconfig b/drivers/char/tpm/Kconfig index e63a6a17793c85..cf0be8a7939de4 100644 --- a/drivers/char/tpm/Kconfig +++ b/drivers/char/tpm/Kconfig @@ -29,7 +29,7 @@ if TCG_TPM config TCG_TPM2_HMAC bool "Use HMAC and encrypted transactions on the TPM bus" - default y + default X86_64 select CRYPTO_ECDH select CRYPTO_LIB_AESCFB select CRYPTO_LIB_SHA256 From 8a42886cae307663f3f999846926bd6e64392000 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 27 May 2024 17:18:49 +0200 Subject: [PATCH 0205/1578] ALSA: seq: Fix missing bank setup between MIDI1/MIDI2 UMP conversion When a UMP packet is converted between MIDI1 and MIDI2 protocols, the bank selection may be lost. The conversion from MIDI1 to MIDI2 needs the encoding of the bank into UMP_MSG_STATUS_PROGRAM bits, while the conversion from MIDI2 to MIDI1 needs the extraction from that instead. This patch implements the missing bank selection mechanism in those conversions. Fixes: e9e02819a98a ("ALSA: seq: Automatic conversion of UMP events") Link: https://lore.kernel.org/r/20240527151852.29036-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/seq/seq_ump_convert.c | 38 ++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/sound/core/seq/seq_ump_convert.c b/sound/core/seq/seq_ump_convert.c index ee6ac649df836d..c21be87f5da9e8 100644 --- a/sound/core/seq/seq_ump_convert.c +++ b/sound/core/seq/seq_ump_convert.c @@ -368,6 +368,7 @@ static int cvt_ump_midi1_to_midi2(struct snd_seq_client *dest, struct snd_seq_ump_event ev_cvt; const union snd_ump_midi1_msg *midi1 = (const union snd_ump_midi1_msg *)event->ump; union snd_ump_midi2_msg *midi2 = (union snd_ump_midi2_msg *)ev_cvt.ump; + struct snd_seq_ump_midi2_bank *cc; ev_cvt = *event; memset(&ev_cvt.ump, 0, sizeof(ev_cvt.ump)); @@ -387,11 +388,29 @@ static int cvt_ump_midi1_to_midi2(struct snd_seq_client *dest, midi2->paf.data = upscale_7_to_32bit(midi1->paf.data); break; case UMP_MSG_STATUS_CC: + cc = &dest_port->midi2_bank[midi1->note.channel]; + switch (midi1->cc.index) { + case UMP_CC_BANK_SELECT: + cc->bank_set = 1; + cc->cc_bank_msb = midi1->cc.data; + return 0; // skip + case UMP_CC_BANK_SELECT_LSB: + cc->bank_set = 1; + cc->cc_bank_lsb = midi1->cc.data; + return 0; // skip + } midi2->cc.index = midi1->cc.index; midi2->cc.data = upscale_7_to_32bit(midi1->cc.data); break; case UMP_MSG_STATUS_PROGRAM: midi2->pg.program = midi1->pg.program; + cc = &dest_port->midi2_bank[midi1->note.channel]; + if (cc->bank_set) { + midi2->pg.bank_valid = 1; + midi2->pg.bank_msb = cc->cc_bank_msb; + midi2->pg.bank_lsb = cc->cc_bank_lsb; + cc->bank_set = 0; + } break; case UMP_MSG_STATUS_CHANNEL_PRESSURE: midi2->caf.data = upscale_7_to_32bit(midi1->caf.data); @@ -419,6 +438,7 @@ static int cvt_ump_midi2_to_midi1(struct snd_seq_client *dest, struct snd_seq_ump_event ev_cvt; union snd_ump_midi1_msg *midi1 = (union snd_ump_midi1_msg *)ev_cvt.ump; const union snd_ump_midi2_msg *midi2 = (const union snd_ump_midi2_msg *)event->ump; + int err; u16 v; ev_cvt = *event; @@ -443,6 +463,24 @@ static int cvt_ump_midi2_to_midi1(struct snd_seq_client *dest, midi1->cc.data = downscale_32_to_7bit(midi2->cc.data); break; case UMP_MSG_STATUS_PROGRAM: + if (midi2->pg.bank_valid) { + midi1->cc.status = UMP_MSG_STATUS_CC; + midi1->cc.index = UMP_CC_BANK_SELECT; + midi1->cc.data = midi2->pg.bank_msb; + err = __snd_seq_deliver_single_event(dest, dest_port, + (struct snd_seq_event *)&ev_cvt, + atomic, hop); + if (err < 0) + return err; + midi1->cc.index = UMP_CC_BANK_SELECT_LSB; + midi1->cc.data = midi2->pg.bank_lsb; + err = __snd_seq_deliver_single_event(dest, dest_port, + (struct snd_seq_event *)&ev_cvt, + atomic, hop); + if (err < 0) + return err; + midi1->note.status = midi2->note.status; + } midi1->pg.program = midi2->pg.program; break; case UMP_MSG_STATUS_CHANNEL_PRESSURE: From a200df7deb3186cd7b55abb77ab96dfefb8a4f09 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 27 May 2024 17:18:50 +0200 Subject: [PATCH 0206/1578] ALSA: seq: Don't clear bank selection at event -> UMP MIDI2 conversion The current code to convert from a legacy sequencer event to UMP MIDI2 clears the bank selection at each time the program change is submitted. This is confusing and may lead to incorrect bank values tranmitted to the destination in the end. Drop the line to clear the bank info and keep the provided values. Fixes: e9e02819a98a ("ALSA: seq: Automatic conversion of UMP events") Link: https://lore.kernel.org/r/20240527151852.29036-2-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/seq/seq_ump_convert.c | 1 - 1 file changed, 1 deletion(-) diff --git a/sound/core/seq/seq_ump_convert.c b/sound/core/seq/seq_ump_convert.c index c21be87f5da9e8..f5d22dd008426f 100644 --- a/sound/core/seq/seq_ump_convert.c +++ b/sound/core/seq/seq_ump_convert.c @@ -892,7 +892,6 @@ static int pgm_ev_to_ump_midi2(const struct snd_seq_event *event, data->pg.bank_msb = cc->cc_bank_msb; data->pg.bank_lsb = cc->cc_bank_lsb; cc->bank_set = 0; - cc->cc_bank_msb = cc->cc_bank_lsb = 0; } return 1; } From e662c90a6debc3bd8d8eff916ad21d9ec458dfcd Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 27 May 2024 14:38:08 -0500 Subject: [PATCH 0207/1578] ALSA/hda: intel-dsp-config: reduce log verbosity The information on PCI class/subclass was interesting in the Skylake timeframe, since the DSP was only enabled on a limited number of platforms. Now most Intel platforms do enable the DSP, so the information is less interesting to log. When a DSP driver is used, the common helper may be called multiple times due to deferred probes, but there's no reason to print the same information multiple times. Using dev_info_once() covers all the existing usages for internal cards with DSPs. External cards don't rely on DSPs so far. Signed-off-by: Pierre-Louis Bossart Reviewed-by: Bard Liao Link: https://lore.kernel.org/r/20240527193808.165652-1-pierre-louis.bossart@linux.intel.com Signed-off-by: Takashi Iwai --- sound/hda/intel-dsp-config.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sound/hda/intel-dsp-config.c b/sound/hda/intel-dsp-config.c index cfdb1b73c88c20..537863447358e0 100644 --- a/sound/hda/intel-dsp-config.c +++ b/sound/hda/intel-dsp-config.c @@ -668,7 +668,7 @@ int snd_intel_dsp_driver_probe(struct pci_dev *pci) return SND_INTEL_DSP_DRIVER_LEGACY; } - dev_info(&pci->dev, "DSP detected with PCI class/subclass/prog-if info 0x%06x\n", pci->class); + dev_dbg(&pci->dev, "DSP detected with PCI class/subclass/prog-if info 0x%06x\n", pci->class); /* find the configuration for the specific device */ cfg = snd_intel_dsp_find_config(pci, config_table, ARRAY_SIZE(config_table)); @@ -678,12 +678,12 @@ int snd_intel_dsp_driver_probe(struct pci_dev *pci) if (cfg->flags & FLAG_SOF) { if (cfg->flags & FLAG_SOF_ONLY_IF_SOUNDWIRE && snd_intel_dsp_check_soundwire(pci) > 0) { - dev_info(&pci->dev, "SoundWire enabled on CannonLake+ platform, using SOF driver\n"); + dev_info_once(&pci->dev, "SoundWire enabled on CannonLake+ platform, using SOF driver\n"); return SND_INTEL_DSP_DRIVER_SOF; } if (cfg->flags & FLAG_SOF_ONLY_IF_DMIC && snd_intel_dsp_check_dmic(pci)) { - dev_info(&pci->dev, "Digital mics found on Skylake+ platform, using SOF driver\n"); + dev_info_once(&pci->dev, "Digital mics found on Skylake+ platform, using SOF driver\n"); return SND_INTEL_DSP_DRIVER_SOF; } if (!(cfg->flags & FLAG_SOF_ONLY_IF_DMIC_OR_SOUNDWIRE)) @@ -694,7 +694,7 @@ int snd_intel_dsp_driver_probe(struct pci_dev *pci) if (cfg->flags & FLAG_SST) { if (cfg->flags & FLAG_SST_ONLY_IF_DMIC) { if (snd_intel_dsp_check_dmic(pci)) { - dev_info(&pci->dev, "Digital mics found on Skylake+ platform, using SST driver\n"); + dev_info_once(&pci->dev, "Digital mics found on Skylake+ platform, using SST driver\n"); return SND_INTEL_DSP_DRIVER_SST; } } else { From 72b6a2d6506843375c7b91197f49ef38ca0c6d0f Mon Sep 17 00:00:00 2001 From: Alina Yu Date: Tue, 28 May 2024 14:01:13 +0800 Subject: [PATCH 0208/1578] regulator: rtq2208: Fix invalid memory access when devm_of_regulator_put_matches is called In this patch, a software bug has been fixed. rtq2208_ldo_match is no longer a local variable. It prevents invalid memory access when devm_of_regulator_put_matches is called. Signed-off-by: Alina Yu Link: https://msgid.link/r/4ce8c4f16f1cf3aa4e5f36c0694dd3c5ccf3cd1c.1716870419.git.alina_yu@richtek.com Signed-off-by: Mark Brown --- drivers/regulator/rtq2208-regulator.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/drivers/regulator/rtq2208-regulator.c b/drivers/regulator/rtq2208-regulator.c index b90e53d922d6d1..c31b6dc3229cab 100644 --- a/drivers/regulator/rtq2208-regulator.c +++ b/drivers/regulator/rtq2208-regulator.c @@ -228,6 +228,11 @@ static const struct regulator_ops rtq2208_regulator_ldo_ops = { .set_suspend_disable = rtq2208_set_suspend_disable, }; +static struct of_regulator_match rtq2208_ldo_match[] = { + {.name = "ldo2", }, + {.name = "ldo1", }, +}; + static unsigned int rtq2208_of_map_mode(unsigned int mode) { switch (mode) { @@ -322,8 +327,7 @@ static irqreturn_t rtq2208_irq_handler(int irqno, void *devid) return IRQ_HANDLED; } -static int rtq2208_of_get_fixed_voltage(struct device *dev, - struct of_regulator_match *rtq2208_ldo_match, int n_fixed) +static int rtq2208_of_get_ldo_dvs_ability(struct device *dev) { struct device_node *np; struct of_regulator_match *match; @@ -338,14 +342,14 @@ static int rtq2208_of_get_fixed_voltage(struct device *dev, if (!np) np = dev->of_node; - ret = of_regulator_match(dev, np, rtq2208_ldo_match, n_fixed); + ret = of_regulator_match(dev, np, rtq2208_ldo_match, ARRAY_SIZE(rtq2208_ldo_match)); of_node_put(np); if (ret < 0) return ret; - for (i = 0; i < n_fixed; i++) { + for (i = 0; i < ARRAY_SIZE(rtq2208_ldo_match); i++) { match = rtq2208_ldo_match + i; init_data = match->init_data; rdesc = (struct rtq2208_regulator_desc *)match->driver_data; @@ -388,8 +392,7 @@ static const struct linear_range rtq2208_vout_range[] = { REGULATOR_LINEAR_RANGE(1310000, 181, 255, 10000), }; -static void rtq2208_init_regulator_desc(struct rtq2208_regulator_desc *rdesc, int mtp_sel, - int idx, struct of_regulator_match *rtq2208_ldo_match, int *ldo_idx) +static void rtq2208_init_regulator_desc(struct rtq2208_regulator_desc *rdesc, int mtp_sel, int idx) { struct regulator_desc *desc; static const struct { @@ -461,8 +464,7 @@ static void rtq2208_init_regulator_desc(struct rtq2208_regulator_desc *rdesc, in static int rtq2208_parse_regulator_dt_data(int n_regulator, const unsigned int *regulator_idx_table, struct rtq2208_regulator_desc *rdesc[RTQ2208_LDO_MAX], struct device *dev) { - struct of_regulator_match rtq2208_ldo_match[2]; - int mtp_sel, ret, i, idx, ldo_idx = 0; + int mtp_sel, i, idx, ret; /* get mtp_sel0 or mtp_sel1 */ mtp_sel = device_property_read_bool(dev, "richtek,mtp-sel-high"); @@ -474,7 +476,7 @@ static int rtq2208_parse_regulator_dt_data(int n_regulator, const unsigned int * if (!rdesc[i]) return -ENOMEM; - rtq2208_init_regulator_desc(rdesc[i], mtp_sel, idx, rtq2208_ldo_match, &ldo_idx); + rtq2208_init_regulator_desc(rdesc[i], mtp_sel, idx); /* init ldo dvs ability */ if (idx >= RTQ2208_LDO2) @@ -482,7 +484,7 @@ static int rtq2208_parse_regulator_dt_data(int n_regulator, const unsigned int * } /* init ldo fixed_uV */ - ret = rtq2208_of_get_fixed_voltage(dev, rtq2208_ldo_match, ldo_idx); + ret = rtq2208_of_get_ldo_dvs_ability(dev); if (ret) return dev_err_probe(dev, ret, "Failed to get ldo fixed_uV\n"); From 0c07c273a5fe1a25d4f477fe7edf64b3e8b19b3d Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 27 May 2024 14:15:22 +0200 Subject: [PATCH 0209/1578] debugfs: continue to ignore unknown mount options Wolfram reported that debugfs remained empty on some of his boards triggering the message "debugfs: Unknown parameter 'auto'". The root of the issue is that we ignored unknown mount options in the old mount api but we started rejecting unknown mount options in the new mount api. Continue to ignore unknown mount options to not regress userspace. Fixes: a20971c18752 ("vfs: Convert debugfs to use the new mount API") Link: https://lore.kernel.org/r/20240527100618.np2wqiw5mz7as3vk@ninjato Reported-by: Wolfram Sang Tested-by: Wolfram Sang Signed-off-by: Christian Brauner --- fs/debugfs/inode.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index dc51df0b118d0a..8fd928899a59e4 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -107,8 +107,16 @@ static int debugfs_parse_param(struct fs_context *fc, struct fs_parameter *param int opt; opt = fs_parse(fc, debugfs_param_specs, param, &result); - if (opt < 0) + if (opt < 0) { + /* + * We might like to report bad mount options here; but + * traditionally debugfs has ignored all mount options + */ + if (opt == -ENOPARAM) + return 0; + return opt; + } switch (opt) { case Opt_uid: From db003a28e03f95f2bcb63f037a2078b8870b1ecd Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 28 May 2024 14:33:20 +0200 Subject: [PATCH 0210/1578] netfs: fix kernel doc for nets_wait_for_outstanding_io() The @inode parameter wasn't documented leading to new doc build warnings. Fixes: f89ea63f1c65 ("netfs, 9p: Fix race between umount and async request completion") Link: https://lore.kernel.org/r/20240528133050.7e09d78e@canb.auug.org.au Signed-off-by: Christian Brauner --- include/linux/netfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 3ca3906bb8da66..5d0288938cc2dc 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -521,7 +521,7 @@ static inline struct fscache_cookie *netfs_i_cookie(struct netfs_inode *ctx) /** * netfs_wait_for_outstanding_io - Wait for outstanding I/O to complete - * @ctx: The netfs inode to wait on + * @inode: The netfs inode to wait on * * Wait for outstanding I/O requests of any type to complete. This is intended * to be called from inode eviction routines. This makes sure that any From 233e27b4d21c3e44eb863f03e566d3a22e81a7ae Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 28 May 2024 15:28:52 +0900 Subject: [PATCH 0211/1578] null_blk: Print correct max open zones limit in null_init_zoned_dev() When changing the maximum number of open zones, print that number instead of the total number of zones. Fixes: dc4d137ee3b7 ("null_blk: add support for max open/active zone limit for zoned devices") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Niklas Cassel Link: https://lore.kernel.org/r/20240528062852.437599-1-dlemoal@kernel.org Signed-off-by: Jens Axboe --- drivers/block/null_blk/zoned.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index 5b5a63adacc17c..79c8e5e99f7ff1 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -108,7 +108,7 @@ int null_init_zoned_dev(struct nullb_device *dev, if (dev->zone_max_active && dev->zone_max_open > dev->zone_max_active) { dev->zone_max_open = dev->zone_max_active; pr_info("changed the maximum number of open zones to %u\n", - dev->nr_zones); + dev->zone_max_open); } else if (dev->zone_max_open >= dev->nr_zones - dev->zone_nr_conv) { dev->zone_max_open = 0; pr_info("zone_max_open limit disabled, limit >= zone count\n"); From bafea1c58b24be594d97841ced1b7ae0347bf6e3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 23 May 2024 20:26:13 +0200 Subject: [PATCH 0212/1578] sd: also set max_user_sectors when setting max_sectors sd can set a max_sectors value that is lower than the max_hw_sectors limit based on the block limits VPD page. While this is rather unusual, it used to work until the max_user_sectors field was split out to cleanly deal with conflicting hardware and user limits when the hardware limit changes. Also set max_user_sectors to ensure the limit can properly be stacked. Fixes: 4f563a64732d ("block: add a max_user_discard_sectors queue limit") Reported-by: Mike Snitzer Signed-off-by: Christoph Hellwig Acked-by: Mike Snitzer Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240523182618.602003-2-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/sd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 332eb9dac22d91..f6c822c9cbd2d3 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3700,8 +3700,10 @@ static int sd_revalidate_disk(struct gendisk *disk) */ if (sdkp->first_scan || q->limits.max_sectors > q->limits.max_dev_sectors || - q->limits.max_sectors > q->limits.max_hw_sectors) + q->limits.max_sectors > q->limits.max_hw_sectors) { q->limits.max_sectors = rw_max; + q->limits.max_user_sectors = rw_max; + } sdkp->first_scan = 0; From e528bede6f4e6822afdf0fa80be46ea9199f0911 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 23 May 2024 20:26:14 +0200 Subject: [PATCH 0213/1578] block: stack max_user_sectors The max_user_sectors is one of the three factors determining the actual max_sectors limit for READ/WRITE requests. Because of that it needs to be stacked at least for the device mapper multi-path case where requests are directly inserted on the lower device. For SCSI disks this is important because the sd driver actually sets it's own advisory limit that is lower than max_hw_sectors based on the block limits VPD page. While this is a bit odd an unusual, the same effect can happen if a user or udev script tweaks the value manually. Fixes: 4f563a64732d ("block: add a max_user_discard_sectors queue limit") Reported-by: Mike Snitzer Signed-off-by: Christoph Hellwig Acked-by: Mike Snitzer Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240523182618.602003-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/blk-settings.c b/block/blk-settings.c index a7fe8e90240a6e..7a672021daee6a 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -611,6 +611,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, unsigned int top, bottom, alignment, ret = 0; t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); + t->max_user_sectors = min_not_zero(t->max_user_sectors, + b->max_user_sectors); t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors); t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors, From e993db2d6e5207f1ae061c2ac554ab1f714c741d Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 24 May 2024 12:46:51 +0200 Subject: [PATCH 0214/1578] block: check for max_hw_sectors underflow The logical block size need to be smaller than the max_hw_sector setting, otherwise we can't even transfer a single LBA. Signed-off-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: John Garry Signed-off-by: Jens Axboe --- block/blk-settings.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index 7a672021daee6a..effeb9a639bb45 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -104,6 +104,7 @@ static int blk_validate_zoned_limits(struct queue_limits *lim) static int blk_validate_limits(struct queue_limits *lim) { unsigned int max_hw_sectors; + unsigned int logical_block_sectors; /* * Unless otherwise specified, default to 512 byte logical blocks and a @@ -134,8 +135,11 @@ static int blk_validate_limits(struct queue_limits *lim) lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; if (WARN_ON_ONCE(lim->max_hw_sectors < PAGE_SECTORS)) return -EINVAL; + logical_block_sectors = lim->logical_block_size >> SECTOR_SHIFT; + if (WARN_ON_ONCE(logical_block_sectors > lim->max_hw_sectors)) + return -EINVAL; lim->max_hw_sectors = round_down(lim->max_hw_sectors, - lim->logical_block_size >> SECTOR_SHIFT); + logical_block_sectors); /* * The actual max_sectors value is a complex beast and also takes the @@ -153,7 +157,7 @@ static int blk_validate_limits(struct queue_limits *lim) lim->max_sectors = min(max_hw_sectors, BLK_DEF_MAX_SECTORS_CAP); } lim->max_sectors = round_down(lim->max_sectors, - lim->logical_block_size >> SECTOR_SHIFT); + logical_block_sectors); /* * Random default for the maximum number of segments. Driver should not From a14a68b76954e73031ca6399abace17dcb77c17a Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Tue, 28 May 2024 20:09:12 +0800 Subject: [PATCH 0215/1578] bcache: allow allocator to invalidate bucket in gc Currently, if the gc is running, when the allocator found free_inc is empty, allocator has to wait the gc finish. Before that, the IO is blocked. But actually, there would be some buckets is reclaimable before gc, and gc will never mark this kind of bucket to be unreclaimable. So we can put these buckets into free_inc in gc running to avoid IO being blocked. Signed-off-by: Dongsheng Yang Signed-off-by: Mingzhe Zou Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20240528120914.28705-2-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/alloc.c | 13 +++++-------- drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/btree.c | 7 ++++++- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index ce13c272c38723..32a46343097dd8 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -129,12 +129,9 @@ static inline bool can_inc_bucket_gen(struct bucket *b) bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b) { - BUG_ON(!ca->set->gc_mark_valid); - - return (!GC_MARK(b) || - GC_MARK(b) == GC_MARK_RECLAIMABLE) && - !atomic_read(&b->pin) && - can_inc_bucket_gen(b); + return (ca->set->gc_mark_valid || b->reclaimable_in_gc) && + ((!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) && + !atomic_read(&b->pin) && can_inc_bucket_gen(b)); } void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) @@ -148,6 +145,7 @@ void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) bch_inc_gen(ca, b); b->prio = INITIAL_PRIO; atomic_inc(&b->pin); + b->reclaimable_in_gc = 0; } static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) @@ -352,8 +350,7 @@ static int bch_allocator_thread(void *arg) */ retry_invalidate: - allocator_wait(ca, ca->set->gc_mark_valid && - !ca->invalidate_needs_gc); + allocator_wait(ca, !ca->invalidate_needs_gc); invalidate_buckets(ca); /* diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 4e6afa89921fe0..1d33e40d26ea51 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -200,6 +200,7 @@ struct bucket { uint8_t gen; uint8_t last_gc; /* Most out of date gen in the btree */ uint16_t gc_mark; /* Bitfield used by GC. See below for field */ + uint16_t reclaimable_in_gc:1; }; /* diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index d011a7154d3304..4e6ccf2c8a0bf3 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1741,18 +1741,20 @@ static void btree_gc_start(struct cache_set *c) mutex_lock(&c->bucket_lock); - c->gc_mark_valid = 0; c->gc_done = ZERO_KEY; ca = c->cache; for_each_bucket(b, ca) { b->last_gc = b->gen; + if (bch_can_invalidate_bucket(ca, b)) + b->reclaimable_in_gc = 1; if (!atomic_read(&b->pin)) { SET_GC_MARK(b, 0); SET_GC_SECTORS_USED(b, 0); } } + c->gc_mark_valid = 0; mutex_unlock(&c->bucket_lock); } @@ -1809,6 +1811,9 @@ static void bch_btree_gc_finish(struct cache_set *c) for_each_bucket(b, ca) { c->need_gc = max(c->need_gc, bucket_gc_gen(b)); + if (b->reclaimable_in_gc) + b->reclaimable_in_gc = 0; + if (atomic_read(&b->pin)) continue; From 05356938a4be356adde4eab4425c6822f3c7d706 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Tue, 28 May 2024 20:09:13 +0800 Subject: [PATCH 0216/1578] bcache: call force_wake_up_gc() if necessary in check_should_bypass() If there are extreme heavy write I/O continuously hit on relative small cache device (512GB in my testing), it is possible to make counter c->gc_stats.in_use continue to increase and exceed CUTOFF_CACHE_ADD. If 'c->gc_stats.in_use > CUTOFF_CACHE_ADD' happens, all following write requests will bypass the cache device because check_should_bypass() returns 'true'. Because all writes bypass the cache device, counter c->sectors_to_gc has no chance to be negative value, and garbage collection thread won't be waken up even the whole cache becomes clean after writeback accomplished. The aftermath is that all write I/Os go directly into backing device even the cache device is clean. To avoid the above situation, this patch uses a quite conservative way to fix: if 'c->gc_stats.in_use > CUTOFF_CACHE_ADD' happens, only wakes up garbage collection thread when the whole cache device is clean. Before the fix, the writes-always-bypass situation happens after 10+ hours write I/O pressure on 512GB Intel optane memory which acts as cache device. After this fix, such situation doesn't happen after 36+ hours testing. Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20240528120914.28705-3-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/request.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 83d112bd2b1c0e..af345dc6fde14f 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -369,10 +369,24 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) struct io *i; if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || - c->gc_stats.in_use > CUTOFF_CACHE_ADD || (bio_op(bio) == REQ_OP_DISCARD)) goto skip; + if (c->gc_stats.in_use > CUTOFF_CACHE_ADD) { + /* + * If cached buckets are all clean now, 'true' will be + * returned and all requests will bypass the cache device. + * Then c->sectors_to_gc has no chance to be negative, and + * gc thread won't wake up and caching won't work forever. + * Here call force_wake_up_gc() to avoid such aftermath. + */ + if (BDEV_STATE(&dc->sb) == BDEV_STATE_CLEAN && + c->gc_mark_valid) + force_wake_up_gc(c); + + goto skip; + } + if (mode == CACHE_MODE_NONE || (mode == CACHE_MODE_WRITEAROUND && op_is_write(bio_op(bio)))) From 74d4ce92e08d5669d66fd890403724faa4286c21 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Tue, 28 May 2024 20:09:14 +0800 Subject: [PATCH 0217/1578] bcache: code cleanup in __bch_bucket_alloc_set() In __bch_bucket_alloc_set() the lines after lable 'err:' indeed do nothing useful after multiple cache devices are removed from bcache code. This cleanup patch drops the useless code to save a bit CPU cycles. Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20240528120914.28705-4-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/alloc.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 32a46343097dd8..48ce750bf70af9 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -498,8 +498,8 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, ca = c->cache; b = bch_bucket_alloc(ca, reserve, wait); - if (b == -1) - goto err; + if (b < 0) + return -1; k->ptr[0] = MAKE_PTR(ca->buckets[b].gen, bucket_to_sector(c, b), @@ -508,10 +508,6 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, SET_KEY_PTRS(k, 1); return 0; -err: - bch_bucket_free(c, k); - bkey_put(c, k); - return -1; } int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, From 56a5cf538c3f2d935b0d81040a8303b6e7fc5fd8 Mon Sep 17 00:00:00 2001 From: MD Danish Anwar Date: Mon, 27 May 2024 12:00:15 +0530 Subject: [PATCH 0218/1578] net: ti: icssg-prueth: Fix start counter for ft1 filter The start counter for FT1 filter is wrongly set to 0 in the driver. FT1 is used for source address violation (SAV) check and source address starts at Byte 6 not Byte 0. Fix this by changing start counter to ETH_ALEN in icssg_ft1_set_mac_addr(). Fixes: e9b4ece7d74b ("net: ti: icssg-prueth: Add Firmware config and classification APIs.") Signed-off-by: MD Danish Anwar Link: https://lore.kernel.org/r/20240527063015.263748-1-danishanwar@ti.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/ti/icssg/icssg_classifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/icssg/icssg_classifier.c b/drivers/net/ethernet/ti/icssg/icssg_classifier.c index 79ba47bb3602ec..f7d21da1a0fb62 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_classifier.c +++ b/drivers/net/ethernet/ti/icssg/icssg_classifier.c @@ -455,7 +455,7 @@ void icssg_ft1_set_mac_addr(struct regmap *miig_rt, int slice, u8 *mac_addr) { const u8 mask_addr[] = { 0, 0, 0, 0, 0, 0, }; - rx_class_ft1_set_start_len(miig_rt, slice, 0, 6); + rx_class_ft1_set_start_len(miig_rt, slice, ETH_ALEN, ETH_ALEN); rx_class_ft1_set_da(miig_rt, slice, 0, mac_addr); rx_class_ft1_set_da_mask(miig_rt, slice, 0, mask_addr); rx_class_ft1_cfg_set_type(miig_rt, slice, 0, FT1_CFG_TYPE_EQ); From da42b5229b27bb5c0eff3408c92f025e6041dad3 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 28 May 2024 11:01:20 -0300 Subject: [PATCH 0219/1578] tools headers: Update the syscall tables and unistd.h, mostly to support the new 'mseal' syscall But also to wire up shadow stacks on 32-bit x86, picking up those changes from these csets: ff388fe5c481d39c ("mseal: wire up mseal syscall") 2883f01ec37dd866 ("x86/shstk: Enable shadow stacks for x32") This makes 'perf trace' support it, now its possible, for instance to do: # perf trace -e mseal --max-stack=16 Here is an example with the 'sendmmsg' syscall: root@x1:~# perf trace -e sendmmsg --max-stack 16 --max-events=1 0.000 ( 0.062 ms): dbus-broker/1012 sendmmsg(fd: 150, mmsg: 0x7ffef57cca50, vlen: 1, flags: DONTWAIT|NOSIGNAL) = 1 syscall_exit_to_user_mode_prepare ([kernel.kallsyms]) syscall_exit_to_user_mode_prepare ([kernel.kallsyms]) syscall_exit_to_user_mode ([kernel.kallsyms]) do_syscall_64 ([kernel.kallsyms]) entry_SYSCALL_64 ([kernel.kallsyms]) [0x117ce7] (/usr/lib64/libc.so.6 (deleted)) root@x1:~# To do a system wide tracing of the new 'mseal' syscall with a backtrace of at most 16 entries. This addresses these perf tools build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/asm-generic/unistd.h include/uapi/asm-generic/unistd.h diff -u tools/perf/arch/x86/entry/syscalls/syscall_64.tbl arch/x86/entry/syscalls/syscall_64.tbl diff -u tools/perf/arch/powerpc/entry/syscalls/syscall.tbl arch/powerpc/kernel/syscalls/syscall.tbl diff -u tools/perf/arch/s390/entry/syscalls/syscall.tbl arch/s390/kernel/syscalls/syscall.tbl diff -u tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl arch/mips/kernel/syscalls/syscall_n64.tbl Cc: Adrian Hunter Cc: Andrew Morton Cc: H J Lu Cc: Ian Rogers Cc: Ingo Molnar Cc: Jeff Xu Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/ZlXlo4TNcba4wnVZ@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/asm-generic/unistd.h | 5 ++++- tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl | 1 + tools/perf/arch/powerpc/entry/syscalls/syscall.tbl | 1 + tools/perf/arch/s390/entry/syscalls/syscall.tbl | 1 + tools/perf/arch/x86/entry/syscalls/syscall_64.tbl | 3 ++- 5 files changed, 9 insertions(+), 2 deletions(-) diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index 75f00965ab1586..d983c48a3b6af1 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -842,8 +842,11 @@ __SYSCALL(__NR_lsm_set_self_attr, sys_lsm_set_self_attr) #define __NR_lsm_list_modules 461 __SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules) +#define __NR_mseal 462 +__SYSCALL(__NR_mseal, sys_mseal) + #undef __NR_syscalls -#define __NR_syscalls 462 +#define __NR_syscalls 463 /* * 32 bit systems traditionally used different diff --git a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl index 532b855df58957..1464c6be6eb3c7 100644 --- a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl +++ b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl @@ -376,3 +376,4 @@ 459 n64 lsm_get_self_attr sys_lsm_get_self_attr 460 n64 lsm_set_self_attr sys_lsm_set_self_attr 461 n64 lsm_list_modules sys_lsm_list_modules +462 n64 mseal sys_mseal diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl index 17173b82ca21dc..3656f1ca7a21c6 100644 --- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl @@ -548,3 +548,4 @@ 459 common lsm_get_self_attr sys_lsm_get_self_attr 460 common lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules +462 common mseal sys_mseal diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl index 095bb86339a7d3..bd0fee24ad10a3 100644 --- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl @@ -464,3 +464,4 @@ 459 common lsm_get_self_attr sys_lsm_get_self_attr sys_lsm_get_self_attr 460 common lsm_set_self_attr sys_lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules sys_lsm_list_modules +462 common mseal sys_mseal sys_mseal diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl index 7e8d46f4147f57..a396f6e6ab5bf9 100644 --- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl @@ -374,7 +374,7 @@ 450 common set_mempolicy_home_node sys_set_mempolicy_home_node 451 common cachestat sys_cachestat 452 common fchmodat2 sys_fchmodat2 -453 64 map_shadow_stack sys_map_shadow_stack +453 common map_shadow_stack sys_map_shadow_stack 454 common futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait 456 common futex_requeue sys_futex_requeue @@ -383,6 +383,7 @@ 459 common lsm_get_self_attr sys_lsm_get_self_attr 460 common lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules +462 common mseal sys_mseal # # Due to a historical design error, certain syscalls are numbered differently From f13e01b89daf42330a4a722f451e48c3e2edfc8d Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 17 May 2024 12:48:25 +0100 Subject: [PATCH 0220/1578] btrfs: ensure fast fsync waits for ordered extents after a write failure If a write path in COW mode fails, either before submitting a bio for the new extents or an actual IO error happens, we can end up allowing a fast fsync to log file extent items that point to unwritten extents. This is because dropping the extent maps happens when completing ordered extents, at btrfs_finish_one_ordered(), and the completion of an ordered extent is executed in a work queue. This can result in a fast fsync to start logging file extent items based on existing extent maps before the ordered extents complete, therefore resulting in a log that has file extent items that point to unwritten extents, resulting in a corrupt file if a crash happens after and the log tree is replayed the next time the fs is mounted. This can happen for both direct IO writes and buffered writes. For example consider a direct IO write, in COW mode, that fails at btrfs_dio_submit_io() because btrfs_extract_ordered_extent() returned an error: 1) We call btrfs_finish_ordered_extent() with the 'uptodate' parameter set to false, meaning an error happened; 2) That results in marking the ordered extent with the BTRFS_ORDERED_IOERR flag; 3) btrfs_finish_ordered_extent() queues the completion of the ordered extent - so that btrfs_finish_one_ordered() will be executed later in a work queue. That function will drop extent maps in the range when it's executed, since the extent maps point to unwritten locations (signaled by the BTRFS_ORDERED_IOERR flag); 4) After calling btrfs_finish_ordered_extent() we keep going down the write path and unlock the inode; 5) After that a fast fsync starts and locks the inode; 6) Before the work queue executes btrfs_finish_one_ordered(), the fsync task sees the extent maps that point to the unwritten locations and logs file extent items based on them - it does not know they are unwritten, and the fast fsync path does not wait for ordered extents to complete, which is an intentional behaviour in order to reduce latency. For the buffered write case, here's one example: 1) A fast fsync begins, and it starts by flushing delalloc and waiting for the writeback to complete by calling filemap_fdatawait_range(); 2) Flushing the dellaloc created a new extent map X; 3) During the writeback some IO error happened, and at the end io callback (end_bbio_data_write()) we call btrfs_finish_ordered_extent(), which sets the BTRFS_ORDERED_IOERR flag in the ordered extent and queues its completion; 4) After queuing the ordered extent completion, the end io callback clears the writeback flag from all pages (or folios), and from that moment the fast fsync can proceed; 5) The fast fsync proceeds sees extent map X and logs a file extent item based on extent map X, resulting in a log that points to an unwritten data extent - because the ordered extent completion hasn't run yet, it happens only after the logging. To fix this make btrfs_finish_ordered_extent() set the inode flag BTRFS_INODE_NEEDS_FULL_SYNC in case an error happened for a COW write, so that a fast fsync will wait for ordered extent completion. Note that this issues of using extent maps that point to unwritten locations can not happen for reads, because in read paths we start by locking the extent range and wait for any ordered extents in the range to complete before looking for extent maps. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 10 ++++++++++ fs/btrfs/file.c | 16 ++++++++++++++++ fs/btrfs/ordered-data.c | 31 +++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 91c994b569f3b8..6ed495ca7a311d 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -89,6 +89,16 @@ enum { BTRFS_INODE_FREE_SPACE_INODE, /* Set when there are no capabilities in XATTs for the inode. */ BTRFS_INODE_NO_CAP_XATTR, + /* + * Set if an error happened when doing a COW write before submitting a + * bio or during writeback. Used for both buffered writes and direct IO + * writes. This is to signal a fast fsync that it has to wait for + * ordered extents to complete and therefore not log extent maps that + * point to unwritten extents (when an ordered extent completes and it + * has the BTRFS_ORDERED_IOERR flag set, it drops extent maps in its + * range). + */ + BTRFS_INODE_COW_WRITE_ERROR, }; /* in memory btrfs inode */ diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0c7c1b42028e3d..00670596bf065a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1885,6 +1885,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ if (full_sync || btrfs_is_zoned(fs_info)) { ret = btrfs_wait_ordered_range(inode, start, len); + clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &BTRFS_I(inode)->runtime_flags); } else { /* * Get our ordered extents as soon as possible to avoid doing @@ -1894,6 +1895,21 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) btrfs_get_ordered_extents_for_logging(BTRFS_I(inode), &ctx.ordered_extents); ret = filemap_fdatawait_range(inode->i_mapping, start, end); + if (ret) + goto out_release_extents; + + /* + * Check and clear the BTRFS_INODE_COW_WRITE_ERROR now after + * starting and waiting for writeback, because for buffered IO + * it may have been set during the end IO callback + * (end_bbio_data_write() -> btrfs_finish_ordered_extent()) in + * case an error happened and we need to wait for ordered + * extents to complete so that any extent maps that point to + * unwritten locations are dropped and we don't log them. + */ + if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR, + &BTRFS_I(inode)->runtime_flags)) + ret = btrfs_wait_ordered_range(inode, start, len); } if (ret) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index c5bdd674f55c53..35a413ce935d9e 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -388,6 +388,37 @@ bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate); spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); + /* + * If this is a COW write it means we created new extent maps for the + * range and they point to unwritten locations if we got an error either + * before submitting a bio or during IO. + * + * We have marked the ordered extent with BTRFS_ORDERED_IOERR, and we + * are queuing its completion below. During completion, at + * btrfs_finish_one_ordered(), we will drop the extent maps for the + * unwritten extents. + * + * However because completion runs in a work queue we can end up having + * a fast fsync running before that. In the case of direct IO, once we + * unlock the inode the fsync might start, and we queue the completion + * before unlocking the inode. In the case of buffered IO when writeback + * finishes (end_bbio_data_write()) we queue the completion, so if the + * writeback was triggered by a fast fsync, the fsync might start + * logging before ordered extent completion runs in the work queue. + * + * The fast fsync will log file extent items based on the extent maps it + * finds, so if by the time it collects extent maps the ordered extent + * completion didn't happen yet, it will log file extent items that + * point to unwritten extents, resulting in a corruption if a crash + * happens and the log tree is replayed. Note that a fast fsync does not + * wait for completion of ordered extents in order to reduce latency. + * + * Set a flag in the inode so that the next fast fsync will wait for + * ordered extents to complete before starting to log. + */ + if (!uptodate && !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) + set_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags); + if (ret) btrfs_queue_ordered_fn(ordered); return ret; From 43cad521c6d228ea0c51e248f8e5b3a6295a2849 Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Tue, 30 Apr 2024 14:07:06 +0530 Subject: [PATCH 0221/1578] tools/power/cpupower: Fix Pstate frequency reporting on AMD Family 1Ah CPUs Update cpupower's P-State frequency calculation and reporting with AMD Family 1Ah+ processors, when using the acpi-cpufreq driver. This is due to a change in the PStateDef MSR layout in AMD Family 1Ah+. Tested on 4th and 5th Gen AMD EPYC system Signed-off-by: Ananth Narayan Signed-off-by: Dhananjay Ugwekar Reviewed-by: Mario Limonciello Signed-off-by: Shuah Khan --- tools/power/cpupower/utils/helpers/amd.c | 26 +++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/tools/power/cpupower/utils/helpers/amd.c b/tools/power/cpupower/utils/helpers/amd.c index c519cc89c97f42..0a56e22240fc8b 100644 --- a/tools/power/cpupower/utils/helpers/amd.c +++ b/tools/power/cpupower/utils/helpers/amd.c @@ -41,6 +41,16 @@ union core_pstate { unsigned res1:31; unsigned en:1; } pstatedef; + /* since fam 1Ah: */ + struct { + unsigned fid:12; + unsigned res1:2; + unsigned vid:8; + unsigned iddval:8; + unsigned idddiv:2; + unsigned res2:31; + unsigned en:1; + } pstatedef2; unsigned long long val; }; @@ -48,6 +58,10 @@ static int get_did(union core_pstate pstate) { int t; + /* Fam 1Ah onward do not use did */ + if (cpupower_cpu_info.family >= 0x1A) + return 0; + if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_PSTATEDEF) t = pstate.pstatedef.did; else if (cpupower_cpu_info.family == 0x12) @@ -61,12 +75,18 @@ static int get_did(union core_pstate pstate) static int get_cof(union core_pstate pstate) { int t; - int fid, did, cof; + int fid, did, cof = 0; did = get_did(pstate); if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_PSTATEDEF) { - fid = pstate.pstatedef.fid; - cof = 200 * fid / did; + if (cpupower_cpu_info.family >= 0x1A) { + fid = pstate.pstatedef2.fid; + if (fid > 0x0f) + cof = (fid * 5); + } else { + fid = pstate.pstatedef.fid; + cof = 200 * fid / did; + } } else { t = 0x10; fid = pstate.pstate.fid; From 1292bc2ebf63e705ae18bbaaf9cea21b68d37ee6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 24 May 2024 13:59:34 -0400 Subject: [PATCH 0222/1578] bcachefs: Plumb bkey into __btree_err() It can be useful to know the exact byte offset within a btree node where an error occured. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 85 ++++++++++++++++++++++-------------------- 1 file changed, 45 insertions(+), 40 deletions(-) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index cbf8f5d9060211..829c1b91477d79 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -519,7 +519,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) static void btree_err_msg(struct printbuf *out, struct bch_fs *c, struct bch_dev *ca, - struct btree *b, struct bset *i, + struct btree *b, struct bset *i, struct bkey_packed *k, unsigned offset, int write) { prt_printf(out, bch2_log_msg(c, "%s"), @@ -537,15 +537,20 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, b->written, btree_ptr_sectors_written(&b->key)); if (i) prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s)); + if (k) + prt_printf(out, " bset byte offset %lu", + (unsigned long)(void *)k - + ((unsigned long)(void *)i & ~511UL)); prt_str(out, ": "); } -__printf(9, 10) +__printf(10, 11) static int __btree_err(int ret, struct bch_fs *c, struct bch_dev *ca, struct btree *b, struct bset *i, + struct bkey_packed *k, int write, bool have_retry, enum bch_sb_error_id err_type, @@ -555,7 +560,7 @@ static int __btree_err(int ret, bool silent = c->curr_recovery_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes; va_list args; - btree_err_msg(&out, c, ca, b, i, b->written, write); + btree_err_msg(&out, c, ca, b, i, k, b->written, write); va_start(args, fmt); prt_vprintf(&out, fmt, args); @@ -611,9 +616,9 @@ static int __btree_err(int ret, return ret; } -#define btree_err(type, c, ca, b, i, _err_type, msg, ...) \ +#define btree_err(type, c, ca, b, i, k, _err_type, msg, ...) \ ({ \ - int _ret = __btree_err(type, c, ca, b, i, write, have_retry, \ + int _ret = __btree_err(type, c, ca, b, i, k, write, have_retry, \ BCH_FSCK_ERR_##_err_type, \ msg, ##__VA_ARGS__); \ \ @@ -690,7 +695,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_err_on(!bch2_version_compatible(version), -BCH_ERR_btree_node_read_err_incompatible, - c, ca, b, i, + c, ca, b, i, NULL, btree_node_unsupported_version, "unsupported bset version %u.%u", BCH_VERSION_MAJOR(version), @@ -698,7 +703,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, if (btree_err_on(version < c->sb.version_min, -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, NULL, btree_node_bset_older_than_sb_min, "bset version %u older than superblock version_min %u", version, c->sb.version_min)) { @@ -711,7 +716,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, if (btree_err_on(BCH_VERSION_MAJOR(version) > BCH_VERSION_MAJOR(c->sb.version), -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, NULL, btree_node_bset_newer_than_sb, "bset version %u newer than superblock version %u", version, c->sb.version)) { @@ -723,13 +728,13 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_err_on(BSET_SEPARATE_WHITEOUTS(i), -BCH_ERR_btree_node_read_err_incompatible, - c, ca, b, i, + c, ca, b, i, NULL, btree_node_unsupported_version, "BSET_SEPARATE_WHITEOUTS no longer supported"); if (btree_err_on(offset + sectors > btree_sectors(c), -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, + c, ca, b, i, NULL, bset_past_end_of_btree_node, "bset past end of btree node")) { i->u64s = 0; @@ -739,13 +744,13 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_err_on(offset && !i->u64s, -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, + c, ca, b, i, NULL, bset_empty, "empty bset"); btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset, -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, + c, ca, b, i, NULL, bset_wrong_sector_offset, "bset at wrong sector offset"); @@ -761,20 +766,20 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, /* XXX endianness */ btree_err_on(bp->seq != bn->keys.seq, -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, bset_bad_seq, "incorrect sequence number (wrong btree node)"); } btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id, -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, i, + c, ca, b, i, NULL, btree_node_bad_btree, "incorrect btree id"); btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level, -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, i, + c, ca, b, i, NULL, btree_node_bad_level, "incorrect level"); @@ -793,7 +798,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_err_on(!bpos_eq(b->data->min_key, bp->min_key), -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_bad_min_key, "incorrect min_key: got %s should be %s", (printbuf_reset(&buf1), @@ -804,7 +809,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_err_on(!bpos_eq(bn->max_key, b->key.k.p), -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, i, + c, ca, b, i, NULL, btree_node_bad_max_key, "incorrect max key %s", (printbuf_reset(&buf1), @@ -816,7 +821,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1), -BCH_ERR_btree_node_read_err_bad_node, - c, ca, b, i, + c, ca, b, i, NULL, btree_node_bad_format, "invalid bkey format: %s\n %s", buf1.buf, (printbuf_reset(&buf2), @@ -883,7 +888,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, if (btree_err_on(bkey_p_next(k) > vstruct_last(i), -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, k, btree_node_bkey_past_bset_end, "key extends past end of bset")) { i->u64s = cpu_to_le16((u64 *) k - i->_data); @@ -892,14 +897,14 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, if (btree_err_on(k->format > KEY_FORMAT_CURRENT, -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, k, btree_node_bkey_bad_format, "invalid bkey format %u", k->format)) goto drop_this_key; if (btree_err_on(!bkeyp_u64s_valid(&b->format, k), -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, k, btree_node_bkey_bad_u64s, "bad k->u64s %u (min %u max %zu)", k->u64s, bkeyp_key_u64s(&b->format, k), @@ -921,7 +926,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, bch2_bkey_val_to_text(&buf, c, u.s_c); btree_err(-BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, k, btree_node_bad_bkey, "invalid bkey: %s", buf.buf); goto drop_this_key; @@ -942,7 +947,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, bch2_bkey_to_text(&buf, u.k); if (btree_err(-BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, k, btree_node_bkey_out_of_order, "%s", buf.buf)) goto drop_this_key; @@ -1011,13 +1016,13 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, if (bch2_meta_read_fault("btree")) btree_err(-BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_fault_injected, "dynamic fault"); btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_bad_magic, "bad magic: want %llx, got %llx", bset_magic(c), le64_to_cpu(b->data->magic)); @@ -1032,7 +1037,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(b->data->keys.seq != bp->seq, -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_bad_seq, "got wrong btree node: got\n%s", (printbuf_reset(&buf), @@ -1041,7 +1046,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, } else { btree_err_on(!b->data->keys.seq, -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_bad_seq, "bad btree header: seq 0\n%s", (printbuf_reset(&buf), @@ -1060,7 +1065,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, + c, ca, b, i, NULL, bset_unknown_csum, "unknown checksum type %llu", BSET_CSUM_TYPE(i)); @@ -1073,7 +1078,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(csum_bad, -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, + c, ca, b, i, NULL, bset_bad_csum, "%s", (printbuf_reset(&buf), @@ -1088,7 +1093,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), -BCH_ERR_btree_node_read_err_incompatible, - c, NULL, b, NULL, + c, NULL, b, NULL, NULL, btree_node_unsupported_version, "btree node does not have NEW_EXTENT_OVERWRITE set"); @@ -1102,7 +1107,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, + c, ca, b, i, NULL, bset_unknown_csum, "unknown checksum type %llu", BSET_CSUM_TYPE(i)); @@ -1114,7 +1119,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(csum_bad, -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, + c, ca, b, i, NULL, bset_bad_csum, "%s", (printbuf_reset(&buf), @@ -1152,14 +1157,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(blacklisted && first, -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, + c, ca, b, i, NULL, bset_blacklisted_journal_seq, "first btree node bset has blacklisted journal seq (%llu)", le64_to_cpu(i->journal_seq)); btree_err_on(blacklisted && ptr_written, -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, + c, ca, b, i, NULL, first_bset_blacklisted_journal_seq, "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u", le64_to_cpu(i->journal_seq), @@ -1178,7 +1183,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, if (ptr_written) { btree_err_on(b->written < ptr_written, -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_data_missing, "btree node data missing: expected %u sectors, found %u", ptr_written, b->written); @@ -1191,7 +1196,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, le64_to_cpu(bne->keys.journal_seq), true), -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_bset_after_end, "found bset signature after last bset"); } @@ -1235,7 +1240,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, bch2_bkey_val_to_text(&buf, c, u.s_c); btree_err(-BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, k, btree_node_bad_bkey, "%s", buf.buf); @@ -1471,18 +1476,18 @@ static CLOSURE_CALLBACK(btree_node_read_all_replicas_done) written2 = btree_node_sectors_written(c, ra->buf[i]); if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, NULL, + c, NULL, b, NULL, NULL, btree_node_replicas_sectors_written_mismatch, "btree node sectors written mismatch: %u != %u", written, written2) || btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]), -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, NULL, + c, NULL, b, NULL, NULL, btree_node_bset_after_end, "found bset signature after last bset") || btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9), -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, NULL, + c, NULL, b, NULL, NULL, btree_node_replicas_data_mismatch, "btree node replicas content mismatch")) dump_bset_maps = true; From 9e1a66e66870ebeebea9f674550118df3c12eaf6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 26 May 2024 13:24:31 -0400 Subject: [PATCH 0223/1578] bcachefs: Fix lookup_first_inode() when inode_generations are present This function is used for finding the hash seed (which is the same in all versions of an inode in different snapshots): ff an inode has been deleted in a child snapshot we need to iterate until we find a live version. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index c8f57465131c54..4cd28db9bad8de 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -77,21 +77,17 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, struct bkey_s_c k; int ret; - bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, - POS(0, inode_nr), - BTREE_ITER_all_snapshots); - k = bch2_btree_iter_peek(&iter); - ret = bkey_err(k); - if (ret) - goto err; - - if (!k.k || !bkey_eq(k.k->p, POS(0, inode_nr))) { - ret = -BCH_ERR_ENOENT_inode; - goto err; + for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != inode_nr) + break; + if (!bkey_is_inode(k.k)) + continue; + ret = bch2_inode_unpack(k, inode); + goto found; } - - ret = bch2_inode_unpack(k, inode); -err: + ret = -BCH_ERR_ENOENT_inode; +found: bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); bch2_trans_iter_exit(trans, &iter); return ret; From 218e5e0c2a3acdb29ccbdfbfdd5e2def27d3aae2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 27 May 2024 02:40:06 -0400 Subject: [PATCH 0224/1578] bcachefs: Fix locking assert We now track whether a transaction is locked, and verify that we don't have nodes locked when the transaction isn't locked; reorder relocks to not pop the new assert. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_key_cache.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 75f5e6fe46349f..34056aaece009e 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -424,16 +424,16 @@ static int btree_key_cache_fill(struct btree_trans *trans, goto err; } - if (!bch2_btree_node_relock(trans, ck_path, 0)) { + ret = bch2_trans_relock(trans); + if (ret) { kfree(new_k); - trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); goto err; } - ret = bch2_trans_relock(trans); - if (ret) { + if (!bch2_btree_node_relock(trans, ck_path, 0)) { kfree(new_k); + trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); goto err; } } From 82af5ceb5d9d1f0613be3b9161ec1104b85f00b8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 26 May 2024 22:22:30 -0400 Subject: [PATCH 0225/1578] bcachefs: Refactor delete_dead_snapshots() Consolidate per-key work into delete_dead_snapshots_process_key(), so we now walk all keys once, not twice. Signed-off-by: Kent Overstreet --- fs/bcachefs/snapshot.c | 65 ++++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 40 deletions(-) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 629900a5e6411c..466fa3e6a4b656 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -1351,7 +1351,7 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, * that key to snapshot leaf nodes, where we can mutate it */ -static int snapshot_delete_key(struct btree_trans *trans, +static int delete_dead_snapshots_process_key(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, snapshot_id_list *deleted, @@ -1360,26 +1360,26 @@ static int snapshot_delete_key(struct btree_trans *trans, { struct bch_fs *c = trans->c; u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); + if (!equiv) /* key for invalid snapshot node, but we chose not to delete */ + return 0; if (!bkey_eq(k.k->p, *last_pos)) equiv_seen->nr = 0; - *last_pos = k.k->p; - if (snapshot_list_has_id(deleted, k.k->p.snapshot) || - snapshot_list_has_id(equiv_seen, equiv)) { + if (snapshot_list_has_id(deleted, k.k->p.snapshot)) return bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node); - } else { - return snapshot_list_add(c, equiv_seen, equiv); - } -} -static int move_key_to_correct_snapshot(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); + if (!bpos_eq(*last_pos, k.k->p) && + snapshot_list_has_id(equiv_seen, equiv)) + return bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_internal_snapshot_node); + + *last_pos = k.k->p; + + int ret = snapshot_list_add_nodup(c, equiv_seen, equiv); + if (ret) + return ret; /* * When we have a linear chain of snapshot nodes, we consider @@ -1389,21 +1389,20 @@ static int move_key_to_correct_snapshot(struct btree_trans *trans, * * If there are multiple keys in different snapshots at the same * position, we're only going to keep the one in the newest - * snapshot - the rest have been overwritten and are redundant, - * and for the key we're going to keep we need to move it to the - * equivalance class ID if it's not there already. + * snapshot (we delete the others above) - the rest have been + * overwritten and are redundant, and for the key we're going to keep we + * need to move it to the equivalance class ID if it's not there + * already. */ if (equiv != k.k->p.snapshot) { struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); - struct btree_iter new_iter; - int ret; - - ret = PTR_ERR_OR_ZERO(new); + int ret = PTR_ERR_OR_ZERO(new); if (ret) return ret; new->k.p.snapshot = equiv; + struct btree_iter new_iter; bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p, BTREE_ITER_all_snapshots| BTREE_ITER_cached| @@ -1538,7 +1537,6 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) struct btree_trans *trans; snapshot_id_list deleted = { 0 }; snapshot_id_list deleted_interior = { 0 }; - u32 id; int ret = 0; if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) @@ -1585,33 +1583,20 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) if (ret) goto err; - for (id = 0; id < BTREE_ID_NR; id++) { + for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) { struct bpos last_pos = POS_MIN; snapshot_id_list equiv_seen = { 0 }; struct disk_reservation res = { 0 }; - if (!btree_type_has_snapshots(id)) - continue; - - /* - * deleted inodes btree is maintained by a trigger on the inodes - * btree - no work for us to do here, and it's not safe to scan - * it because we'll see out of date keys due to the btree write - * buffer: - */ - if (id == BTREE_ID_deleted_inodes) + if (!btree_type_has_snapshots(btree)) continue; ret = for_each_btree_key_commit(trans, iter, - id, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - &res, NULL, BCH_TRANS_COMMIT_no_enospc, - snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?: - for_each_btree_key_commit(trans, iter, - id, POS_MIN, + btree, POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, &res, NULL, BCH_TRANS_COMMIT_no_enospc, - move_key_to_correct_snapshot(trans, &iter, k)); + delete_dead_snapshots_process_key(trans, &iter, k, &deleted, + &equiv_seen, &last_pos)); bch2_disk_reservation_put(c, &res); darray_exit(&equiv_seen); From 08f50005e09f3bf74a7cb5fd86335d3c4077df51 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 26 May 2024 12:38:30 -0400 Subject: [PATCH 0226/1578] bcachefs: Run check_key_has_snapshot in snapshot_delete_keys() delete_dead_snapshots now runs before the main fsck.c passes which check for keys for invalid snapshots; thus, it needs those checks as well. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 27 ++++----------------------- fs/bcachefs/snapshot.c | 25 ++++++++++++++++++++++++- fs/bcachefs/snapshot.h | 1 + 3 files changed, 29 insertions(+), 24 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 4cd28db9bad8de..fd277bd58ed34a 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -766,25 +766,6 @@ static int get_visible_inodes(struct btree_trans *trans, return ret; } -static int check_key_has_snapshot(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - int ret = 0; - - if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c, - bkey_in_missing_snapshot, - "key in missing snapshot: %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node) ?: 1; -fsck_err: - printbuf_exit(&buf); - return ret; -} - static int hash_redo_key(struct btree_trans *trans, const struct bch_hash_desc desc, struct bch_hash_info *hash_info, @@ -979,7 +960,7 @@ static int check_inode(struct btree_trans *trans, bool do_update = false; int ret; - ret = check_key_has_snapshot(trans, iter, k); + ret = bch2_check_key_has_snapshot(trans, iter, k); if (ret < 0) goto err; if (ret) @@ -1483,7 +1464,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, struct printbuf buf = PRINTBUF; int ret = 0; - ret = check_key_has_snapshot(trans, iter, k); + ret = bch2_check_key_has_snapshot(trans, iter, k); if (ret) { ret = ret < 0 ? ret : 0; goto out; @@ -2006,7 +1987,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, struct printbuf buf = PRINTBUF; int ret = 0; - ret = check_key_has_snapshot(trans, iter, k); + ret = bch2_check_key_has_snapshot(trans, iter, k); if (ret) { ret = ret < 0 ? ret : 0; goto out; @@ -2161,7 +2142,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, struct inode_walker_entry *i; int ret; - ret = check_key_has_snapshot(trans, iter, k); + ret = bch2_check_key_has_snapshot(trans, iter, k); if (ret < 0) return ret; if (ret) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 466fa3e6a4b656..51918acfd72681 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -1042,6 +1042,25 @@ int bch2_reconstruct_snapshots(struct bch_fs *c) return ret; } +int bch2_check_key_has_snapshot(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + int ret = 0; + + if (fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c, + bkey_in_missing_snapshot, + "key in missing snapshot %s, delete?", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_internal_snapshot_node) ?: 1; +fsck_err: + printbuf_exit(&buf); + return ret; +} + /* * Mark a snapshot as deleted, for future cleanup: */ @@ -1358,6 +1377,10 @@ static int delete_dead_snapshots_process_key(struct btree_trans *trans, snapshot_id_list *equiv_seen, struct bpos *last_pos) { + int ret = bch2_check_key_has_snapshot(trans, iter, k); + if (ret) + return ret < 0 ? ret : 0; + struct bch_fs *c = trans->c; u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); if (!equiv) /* key for invalid snapshot node, but we chose not to delete */ @@ -1377,7 +1400,7 @@ static int delete_dead_snapshots_process_key(struct btree_trans *trans, *last_pos = k.k->p; - int ret = snapshot_list_add_nodup(c, equiv_seen, equiv); + ret = snapshot_list_add_nodup(c, equiv_seen, equiv); if (ret) return ret; diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index ab13d8f4b41e1e..31b0ee03e96288 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -242,6 +242,7 @@ int bch2_snapshot_node_create(struct btree_trans *, u32, int bch2_check_snapshot_trees(struct bch_fs *); int bch2_check_snapshots(struct bch_fs *); int bch2_reconstruct_snapshots(struct bch_fs *); +int bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c); int bch2_snapshot_node_set_deleted(struct btree_trans *, u32); void bch2_delete_dead_snapshots_work(struct work_struct *); From 247c056bde2ebc9fad2fc62332dc7cc99b58d720 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 27 May 2024 16:30:19 -0400 Subject: [PATCH 0227/1578] bcachefs: Fix setting of downgrade recovery passes/errors bch2_check_version_downgrade() was setting c->sb.version, which bch2_sb_set_downgrade() expects to be at the previous version; and it shouldn't even have been set directly because c->sb.version is updated by write_super(). Signed-off-by: Kent Overstreet --- fs/bcachefs/super-io.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index f1bee6c5222d24..d73a0222f70954 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -1132,18 +1132,12 @@ bool bch2_check_version_downgrade(struct bch_fs *c) * c->sb will be checked before we write the superblock, so update it as * well: */ - if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) { + if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); - c->sb.version_upgrade_complete = bcachefs_metadata_version_current; - } - if (c->sb.version > bcachefs_metadata_version_current) { + if (c->sb.version > bcachefs_metadata_version_current) c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); - c->sb.version = bcachefs_metadata_version_current; - } - if (c->sb.version_min > bcachefs_metadata_version_current) { + if (c->sb.version_min > bcachefs_metadata_version_current) c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current); - c->sb.version_min = bcachefs_metadata_version_current; - } c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1); return ret; } From b4131076c16fdd2bc6cb09cfa7e0cfe278aa49a1 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Mon, 27 May 2024 12:01:18 -0700 Subject: [PATCH 0228/1578] bcachefs: add missing MODULE_DESCRIPTION() Fix the 'make W=1' warning: WARNING: modpost: missing MODULE_DESCRIPTION() in fs/bcachefs/mean_and_variance_test.o Signed-off-by: Jeff Johnson Signed-off-by: Kent Overstreet --- fs/bcachefs/mean_and_variance_test.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c index 4c298e74723db3..e9d9c0212e44b1 100644 --- a/fs/bcachefs/mean_and_variance_test.c +++ b/fs/bcachefs/mean_and_variance_test.c @@ -217,4 +217,5 @@ static struct kunit_suite mean_and_variance_test_suite = { kunit_test_suite(mean_and_variance_test_suite); MODULE_AUTHOR("Daniel B. Hill"); +MODULE_DESCRIPTION("bcachefs filesystem mean and variance unit tests"); MODULE_LICENSE("GPL"); From 088d0de81220a74d7d553febb81656927f10bb16 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 27 May 2024 18:40:50 -0400 Subject: [PATCH 0229/1578] bcachefs: btree_gc can now handle unknown btrees Compatibility fix - we no longer have a separate table for which order gc walks btrees in, and special case the stripes btree directly. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 44 +----------------------------------- fs/bcachefs/btree_gc.c | 9 ++++---- fs/bcachefs/btree_gc.h | 44 ++++++++++++++++-------------------- fs/bcachefs/btree_gc_types.h | 29 ++++++++++++++++++++++++ fs/bcachefs/ec.c | 2 +- 5 files changed, 55 insertions(+), 73 deletions(-) create mode 100644 fs/bcachefs/btree_gc_types.h diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index bc0ea2c4efef25..2a538eb2af110c 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -457,6 +457,7 @@ enum bch_time_stats { }; #include "alloc_types.h" +#include "btree_gc_types.h" #include "btree_types.h" #include "btree_node_scan_types.h" #include "btree_write_buffer_types.h" @@ -488,49 +489,6 @@ enum bch_time_stats { struct btree; -enum gc_phase { - GC_PHASE_NOT_RUNNING, - GC_PHASE_START, - GC_PHASE_SB, - - GC_PHASE_BTREE_stripes, - GC_PHASE_BTREE_extents, - GC_PHASE_BTREE_inodes, - GC_PHASE_BTREE_dirents, - GC_PHASE_BTREE_xattrs, - GC_PHASE_BTREE_alloc, - GC_PHASE_BTREE_quotas, - GC_PHASE_BTREE_reflink, - GC_PHASE_BTREE_subvolumes, - GC_PHASE_BTREE_snapshots, - GC_PHASE_BTREE_lru, - GC_PHASE_BTREE_freespace, - GC_PHASE_BTREE_need_discard, - GC_PHASE_BTREE_backpointers, - GC_PHASE_BTREE_bucket_gens, - GC_PHASE_BTREE_snapshot_trees, - GC_PHASE_BTREE_deleted_inodes, - GC_PHASE_BTREE_logged_ops, - GC_PHASE_BTREE_rebalance_work, - GC_PHASE_BTREE_subvolume_children, - - GC_PHASE_PENDING_DELETE, -}; - -struct gc_pos { - enum gc_phase phase; - u16 level; - struct bpos pos; -}; - -struct reflink_gc { - u64 offset; - u32 size; - u32 refcount; -}; - -typedef GENRADIX(struct reflink_gc) reflink_gc_table; - struct io_count { u64 sectors[2][BCH_DATA_NR]; }; diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 8035c8b797ab37..e9e901feda2912 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -673,8 +673,7 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool in static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) { - return (int) btree_id_to_gc_phase(l) - - (int) btree_id_to_gc_phase(r); + return cmp_int(gc_btree_order(l), gc_btree_order(r)); } static int bch2_gc_btrees(struct bch_fs *c) @@ -711,7 +710,7 @@ static int bch2_gc_btrees(struct bch_fs *c) static int bch2_mark_superblocks(struct bch_fs *c) { mutex_lock(&c->sb_lock); - gc_pos_set(c, gc_phase(GC_PHASE_SB)); + gc_pos_set(c, gc_phase(GC_PHASE_sb)); int ret = bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc); mutex_unlock(&c->sb_lock); @@ -1209,7 +1208,7 @@ int bch2_check_allocations(struct bch_fs *c) if (ret) goto out; - gc_pos_set(c, gc_phase(GC_PHASE_START)); + gc_pos_set(c, gc_phase(GC_PHASE_start)); ret = bch2_mark_superblocks(c); BUG_ON(ret); @@ -1231,7 +1230,7 @@ int bch2_check_allocations(struct bch_fs *c) percpu_down_write(&c->mark_lock); /* Indicates that gc is no longer in progress: */ - __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); + __gc_pos_set(c, gc_phase(GC_PHASE_not_running)); bch2_gc_free(c); percpu_up_write(&c->mark_lock); diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h index 1b6489d8e0f4fa..876d81e2017d73 100644 --- a/fs/bcachefs/btree_gc.h +++ b/fs/bcachefs/btree_gc.h @@ -3,6 +3,7 @@ #define _BCACHEFS_BTREE_GC_H #include "bkey.h" +#include "btree_gc_types.h" #include "btree_types.h" int bch2_check_topology(struct bch_fs *); @@ -32,36 +33,15 @@ int bch2_check_allocations(struct bch_fs *); /* Position of (the start of) a gc phase: */ static inline struct gc_pos gc_phase(enum gc_phase phase) { - return (struct gc_pos) { - .phase = phase, - .level = 0, - .pos = POS_MIN, - }; -} - -static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) -{ - return cmp_int(l.phase, r.phase) ?: - -cmp_int(l.level, r.level) ?: - bpos_cmp(l.pos, r.pos); -} - -static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id) -{ - switch (id) { -#define x(name, v, ...) case BTREE_ID_##name: return GC_PHASE_BTREE_##name; - BCH_BTREE_IDS() -#undef x - default: - BUG(); - } + return (struct gc_pos) { .phase = phase, }; } static inline struct gc_pos gc_pos_btree(enum btree_id btree, unsigned level, struct bpos pos) { return (struct gc_pos) { - .phase = btree_id_to_gc_phase(btree), + .phase = GC_PHASE_btree, + .btree = btree, .level = level, .pos = pos, }; @@ -76,6 +56,22 @@ static inline struct gc_pos gc_pos_btree_node(struct btree *b) return gc_pos_btree(b->c.btree_id, b->c.level, b->key.k.p); } +static inline int gc_btree_order(enum btree_id btree) +{ + if (btree == BTREE_ID_stripes) + return -1; + return btree; +} + +static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) +{ + return cmp_int(l.phase, r.phase) ?: + cmp_int(gc_btree_order(l.btree), + gc_btree_order(r.btree)) ?: + -cmp_int(l.level, r.level) ?: + bpos_cmp(l.pos, r.pos); +} + static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) { unsigned seq; diff --git a/fs/bcachefs/btree_gc_types.h b/fs/bcachefs/btree_gc_types.h new file mode 100644 index 00000000000000..b82c24bcc0880d --- /dev/null +++ b/fs/bcachefs/btree_gc_types.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_GC_TYPES_H +#define _BCACHEFS_BTREE_GC_TYPES_H + +#include + +enum gc_phase { + GC_PHASE_not_running, + GC_PHASE_start, + GC_PHASE_sb, + GC_PHASE_btree, +}; + +struct gc_pos { + enum gc_phase phase:8; + enum btree_id btree:8; + u16 level; + struct bpos pos; +}; + +struct reflink_gc { + u64 offset; + u32 size; + u32 refcount; +}; + +typedef GENRADIX(struct reflink_gc) reflink_gc_table; + +#endif /* _BCACHEFS_BTREE_GC_TYPES_H */ diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index b26dc742466239..d8b9beca377627 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -908,7 +908,7 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) if (!genradix_ptr_alloc(&c->stripes, idx, gfp)) return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; - if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING && + if (c->gc_pos.phase != GC_PHASE_not_running && !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; From f1d4fed13fa0c0f8c99cdbbc27460f1e1f46fd4e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 27 May 2024 19:17:09 -0400 Subject: [PATCH 0230/1578] bcachefs: Better fsck error message for key version Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index e9e901feda2912..dc97991bcd6ada 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -585,16 +585,17 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c, bkey_version_in_future, - "key version number higher than recorded: %llu > %llu", - k.k->version.lo, - atomic64_read(&c->key_version))) + "key version number higher than recorded %llu\n %s", + atomic64_read(&c->key_version), + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) atomic64_set(&c->key_version, k.k->version.lo); } if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k), c, btree_bitmap_not_marked, "btree ptr not marked in member info btree allocated bitmap\n %s", - (bch2_bkey_val_to_text(&buf, c, k), + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { mutex_lock(&c->sb_lock); bch2_dev_btree_bitmap_mark(c, k); From be647e2c76b27f409cdd520f66c95be888b553a3 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 21 May 2024 06:41:45 -0700 Subject: [PATCH 0231/1578] nvme: use srcu for iterating namespace list The nvme pci driver synchronizes with all the namespace queues during a reset to ensure that there's no pending timeout work. Meanwhile the timeout work potentially iterates those same namespaces to freeze their queues. Each of those namespace iterations use the same read lock. If a write lock should somehow get between the synchronize and freeze steps, then forward progress is deadlocked. We had been relying on the nvme controller state machine to ensure the reset work wouldn't conflict with timeout work. That guarantee may be a bit fragile to rely on, so iterate the namespace lists without taking potentially circular locks, as reported by lockdep. Link: https://lore.kernel.org/all/20220930001943.zdbvolc3gkekfmcv@shindev/ Reported-by: Shinichiro Kawasaki Tested-by: Shinichiro Kawasaki Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 99 +++++++++++++++++++++-------------- drivers/nvme/host/ioctl.c | 15 +++--- drivers/nvme/host/multipath.c | 21 ++++---- drivers/nvme/host/nvme.h | 4 +- 4 files changed, 83 insertions(+), 56 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 7706df23734945..f5d150c62955d8 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -678,7 +678,7 @@ static void nvme_free_ns(struct kref *kref) kfree(ns); } -static inline bool nvme_get_ns(struct nvme_ns *ns) +bool nvme_get_ns(struct nvme_ns *ns) { return kref_get_unless_zero(&ns->kref); } @@ -3684,9 +3684,10 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info) struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) { struct nvme_ns *ns, *ret = NULL; + int srcu_idx; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) { + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { if (ns->head->ns_id == nsid) { if (!nvme_get_ns(ns)) continue; @@ -3696,7 +3697,7 @@ struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) if (ns->head->ns_id > nsid) break; } - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); return ret; } EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU); @@ -3710,7 +3711,7 @@ static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns) list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) { if (tmp->head->ns_id < ns->head->ns_id) { - list_add(&ns->list, &tmp->list); + list_add_rcu(&ns->list, &tmp->list); return; } } @@ -3776,17 +3777,18 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) if (nvme_update_ns_info(ns, info)) goto out_unlink_ns; - down_write(&ctrl->namespaces_rwsem); + mutex_lock(&ctrl->namespaces_lock); /* * Ensure that no namespaces are added to the ctrl list after the queues * are frozen, thereby avoiding a deadlock between scan and reset. */ if (test_bit(NVME_CTRL_FROZEN, &ctrl->flags)) { - up_write(&ctrl->namespaces_rwsem); + mutex_unlock(&ctrl->namespaces_lock); goto out_unlink_ns; } nvme_ns_add_to_ctrl_list(ns); - up_write(&ctrl->namespaces_rwsem); + mutex_unlock(&ctrl->namespaces_lock); + synchronize_srcu(&ctrl->srcu); nvme_get_ctrl(ctrl); if (device_add_disk(ctrl->device, ns->disk, nvme_ns_attr_groups)) @@ -3809,9 +3811,10 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) out_cleanup_ns_from_list: nvme_put_ctrl(ctrl); - down_write(&ctrl->namespaces_rwsem); - list_del_init(&ns->list); - up_write(&ctrl->namespaces_rwsem); + mutex_lock(&ctrl->namespaces_lock); + list_del_rcu(&ns->list); + mutex_unlock(&ctrl->namespaces_lock); + synchronize_srcu(&ctrl->srcu); out_unlink_ns: mutex_lock(&ctrl->subsys->lock); list_del_rcu(&ns->siblings); @@ -3861,9 +3864,10 @@ static void nvme_ns_remove(struct nvme_ns *ns) nvme_cdev_del(&ns->cdev, &ns->cdev_device); del_gendisk(ns->disk); - down_write(&ns->ctrl->namespaces_rwsem); - list_del_init(&ns->list); - up_write(&ns->ctrl->namespaces_rwsem); + mutex_lock(&ns->ctrl->namespaces_lock); + list_del_rcu(&ns->list); + mutex_unlock(&ns->ctrl->namespaces_lock); + synchronize_srcu(&ns->ctrl->srcu); if (last_path) nvme_mpath_shutdown_disk(ns->head); @@ -3953,16 +3957,17 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, struct nvme_ns *ns, *next; LIST_HEAD(rm_list); - down_write(&ctrl->namespaces_rwsem); + mutex_lock(&ctrl->namespaces_lock); list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) { if (ns->head->ns_id > nsid) - list_move_tail(&ns->list, &rm_list); + list_splice_init_rcu(&ns->list, &rm_list, + synchronize_rcu); } - up_write(&ctrl->namespaces_rwsem); + mutex_unlock(&ctrl->namespaces_lock); + synchronize_srcu(&ctrl->srcu); list_for_each_entry_safe(ns, next, &rm_list, list) nvme_ns_remove(ns); - } static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) @@ -4132,9 +4137,10 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) /* this is a no-op when called from the controller reset handler */ nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO); - down_write(&ctrl->namespaces_rwsem); - list_splice_init(&ctrl->namespaces, &ns_list); - up_write(&ctrl->namespaces_rwsem); + mutex_lock(&ctrl->namespaces_lock); + list_splice_init_rcu(&ctrl->namespaces, &ns_list, synchronize_rcu); + mutex_unlock(&ctrl->namespaces_lock); + synchronize_srcu(&ctrl->srcu); list_for_each_entry_safe(ns, next, &ns_list, list) nvme_ns_remove(ns); @@ -4582,6 +4588,7 @@ static void nvme_free_ctrl(struct device *dev) key_put(ctrl->tls_key); nvme_free_cels(ctrl); nvme_mpath_uninit(ctrl); + cleanup_srcu_struct(&ctrl->srcu); nvme_auth_stop(ctrl); nvme_auth_free(ctrl); __free_page(ctrl->discard_page); @@ -4614,10 +4621,15 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, ctrl->passthru_err_log_enabled = false; clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags); spin_lock_init(&ctrl->lock); + mutex_init(&ctrl->namespaces_lock); + + ret = init_srcu_struct(&ctrl->srcu); + if (ret) + return ret; + mutex_init(&ctrl->scan_lock); INIT_LIST_HEAD(&ctrl->namespaces); xa_init(&ctrl->cels); - init_rwsem(&ctrl->namespaces_rwsem); ctrl->dev = dev; ctrl->ops = ops; ctrl->quirks = quirks; @@ -4697,6 +4709,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, out: if (ctrl->discard_page) __free_page(ctrl->discard_page); + cleanup_srcu_struct(&ctrl->srcu); return ret; } EXPORT_SYMBOL_GPL(nvme_init_ctrl); @@ -4705,22 +4718,24 @@ EXPORT_SYMBOL_GPL(nvme_init_ctrl); void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; + int srcu_idx; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) blk_mark_disk_dead(ns->disk); - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); } EXPORT_SYMBOL_GPL(nvme_mark_namespaces_dead); void nvme_unfreeze(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; + int srcu_idx; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) blk_mq_unfreeze_queue(ns->queue); - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); clear_bit(NVME_CTRL_FROZEN, &ctrl->flags); } EXPORT_SYMBOL_GPL(nvme_unfreeze); @@ -4728,14 +4743,15 @@ EXPORT_SYMBOL_GPL(nvme_unfreeze); int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout) { struct nvme_ns *ns; + int srcu_idx; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) { + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout); if (timeout <= 0) break; } - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); return timeout; } EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout); @@ -4743,23 +4759,25 @@ EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout); void nvme_wait_freeze(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; + int srcu_idx; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) blk_mq_freeze_queue_wait(ns->queue); - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); } EXPORT_SYMBOL_GPL(nvme_wait_freeze); void nvme_start_freeze(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; + int srcu_idx; set_bit(NVME_CTRL_FROZEN, &ctrl->flags); - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) blk_freeze_queue_start(ns->queue); - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); } EXPORT_SYMBOL_GPL(nvme_start_freeze); @@ -4802,11 +4820,12 @@ EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue); void nvme_sync_io_queues(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; + int srcu_idx; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) blk_sync_queue(ns->queue); - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); } EXPORT_SYMBOL_GPL(nvme_sync_io_queues); diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 499a8bb7cac7d1..9d9d2a127c4ec2 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -789,15 +789,15 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp, bool open_for_write) { struct nvme_ns *ns; - int ret; + int ret, srcu_idx; - down_read(&ctrl->namespaces_rwsem); + srcu_idx = srcu_read_lock(&ctrl->srcu); if (list_empty(&ctrl->namespaces)) { ret = -ENOTTY; goto out_unlock; } - ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list); + ns = list_first_or_null_rcu(&ctrl->namespaces, struct nvme_ns, list); if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) { dev_warn(ctrl->device, "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n"); @@ -807,15 +807,18 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp, dev_warn(ctrl->device, "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n"); - kref_get(&ns->kref); - up_read(&ctrl->namespaces_rwsem); + if (!nvme_get_ns(ns)) { + ret = -ENXIO; + goto out_unlock; + } + srcu_read_unlock(&ctrl->srcu, srcu_idx); ret = nvme_user_cmd(ctrl, ns, argp, 0, open_for_write); nvme_put_ns(ns); return ret; out_unlock: - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); return ret; } diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 1bee176fd850e3..d8b6b4648eaff9 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -151,16 +151,17 @@ void nvme_mpath_end_request(struct request *rq) void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; + int srcu_idx; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) { + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { if (!ns->head->disk) continue; kblockd_schedule_work(&ns->head->requeue_work); if (nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE) disk_uevent(ns->head->disk, KOBJ_CHANGE); } - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); } static const char *nvme_ana_state_names[] = { @@ -194,13 +195,14 @@ bool nvme_mpath_clear_current_path(struct nvme_ns *ns) void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; + int srcu_idx; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) { + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { nvme_mpath_clear_current_path(ns); kblockd_schedule_work(&ns->head->requeue_work); } - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); } void nvme_mpath_revalidate_paths(struct nvme_ns *ns) @@ -681,6 +683,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl, u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0; unsigned *nr_change_groups = data; struct nvme_ns *ns; + int srcu_idx; dev_dbg(ctrl->device, "ANA group %d: %s.\n", le32_to_cpu(desc->grpid), @@ -692,8 +695,8 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl, if (!nr_nsids) return 0; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) { + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { unsigned nsid; again: nsid = le32_to_cpu(desc->nsids[n]); @@ -706,7 +709,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl, if (ns->head->ns_id > nsid) goto again; } - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); return 0; } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index c43a30753d87a8..f3a41133ac3f97 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -282,7 +282,8 @@ struct nvme_ctrl { struct blk_mq_tag_set *tagset; struct blk_mq_tag_set *admin_tagset; struct list_head namespaces; - struct rw_semaphore namespaces_rwsem; + struct mutex namespaces_lock; + struct srcu_struct srcu; struct device ctrl_device; struct device *device; /* char device */ #ifdef CONFIG_NVME_HWMON @@ -1160,6 +1161,7 @@ void nvme_passthru_end(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u32 effects, struct nvme_command *cmd, int status); struct nvme_ctrl *nvme_ctrl_from_file(struct file *file); struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid); +bool nvme_get_ns(struct nvme_ns *ns); void nvme_put_ns(struct nvme_ns *ns); static inline bool nvme_multi_css(struct nvme_ctrl *ctrl) From c758b77d4a0a0ed3a1292b3fd7a2aeccd1a169a4 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 27 May 2024 22:38:52 +0300 Subject: [PATCH 0232/1578] nvmet: fix a possible leak when destroy a ctrl during qp establishment In nvmet_sq_destroy we capture sq->ctrl early and if it is non-NULL we know that a ctrl was allocated (in the admin connect request handler) and we need to release pending AERs, clear ctrl->sqs and sq->ctrl (for nvme-loop primarily), and drop the final reference on the ctrl. However, a small window is possible where nvmet_sq_destroy starts (as a result of the client giving up and disconnecting) concurrently with the nvme admin connect cmd (which may be in an early stage). But *before* kill_and_confirm of sq->ref (i.e. the admin connect managed to get an sq live reference). In this case, sq->ctrl was allocated however after it was captured in a local variable in nvmet_sq_destroy. This prevented the final reference drop on the ctrl. Solve this by re-capturing the sq->ctrl after all inflight request has completed, where for sure sq->ctrl reference is final, and move forward based on that. This issue was observed in an environment with many hosts connecting multiple ctrls simoutanuosly, creating a delay in allocating a ctrl leading up to this race window. Reported-by: Alex Turin Signed-off-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/target/core.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 2fde22323622e4..06f0c587f3437b 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -818,6 +818,15 @@ void nvmet_sq_destroy(struct nvmet_sq *sq) percpu_ref_exit(&sq->ref); nvmet_auth_sq_free(sq); + /* + * we must reference the ctrl again after waiting for inflight IO + * to complete. Because admin connect may have sneaked in after we + * store sq->ctrl locally, but before we killed the percpu_ref. the + * admin connect allocates and assigns sq->ctrl, which now needs a + * final ref put, as this ctrl is going away. + */ + ctrl = sq->ctrl; + if (ctrl) { /* * The teardown flow may take some time, and the host may not From a0fc1a053b7a212244ff109e44469b8deff280c5 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Fri, 24 May 2024 16:58:26 -0700 Subject: [PATCH 0233/1578] of: of_test: add MODULE_DESCRIPTION() Fix the 'make W=1' warning: WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/of/of_test.o Signed-off-by: Jeff Johnson Link: https://lore.kernel.org/r/20240524-md-of-of_test-v1-1-6ebd078d620f@quicinc.com Signed-off-by: Rob Herring (Arm) --- drivers/of/of_test.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/of/of_test.c b/drivers/of/of_test.c index a9301d293f014c..c85a258bc6ae64 100644 --- a/drivers/of/of_test.c +++ b/drivers/of/of_test.c @@ -54,4 +54,5 @@ static struct kunit_suite of_dtb_suite = { kunit_test_suites( &of_dtb_suite, ); +MODULE_DESCRIPTION("KUnit tests for OF APIs"); MODULE_LICENSE("GPL"); From ac4b069035783f7a54b3ab841119f4b6bf435f98 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 28 May 2024 15:10:28 -0300 Subject: [PATCH 0234/1578] tools arch x86: Sync the msr-index.h copy with the kernel sources To pick up the changes from these csets: 53bc516ade85a764 ("x86/msr: Move ARCH_CAP_XAPIC_DISABLE bit definition to its rightful place") That patch just move definitions around, so this just silences this perf build warning: Warning: Kernel ABI header differences: diff -u tools/arch/x86/include/asm/msr-index.h arch/x86/include/asm/msr-index. Cc: Adrian Hunter Cc: Borislav Petkov (AMD) Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: Pawan Gupta Link: https://lore.kernel.org/lkml/ZlYe8jOzd1_DyA7X@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/asm/msr-index.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index e72c2b87295799..e022e6eb766c64 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -170,6 +170,10 @@ * CPU is not affected by Branch * History Injection. */ +#define ARCH_CAP_XAPIC_DISABLE BIT(21) /* + * IA32_XAPIC_DISABLE_STATUS MSR + * supported + */ #define ARCH_CAP_PBRSB_NO BIT(24) /* * Not susceptible to Post-Barrier * Return Stack Buffer Predictions. @@ -192,11 +196,6 @@ * File. */ -#define ARCH_CAP_XAPIC_DISABLE BIT(21) /* - * IA32_XAPIC_DISABLE_STATUS MSR - * supported - */ - #define MSR_IA32_FLUSH_CMD 0x0000010b #define L1D_FLUSH BIT(0) /* * Writeback and invalidate the From 88e520512a68b4e724cb0f4281c79ae28673ccb1 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 28 May 2024 16:24:15 -0300 Subject: [PATCH 0235/1578] tools headers UAPI: Sync kvm headers with the kernel sources To pick the changes in: 4af663c2f64a8d25 ("KVM: SEV: Allow per-guest configuration of GHCB protocol version") 4f5defae708992dd ("KVM: SEV: introduce KVM_SEV_INIT2 operation") 26c44aa9e076ed83 ("KVM: SEV: define VM types for SEV and SEV-ES") ac5c48027bacb1b5 ("KVM: SEV: publish supported VMSA features") 651d61bc8b7d8bb6 ("KVM: PPC: Fix documentation for ppc mmu caps") That don't change functionality in tools/perf, as no new ioctl is added for the 'perf trace' scripts to harvest. This addresses these perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h diff -u tools/arch/x86/include/uapi/asm/kvm.h arch/x86/include/uapi/asm/kvm.h Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Joel Stanley Cc: Michael Ellerman Cc: Michael Roth Cc: Namhyung Kim Cc: Paolo Bonzini Link: https://lore.kernel.org/lkml/ZlYxAdHjyAkvGtMW@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/uapi/asm/kvm.h | 22 ++++++++++++++++++++-- tools/include/uapi/linux/kvm.h | 4 ++-- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index ef11aa4cab4253..9fae1b73b529ca 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -457,8 +457,13 @@ struct kvm_sync_regs { #define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE 0x00000001 -/* attributes for system fd (group 0) */ -#define KVM_X86_XCOMP_GUEST_SUPP 0 +/* vendor-independent attributes for system fd (group 0) */ +#define KVM_X86_GRP_SYSTEM 0 +# define KVM_X86_XCOMP_GUEST_SUPP 0 + +/* vendor-specific groups and attributes for system fd */ +#define KVM_X86_GRP_SEV 1 +# define KVM_X86_SEV_VMSA_FEATURES 0 struct kvm_vmx_nested_state_data { __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; @@ -689,6 +694,9 @@ enum sev_cmd_id { /* Guest Migration Extension */ KVM_SEV_SEND_CANCEL, + /* Second time is the charm; improved versions of the above ioctls. */ + KVM_SEV_INIT2, + KVM_SEV_NR_MAX, }; @@ -700,6 +708,14 @@ struct kvm_sev_cmd { __u32 sev_fd; }; +struct kvm_sev_init { + __u64 vmsa_features; + __u32 flags; + __u16 ghcb_version; + __u16 pad1; + __u32 pad2[8]; +}; + struct kvm_sev_launch_start { __u32 handle; __u32 policy; @@ -856,5 +872,7 @@ struct kvm_hyperv_eventfd { #define KVM_X86_DEFAULT_VM 0 #define KVM_X86_SW_PROTECTED_VM 1 +#define KVM_X86_SEV_VM 2 +#define KVM_X86_SEV_ES_VM 3 #endif /* _ASM_X86_KVM_H */ diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index ea32b101b9999d..d03842abae5784 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1221,9 +1221,9 @@ struct kvm_vfio_spapr_tce { /* Available with KVM_CAP_SPAPR_RESIZE_HPT */ #define KVM_PPC_RESIZE_HPT_PREPARE _IOR(KVMIO, 0xad, struct kvm_ppc_resize_hpt) #define KVM_PPC_RESIZE_HPT_COMMIT _IOR(KVMIO, 0xae, struct kvm_ppc_resize_hpt) -/* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_MMU_HASH_V3 */ +/* Available with KVM_CAP_PPC_MMU_RADIX or KVM_CAP_PPC_MMU_HASH_V3 */ #define KVM_PPC_CONFIGURE_V3_MMU _IOW(KVMIO, 0xaf, struct kvm_ppc_mmuv3_cfg) -/* Available with KVM_CAP_PPC_RADIX_MMU */ +/* Available with KVM_CAP_PPC_MMU_RADIX */ #define KVM_PPC_GET_RMMU_INFO _IOW(KVMIO, 0xb0, struct kvm_ppc_rmmu_info) /* Available with KVM_CAP_PPC_GET_CPU_CHAR */ #define KVM_PPC_GET_CPU_CHAR _IOR(KVMIO, 0xb1, struct kvm_ppc_cpu_char) From c7a5096781732e0f9784551309484f3e103f6750 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 28 May 2024 13:25:02 +0300 Subject: [PATCH 0236/1578] PNP: Make dev_is_pnp() to be a function and export it for modules Since we have a dev_is_pnp() macro that utilises the address of the pnp_bus_type variable, the users, which can be compiled as modules, will fail to build. Convert the macro to be a function and export it to the modules to prevent build breakage. Reported-by: Woody Suwalski Closes: https://lore.kernel.org/r/cc8a93b2-2504-9754-e26c-5d5c3bd1265c@gmail.com Fixes: 2a49b45cd0e7 ("PNP: Add dev_is_pnp() macro") Signed-off-by: Andy Shevchenko Reviewed-by: Christoph Hellwig Signed-off-by: Rafael J. Wysocki --- drivers/pnp/driver.c | 6 ++++++ include/linux/pnp.h | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/pnp/driver.c b/drivers/pnp/driver.c index 0a5d0d8befa840..3483e52e3a81d1 100644 --- a/drivers/pnp/driver.c +++ b/drivers/pnp/driver.c @@ -266,6 +266,12 @@ const struct bus_type pnp_bus_type = { .dev_groups = pnp_dev_groups, }; +bool dev_is_pnp(const struct device *dev) +{ + return dev->bus == &pnp_bus_type; +} +EXPORT_SYMBOL_GPL(dev_is_pnp); + int pnp_register_driver(struct pnp_driver *drv) { drv->driver.name = drv->name; diff --git a/include/linux/pnp.h b/include/linux/pnp.h index 82561242cda428..a8def1cea32c00 100644 --- a/include/linux/pnp.h +++ b/include/linux/pnp.h @@ -469,7 +469,7 @@ int compare_pnp_id(struct pnp_id *pos, const char *id); int pnp_register_driver(struct pnp_driver *drv); void pnp_unregister_driver(struct pnp_driver *drv); -#define dev_is_pnp(d) ((d)->bus == &pnp_bus_type) +bool dev_is_pnp(const struct device *dev); #else @@ -502,7 +502,7 @@ static inline int compare_pnp_id(struct pnp_id *pos, const char *id) { return -E static inline int pnp_register_driver(struct pnp_driver *drv) { return -ENODEV; } static inline void pnp_unregister_driver(struct pnp_driver *drv) { } -#define dev_is_pnp(d) false +static inline bool dev_is_pnp(const struct device *dev) { return false; } #endif /* CONFIG_PNP */ From edcde848c01eb071a91d479a6b3101d9cf48e905 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 28 May 2024 13:25:03 +0300 Subject: [PATCH 0237/1578] PNP: Hide pnp_bus_type from the non-PNP code The pnp_bus_type is defined only when CONFIG_PNP=y, while being not guarded by ifdeffery in the header. Moreover, it's not used outside of the PNP code. Move it to the internal header to make sure no-one will try to (ab)use it. Signed-off-by: Andy Shevchenko Reviewed-by: Christoph Hellwig Signed-off-by: Rafael J. Wysocki --- drivers/pnp/base.h | 1 + include/linux/pnp.h | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/pnp/base.h b/drivers/pnp/base.h index e74a0f6a31572b..4e80273dfb1ec9 100644 --- a/drivers/pnp/base.h +++ b/drivers/pnp/base.h @@ -6,6 +6,7 @@ extern struct mutex pnp_lock; extern const struct attribute_group *pnp_dev_groups[]; +extern const struct bus_type pnp_bus_type; int pnp_register_protocol(struct pnp_protocol *protocol); void pnp_unregister_protocol(struct pnp_protocol *protocol); diff --git a/include/linux/pnp.h b/include/linux/pnp.h index a8def1cea32c00..7f2ff95d2deb5e 100644 --- a/include/linux/pnp.h +++ b/include/linux/pnp.h @@ -435,8 +435,6 @@ struct pnp_protocol { #define protocol_for_each_dev(protocol, dev) \ list_for_each_entry(dev, &(protocol)->devices, protocol_list) -extern const struct bus_type pnp_bus_type; - #if defined(CONFIG_PNP) /* device management */ From 2f523f29d3b19a668b8d4ce6f768d8faff976b3a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 28 May 2024 16:54:31 -0300 Subject: [PATCH 0238/1578] tools headers UAPI: Update i915_drm.h with the kernel sources Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/ Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/drm/i915_drm.h | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/tools/include/uapi/drm/i915_drm.h b/tools/include/uapi/drm/i915_drm.h index 2ee338860b7e08..d4d86e566e0777 100644 --- a/tools/include/uapi/drm/i915_drm.h +++ b/tools/include/uapi/drm/i915_drm.h @@ -806,6 +806,12 @@ typedef struct drm_i915_irq_wait { */ #define I915_PARAM_PXP_STATUS 58 +/* + * Query if kernel allows marking a context to send a Freq hint to SLPC. This + * will enable use of the strategies allowed by the SLPC algorithm. + */ +#define I915_PARAM_HAS_CONTEXT_FREQ_HINT 59 + /* Must be kept compact -- no holes and well documented */ /** @@ -2148,6 +2154,15 @@ struct drm_i915_gem_context_param { * -EIO: The firmware did not succeed in creating the protected context. */ #define I915_CONTEXT_PARAM_PROTECTED_CONTENT 0xd + +/* + * I915_CONTEXT_PARAM_LOW_LATENCY: + * + * Mark this context as a low latency workload which requires aggressive GT + * frequency scaling. Use I915_PARAM_HAS_CONTEXT_FREQ_HINT to check if the kernel + * supports this per context flag. + */ +#define I915_CONTEXT_PARAM_LOW_LATENCY 0xe /* Must be kept compact -- no holes and well documented */ /** @value: Context parameter value to be set or queried */ @@ -2623,19 +2638,29 @@ struct drm_i915_reg_read { * */ +/* + * struct drm_i915_reset_stats - Return global reset and other context stats + * + * Driver keeps few stats for each contexts and also global reset count. + * This struct can be used to query those stats. + */ struct drm_i915_reset_stats { + /** @ctx_id: ID of the requested context */ __u32 ctx_id; + + /** @flags: MBZ */ __u32 flags; - /* All resets since boot/module reload, for all contexts */ + /** @reset_count: All resets since boot/module reload, for all contexts */ __u32 reset_count; - /* Number of batches lost when active in GPU, for this context */ + /** @batch_active: Number of batches lost when active in GPU, for this context */ __u32 batch_active; - /* Number of batches lost pending for execution, for this context */ + /** @batch_pending: Number of batches lost pending for execution, for this context */ __u32 batch_pending; + /** @pad: MBZ */ __u32 pad; }; From ac62f52138f752d6c74adc6321e4996d84caf5bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sun, 26 May 2024 23:40:01 +0200 Subject: [PATCH 0239/1578] ACPI: AC: Properly notify powermanagement core about changes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The powermanagement core does various actions when a powersupply changes. It calls into notifiers, LED triggers, other power supplies and emits an uevent. To make sure that all these actions happen properly call power_supply_changed(). Reported-by: Rajas Paranjpe Closes: https://github.com/MrChromebox/firmware/issues/420#issuecomment-2132251318 Signed-off-by: Thomas Weißschuh Reviewed-by: Sebastian Reichel Signed-off-by: Rafael J. Wysocki --- drivers/acpi/ac.c | 4 ++-- drivers/acpi/sbs.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/ac.c b/drivers/acpi/ac.c index 2d4a35e6dd18d5..09a87fa222c78f 100644 --- a/drivers/acpi/ac.c +++ b/drivers/acpi/ac.c @@ -145,7 +145,7 @@ static void acpi_ac_notify(acpi_handle handle, u32 event, void *data) dev_name(&adev->dev), event, (u32) ac->state); acpi_notifier_call_chain(adev, event, (u32) ac->state); - kobject_uevent(&ac->charger->dev.kobj, KOBJ_CHANGE); + power_supply_changed(ac->charger); } } @@ -268,7 +268,7 @@ static int acpi_ac_resume(struct device *dev) if (acpi_ac_get_state(ac)) return 0; if (old_state != ac->state) - kobject_uevent(&ac->charger->dev.kobj, KOBJ_CHANGE); + power_supply_changed(ac->charger); return 0; } diff --git a/drivers/acpi/sbs.c b/drivers/acpi/sbs.c index 94e3c000df2e16..dc8164b182dccf 100644 --- a/drivers/acpi/sbs.c +++ b/drivers/acpi/sbs.c @@ -610,7 +610,7 @@ static void acpi_sbs_callback(void *context) if (sbs->charger_exists) { acpi_ac_get_present(sbs); if (sbs->charger_present != saved_charger_state) - kobject_uevent(&sbs->charger->dev.kobj, KOBJ_CHANGE); + power_supply_changed(sbs->charger); } if (sbs->manager_present) { @@ -622,7 +622,7 @@ static void acpi_sbs_callback(void *context) acpi_battery_read(bat); if (saved_battery_state == bat->present) continue; - kobject_uevent(&bat->bat->dev.kobj, KOBJ_CHANGE); + power_supply_changed(bat->bat); } } } From 779b8a14afde110dd3502566be907289eba72447 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 28 May 2024 14:09:23 +0200 Subject: [PATCH 0240/1578] cpufreq: amd-pstate: remove global header file When extra warnings are enabled, gcc points out a global variable definition in a header: In file included from drivers/cpufreq/amd-pstate-ut.c:29: include/linux/amd-pstate.h:123:27: error: 'amd_pstate_mode_string' defined but not used [-Werror=unused-const-variable=] 123 | static const char * const amd_pstate_mode_string[] = { | ^~~~~~~~~~~~~~~~~~~~~~ This header is only included from two files in the same directory, and one of them uses only a single definition from it, so clean it up by moving most of the contents into the driver that uses them, and making shared bits a local header file. Fixes: 36c5014e5460 ("cpufreq: amd-pstate: optimize driver working mode selection in amd_pstate_param()") Signed-off-by: Arnd Bergmann Acked-by: Mario Limonciello Signed-off-by: Rafael J. Wysocki --- MAINTAINERS | 1 - drivers/cpufreq/amd-pstate-ut.c | 3 +- drivers/cpufreq/amd-pstate.c | 34 ++++++++++++++++++- .../linux => drivers/cpufreq}/amd-pstate.h | 33 ------------------ 4 files changed, 35 insertions(+), 36 deletions(-) rename {include/linux => drivers/cpufreq}/amd-pstate.h (82%) diff --git a/MAINTAINERS b/MAINTAINERS index d6c90161c7bfe3..fc31870379f84e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1107,7 +1107,6 @@ L: linux-pm@vger.kernel.org S: Supported F: Documentation/admin-guide/pm/amd-pstate.rst F: drivers/cpufreq/amd-pstate* -F: include/linux/amd-pstate.h F: tools/power/x86/amd_pstate_tracer/amd_pstate_trace.py AMD PTDMA DRIVER diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c index f04ae67dda372d..fc275d41d51e9b 100644 --- a/drivers/cpufreq/amd-pstate-ut.c +++ b/drivers/cpufreq/amd-pstate-ut.c @@ -26,10 +26,11 @@ #include #include #include -#include #include +#include "amd-pstate.h" + /* * Abbreviations: * amd_pstate_ut: used as a shortform for AMD P-State unit test. diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 1b7e82a0ad2e62..91993647e09ef8 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -36,7 +36,6 @@ #include #include #include -#include #include #include @@ -46,6 +45,8 @@ #include #include #include + +#include "amd-pstate.h" #include "amd-pstate-trace.h" #define AMD_PSTATE_TRANSITION_LATENCY 20000 @@ -53,6 +54,37 @@ #define CPPC_HIGHEST_PERF_PERFORMANCE 196 #define CPPC_HIGHEST_PERF_DEFAULT 166 +#define AMD_CPPC_EPP_PERFORMANCE 0x00 +#define AMD_CPPC_EPP_BALANCE_PERFORMANCE 0x80 +#define AMD_CPPC_EPP_BALANCE_POWERSAVE 0xBF +#define AMD_CPPC_EPP_POWERSAVE 0xFF + +/* + * enum amd_pstate_mode - driver working mode of amd pstate + */ +enum amd_pstate_mode { + AMD_PSTATE_UNDEFINED = 0, + AMD_PSTATE_DISABLE, + AMD_PSTATE_PASSIVE, + AMD_PSTATE_ACTIVE, + AMD_PSTATE_GUIDED, + AMD_PSTATE_MAX, +}; + +static const char * const amd_pstate_mode_string[] = { + [AMD_PSTATE_UNDEFINED] = "undefined", + [AMD_PSTATE_DISABLE] = "disable", + [AMD_PSTATE_PASSIVE] = "passive", + [AMD_PSTATE_ACTIVE] = "active", + [AMD_PSTATE_GUIDED] = "guided", + NULL, +}; + +struct quirk_entry { + u32 nominal_freq; + u32 lowest_freq; +}; + /* * TODO: We need more time to fine tune processors with shared memory solution * with community together. diff --git a/include/linux/amd-pstate.h b/drivers/cpufreq/amd-pstate.h similarity index 82% rename from include/linux/amd-pstate.h rename to drivers/cpufreq/amd-pstate.h index d58fc022ec466e..e6a28e7f4dbf12 100644 --- a/include/linux/amd-pstate.h +++ b/drivers/cpufreq/amd-pstate.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * linux/include/linux/amd-pstate.h - * * Copyright (C) 2022 Advanced Micro Devices, Inc. * * Author: Meng Li @@ -12,11 +10,6 @@ #include -#define AMD_CPPC_EPP_PERFORMANCE 0x00 -#define AMD_CPPC_EPP_BALANCE_PERFORMANCE 0x80 -#define AMD_CPPC_EPP_BALANCE_POWERSAVE 0xBF -#define AMD_CPPC_EPP_POWERSAVE 0xFF - /********************************************************************* * AMD P-state INTERFACE * *********************************************************************/ @@ -108,30 +101,4 @@ struct amd_cpudata { bool suspended; }; -/* - * enum amd_pstate_mode - driver working mode of amd pstate - */ -enum amd_pstate_mode { - AMD_PSTATE_UNDEFINED = 0, - AMD_PSTATE_DISABLE, - AMD_PSTATE_PASSIVE, - AMD_PSTATE_ACTIVE, - AMD_PSTATE_GUIDED, - AMD_PSTATE_MAX, -}; - -static const char * const amd_pstate_mode_string[] = { - [AMD_PSTATE_UNDEFINED] = "undefined", - [AMD_PSTATE_DISABLE] = "disable", - [AMD_PSTATE_PASSIVE] = "passive", - [AMD_PSTATE_ACTIVE] = "active", - [AMD_PSTATE_GUIDED] = "guided", - NULL, -}; - -struct quirk_entry { - u32 nominal_freq; - u32 lowest_freq; -}; - #endif /* _LINUX_AMD_PSTATE_H */ From e4731baaf29438508197d3a8a6d4f5a8c51663f8 Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Mon, 27 May 2024 10:41:28 +0530 Subject: [PATCH 0241/1578] cpufreq: amd-pstate: Fix the inconsistency in max frequency units The nominal frequency in cpudata is maintained in MHz whereas all other frequencies are in KHz. This means we have to convert nominal frequency value to KHz before we do any interaction with other frequency values. In amd_pstate_set_boost(), this conversion from MHz to KHz is missed, fix that. Tested on a AMD Zen4 EPYC server Before: $ cat /sys/devices/system/cpu/cpufreq/policy*/scaling_max_freq | uniq 2151 $ cat /sys/devices/system/cpu/cpufreq/policy*/cpuinfo_min_freq | uniq 400000 $ cat /sys/devices/system/cpu/cpufreq/policy*/scaling_cur_freq | uniq 2151 409422 After: $ cat /sys/devices/system/cpu/cpufreq/policy*/scaling_max_freq | uniq 2151000 $ cat /sys/devices/system/cpu/cpufreq/policy*/cpuinfo_min_freq | uniq 400000 $ cat /sys/devices/system/cpu/cpufreq/policy*/scaling_cur_freq | uniq 2151000 1799527 Fixes: ec437d71db77 ("cpufreq: amd-pstate: Introduce a new AMD P-State driver to support future processors") Signed-off-by: Dhananjay Ugwekar Acked-by: Mario Limonciello Acked-by: Gautham R. Shenoy Tested-by: Peter Jung Cc: 5.17+ # 5.17+ Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/amd-pstate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 91993647e09ef8..9ad62dbe8bfbff 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -701,7 +701,7 @@ static int amd_pstate_set_boost(struct cpufreq_policy *policy, int state) if (state) policy->cpuinfo.max_freq = cpudata->max_freq; else - policy->cpuinfo.max_freq = cpudata->nominal_freq; + policy->cpuinfo.max_freq = cpudata->nominal_freq * 1000; policy->max = policy->cpuinfo.max_freq; From 016c22e410c6eabfc5164f514d2a0ad06eddf3ba Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 28 May 2024 16:47:34 -0400 Subject: [PATCH 0242/1578] bcachefs: split out sb-members_format.h Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 102 +---------------------------- fs/bcachefs/sb-members_format.h | 110 ++++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+), 101 deletions(-) create mode 100644 fs/bcachefs/sb-members_format.h diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index d801e19cb4890e..c4d10b528b5490 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -513,6 +513,7 @@ struct bch_sb_field { #include "snapshot_format.h" #include "subvolume_format.h" #include "sb-counters_format.h" +#include "sb-members_format.h" enum bch_sb_field_type { #define x(f, nr) BCH_SB_FIELD_##f = nr, @@ -545,106 +546,6 @@ struct bch_sb_field_journal_v2 { } d[]; }; -/* BCH_SB_FIELD_members_v1: */ - -#define BCH_MIN_NR_NBUCKETS (1 << 6) - -#define BCH_IOPS_MEASUREMENTS() \ - x(seqread, 0) \ - x(seqwrite, 1) \ - x(randread, 2) \ - x(randwrite, 3) - -enum bch_iops_measurement { -#define x(t, n) BCH_IOPS_##t = n, - BCH_IOPS_MEASUREMENTS() -#undef x - BCH_IOPS_NR -}; - -#define BCH_MEMBER_ERROR_TYPES() \ - x(read, 0) \ - x(write, 1) \ - x(checksum, 2) - -enum bch_member_error_type { -#define x(t, n) BCH_MEMBER_ERROR_##t = n, - BCH_MEMBER_ERROR_TYPES() -#undef x - BCH_MEMBER_ERROR_NR -}; - -struct bch_member { - __uuid_t uuid; - __le64 nbuckets; /* device size */ - __le16 first_bucket; /* index of first bucket used */ - __le16 bucket_size; /* sectors */ - __u8 btree_bitmap_shift; - __u8 pad[3]; - __le64 last_mount; /* time_t */ - - __le64 flags; - __le32 iops[4]; - __le64 errors[BCH_MEMBER_ERROR_NR]; - __le64 errors_at_reset[BCH_MEMBER_ERROR_NR]; - __le64 errors_reset_time; - __le64 seq; - __le64 btree_allocated_bitmap; - /* - * On recovery from a clean shutdown we don't normally read the journal, - * but we still want to resume writing from where we left off so we - * don't overwrite more than is necessary, for list journal debugging: - */ - __le32 last_journal_bucket; - __le32 last_journal_bucket_offset; -}; - -/* - * This limit comes from the bucket_gens array - it's a single allocation, and - * kernel allocation are limited to INT_MAX - */ -#define BCH_MEMBER_NBUCKETS_MAX (INT_MAX - 64) - -#define BCH_MEMBER_V1_BYTES 56 - -LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4) -/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */ -LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15) -LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags, 15, 20) -LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28) -LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30) -LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED, - struct bch_member, flags, 30, 31) - -#if 0 -LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); -LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); -#endif - -#define BCH_MEMBER_STATES() \ - x(rw, 0) \ - x(ro, 1) \ - x(failed, 2) \ - x(spare, 3) - -enum bch_member_state { -#define x(t, n) BCH_MEMBER_STATE_##t = n, - BCH_MEMBER_STATES() -#undef x - BCH_MEMBER_STATE_NR -}; - -struct bch_sb_field_members_v1 { - struct bch_sb_field field; - struct bch_member _members[]; //Members are now variable size -}; - -struct bch_sb_field_members_v2 { - struct bch_sb_field field; - __le16 member_bytes; //size of single member entry - u8 pad[6]; - struct bch_member _members[]; -}; /* BCH_SB_FIELD_crypt: */ @@ -909,7 +810,6 @@ unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_re #define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) #define BCH_SB_SECTOR 8 -#define BCH_SB_MEMBERS_MAX 64 /* XXX kill */ #define BCH_SB_LAYOUT_SIZE_BITS_MAX 16 /* 32 MB */ diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h new file mode 100644 index 00000000000000..e2630548c0f681 --- /dev/null +++ b/fs/bcachefs/sb-members_format.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_MEMBERS_FORMAT_H +#define _BCACHEFS_SB_MEMBERS_FORMAT_H + +/* + * We refer to members with bitmasks in various places - but we need to get rid + * of this limit: + */ +#define BCH_SB_MEMBERS_MAX 64 + +#define BCH_MIN_NR_NBUCKETS (1 << 6) + +#define BCH_IOPS_MEASUREMENTS() \ + x(seqread, 0) \ + x(seqwrite, 1) \ + x(randread, 2) \ + x(randwrite, 3) + +enum bch_iops_measurement { +#define x(t, n) BCH_IOPS_##t = n, + BCH_IOPS_MEASUREMENTS() +#undef x + BCH_IOPS_NR +}; + +#define BCH_MEMBER_ERROR_TYPES() \ + x(read, 0) \ + x(write, 1) \ + x(checksum, 2) + +enum bch_member_error_type { +#define x(t, n) BCH_MEMBER_ERROR_##t = n, + BCH_MEMBER_ERROR_TYPES() +#undef x + BCH_MEMBER_ERROR_NR +}; + +struct bch_member { + __uuid_t uuid; + __le64 nbuckets; /* device size */ + __le16 first_bucket; /* index of first bucket used */ + __le16 bucket_size; /* sectors */ + __u8 btree_bitmap_shift; + __u8 pad[3]; + __le64 last_mount; /* time_t */ + + __le64 flags; + __le32 iops[4]; + __le64 errors[BCH_MEMBER_ERROR_NR]; + __le64 errors_at_reset[BCH_MEMBER_ERROR_NR]; + __le64 errors_reset_time; + __le64 seq; + __le64 btree_allocated_bitmap; + /* + * On recovery from a clean shutdown we don't normally read the journal, + * but we still want to resume writing from where we left off so we + * don't overwrite more than is necessary, for list journal debugging: + */ + __le32 last_journal_bucket; + __le32 last_journal_bucket_offset; +}; + +/* + * This limit comes from the bucket_gens array - it's a single allocation, and + * kernel allocation are limited to INT_MAX + */ +#define BCH_MEMBER_NBUCKETS_MAX (INT_MAX - 64) + +#define BCH_MEMBER_V1_BYTES 56 + +LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4) +/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */ +LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15) +LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags, 15, 20) +LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28) +LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30) +LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED, + struct bch_member, flags, 30, 31) + +#if 0 +LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); +LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); +#endif + +#define BCH_MEMBER_STATES() \ + x(rw, 0) \ + x(ro, 1) \ + x(failed, 2) \ + x(spare, 3) + +enum bch_member_state { +#define x(t, n) BCH_MEMBER_STATE_##t = n, + BCH_MEMBER_STATES() +#undef x + BCH_MEMBER_STATE_NR +}; + +struct bch_sb_field_members_v1 { + struct bch_sb_field field; + struct bch_member _members[]; //Members are now variable size +}; + +struct bch_sb_field_members_v2 { + struct bch_sb_field field; + __le16 member_bytes; //size of single member entry + u8 pad[6]; + struct bch_member _members[]; +}; + +#endif /* _BCACHEFS_SB_MEMBERS_FORMAT_H */ From 4c5eef0c50ccc71e225f320835dc7cd51f64f961 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 28 May 2024 16:54:29 -0400 Subject: [PATCH 0243/1578] bcachefs: split out sb-downgrade_format.h Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 14 +------------- fs/bcachefs/sb-downgrade_format.h | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 13 deletions(-) create mode 100644 fs/bcachefs/sb-downgrade_format.h diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index c4d10b528b5490..c966eb7e37b834 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -513,6 +513,7 @@ struct bch_sb_field { #include "snapshot_format.h" #include "subvolume_format.h" #include "sb-counters_format.h" +#include "sb-downgrade_format.h" #include "sb-members_format.h" enum bch_sb_field_type { @@ -546,7 +547,6 @@ struct bch_sb_field_journal_v2 { } d[]; }; - /* BCH_SB_FIELD_crypt: */ struct nonce { @@ -738,18 +738,6 @@ struct bch_sb_field_ext { __le64 btrees_lost_data; }; -struct bch_sb_field_downgrade_entry { - __le16 version; - __le64 recovery_passes[2]; - __le16 nr_errors; - __le16 errors[] __counted_by(nr_errors); -} __packed __aligned(2); - -struct bch_sb_field_downgrade { - struct bch_sb_field field; - struct bch_sb_field_downgrade_entry entries[]; -}; - /* Superblock: */ /* diff --git a/fs/bcachefs/sb-downgrade_format.h b/fs/bcachefs/sb-downgrade_format.h new file mode 100644 index 00000000000000..cffd932be3eca0 --- /dev/null +++ b/fs/bcachefs/sb-downgrade_format.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_DOWNGRADE_FORMAT_H +#define _BCACHEFS_SB_DOWNGRADE_FORMAT_H + +struct bch_sb_field_downgrade_entry { + __le16 version; + __le64 recovery_passes[2]; + __le16 nr_errors; + __le16 errors[] __counted_by(nr_errors); +} __packed __aligned(2); + +struct bch_sb_field_downgrade { + struct bch_sb_field field; + struct bch_sb_field_downgrade_entry entries[]; +}; + +#endif /* _BCACHEFS_SB_DOWNGRADE_FORMAT_H */ From 1cdcc6e3c2cb69a235bd32eb0da51205e432f77f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 28 May 2024 17:06:51 -0400 Subject: [PATCH 0244/1578] bcachefs: Split out disk_groups_format.h Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 19 +------------------ fs/bcachefs/disk_groups_format.h | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+), 18 deletions(-) create mode 100644 fs/bcachefs/disk_groups_format.h diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index c966eb7e37b834..b7690fc3ffc62b 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -507,6 +507,7 @@ struct bch_sb_field { #include "ec_format.h" #include "inode_format.h" #include "dirent_format.h" +#include "disk_groups_format.h" #include "xattr_format.h" #include "quota_format.h" #include "logged_ops_format.h" @@ -665,24 +666,6 @@ struct bch_sb_field_replicas { struct bch_replicas_entry_v1 entries[]; } __packed __aligned(8); -/* BCH_SB_FIELD_disk_groups: */ - -#define BCH_SB_LABEL_SIZE 32 - -struct bch_disk_group { - __u8 label[BCH_SB_LABEL_SIZE]; - __le64 flags[2]; -} __packed __aligned(8); - -LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1) -LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6) -LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24) - -struct bch_sb_field_disk_groups { - struct bch_sb_field field; - struct bch_disk_group entries[]; -} __packed __aligned(8); - /* * On clean shutdown, store btree roots and current journal sequence number in * the superblock: diff --git a/fs/bcachefs/disk_groups_format.h b/fs/bcachefs/disk_groups_format.h new file mode 100644 index 00000000000000..698990bbf1d206 --- /dev/null +++ b/fs/bcachefs/disk_groups_format.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DISK_GROUPS_FORMAT_H +#define _BCACHEFS_DISK_GROUPS_FORMAT_H + +#define BCH_SB_LABEL_SIZE 32 + +struct bch_disk_group { + __u8 label[BCH_SB_LABEL_SIZE]; + __le64 flags[2]; +} __packed __aligned(8); + +LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1) +LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6) +LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24) + +struct bch_sb_field_disk_groups { + struct bch_sb_field field; + struct bch_disk_group entries[]; +} __packed __aligned(8); + +#endif /* _BCACHEFS_DISK_GROUPS_FORMAT_H */ From 24998050b69ac1f4caa41b66dbd245f4de366b3c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 28 May 2024 17:13:52 -0400 Subject: [PATCH 0245/1578] bcachefs: Split out replicas_format.h Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 37 +++++------------------------------ fs/bcachefs/replicas_format.h | 31 +++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 32 deletions(-) create mode 100644 fs/bcachefs/replicas_format.h diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index b7690fc3ffc62b..66df1e5c2a8764 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -503,19 +503,20 @@ struct bch_sb_field { #include "alloc_background_format.h" #include "extents_format.h" -#include "reflink_format.h" #include "ec_format.h" -#include "inode_format.h" #include "dirent_format.h" #include "disk_groups_format.h" -#include "xattr_format.h" -#include "quota_format.h" +#include "inode_format.h" #include "logged_ops_format.h" +#include "quota_format.h" +#include "reflink_format.h" +#include "replicas_format.h" #include "snapshot_format.h" #include "subvolume_format.h" #include "sb-counters_format.h" #include "sb-downgrade_format.h" #include "sb-members_format.h" +#include "xattr_format.h" enum bch_sb_field_type { #define x(f, nr) BCH_SB_FIELD_##f = nr, @@ -596,8 +597,6 @@ LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16); LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32); LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48); -/* BCH_SB_FIELD_replicas: */ - #define BCH_DATA_TYPES() \ x(free, 0) \ x(sb, 1) \ @@ -640,32 +639,6 @@ static inline bool data_type_is_hidden(enum bch_data_type type) } } -struct bch_replicas_entry_v0 { - __u8 data_type; - __u8 nr_devs; - __u8 devs[]; -} __packed; - -struct bch_sb_field_replicas_v0 { - struct bch_sb_field field; - struct bch_replicas_entry_v0 entries[]; -} __packed __aligned(8); - -struct bch_replicas_entry_v1 { - __u8 data_type; - __u8 nr_devs; - __u8 nr_required; - __u8 devs[]; -} __packed; - -#define replicas_entry_bytes(_i) \ - (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs) - -struct bch_sb_field_replicas { - struct bch_sb_field field; - struct bch_replicas_entry_v1 entries[]; -} __packed __aligned(8); - /* * On clean shutdown, store btree roots and current journal sequence number in * the superblock: diff --git a/fs/bcachefs/replicas_format.h b/fs/bcachefs/replicas_format.h new file mode 100644 index 00000000000000..b97208195d0643 --- /dev/null +++ b/fs/bcachefs/replicas_format.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REPLICAS_FORMAT_H +#define _BCACHEFS_REPLICAS_FORMAT_H + +struct bch_replicas_entry_v0 { + __u8 data_type; + __u8 nr_devs; + __u8 devs[]; +} __packed; + +struct bch_sb_field_replicas_v0 { + struct bch_sb_field field; + struct bch_replicas_entry_v0 entries[]; +} __packed __aligned(8); + +struct bch_replicas_entry_v1 { + __u8 data_type; + __u8 nr_devs; + __u8 nr_required; + __u8 devs[]; +} __packed; + +struct bch_sb_field_replicas { + struct bch_sb_field field; + struct bch_replicas_entry_v1 entries[]; +} __packed __aligned(8); + +#define replicas_entry_bytes(_i) \ + (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs) + +#endif /* _BCACHEFS_REPLICAS_FORMAT_H */ From 5c16c5748894652e8013dbcb27e54e8319c53b00 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 28 May 2024 17:20:40 -0400 Subject: [PATCH 0246/1578] bcachefs: Split out journal_seq_blacklist_format.h Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 11 +---------- fs/bcachefs/journal_seq_blacklist_format.h | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 10 deletions(-) create mode 100644 fs/bcachefs/journal_seq_blacklist_format.h diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 66df1e5c2a8764..89ac286f17f0bf 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -507,6 +507,7 @@ struct bch_sb_field { #include "dirent_format.h" #include "disk_groups_format.h" #include "inode_format.h" +#include "journal_seq_blacklist_format.h" #include "logged_ops_format.h" #include "quota_format.h" #include "reflink_format.h" @@ -666,16 +667,6 @@ struct bch_sb_field_clean { __u64 _data[]; }; -struct journal_seq_blacklist_entry { - __le64 start; - __le64 end; -}; - -struct bch_sb_field_journal_seq_blacklist { - struct bch_sb_field field; - struct journal_seq_blacklist_entry start[]; -}; - struct bch_sb_field_errors { struct bch_sb_field field; struct bch_sb_field_error_entry { diff --git a/fs/bcachefs/journal_seq_blacklist_format.h b/fs/bcachefs/journal_seq_blacklist_format.h new file mode 100644 index 00000000000000..2566b12dbc045e --- /dev/null +++ b/fs/bcachefs/journal_seq_blacklist_format.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H +#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H + +struct journal_seq_blacklist_entry { + __le64 start; + __le64 end; +}; + +struct bch_sb_field_journal_seq_blacklist { + struct bch_sb_field field; + struct journal_seq_blacklist_entry start[]; +}; + +#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H */ From 759bb4eabc727077145f3173f8ef6c2ac745c3d6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 28 May 2024 17:23:01 -0400 Subject: [PATCH 0247/1578] bcachefs: Split out sb-errors_format.h Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 12 +- fs/bcachefs/sb-errors_format.h | 296 +++++++++++++++++++++++++++++++++ fs/bcachefs/sb-errors_types.h | 281 ------------------------------- 3 files changed, 297 insertions(+), 292 deletions(-) create mode 100644 fs/bcachefs/sb-errors_format.h diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 89ac286f17f0bf..90c12fe2a2cd3b 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -516,6 +516,7 @@ struct bch_sb_field { #include "subvolume_format.h" #include "sb-counters_format.h" #include "sb-downgrade_format.h" +#include "sb-errors_format.h" #include "sb-members_format.h" #include "xattr_format.h" @@ -667,17 +668,6 @@ struct bch_sb_field_clean { __u64 _data[]; }; -struct bch_sb_field_errors { - struct bch_sb_field field; - struct bch_sb_field_error_entry { - __le64 v; - __le64 last_error_time; - } entries[]; -}; - -LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16); -LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64); - struct bch_sb_field_ext { struct bch_sb_field field; __le64 recovery_passes_required[2]; diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h new file mode 100644 index 00000000000000..84d2763bd597bd --- /dev/null +++ b/fs/bcachefs/sb-errors_format.h @@ -0,0 +1,296 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_ERRORS_FORMAT_H +#define _BCACHEFS_SB_ERRORS_FORMAT_H + +#define BCH_SB_ERRS() \ + x(clean_but_journal_not_empty, 0) \ + x(dirty_but_no_journal_entries, 1) \ + x(dirty_but_no_journal_entries_post_drop_nonflushes, 2) \ + x(sb_clean_journal_seq_mismatch, 3) \ + x(sb_clean_btree_root_mismatch, 4) \ + x(sb_clean_missing, 5) \ + x(jset_unsupported_version, 6) \ + x(jset_unknown_csum, 7) \ + x(jset_last_seq_newer_than_seq, 8) \ + x(jset_past_bucket_end, 9) \ + x(jset_seq_blacklisted, 10) \ + x(journal_entries_missing, 11) \ + x(journal_entry_replicas_not_marked, 12) \ + x(journal_entry_past_jset_end, 13) \ + x(journal_entry_replicas_data_mismatch, 14) \ + x(journal_entry_bkey_u64s_0, 15) \ + x(journal_entry_bkey_past_end, 16) \ + x(journal_entry_bkey_bad_format, 17) \ + x(journal_entry_bkey_invalid, 18) \ + x(journal_entry_btree_root_bad_size, 19) \ + x(journal_entry_blacklist_bad_size, 20) \ + x(journal_entry_blacklist_v2_bad_size, 21) \ + x(journal_entry_blacklist_v2_start_past_end, 22) \ + x(journal_entry_usage_bad_size, 23) \ + x(journal_entry_data_usage_bad_size, 24) \ + x(journal_entry_clock_bad_size, 25) \ + x(journal_entry_clock_bad_rw, 26) \ + x(journal_entry_dev_usage_bad_size, 27) \ + x(journal_entry_dev_usage_bad_dev, 28) \ + x(journal_entry_dev_usage_bad_pad, 29) \ + x(btree_node_unreadable, 30) \ + x(btree_node_fault_injected, 31) \ + x(btree_node_bad_magic, 32) \ + x(btree_node_bad_seq, 33) \ + x(btree_node_unsupported_version, 34) \ + x(btree_node_bset_older_than_sb_min, 35) \ + x(btree_node_bset_newer_than_sb, 36) \ + x(btree_node_data_missing, 37) \ + x(btree_node_bset_after_end, 38) \ + x(btree_node_replicas_sectors_written_mismatch, 39) \ + x(btree_node_replicas_data_mismatch, 40) \ + x(bset_unknown_csum, 41) \ + x(bset_bad_csum, 42) \ + x(bset_past_end_of_btree_node, 43) \ + x(bset_wrong_sector_offset, 44) \ + x(bset_empty, 45) \ + x(bset_bad_seq, 46) \ + x(bset_blacklisted_journal_seq, 47) \ + x(first_bset_blacklisted_journal_seq, 48) \ + x(btree_node_bad_btree, 49) \ + x(btree_node_bad_level, 50) \ + x(btree_node_bad_min_key, 51) \ + x(btree_node_bad_max_key, 52) \ + x(btree_node_bad_format, 53) \ + x(btree_node_bkey_past_bset_end, 54) \ + x(btree_node_bkey_bad_format, 55) \ + x(btree_node_bad_bkey, 56) \ + x(btree_node_bkey_out_of_order, 57) \ + x(btree_root_bkey_invalid, 58) \ + x(btree_root_read_error, 59) \ + x(btree_root_bad_min_key, 60) \ + x(btree_root_bad_max_key, 61) \ + x(btree_node_read_error, 62) \ + x(btree_node_topology_bad_min_key, 63) \ + x(btree_node_topology_bad_max_key, 64) \ + x(btree_node_topology_overwritten_by_prev_node, 65) \ + x(btree_node_topology_overwritten_by_next_node, 66) \ + x(btree_node_topology_interior_node_empty, 67) \ + x(fs_usage_hidden_wrong, 68) \ + x(fs_usage_btree_wrong, 69) \ + x(fs_usage_data_wrong, 70) \ + x(fs_usage_cached_wrong, 71) \ + x(fs_usage_reserved_wrong, 72) \ + x(fs_usage_persistent_reserved_wrong, 73) \ + x(fs_usage_nr_inodes_wrong, 74) \ + x(fs_usage_replicas_wrong, 75) \ + x(dev_usage_buckets_wrong, 76) \ + x(dev_usage_sectors_wrong, 77) \ + x(dev_usage_fragmented_wrong, 78) \ + x(dev_usage_buckets_ec_wrong, 79) \ + x(bkey_version_in_future, 80) \ + x(bkey_u64s_too_small, 81) \ + x(bkey_invalid_type_for_btree, 82) \ + x(bkey_extent_size_zero, 83) \ + x(bkey_extent_size_greater_than_offset, 84) \ + x(bkey_size_nonzero, 85) \ + x(bkey_snapshot_nonzero, 86) \ + x(bkey_snapshot_zero, 87) \ + x(bkey_at_pos_max, 88) \ + x(bkey_before_start_of_btree_node, 89) \ + x(bkey_after_end_of_btree_node, 90) \ + x(bkey_val_size_nonzero, 91) \ + x(bkey_val_size_too_small, 92) \ + x(alloc_v1_val_size_bad, 93) \ + x(alloc_v2_unpack_error, 94) \ + x(alloc_v3_unpack_error, 95) \ + x(alloc_v4_val_size_bad, 96) \ + x(alloc_v4_backpointers_start_bad, 97) \ + x(alloc_key_data_type_bad, 98) \ + x(alloc_key_empty_but_have_data, 99) \ + x(alloc_key_dirty_sectors_0, 100) \ + x(alloc_key_data_type_inconsistency, 101) \ + x(alloc_key_to_missing_dev_bucket, 102) \ + x(alloc_key_cached_inconsistency, 103) \ + x(alloc_key_cached_but_read_time_zero, 104) \ + x(alloc_key_to_missing_lru_entry, 105) \ + x(alloc_key_data_type_wrong, 106) \ + x(alloc_key_gen_wrong, 107) \ + x(alloc_key_dirty_sectors_wrong, 108) \ + x(alloc_key_cached_sectors_wrong, 109) \ + x(alloc_key_stripe_wrong, 110) \ + x(alloc_key_stripe_redundancy_wrong, 111) \ + x(bucket_sector_count_overflow, 112) \ + x(bucket_metadata_type_mismatch, 113) \ + x(need_discard_key_wrong, 114) \ + x(freespace_key_wrong, 115) \ + x(freespace_hole_missing, 116) \ + x(bucket_gens_val_size_bad, 117) \ + x(bucket_gens_key_wrong, 118) \ + x(bucket_gens_hole_wrong, 119) \ + x(bucket_gens_to_invalid_dev, 120) \ + x(bucket_gens_to_invalid_buckets, 121) \ + x(bucket_gens_nonzero_for_invalid_buckets, 122) \ + x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \ + x(need_discard_freespace_key_bad, 124) \ + x(backpointer_bucket_offset_wrong, 125) \ + x(backpointer_to_missing_device, 126) \ + x(backpointer_to_missing_alloc, 127) \ + x(backpointer_to_missing_ptr, 128) \ + x(lru_entry_at_time_0, 129) \ + x(lru_entry_to_invalid_bucket, 130) \ + x(lru_entry_bad, 131) \ + x(btree_ptr_val_too_big, 132) \ + x(btree_ptr_v2_val_too_big, 133) \ + x(btree_ptr_has_non_ptr, 134) \ + x(extent_ptrs_invalid_entry, 135) \ + x(extent_ptrs_no_ptrs, 136) \ + x(extent_ptrs_too_many_ptrs, 137) \ + x(extent_ptrs_redundant_crc, 138) \ + x(extent_ptrs_redundant_stripe, 139) \ + x(extent_ptrs_unwritten, 140) \ + x(extent_ptrs_written_and_unwritten, 141) \ + x(ptr_to_invalid_device, 142) \ + x(ptr_to_duplicate_device, 143) \ + x(ptr_after_last_bucket, 144) \ + x(ptr_before_first_bucket, 145) \ + x(ptr_spans_multiple_buckets, 146) \ + x(ptr_to_missing_backpointer, 147) \ + x(ptr_to_missing_alloc_key, 148) \ + x(ptr_to_missing_replicas_entry, 149) \ + x(ptr_to_missing_stripe, 150) \ + x(ptr_to_incorrect_stripe, 151) \ + x(ptr_gen_newer_than_bucket_gen, 152) \ + x(ptr_too_stale, 153) \ + x(stale_dirty_ptr, 154) \ + x(ptr_bucket_data_type_mismatch, 155) \ + x(ptr_cached_and_erasure_coded, 156) \ + x(ptr_crc_uncompressed_size_too_small, 157) \ + x(ptr_crc_csum_type_unknown, 158) \ + x(ptr_crc_compression_type_unknown, 159) \ + x(ptr_crc_redundant, 160) \ + x(ptr_crc_uncompressed_size_too_big, 161) \ + x(ptr_crc_nonce_mismatch, 162) \ + x(ptr_stripe_redundant, 163) \ + x(reservation_key_nr_replicas_invalid, 164) \ + x(reflink_v_refcount_wrong, 165) \ + x(reflink_p_to_missing_reflink_v, 166) \ + x(stripe_pos_bad, 167) \ + x(stripe_val_size_bad, 168) \ + x(stripe_sector_count_wrong, 169) \ + x(snapshot_tree_pos_bad, 170) \ + x(snapshot_tree_to_missing_snapshot, 171) \ + x(snapshot_tree_to_missing_subvol, 172) \ + x(snapshot_tree_to_wrong_subvol, 173) \ + x(snapshot_tree_to_snapshot_subvol, 174) \ + x(snapshot_pos_bad, 175) \ + x(snapshot_parent_bad, 176) \ + x(snapshot_children_not_normalized, 177) \ + x(snapshot_child_duplicate, 178) \ + x(snapshot_child_bad, 179) \ + x(snapshot_skiplist_not_normalized, 180) \ + x(snapshot_skiplist_bad, 181) \ + x(snapshot_should_not_have_subvol, 182) \ + x(snapshot_to_bad_snapshot_tree, 183) \ + x(snapshot_bad_depth, 184) \ + x(snapshot_bad_skiplist, 185) \ + x(subvol_pos_bad, 186) \ + x(subvol_not_master_and_not_snapshot, 187) \ + x(subvol_to_missing_root, 188) \ + x(subvol_root_wrong_bi_subvol, 189) \ + x(bkey_in_missing_snapshot, 190) \ + x(inode_pos_inode_nonzero, 191) \ + x(inode_pos_blockdev_range, 192) \ + x(inode_unpack_error, 193) \ + x(inode_str_hash_invalid, 194) \ + x(inode_v3_fields_start_bad, 195) \ + x(inode_snapshot_mismatch, 196) \ + x(inode_unlinked_but_clean, 197) \ + x(inode_unlinked_but_nlink_nonzero, 198) \ + x(inode_checksum_type_invalid, 199) \ + x(inode_compression_type_invalid, 200) \ + x(inode_subvol_root_but_not_dir, 201) \ + x(inode_i_size_dirty_but_clean, 202) \ + x(inode_i_sectors_dirty_but_clean, 203) \ + x(inode_i_sectors_wrong, 204) \ + x(inode_dir_wrong_nlink, 205) \ + x(inode_dir_multiple_links, 206) \ + x(inode_multiple_links_but_nlink_0, 207) \ + x(inode_wrong_backpointer, 208) \ + x(inode_wrong_nlink, 209) \ + x(inode_unreachable, 210) \ + x(deleted_inode_but_clean, 211) \ + x(deleted_inode_missing, 212) \ + x(deleted_inode_is_dir, 213) \ + x(deleted_inode_not_unlinked, 214) \ + x(extent_overlapping, 215) \ + x(extent_in_missing_inode, 216) \ + x(extent_in_non_reg_inode, 217) \ + x(extent_past_end_of_inode, 218) \ + x(dirent_empty_name, 219) \ + x(dirent_val_too_big, 220) \ + x(dirent_name_too_long, 221) \ + x(dirent_name_embedded_nul, 222) \ + x(dirent_name_dot_or_dotdot, 223) \ + x(dirent_name_has_slash, 224) \ + x(dirent_d_type_wrong, 225) \ + x(inode_bi_parent_wrong, 226) \ + x(dirent_in_missing_dir_inode, 227) \ + x(dirent_in_non_dir_inode, 228) \ + x(dirent_to_missing_inode, 229) \ + x(dirent_to_missing_subvol, 230) \ + x(dirent_to_itself, 231) \ + x(quota_type_invalid, 232) \ + x(xattr_val_size_too_small, 233) \ + x(xattr_val_size_too_big, 234) \ + x(xattr_invalid_type, 235) \ + x(xattr_name_invalid_chars, 236) \ + x(xattr_in_missing_inode, 237) \ + x(root_subvol_missing, 238) \ + x(root_dir_missing, 239) \ + x(root_inode_not_dir, 240) \ + x(dir_loop, 241) \ + x(hash_table_key_duplicate, 242) \ + x(hash_table_key_wrong_offset, 243) \ + x(unlinked_inode_not_on_deleted_list, 244) \ + x(reflink_p_front_pad_bad, 245) \ + x(journal_entry_dup_same_device, 246) \ + x(inode_bi_subvol_missing, 247) \ + x(inode_bi_subvol_wrong, 248) \ + x(inode_points_to_missing_dirent, 249) \ + x(inode_points_to_wrong_dirent, 250) \ + x(inode_bi_parent_nonzero, 251) \ + x(dirent_to_missing_parent_subvol, 252) \ + x(dirent_not_visible_in_parent_subvol, 253) \ + x(subvol_fs_path_parent_wrong, 254) \ + x(subvol_root_fs_path_parent_nonzero, 255) \ + x(subvol_children_not_set, 256) \ + x(subvol_children_bad, 257) \ + x(subvol_loop, 258) \ + x(subvol_unreachable, 259) \ + x(btree_node_bkey_bad_u64s, 260) \ + x(btree_node_topology_empty_interior_node, 261) \ + x(btree_ptr_v2_min_key_bad, 262) \ + x(btree_root_unreadable_and_scan_found_nothing, 263) \ + x(snapshot_node_missing, 264) \ + x(dup_backpointer_to_bad_csum_extent, 265) \ + x(btree_bitmap_not_marked, 266) \ + x(sb_clean_entry_overrun, 267) \ + x(btree_ptr_v2_written_0, 268) \ + x(subvol_snapshot_bad, 269) \ + x(subvol_inode_bad, 270) + +enum bch_sb_error_id { +#define x(t, n) BCH_FSCK_ERR_##t = n, + BCH_SB_ERRS() +#undef x + BCH_SB_ERR_MAX +}; + +struct bch_sb_field_errors { + struct bch_sb_field field; + struct bch_sb_field_error_entry { + __le64 v; + __le64 last_error_time; + } entries[]; +}; + +LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16); +LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64); + +#endif /* _BCACHEFS_SB_ERRORS_FORMAT_H */ diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index 666599d3fb9d7d..40325239c3b0f2 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -4,286 +4,6 @@ #include "darray.h" -#define BCH_SB_ERRS() \ - x(clean_but_journal_not_empty, 0) \ - x(dirty_but_no_journal_entries, 1) \ - x(dirty_but_no_journal_entries_post_drop_nonflushes, 2) \ - x(sb_clean_journal_seq_mismatch, 3) \ - x(sb_clean_btree_root_mismatch, 4) \ - x(sb_clean_missing, 5) \ - x(jset_unsupported_version, 6) \ - x(jset_unknown_csum, 7) \ - x(jset_last_seq_newer_than_seq, 8) \ - x(jset_past_bucket_end, 9) \ - x(jset_seq_blacklisted, 10) \ - x(journal_entries_missing, 11) \ - x(journal_entry_replicas_not_marked, 12) \ - x(journal_entry_past_jset_end, 13) \ - x(journal_entry_replicas_data_mismatch, 14) \ - x(journal_entry_bkey_u64s_0, 15) \ - x(journal_entry_bkey_past_end, 16) \ - x(journal_entry_bkey_bad_format, 17) \ - x(journal_entry_bkey_invalid, 18) \ - x(journal_entry_btree_root_bad_size, 19) \ - x(journal_entry_blacklist_bad_size, 20) \ - x(journal_entry_blacklist_v2_bad_size, 21) \ - x(journal_entry_blacklist_v2_start_past_end, 22) \ - x(journal_entry_usage_bad_size, 23) \ - x(journal_entry_data_usage_bad_size, 24) \ - x(journal_entry_clock_bad_size, 25) \ - x(journal_entry_clock_bad_rw, 26) \ - x(journal_entry_dev_usage_bad_size, 27) \ - x(journal_entry_dev_usage_bad_dev, 28) \ - x(journal_entry_dev_usage_bad_pad, 29) \ - x(btree_node_unreadable, 30) \ - x(btree_node_fault_injected, 31) \ - x(btree_node_bad_magic, 32) \ - x(btree_node_bad_seq, 33) \ - x(btree_node_unsupported_version, 34) \ - x(btree_node_bset_older_than_sb_min, 35) \ - x(btree_node_bset_newer_than_sb, 36) \ - x(btree_node_data_missing, 37) \ - x(btree_node_bset_after_end, 38) \ - x(btree_node_replicas_sectors_written_mismatch, 39) \ - x(btree_node_replicas_data_mismatch, 40) \ - x(bset_unknown_csum, 41) \ - x(bset_bad_csum, 42) \ - x(bset_past_end_of_btree_node, 43) \ - x(bset_wrong_sector_offset, 44) \ - x(bset_empty, 45) \ - x(bset_bad_seq, 46) \ - x(bset_blacklisted_journal_seq, 47) \ - x(first_bset_blacklisted_journal_seq, 48) \ - x(btree_node_bad_btree, 49) \ - x(btree_node_bad_level, 50) \ - x(btree_node_bad_min_key, 51) \ - x(btree_node_bad_max_key, 52) \ - x(btree_node_bad_format, 53) \ - x(btree_node_bkey_past_bset_end, 54) \ - x(btree_node_bkey_bad_format, 55) \ - x(btree_node_bad_bkey, 56) \ - x(btree_node_bkey_out_of_order, 57) \ - x(btree_root_bkey_invalid, 58) \ - x(btree_root_read_error, 59) \ - x(btree_root_bad_min_key, 60) \ - x(btree_root_bad_max_key, 61) \ - x(btree_node_read_error, 62) \ - x(btree_node_topology_bad_min_key, 63) \ - x(btree_node_topology_bad_max_key, 64) \ - x(btree_node_topology_overwritten_by_prev_node, 65) \ - x(btree_node_topology_overwritten_by_next_node, 66) \ - x(btree_node_topology_interior_node_empty, 67) \ - x(fs_usage_hidden_wrong, 68) \ - x(fs_usage_btree_wrong, 69) \ - x(fs_usage_data_wrong, 70) \ - x(fs_usage_cached_wrong, 71) \ - x(fs_usage_reserved_wrong, 72) \ - x(fs_usage_persistent_reserved_wrong, 73) \ - x(fs_usage_nr_inodes_wrong, 74) \ - x(fs_usage_replicas_wrong, 75) \ - x(dev_usage_buckets_wrong, 76) \ - x(dev_usage_sectors_wrong, 77) \ - x(dev_usage_fragmented_wrong, 78) \ - x(dev_usage_buckets_ec_wrong, 79) \ - x(bkey_version_in_future, 80) \ - x(bkey_u64s_too_small, 81) \ - x(bkey_invalid_type_for_btree, 82) \ - x(bkey_extent_size_zero, 83) \ - x(bkey_extent_size_greater_than_offset, 84) \ - x(bkey_size_nonzero, 85) \ - x(bkey_snapshot_nonzero, 86) \ - x(bkey_snapshot_zero, 87) \ - x(bkey_at_pos_max, 88) \ - x(bkey_before_start_of_btree_node, 89) \ - x(bkey_after_end_of_btree_node, 90) \ - x(bkey_val_size_nonzero, 91) \ - x(bkey_val_size_too_small, 92) \ - x(alloc_v1_val_size_bad, 93) \ - x(alloc_v2_unpack_error, 94) \ - x(alloc_v3_unpack_error, 95) \ - x(alloc_v4_val_size_bad, 96) \ - x(alloc_v4_backpointers_start_bad, 97) \ - x(alloc_key_data_type_bad, 98) \ - x(alloc_key_empty_but_have_data, 99) \ - x(alloc_key_dirty_sectors_0, 100) \ - x(alloc_key_data_type_inconsistency, 101) \ - x(alloc_key_to_missing_dev_bucket, 102) \ - x(alloc_key_cached_inconsistency, 103) \ - x(alloc_key_cached_but_read_time_zero, 104) \ - x(alloc_key_to_missing_lru_entry, 105) \ - x(alloc_key_data_type_wrong, 106) \ - x(alloc_key_gen_wrong, 107) \ - x(alloc_key_dirty_sectors_wrong, 108) \ - x(alloc_key_cached_sectors_wrong, 109) \ - x(alloc_key_stripe_wrong, 110) \ - x(alloc_key_stripe_redundancy_wrong, 111) \ - x(bucket_sector_count_overflow, 112) \ - x(bucket_metadata_type_mismatch, 113) \ - x(need_discard_key_wrong, 114) \ - x(freespace_key_wrong, 115) \ - x(freespace_hole_missing, 116) \ - x(bucket_gens_val_size_bad, 117) \ - x(bucket_gens_key_wrong, 118) \ - x(bucket_gens_hole_wrong, 119) \ - x(bucket_gens_to_invalid_dev, 120) \ - x(bucket_gens_to_invalid_buckets, 121) \ - x(bucket_gens_nonzero_for_invalid_buckets, 122) \ - x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \ - x(need_discard_freespace_key_bad, 124) \ - x(backpointer_bucket_offset_wrong, 125) \ - x(backpointer_to_missing_device, 126) \ - x(backpointer_to_missing_alloc, 127) \ - x(backpointer_to_missing_ptr, 128) \ - x(lru_entry_at_time_0, 129) \ - x(lru_entry_to_invalid_bucket, 130) \ - x(lru_entry_bad, 131) \ - x(btree_ptr_val_too_big, 132) \ - x(btree_ptr_v2_val_too_big, 133) \ - x(btree_ptr_has_non_ptr, 134) \ - x(extent_ptrs_invalid_entry, 135) \ - x(extent_ptrs_no_ptrs, 136) \ - x(extent_ptrs_too_many_ptrs, 137) \ - x(extent_ptrs_redundant_crc, 138) \ - x(extent_ptrs_redundant_stripe, 139) \ - x(extent_ptrs_unwritten, 140) \ - x(extent_ptrs_written_and_unwritten, 141) \ - x(ptr_to_invalid_device, 142) \ - x(ptr_to_duplicate_device, 143) \ - x(ptr_after_last_bucket, 144) \ - x(ptr_before_first_bucket, 145) \ - x(ptr_spans_multiple_buckets, 146) \ - x(ptr_to_missing_backpointer, 147) \ - x(ptr_to_missing_alloc_key, 148) \ - x(ptr_to_missing_replicas_entry, 149) \ - x(ptr_to_missing_stripe, 150) \ - x(ptr_to_incorrect_stripe, 151) \ - x(ptr_gen_newer_than_bucket_gen, 152) \ - x(ptr_too_stale, 153) \ - x(stale_dirty_ptr, 154) \ - x(ptr_bucket_data_type_mismatch, 155) \ - x(ptr_cached_and_erasure_coded, 156) \ - x(ptr_crc_uncompressed_size_too_small, 157) \ - x(ptr_crc_csum_type_unknown, 158) \ - x(ptr_crc_compression_type_unknown, 159) \ - x(ptr_crc_redundant, 160) \ - x(ptr_crc_uncompressed_size_too_big, 161) \ - x(ptr_crc_nonce_mismatch, 162) \ - x(ptr_stripe_redundant, 163) \ - x(reservation_key_nr_replicas_invalid, 164) \ - x(reflink_v_refcount_wrong, 165) \ - x(reflink_p_to_missing_reflink_v, 166) \ - x(stripe_pos_bad, 167) \ - x(stripe_val_size_bad, 168) \ - x(stripe_sector_count_wrong, 169) \ - x(snapshot_tree_pos_bad, 170) \ - x(snapshot_tree_to_missing_snapshot, 171) \ - x(snapshot_tree_to_missing_subvol, 172) \ - x(snapshot_tree_to_wrong_subvol, 173) \ - x(snapshot_tree_to_snapshot_subvol, 174) \ - x(snapshot_pos_bad, 175) \ - x(snapshot_parent_bad, 176) \ - x(snapshot_children_not_normalized, 177) \ - x(snapshot_child_duplicate, 178) \ - x(snapshot_child_bad, 179) \ - x(snapshot_skiplist_not_normalized, 180) \ - x(snapshot_skiplist_bad, 181) \ - x(snapshot_should_not_have_subvol, 182) \ - x(snapshot_to_bad_snapshot_tree, 183) \ - x(snapshot_bad_depth, 184) \ - x(snapshot_bad_skiplist, 185) \ - x(subvol_pos_bad, 186) \ - x(subvol_not_master_and_not_snapshot, 187) \ - x(subvol_to_missing_root, 188) \ - x(subvol_root_wrong_bi_subvol, 189) \ - x(bkey_in_missing_snapshot, 190) \ - x(inode_pos_inode_nonzero, 191) \ - x(inode_pos_blockdev_range, 192) \ - x(inode_unpack_error, 193) \ - x(inode_str_hash_invalid, 194) \ - x(inode_v3_fields_start_bad, 195) \ - x(inode_snapshot_mismatch, 196) \ - x(inode_unlinked_but_clean, 197) \ - x(inode_unlinked_but_nlink_nonzero, 198) \ - x(inode_checksum_type_invalid, 199) \ - x(inode_compression_type_invalid, 200) \ - x(inode_subvol_root_but_not_dir, 201) \ - x(inode_i_size_dirty_but_clean, 202) \ - x(inode_i_sectors_dirty_but_clean, 203) \ - x(inode_i_sectors_wrong, 204) \ - x(inode_dir_wrong_nlink, 205) \ - x(inode_dir_multiple_links, 206) \ - x(inode_multiple_links_but_nlink_0, 207) \ - x(inode_wrong_backpointer, 208) \ - x(inode_wrong_nlink, 209) \ - x(inode_unreachable, 210) \ - x(deleted_inode_but_clean, 211) \ - x(deleted_inode_missing, 212) \ - x(deleted_inode_is_dir, 213) \ - x(deleted_inode_not_unlinked, 214) \ - x(extent_overlapping, 215) \ - x(extent_in_missing_inode, 216) \ - x(extent_in_non_reg_inode, 217) \ - x(extent_past_end_of_inode, 218) \ - x(dirent_empty_name, 219) \ - x(dirent_val_too_big, 220) \ - x(dirent_name_too_long, 221) \ - x(dirent_name_embedded_nul, 222) \ - x(dirent_name_dot_or_dotdot, 223) \ - x(dirent_name_has_slash, 224) \ - x(dirent_d_type_wrong, 225) \ - x(inode_bi_parent_wrong, 226) \ - x(dirent_in_missing_dir_inode, 227) \ - x(dirent_in_non_dir_inode, 228) \ - x(dirent_to_missing_inode, 229) \ - x(dirent_to_missing_subvol, 230) \ - x(dirent_to_itself, 231) \ - x(quota_type_invalid, 232) \ - x(xattr_val_size_too_small, 233) \ - x(xattr_val_size_too_big, 234) \ - x(xattr_invalid_type, 235) \ - x(xattr_name_invalid_chars, 236) \ - x(xattr_in_missing_inode, 237) \ - x(root_subvol_missing, 238) \ - x(root_dir_missing, 239) \ - x(root_inode_not_dir, 240) \ - x(dir_loop, 241) \ - x(hash_table_key_duplicate, 242) \ - x(hash_table_key_wrong_offset, 243) \ - x(unlinked_inode_not_on_deleted_list, 244) \ - x(reflink_p_front_pad_bad, 245) \ - x(journal_entry_dup_same_device, 246) \ - x(inode_bi_subvol_missing, 247) \ - x(inode_bi_subvol_wrong, 248) \ - x(inode_points_to_missing_dirent, 249) \ - x(inode_points_to_wrong_dirent, 250) \ - x(inode_bi_parent_nonzero, 251) \ - x(dirent_to_missing_parent_subvol, 252) \ - x(dirent_not_visible_in_parent_subvol, 253) \ - x(subvol_fs_path_parent_wrong, 254) \ - x(subvol_root_fs_path_parent_nonzero, 255) \ - x(subvol_children_not_set, 256) \ - x(subvol_children_bad, 257) \ - x(subvol_loop, 258) \ - x(subvol_unreachable, 259) \ - x(btree_node_bkey_bad_u64s, 260) \ - x(btree_node_topology_empty_interior_node, 261) \ - x(btree_ptr_v2_min_key_bad, 262) \ - x(btree_root_unreadable_and_scan_found_nothing, 263) \ - x(snapshot_node_missing, 264) \ - x(dup_backpointer_to_bad_csum_extent, 265) \ - x(btree_bitmap_not_marked, 266) \ - x(sb_clean_entry_overrun, 267) \ - x(btree_ptr_v2_written_0, 268) \ - x(subvol_snapshot_bad, 269) \ - x(subvol_inode_bad, 270) - -enum bch_sb_error_id { -#define x(t, n) BCH_FSCK_ERR_##t = n, - BCH_SB_ERRS() -#undef x - BCH_SB_ERR_MAX -}; - struct bch_sb_error_entry_cpu { u64 id:16, nr:48; @@ -293,4 +13,3 @@ struct bch_sb_error_entry_cpu { typedef DARRAY(struct bch_sb_error_entry_cpu) bch_sb_errors_cpu; #endif /* _BCACHEFS_SB_ERRORS_TYPES_H */ - From 8528bde1b66bab9a0abc2f521523abd00049c81b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 28 May 2024 18:20:52 -0400 Subject: [PATCH 0248/1578] bcachefs: Fix uninitialized var warning Can't actually be used uninitialized, but gcc was being silly. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 2 +- fs/bcachefs/buckets.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 692b1c7d5018c8..4321f9fb73bd9f 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -690,7 +690,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans, ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - struct bpos bucket_pos; + struct bpos bucket_pos = POS_MIN; struct bch_backpointer bp; if (p.ptr.cached) diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index b469586517a86d..ed97712d0db1ed 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -1134,7 +1134,7 @@ static int __trigger_extent(struct btree_trans *trans, r.e.nr_required = 1; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - s64 disk_sectors; + s64 disk_sectors = 0; ret = bch2_trigger_pointer(trans, btree_id, level, k, p, entry, &disk_sectors, flags); if (ret < 0) return ret; From 33c563ebf8d3deed7d8addd20d77398ac737ef9a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 8 May 2024 22:50:34 +0200 Subject: [PATCH 0249/1578] netfilter: nft_payload: skbuff vlan metadata mangle support Userspace assumes vlan header is present at a given offset, but vlan offload allows to store this in metadata fields of the skbuff. Hence mangling vlan results in a garbled packet. Handle this transparently by adding a parser to the kernel. If vlan metadata is present and payload offset is over 12 bytes (source and destination mac address fields), then subtract vlan header present in vlan metadata, otherwise mangle vlan metadata based on offset and length, extracting data from the source register. This is similar to: 8cfd23e67401 ("netfilter: nft_payload: work around vlan header stripping") to deal with vlan payload mangling. Fixes: 7ec3f7b47b8d ("netfilter: nft_payload: add packet mangling support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_payload.c | 72 +++++++++++++++++++++++++++++++++---- 1 file changed, 65 insertions(+), 7 deletions(-) diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index a3cb5dbcb362cc..0c43d748e23ae1 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -145,12 +145,12 @@ int nft_payload_inner_offset(const struct nft_pktinfo *pkt) return pkt->inneroff; } -static bool nft_payload_need_vlan_copy(const struct nft_payload *priv) +static bool nft_payload_need_vlan_adjust(u32 offset, u32 len) { - unsigned int len = priv->offset + priv->len; + unsigned int boundary = offset + len; /* data past ether src/dst requested, copy needed */ - if (len > offsetof(struct ethhdr, h_proto)) + if (boundary > offsetof(struct ethhdr, h_proto)) return true; return false; @@ -174,7 +174,7 @@ void nft_payload_eval(const struct nft_expr *expr, goto err; if (skb_vlan_tag_present(skb) && - nft_payload_need_vlan_copy(priv)) { + nft_payload_need_vlan_adjust(priv->offset, priv->len)) { if (!nft_payload_copy_vlan(dest, skb, priv->offset, priv->len)) goto err; @@ -801,21 +801,79 @@ struct nft_payload_set { u8 csum_flags; }; +/* This is not struct vlan_hdr. */ +struct nft_payload_vlan_hdr { + __be16 h_vlan_proto; + __be16 h_vlan_TCI; +}; + +static bool +nft_payload_set_vlan(const u32 *src, struct sk_buff *skb, u8 offset, u8 len, + int *vlan_hlen) +{ + struct nft_payload_vlan_hdr *vlanh; + __be16 vlan_proto; + u16 vlan_tci; + + if (offset >= offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto)) { + *vlan_hlen = VLAN_HLEN; + return true; + } + + switch (offset) { + case offsetof(struct vlan_ethhdr, h_vlan_proto): + if (len == 2) { + vlan_proto = nft_reg_load_be16(src); + skb->vlan_proto = vlan_proto; + } else if (len == 4) { + vlanh = (struct nft_payload_vlan_hdr *)src; + __vlan_hwaccel_put_tag(skb, vlanh->h_vlan_proto, + ntohs(vlanh->h_vlan_TCI)); + } else { + return false; + } + break; + case offsetof(struct vlan_ethhdr, h_vlan_TCI): + if (len != 2) + return false; + + vlan_tci = ntohs(nft_reg_load_be16(src)); + skb->vlan_tci = vlan_tci; + break; + default: + return false; + } + + return true; +} + static void nft_payload_set_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_payload_set *priv = nft_expr_priv(expr); - struct sk_buff *skb = pkt->skb; const u32 *src = ®s->data[priv->sreg]; - int offset, csum_offset; + int offset, csum_offset, vlan_hlen = 0; + struct sk_buff *skb = pkt->skb; __wsum fsum, tsum; switch (priv->base) { case NFT_PAYLOAD_LL_HEADER: if (!skb_mac_header_was_set(skb)) goto err; - offset = skb_mac_header(skb) - skb->data; + + if (skb_vlan_tag_present(skb) && + nft_payload_need_vlan_adjust(priv->offset, priv->len)) { + if (!nft_payload_set_vlan(src, skb, + priv->offset, priv->len, + &vlan_hlen)) + goto err; + + if (!vlan_hlen) + return; + } + + offset = skb_mac_header(skb) - skb->data - vlan_hlen; break; case NFT_PAYLOAD_NETWORK_HEADER: offset = skb_network_offset(skb); From 21a673bddc8fd4873c370caf9ae70ffc6d47e8d3 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 13 May 2024 12:27:15 +0200 Subject: [PATCH 0250/1578] netfilter: tproxy: bail out if IP has been disabled on the device syzbot reports: general protection fault, probably for non-canonical address 0xdffffc0000000003: 0000 [#1] PREEMPT SMP KASAN PTI KASAN: null-ptr-deref in range [0x0000000000000018-0x000000000000001f] [..] RIP: 0010:nf_tproxy_laddr4+0xb7/0x340 net/ipv4/netfilter/nf_tproxy_ipv4.c:62 Call Trace: nft_tproxy_eval_v4 net/netfilter/nft_tproxy.c:56 [inline] nft_tproxy_eval+0xa9a/0x1a00 net/netfilter/nft_tproxy.c:168 __in_dev_get_rcu() can return NULL, so check for this. Reported-and-tested-by: syzbot+b94a6818504ea90d7661@syzkaller.appspotmail.com Fixes: cc6eb4338569 ("tproxy: use the interface primary IP address as a default value for --on-ip") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_tproxy_ipv4.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c index 69e33179960430..73e66a088e25eb 100644 --- a/net/ipv4/netfilter/nf_tproxy_ipv4.c +++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c @@ -58,6 +58,8 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) laddr = 0; indev = __in_dev_get_rcu(skb->dev); + if (!indev) + return daddr; in_dev_for_each_ifa_rcu(ifa, indev) { if (ifa->ifa_flags & IFA_F_SECONDARY) From e8ded22ef0f4831279c363c264cd41cd9d59ca9e Mon Sep 17 00:00:00 2001 From: Eric Garver Date: Tue, 21 May 2024 10:25:05 -0400 Subject: [PATCH 0251/1578] netfilter: nft_fib: allow from forward/input without iif selector This removes the restriction of needing iif selector in the forward/input hooks for fib lookups when requested result is oif/oifname. Removing this restriction allows "loose" lookups from the forward hooks. Fixes: be8be04e5ddb ("netfilter: nft_fib: reverse path filter for policy-based routing on iif") Signed-off-by: Eric Garver Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_fib.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c index 37cfe6dd712d8b..b58f62195ff3ee 100644 --- a/net/netfilter/nft_fib.c +++ b/net/netfilter/nft_fib.c @@ -35,11 +35,9 @@ int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, switch (priv->result) { case NFT_FIB_RESULT_OIF: case NFT_FIB_RESULT_OIFNAME: - hooks = (1 << NF_INET_PRE_ROUTING); - if (priv->flags & NFTA_FIB_F_IIF) { - hooks |= (1 << NF_INET_LOCAL_IN) | - (1 << NF_INET_FORWARD); - } + hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD); break; case NFT_FIB_RESULT_ADDRTYPE: if (priv->flags & NFTA_FIB_F_IIF) From a69ce592cbe0417664bc5a075205aa75c2ec1273 Mon Sep 17 00:00:00 2001 From: Jonathan Denose Date: Fri, 3 May 2024 16:12:07 +0000 Subject: [PATCH 0252/1578] Input: elantech - fix touchpad state on resume for Lenovo N24 The Lenovo N24 on resume becomes stuck in a state where it sends incorrect packets, causing elantech_packet_check_v4 to fail. The only way for the device to resume sending the correct packets is for it to be disabled and then re-enabled. This change adds a dmi check to trigger this behavior on resume. Signed-off-by: Jonathan Denose Link: https://lore.kernel.org/r/20240503155020.v2.1.Ifa0e25ebf968d8f307f58d678036944141ab17e6@changeid Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/elantech.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/drivers/input/mouse/elantech.c b/drivers/input/mouse/elantech.c index 4e38229404b4b0..b4723ea395eb9f 100644 --- a/drivers/input/mouse/elantech.c +++ b/drivers/input/mouse/elantech.c @@ -1476,16 +1476,47 @@ static void elantech_disconnect(struct psmouse *psmouse) psmouse->private = NULL; } +/* + * Some hw_version 4 models fail to properly activate absolute mode on + * resume without going through disable/enable cycle. + */ +static const struct dmi_system_id elantech_needs_reenable[] = { +#if defined(CONFIG_DMI) && defined(CONFIG_X86) + { + /* Lenovo N24 */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "81AF"), + }, + }, +#endif + { } +}; + /* * Put the touchpad back into absolute mode when reconnecting */ static int elantech_reconnect(struct psmouse *psmouse) { + int err; + psmouse_reset(psmouse); if (elantech_detect(psmouse, 0)) return -1; + if (dmi_check_system(elantech_needs_reenable)) { + err = ps2_command(&psmouse->ps2dev, NULL, PSMOUSE_CMD_DISABLE); + if (err) + psmouse_warn(psmouse, "failed to deactivate mouse on %s: %d\n", + psmouse->ps2dev.serio->phys, err); + + err = ps2_command(&psmouse->ps2dev, NULL, PSMOUSE_CMD_ENABLE); + if (err) + psmouse_warn(psmouse, "failed to reactivate mouse on %s: %d\n", + psmouse->ps2dev.serio->phys, err); + } + if (elantech_set_absolute_mode(psmouse)) { psmouse_err(psmouse, "failed to put touchpad back into absolute mode.\n"); From d55510527153d17a3af8cc2df69c04f95ae1350d Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Tue, 28 May 2024 15:55:51 -0700 Subject: [PATCH 0253/1578] cxl/test: Add missing vmalloc.h for tools/testing/cxl/test/mem.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tools/testing/cxl/test/mem.c uses vmalloc() and vfree() but does not include linux/vmalloc.h. Kernel v6.10 made changes that causes the currently included headers not depend on vmalloc.h and therefore mem.c can no longer compile. Add linux/vmalloc.h to fix compile issue. CC [M] tools/testing/cxl/test/mem.o tools/testing/cxl/test/mem.c: In function ‘label_area_release’: tools/testing/cxl/test/mem.c:1428:9: error: implicit declaration of function ‘vfree’; did you mean ‘kvfree’? [-Werror=implicit-function-declaration] 1428 | vfree(lsa); | ^~~~~ | kvfree tools/testing/cxl/test/mem.c: In function ‘cxl_mock_mem_probe’: tools/testing/cxl/test/mem.c:1466:22: error: implicit declaration of function ‘vmalloc’; did you mean ‘kmalloc’? [-Werror=implicit-function-declaration] 1466 | mdata->lsa = vmalloc(LSA_SIZE); | ^~~~~~~ | kmalloc Fixes: 7d3eb23c4ccf ("tools/testing/cxl: Introduce a mock memory device + driver") Reviewed-by: Dan Williams Reviewed-by: Alison Schofield Link: https://lore.kernel.org/r/20240528225551.1025977-1-dave.jiang@intel.com Signed-off-by: Dave Jiang --- tools/testing/cxl/test/mem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index 6584443144de89..eaf091a3d33137 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -3,6 +3,7 @@ #include #include +#include #include #include #include From 49ba7b515c4c0719b866d16f068e62d16a8a3dd1 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 7 May 2024 13:34:21 +0800 Subject: [PATCH 0254/1578] cxl/region: Fix memregion leaks in devm_cxl_add_region() Move the mode verification to __create_region() before allocating the memregion to avoid the memregion leaks. Fixes: 6e099264185d ("cxl/region: Add volatile region creation support") Signed-off-by: Li Zhijian Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/20240507053421.456439-1-lizhijian@fujitsu.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 00a9f0eef8dd2c..3c2b6144be23cd 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -2352,15 +2352,6 @@ static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd, struct device *dev; int rc; - switch (mode) { - case CXL_DECODER_RAM: - case CXL_DECODER_PMEM: - break; - default: - dev_err(&cxlrd->cxlsd.cxld.dev, "unsupported mode %d\n", mode); - return ERR_PTR(-EINVAL); - } - cxlr = cxl_region_alloc(cxlrd, id); if (IS_ERR(cxlr)) return cxlr; @@ -2415,6 +2406,15 @@ static struct cxl_region *__create_region(struct cxl_root_decoder *cxlrd, { int rc; + switch (mode) { + case CXL_DECODER_RAM: + case CXL_DECODER_PMEM: + break; + default: + dev_err(&cxlrd->cxlsd.cxld.dev, "unsupported mode %d\n", mode); + return ERR_PTR(-EINVAL); + } + rc = memregion_alloc(GFP_KERNEL); if (rc < 0) return ERR_PTR(rc); From 83208cbf2f08c270033003e10f3e7351de64a5c5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 28 May 2024 19:21:59 -0400 Subject: [PATCH 0255/1578] bcachefs: Don't return -EROFS from mount on inconsistency error We were accidentally returning -EROFS during recovery on filesystem inconsistency - since this is what the journal returns on emergency shutdown. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 96040a95cf4667..cd388f1702dc85 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1939,8 +1939,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, if (IS_ERR(sb)) { ret = PTR_ERR(sb); - ret = bch2_err_class(ret); - return ERR_PTR(ret); + goto err; } c = sb->s_fs_info; @@ -2016,6 +2015,15 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, err_put_super: __bch2_fs_stop(c); deactivate_locked_super(sb); +err: + /* + * On an inconsistency error in recovery we might see an -EROFS derived + * errorcode (from the journal), but we don't want to return that to + * userspace as that causes util-linux to retry the mount RO - which is + * confusing: + */ + if (bch2_err_matches(ret, EROFS) && ret != -EROFS) + ret = -EIO; return ERR_PTR(bch2_err_class(ret)); } From e634134180885574d1fe7aa162777ba41e7fcd5b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 27 May 2024 18:39:54 +0300 Subject: [PATCH 0256/1578] net/sched: taprio: make q->picos_per_byte available to fill_sched_entry() In commit b5b73b26b3ca ("taprio: Fix allowing too small intervals"), a comparison of user input against length_to_duration(q, ETH_ZLEN) was introduced, to avoid RCU stalls due to frequent hrtimers. The implementation of length_to_duration() depends on q->picos_per_byte being set for the link speed. The blamed commit in the Fixes: tag has moved this too late, so the checks introduced above are ineffective. The q->picos_per_byte is zero at parse_taprio_schedule() -> parse_sched_list() -> parse_sched_entry() -> fill_sched_entry() time. Move the taprio_set_picos_per_byte() call as one of the first things in taprio_change(), before the bulk of the netlink attribute parsing is done. That's because it is needed there. Add a selftest to make sure the issue doesn't get reintroduced. Fixes: 09dbdf28f9f9 ("net/sched: taprio: fix calculation of maximum gate durations") Signed-off-by: Vladimir Oltean Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240527153955.553333-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/sched/sch_taprio.c | 4 +++- .../tc-testing/tc-tests/qdiscs/taprio.json | 22 +++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 1ab17e8a726053..11891505536081 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1848,6 +1848,9 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, } q->flags = taprio_flags; + /* Needed for length_to_duration() during netlink attribute parsing */ + taprio_set_picos_per_byte(dev, q); + err = taprio_parse_mqprio_opt(dev, mqprio, extack, q->flags); if (err < 0) return err; @@ -1907,7 +1910,6 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, if (err < 0) goto free_sched; - taprio_set_picos_per_byte(dev, q); taprio_update_queue_max_sdu(q, new_admin, stab); if (FULL_OFFLOAD_IS_ENABLED(q->flags)) diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/taprio.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/taprio.json index 12da0a939e3e5f..8f12f00a4f5720 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/taprio.json +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/taprio.json @@ -132,6 +132,28 @@ "echo \"1\" > /sys/bus/netdevsim/del_device" ] }, + { + "id": "6f62", + "name": "Add taprio Qdisc with too short interval", + "category": [ + "qdisc", + "taprio" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 8\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH root handle 1: taprio num_tc 2 queues 1@0 1@1 sched-entry S 01 300 sched-entry S 02 1700 clockid CLOCK_TAI", + "expExitCode": "2", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc taprio 1: root refcnt", + "matchCount": "0", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, { "id": "3e1e", "name": "Add taprio Qdisc with an invalid cycle-time", From fb66df20a7201e60f2b13d7f95d031b31a8831d3 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 27 May 2024 18:39:55 +0300 Subject: [PATCH 0257/1578] net/sched: taprio: extend minimum interval restriction to entire cycle too It is possible for syzbot to side-step the restriction imposed by the blamed commit in the Fixes: tag, because the taprio UAPI permits a cycle-time different from (and potentially shorter than) the sum of entry intervals. We need one more restriction, which is that the cycle time itself must be larger than N * ETH_ZLEN bit times, where N is the number of schedule entries. This restriction needs to apply regardless of whether the cycle time came from the user or was the implicit, auto-calculated value, so we move the existing "cycle == 0" check outside the "if "(!new->cycle_time)" branch. This way covers both conditions and scenarios. Add a selftest which illustrates the issue triggered by syzbot. Fixes: b5b73b26b3ca ("taprio: Fix allowing too small intervals") Reported-by: syzbot+a7d2b1d5d1af83035567@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/0000000000007d66bc06196e7c66@google.com/ Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20240527153955.553333-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/sched/sch_taprio.c | 10 ++++----- .../tc-testing/tc-tests/qdiscs/taprio.json | 22 +++++++++++++++++++ 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 11891505536081..937a0c513c174b 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1151,11 +1151,6 @@ static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb, list_for_each_entry(entry, &new->entries, list) cycle = ktime_add_ns(cycle, entry->interval); - if (!cycle) { - NL_SET_ERR_MSG(extack, "'cycle_time' can never be 0"); - return -EINVAL; - } - if (cycle < 0 || cycle > INT_MAX) { NL_SET_ERR_MSG(extack, "'cycle_time' is too big"); return -EINVAL; @@ -1164,6 +1159,11 @@ static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb, new->cycle_time = cycle; } + if (new->cycle_time < new->num_entries * length_to_duration(q, ETH_ZLEN)) { + NL_SET_ERR_MSG(extack, "'cycle_time' is too small"); + return -EINVAL; + } + taprio_calculate_gate_durations(q, new); return 0; diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/taprio.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/taprio.json index 8f12f00a4f5720..557fb074acf0ca 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/taprio.json +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/taprio.json @@ -154,6 +154,28 @@ "echo \"1\" > /sys/bus/netdevsim/del_device" ] }, + { + "id": "831f", + "name": "Add taprio Qdisc with too short cycle-time", + "category": [ + "qdisc", + "taprio" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 8\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH root handle 1: taprio num_tc 2 queues 1@0 1@1 sched-entry S 01 200000 sched-entry S 02 200000 cycle-time 100 clockid CLOCK_TAI", + "expExitCode": "2", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc taprio 1: root refcnt", + "matchCount": "0", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, { "id": "3e1e", "name": "Add taprio Qdisc with an invalid cycle-time", From e57f2187ccc125f1f14f6d2c83da80831fc3ce9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Larumbe?= Date: Thu, 23 May 2024 12:32:17 +0100 Subject: [PATCH 0258/1578] drm/panfrost: Fix dma_resv deadlock at drm object pin time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When Panfrost must pin an object that is being prepared a dma-buf attachment for on behalf of another driver, the core drm gem object pinning code already takes a lock on the object's dma reservation. However, Panfrost GEM object's pinning callback would eventually try taking the lock on the same dma reservation when delegating pinning of the object onto the shmem subsystem, which led to a deadlock. This can be shown by enabling CONFIG_DEBUG_WW_MUTEX_SLOWPATH, which throws the following recursive locking situation: weston/3440 is trying to acquire lock: ffff000000e235a0 (reservation_ww_class_mutex){+.+.}-{3:3}, at: drm_gem_shmem_pin+0x34/0xb8 [drm_shmem_helper] but task is already holding lock: ffff000000e235a0 (reservation_ww_class_mutex){+.+.}-{3:3}, at: drm_gem_pin+0x2c/0x80 [drm] Fix it by replacing drm_gem_shmem_pin with its locked version, as the lock had already been taken by drm_gem_pin(). Cc: Thomas Zimmermann Cc: Dmitry Osipenko Cc: Boris Brezillon Cc: Steven Price Fixes: a78027847226 ("drm/gem: Acquire reservation lock in drm_gem_{pin/unpin}()") Signed-off-by: Adrián Larumbe Reviewed-by: Boris Brezillon Signed-off-by: Boris Brezillon Link: https://patchwork.freedesktop.org/patch/msgid/20240523113236.432585-2-adrian.larumbe@collabora.com --- drivers/gpu/drm/panfrost/panfrost_gem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/panfrost/panfrost_gem.c b/drivers/gpu/drm/panfrost/panfrost_gem.c index d47b40b82b0bc4..8e0ff3efede7e3 100644 --- a/drivers/gpu/drm/panfrost/panfrost_gem.c +++ b/drivers/gpu/drm/panfrost/panfrost_gem.c @@ -192,7 +192,7 @@ static int panfrost_gem_pin(struct drm_gem_object *obj) if (bo->is_heap) return -EINVAL; - return drm_gem_shmem_pin(&bo->base); + return drm_gem_shmem_pin_locked(&bo->base); } static enum drm_gem_object_status panfrost_gem_status(struct drm_gem_object *obj) From 8c2f5dd0c362ec036f0217da1d413ce2b8361080 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Larumbe?= Date: Thu, 23 May 2024 12:32:18 +0100 Subject: [PATCH 0259/1578] drm/lima: Fix dma_resv deadlock at drm object pin time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit a78027847226 ("drm/gem: Acquire reservation lock in drm_gem_{pin/unpin}()") moved locking the DRM object's dma reservation to drm_gem_pin(), but Lima's pin callback kept calling drm_gem_shmem_pin, which also tries to lock the same dma_resv, leading to a double lock situation. As was already done for Panfrost in the previous commit, fix it by replacing drm_gem_shmem_pin() with its locked variant. Cc: Thomas Zimmermann Cc: Dmitry Osipenko Cc: Boris Brezillon Cc: Steven Price Fixes: a78027847226 ("drm/gem: Acquire reservation lock in drm_gem_{pin/unpin}()") Signed-off-by: Adrián Larumbe Reviewed-by: Boris Brezillon Tested-by: Val Packett Signed-off-by: Boris Brezillon Link: https://patchwork.freedesktop.org/patch/msgid/20240523113236.432585-3-adrian.larumbe@collabora.com --- drivers/gpu/drm/lima/lima_gem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/lima/lima_gem.c b/drivers/gpu/drm/lima/lima_gem.c index 7ea244d876ca63..9bb997dbb4b901 100644 --- a/drivers/gpu/drm/lima/lima_gem.c +++ b/drivers/gpu/drm/lima/lima_gem.c @@ -185,7 +185,7 @@ static int lima_gem_pin(struct drm_gem_object *obj) if (bo->heap_size) return -EINVAL; - return drm_gem_shmem_pin(&bo->base); + return drm_gem_shmem_pin_locked(&bo->base); } static int lima_gem_vmap(struct drm_gem_object *obj, struct iosys_map *map) From 3b8407e81ed76c0d84d710c2a177a8fe24292702 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Larumbe?= Date: Thu, 23 May 2024 12:32:19 +0100 Subject: [PATCH 0260/1578] drm/gem-shmem: Add import attachment warning to locked pin function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit ec144244a43f ("drm/gem-shmem: Acquire reservation lock in GEM pin/unpin callbacks") moved locking DRM object's dma reservation to drm_gem_shmem_object_pin, and made drm_gem_shmem_pin_locked public, so we need to make sure the not-imported check warning is also added to the latter. Cc: Thomas Zimmermann Cc: Dmitry Osipenko Cc: Boris Brezillon Fixes: a78027847226 ("drm/gem: Acquire reservation lock in drm_gem_{pin/unpin}()") Signed-off-by: Adrián Larumbe Reviewed-by: Boris Brezillon Signed-off-by: Boris Brezillon Link: https://patchwork.freedesktop.org/patch/msgid/20240523113236.432585-4-adrian.larumbe@collabora.com --- drivers/gpu/drm/drm_gem_shmem_helper.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/drm_gem_shmem_helper.c b/drivers/gpu/drm/drm_gem_shmem_helper.c index 885a62c2e1be23..53c003983ad183 100644 --- a/drivers/gpu/drm/drm_gem_shmem_helper.c +++ b/drivers/gpu/drm/drm_gem_shmem_helper.c @@ -233,6 +233,8 @@ int drm_gem_shmem_pin_locked(struct drm_gem_shmem_object *shmem) dma_resv_assert_held(shmem->base.resv); + drm_WARN_ON(shmem->base.dev, shmem->base.import_attach); + ret = drm_gem_shmem_get_pages(shmem); return ret; From a607468b521cc99ca64f19947cb7a40f8c814730 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 19 May 2024 13:24:30 +0900 Subject: [PATCH 0261/1578] kconfig: remove unused expr_is_no() This has not been used since commit e911503085ae ("Kconfig: Remove bad inference rules expr_eliminate_dups2()"). Signed-off-by: Masahiro Yamada --- scripts/kconfig/expr.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/scripts/kconfig/expr.h b/scripts/kconfig/expr.h index d965e427753eb7..fa50fc45622e38 100644 --- a/scripts/kconfig/expr.h +++ b/scripts/kconfig/expr.h @@ -302,11 +302,6 @@ static inline int expr_is_yes(struct expr *e) return !e || (e->type == E_SYMBOL && e->left.sym == &symbol_yes); } -static inline int expr_is_no(struct expr *e) -{ - return e && (e->type == E_SYMBOL && e->left.sym == &symbol_no); -} - #ifdef __cplusplus } #endif From aabdc960a283ba78086b0bf66ee74326f49e218e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 19 May 2024 18:22:27 +0900 Subject: [PATCH 0262/1578] kconfig: fix comparison to constant symbols, 'm', 'n' Currently, comparisons to 'm' or 'n' result in incorrect output. [Test Code] config MODULES def_bool y modules config A def_tristate m config B def_bool A > n CONFIG_B is unset, while CONFIG_B=y is expected. The reason for the issue is because Kconfig compares the tristate values as strings. Currently, the .type fields in the constant symbol definitions, symbol_{yes,mod,no} are unspecified, i.e., S_UNKNOWN. When expr_calc_value() evaluates 'A > n', it checks the types of 'A' and 'n' to determine how to compare them. The left-hand side, 'A', is a tristate symbol with a value of 'm', which corresponds to a numeric value of 1. (Internally, 'y', 'm', and 'n' are represented as 2, 1, and 0, respectively.) The right-hand side, 'n', has an unknown type, so it is treated as the string "n" during the comparison. expr_calc_value() compares two values numerically only when both can have numeric values. Otherwise, they are compared as strings. symbol numeric value ASCII code ------------------------------------- y 2 0x79 m 1 0x6d n 0 0x6e 'm' is greater than 'n' if compared numerically (since 1 is greater than 0), but smaller than 'n' if compared as strings (since the ASCII code 0x6d is smaller than 0x6e). Specifying .type=S_TRISTATE for symbol_{yes,mod,no} fixes the above test code. Doing so, however, would cause a regression to the following test code. [Test Code 2] config MODULES def_bool n modules config A def_tristate n config B def_bool A = m You would get CONFIG_B=y, while CONFIG_B should not be set. The reason is because sym_get_string_value() turns 'm' into 'n' when the module feature is disabled. Consequently, expr_calc_value() evaluates 'A = n' instead of 'A = m'. This oddity has been hidden because the type of 'm' was previously S_UNKNOWN instead of S_TRISTATE. sym_get_string_value() should not tweak the string because the tristate value has already been correctly calculated. There is no reason to return the string "n" where its tristate value is mod. Fixes: 31847b67bec0 ("kconfig: allow use of relations other than (in)equality") Signed-off-by: Masahiro Yamada --- scripts/kconfig/symbol.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/kconfig/symbol.c b/scripts/kconfig/symbol.c index aa0e25ee5119ef..0e439d3d48d11b 100644 --- a/scripts/kconfig/symbol.c +++ b/scripts/kconfig/symbol.c @@ -14,6 +14,7 @@ struct symbol symbol_yes = { .name = "y", + .type = S_TRISTATE, .curr = { "y", yes }, .menus = LIST_HEAD_INIT(symbol_yes.menus), .flags = SYMBOL_CONST|SYMBOL_VALID, @@ -21,6 +22,7 @@ struct symbol symbol_yes = { struct symbol symbol_mod = { .name = "m", + .type = S_TRISTATE, .curr = { "m", mod }, .menus = LIST_HEAD_INIT(symbol_mod.menus), .flags = SYMBOL_CONST|SYMBOL_VALID, @@ -28,6 +30,7 @@ struct symbol symbol_mod = { struct symbol symbol_no = { .name = "n", + .type = S_TRISTATE, .curr = { "n", no }, .menus = LIST_HEAD_INIT(symbol_no.menus), .flags = SYMBOL_CONST|SYMBOL_VALID, @@ -820,8 +823,7 @@ const char *sym_get_string_value(struct symbol *sym) case no: return "n"; case mod: - sym_calc_value(modules_sym); - return (modules_sym->curr.tri == no) ? "n" : "m"; + return "m"; case yes: return "y"; } From 31894d35b51ba61e2931cbf28e80114a4f72bc2b Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 19 May 2024 22:34:02 +0900 Subject: [PATCH 0263/1578] kconfig: remove redundant check in expr_join_or() The check for 'sym1 == sym2' is redundant here because it has already been done a few lines above: if (sym1 != sym2) return NULL; Signed-off-by: Masahiro Yamada --- scripts/kconfig/expr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/kconfig/expr.c b/scripts/kconfig/expr.c index a290de36307ba8..4d95fce5f9a7ab 100644 --- a/scripts/kconfig/expr.c +++ b/scripts/kconfig/expr.c @@ -476,7 +476,7 @@ static struct expr *expr_join_or(struct expr *e1, struct expr *e2) return expr_alloc_comp(E_UNEQUAL, sym1, &symbol_yes); } } - if (sym1->type == S_BOOLEAN && sym1 == sym2) { + if (sym1->type == S_BOOLEAN) { if ((e1->type == E_NOT && e1->left.expr->type == E_SYMBOL && e2->type == E_SYMBOL) || (e2->type == E_NOT && e2->left.expr->type == E_SYMBOL && e1->type == E_SYMBOL)) return expr_alloc_symbol(&symbol_yes); From 659bbf7e1b08267b8e1dd900b316edcb6f6d9e2e Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Mon, 20 May 2024 12:56:52 -0700 Subject: [PATCH 0264/1578] kbuild: scripts/gdb: Replace missed $(srctree)/$(src) w/ $(src) Recently we went through the source tree and replaced $(srctree)/$(src) w/ $(src). However, the gdb scripts Makefile had a hidden $(srctree)/$(src) that looked like this: $(abspath $(srctree))/$(src) Because we missed that then my installed kernel had symlinks that looked like this: __init__.py -> ${INSTALL_DIR}/$(INSTALL_DIR}/scripts/gdb/linux/__init__.py Let's also replace the midden $(abspath $(srctree))/$(src) with $(src). Now: __init__.py -> $(INSTALL_DIR}/scripts/gdb/linux/__init__.py Fixes: b1992c3772e6 ("kbuild: use $(src) instead of $(srctree)/$(src) for source directory") Signed-off-by: Douglas Anderson Signed-off-by: Masahiro Yamada --- scripts/gdb/linux/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gdb/linux/Makefile b/scripts/gdb/linux/Makefile index d77ad9079d0f92..fd1402c0a1a18f 100644 --- a/scripts/gdb/linux/Makefile +++ b/scripts/gdb/linux/Makefile @@ -5,7 +5,7 @@ ifdef building_out_of_srctree symlinks := $(patsubst $(src)/%,%,$(wildcard $(src)/*.py)) quiet_cmd_symlink = SYMLINK $@ - cmd_symlink = ln -fsn $(patsubst $(obj)/%,$(abspath $(srctree))/$(src)/%,$@) $@ + cmd_symlink = ln -fsn $(patsubst $(obj)/%,$(src)/%,$@) $@ always-y += $(symlinks) $(addprefix $(obj)/, $(symlinks)): FORCE From 04b8cb0945b4bf679c71dc2351e0d3c25481e3c6 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 20 May 2024 21:42:09 +0900 Subject: [PATCH 0265/1578] kbuild: avoid unneeded kallsyms step 3 Since commit 951bcae6c5a0 ("kallsyms: Avoid weak references for kallsyms symbols"), the kallsyms step 3 always occurs. You can compare the build logs. [Before 951bcae6c5a0] $ git checkout 951bcae6c5a0^ $ make defconfig all [ snip ] LD .tmp_vmlinux.kallsyms1 NM .tmp_vmlinux.kallsyms1.syms KSYMS .tmp_vmlinux.kallsyms1.S AS .tmp_vmlinux.kallsyms1.S LD .tmp_vmlinux.kallsyms2 NM .tmp_vmlinux.kallsyms2.syms KSYMS .tmp_vmlinux.kallsyms2.S AS .tmp_vmlinux.kallsyms2.S LD vmlinux [After 951bcae6c5a0] $ git checkout 951bcae6c5a0 $ make defconfig all [ snip ] LD .tmp_vmlinux.kallsyms1 NM .tmp_vmlinux.kallsyms1.syms KSYMS .tmp_vmlinux.kallsyms1.S AS .tmp_vmlinux.kallsyms1.S LD .tmp_vmlinux.kallsyms2 NM .tmp_vmlinux.kallsyms2.syms KSYMS .tmp_vmlinux.kallsyms2.S AS .tmp_vmlinux.kallsyms2.S LD .tmp_vmlinux.kallsyms3 # should not happen NM .tmp_vmlinux.kallsyms3.syms # should not happen KSYMS .tmp_vmlinux.kallsyms3.S # should not happen AS .tmp_vmlinux.kallsyms3.S # should not happen LD vmlinux The resulting vmlinux is correct, but it always requires an additional linking step. The symbols produced by kallsyms are excluded from kallsyms itself because they were previously missing in step 1. With those symbols excluded, the symbol lists matched between step 1 and step 2, eliminating the need for step 3. Now, this has a negative effect. Since 951bcae6c5a0, the PROVIDE() directives provide the fallback definitions, which are not trimmed from the sysbol list in step 1 because ${kallsymso_prev} is empty at this point. In step 2, ${kallsymso_prev} is set, and the kallsyms_* symbols are trimmed from the symbol list. Due to the table size difference between step 1 and step 2 (the former is larger due to the presence of kallsyms_*), step 3 is triggered. Now that the kallsyms_* symbols are always linked, let's stop omitting them from kallsyms. This avoids unnecessary step 3. Fixes: 951bcae6c5a0 ("kallsyms: Avoid weak references for kallsyms symbols") Signed-off-by: Masahiro Yamada --- scripts/link-vmlinux.sh | 6 +++--- scripts/mksysmap | 11 +---------- 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 7862a81017477d..b0d39a927fbc8d 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -179,7 +179,7 @@ kallsyms_step() kallsyms_S=${kallsyms_vmlinux}.S vmlinux_link ${kallsyms_vmlinux} "${kallsymso_prev}" ${btf_vmlinux_bin_o} - mksysmap ${kallsyms_vmlinux} ${kallsyms_vmlinux}.syms ${kallsymso_prev} + mksysmap ${kallsyms_vmlinux} ${kallsyms_vmlinux}.syms kallsyms ${kallsyms_vmlinux}.syms ${kallsyms_S} info AS ${kallsyms_S} @@ -193,7 +193,7 @@ kallsyms_step() mksysmap() { info NM ${2} - ${CONFIG_SHELL} "${srctree}/scripts/mksysmap" ${1} ${2} ${3} + ${CONFIG_SHELL} "${srctree}/scripts/mksysmap" ${1} ${2} } sorttable() @@ -282,7 +282,7 @@ if is_enabled CONFIG_DEBUG_INFO_BTF && is_enabled CONFIG_BPF; then ${RESOLVE_BTFIDS} vmlinux fi -mksysmap vmlinux System.map ${kallsymso} +mksysmap vmlinux System.map if is_enabled CONFIG_BUILDTIME_TABLE_SORT; then info SORTTAB vmlinux diff --git a/scripts/mksysmap b/scripts/mksysmap index 57ff5656d566fb..e46bafe333bdc2 100755 --- a/scripts/mksysmap +++ b/scripts/mksysmap @@ -4,7 +4,7 @@ # tools to retrieve the actual addresses of symbols in the kernel. # # Usage -# mksysmap vmlinux System.map [exclude] +# mksysmap vmlinux System.map ##### @@ -92,13 +92,4 @@ ${NM} -n ${1} | sed >${2} -e " # ppc stub /\.long_branch\./d /\.plt_branch\./d - -# --------------------------------------------------------------------------- -# Ignored kallsyms symbols -# -# If the 3rd parameter exists, symbols from it will be omitted from the output. -# This makes kallsyms have the identical symbol lists in the step 1 and 2. -# Without this, the step2 would get new symbols generated by scripts/kallsyms.c -# when CONFIG_KALLSYMS_ALL is enabled. That might require one more pass. -$(if [ $# -ge 3 ]; then ${NM} ${3} | sed -n '/ U /!s:.* \([^ ]*\)$:/ \1$/d:p'; fi) " From b18b047002b7d3b19d9fb905c1bd2a214016c153 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 20 May 2024 21:42:10 +0900 Subject: [PATCH 0266/1578] kbuild: change scripts/mksysmap into sed script The previous commit removed the subshell execution from scripts/mksysmap, which is now simple enough to become a sed script. Signed-off-by: Masahiro Yamada --- scripts/link-vmlinux.sh | 2 +- scripts/mksysmap | 19 ++++++------------- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index b0d39a927fbc8d..c22a213ea6a9c5 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -193,7 +193,7 @@ kallsyms_step() mksysmap() { info NM ${2} - ${CONFIG_SHELL} "${srctree}/scripts/mksysmap" ${1} ${2} + ${NM} -n "${1}" | "${srctree}/scripts/mksysmap" > "${2}" } sorttable() diff --git a/scripts/mksysmap b/scripts/mksysmap index e46bafe333bdc2..c12723a0465562 100755 --- a/scripts/mksysmap +++ b/scripts/mksysmap @@ -1,22 +1,16 @@ -#!/bin/sh -x -# Based on the vmlinux file create the System.map file +#!/bin/sed -f +# SPDX-License-Identifier: GPL-2.0-only +# +# sed script to filter out symbols that are not needed for System.map, +# or not suitable for kallsyms. The input should be 'nm -n '. +# # System.map is used by module-init tools and some debugging # tools to retrieve the actual addresses of symbols in the kernel. # -# Usage -# mksysmap vmlinux System.map - - -##### -# Generate System.map (actual filename passed as second argument) -# The following refers to the symbol type as per nm(1). - # readprofile starts reading symbols when _stext is found, and # continue until it finds a symbol which is not either of 'T', 't', # 'W' or 'w'. # - -${NM} -n ${1} | sed >${2} -e " # --------------------------------------------------------------------------- # Ignored symbol types # @@ -92,4 +86,3 @@ ${NM} -n ${1} | sed >${2} -e " # ppc stub /\.long_branch\./d /\.plt_branch\./d -" From 3430f65d6130ccbc86f0ff45642eeb9e2032a600 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 20 May 2024 21:42:11 +0900 Subject: [PATCH 0267/1578] kbuild: fix short log for AS in link-vmlinux.sh In convention, short logs print the output file, not the input file. Let's change the suffix for 'AS' since it assembles *.S into *.o. [Before] LD .tmp_vmlinux.kallsyms1 NM .tmp_vmlinux.kallsyms1.syms KSYMS .tmp_vmlinux.kallsyms1.S AS .tmp_vmlinux.kallsyms1.S LD .tmp_vmlinux.kallsyms2 NM .tmp_vmlinux.kallsyms2.syms KSYMS .tmp_vmlinux.kallsyms2.S AS .tmp_vmlinux.kallsyms2.S LD vmlinux [After] LD .tmp_vmlinux.kallsyms1 NM .tmp_vmlinux.kallsyms1.syms KSYMS .tmp_vmlinux.kallsyms1.S AS .tmp_vmlinux.kallsyms1.o LD .tmp_vmlinux.kallsyms2 NM .tmp_vmlinux.kallsyms2.syms KSYMS .tmp_vmlinux.kallsyms2.S AS .tmp_vmlinux.kallsyms2.o LD vmlinux Signed-off-by: Masahiro Yamada --- scripts/link-vmlinux.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index c22a213ea6a9c5..7aca51b24e9fbb 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -182,7 +182,7 @@ kallsyms_step() mksysmap ${kallsyms_vmlinux} ${kallsyms_vmlinux}.syms kallsyms ${kallsyms_vmlinux}.syms ${kallsyms_S} - info AS ${kallsyms_S} + info AS ${kallsymso} ${CC} ${NOSTDINC_FLAGS} ${LINUXINCLUDE} ${KBUILD_CPPFLAGS} \ ${KBUILD_AFLAGS} ${KBUILD_AFLAGS_KERNEL} \ -c -o ${kallsymso} ${kallsyms_S} From 3c562a70cf4da331baef60ebb3f0e30b254006e9 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 22 May 2024 19:43:11 +0900 Subject: [PATCH 0268/1578] kbuild: remove a stale comment about cleaning in link-vmlinux.sh Remove the left-over of commit 51eb95e2da41 ("kbuild: Don't remove link-vmlinux temporary files on exit/signal"). Signed-off-by: Masahiro Yamada --- scripts/link-vmlinux.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 7aca51b24e9fbb..46ce5d04dbeb10 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -201,7 +201,6 @@ sorttable() ${objtree}/scripts/sorttable ${1} } -# Delete output files in case of error cleanup() { rm -f .btf.* From e06a698ae62b9ee5ca98e65be2c90a61464192e6 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Tue, 28 May 2024 16:52:18 +0800 Subject: [PATCH 0269/1578] scripts/make_fit: Drop fdt image entry compatible string According to the FIT image source file format document found in U-boot [1] and the split-out FIT image specification [2], under "'/images' node" -> "Conditionally mandatory property", the "compatible" property is described as "compatible method for loading image", i.e., not the compatible string embedded in the FDT or used for matching. Drop the compatible string from the fdt image entry node. While at it also fix up a typo in the document section of output_dtb. [1] U-boot source "doc/usage/fit/source_file_format.rst", or on the website: https://docs.u-boot.org/en/latest/usage/fit/source_file_format.html [2] https://github.com/open-source-firmware/flat-image-tree/blob/main/source/chapter2-source-file-format.rst Fixes: 7a23b027ec17 ("arm64: boot: Support Flat Image Tree") Signed-off-by: Chen-Yu Tsai Reviewed-by: Simon Glass Signed-off-by: Masahiro Yamada --- scripts/make_fit.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/make_fit.py b/scripts/make_fit.py index 3de90c5a094b7a..263147df80a445 100755 --- a/scripts/make_fit.py +++ b/scripts/make_fit.py @@ -190,7 +190,7 @@ def output_dtb(fsw, seq, fname, arch, compress): Args: fsw (libfdt.FdtSw): Object to use for writing seq (int): Sequence number (1 for first) - fmame (str): Filename containing the DTB + fname (str): Filename containing the DTB arch: FIT architecture, e.g. 'arm64' compress (str): Compressed algorithm, e.g. 'gzip' @@ -211,7 +211,6 @@ def output_dtb(fsw, seq, fname, arch, compress): fsw.property_string('type', 'flat_dt') fsw.property_string('arch', arch) fsw.property_string('compression', compress) - fsw.property('compatible', bytes(compat)) with open(fname, 'rb') as inf: compressed = compress_data(inf, compress) From 647535760a00a854c185dd4d7e6eccfea30ea0d5 Mon Sep 17 00:00:00 2001 From: Janusz Krzysztofik Date: Mon, 6 May 2024 20:02:50 +0200 Subject: [PATCH 0270/1578] Revert "drm/i915: Remove extra multi-gt pm-references" This reverts commit 1f33dc0c1189efb9ae19c6fc22b64dd3e26261fb. There was a patch supposed to fix an issue of illegal attempts to free a still active i915 VMA object when parking a GT believed to be idle, reported by CI on 2-GT Meteor Lake. As a solution, an extra wakeref for a Primary GT was acquired from i915_gem_do_execbuffer() -- see commit f56fe3e91787 ("drm/i915: Fix a VMA UAF for multi-gt platform"). However, that fix occurred insufficient -- the issue was still reported by CI. That wakeref was released on exit from i915_gem_do_execbuffer(), then potentially before completion of the request and deactivation of its associated VMAs. Moreover, CI reports indicated that single-GT platforms also suffered sporadically from the same race. Since that issue was fixed by another commit f3c71b2ded5c ("drm/i915/vma: Fix UAF on destroy against retire race"), the changes introduced by that insufficient fix were dropped as no longer useful. However, that series resulted in another VMA UAF scenario now being triggered in CI. <4> [260.290809] ------------[ cut here ]------------ <4> [260.290988] list_del corruption. prev->next should be ffff888118c5d990, but was ffff888118c5a510. (prev=ffff888118c5a510) <4> [260.291004] WARNING: CPU: 2 PID: 1143 at lib/list_debug.c:62 __list_del_entry_valid_or_report+0xb7/0xe0 .. <4> [260.291055] CPU: 2 PID: 1143 Comm: kms_plane Not tainted 6.9.0-rc2-CI_DRM_14524-ga25d180c6853+ #1 <4> [260.291058] Hardware name: Intel Corporation Meteor Lake Client Platform/MTL-P LP5x T3 RVP, BIOS MTLPFWI1.R00.3471.D91.2401310918 01/31/2024 <4> [260.291060] RIP: 0010:__list_del_entry_valid_or_report+0xb7/0xe0 ... <4> [260.291087] Call Trace: <4> [260.291089] <4> [260.291124] i915_vma_reopen+0x43/0x80 [i915] <4> [260.291298] eb_lookup_vmas+0x9cb/0xcc0 [i915] <4> [260.291579] i915_gem_do_execbuffer+0xc9a/0x26d0 [i915] <4> [260.291883] i915_gem_execbuffer2_ioctl+0x123/0x2a0 [i915] ... <4> [260.292301] ... <4> [260.292506] ---[ end trace 0000000000000000 ]--- <4> [260.292782] general protection fault, probably for non-canonical address 0x6b6b6b6b6b6b6ca3: 0000 [#1] PREEMPT SMP NOPTI <4> [260.303575] CPU: 2 PID: 1143 Comm: kms_plane Tainted: G W 6.9.0-rc2-CI_DRM_14524-ga25d180c6853+ #1 <4> [260.313851] Hardware name: Intel Corporation Meteor Lake Client Platform/MTL-P LP5x T3 RVP, BIOS MTLPFWI1.R00.3471.D91.2401310918 01/31/2024 <4> [260.326359] RIP: 0010:eb_validate_vmas+0x114/0xd80 [i915] ... <4> [260.428756] Call Trace: <4> [260.431192] <4> [639.283393] i915_gem_do_execbuffer+0xd05/0x26d0 [i915] <4> [639.305245] i915_gem_execbuffer2_ioctl+0x123/0x2a0 [i915] ... <4> [639.411134] ... <4> [639.449979] ---[ end trace 0000000000000000 ]--- We defer actually closing, unbinding and destroying a VMA until next idle point, or until the object is freed in the meantime. By postponing the unbind, we allow for the VMA to be reopened by the client, avoiding the work required to rebind the VMA. Starting from commit b0647a5e79b1 ("drm/i915: Avoid live-lock with i915_vma_parked()"), we assume that as long as a GT is held idle, no VMA would be reopened while we destroy them. That assumption is no longer true in multi-GT configurations, where a VMA we reopen may be handled by a GT different from the one that we already keep active via its engine while we set up an execbuf request. Restoring the extra GT0 PM wakeref removed from i915_gem_do_execbuffer() processing path seems to fix this issue. Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10608 Signed-off-by: Janusz Krzysztofik Cc: Rodrigo Vivi Cc: Nirmoy Das Reviewed-by: Nirmoy Das Fixes: 1f33dc0c1189 ("drm/i915: Remove extra multi-gt pm-references") Link: https://patchwork.freedesktop.org/patch/msgid/20240506180253.96858-2-janusz.krzysztofik@linux.intel.com Signed-off-by: Rodrigo Vivi (cherry picked from commit 749670a58d935303ad1ce529acc73f12de25832e) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index 42619fc05de484..090724fa766c9e 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -255,6 +255,7 @@ struct i915_execbuffer { struct intel_context *context; /* logical state for the request */ struct i915_gem_context *gem_context; /** caller's context */ intel_wakeref_t wakeref; + intel_wakeref_t wakeref_gt0; /** our requests to build */ struct i915_request *requests[MAX_ENGINE_INSTANCE + 1]; @@ -2685,6 +2686,7 @@ static int eb_select_engine(struct i915_execbuffer *eb) { struct intel_context *ce, *child; + struct intel_gt *gt; unsigned int idx; int err; @@ -2708,10 +2710,17 @@ eb_select_engine(struct i915_execbuffer *eb) } } eb->num_batches = ce->parallel.number_children + 1; + gt = ce->engine->gt; for_each_child(ce, child) intel_context_get(child); eb->wakeref = intel_gt_pm_get(ce->engine->gt); + /* + * Keep GT0 active on MTL so that i915_vma_parked() doesn't + * free VMAs while execbuf ioctl is validating VMAs. + */ + if (gt->info.id) + eb->wakeref_gt0 = intel_gt_pm_get(to_gt(gt->i915)); if (!test_bit(CONTEXT_ALLOC_BIT, &ce->flags)) { err = intel_context_alloc_state(ce); @@ -2750,6 +2759,9 @@ eb_select_engine(struct i915_execbuffer *eb) return err; err: + if (gt->info.id) + intel_gt_pm_put(to_gt(gt->i915), eb->wakeref_gt0); + intel_gt_pm_put(ce->engine->gt, eb->wakeref); for_each_child(ce, child) intel_context_put(child); @@ -2763,6 +2775,12 @@ eb_put_engine(struct i915_execbuffer *eb) struct intel_context *child; i915_vm_put(eb->context->vm); + /* + * This works in conjunction with eb_select_engine() to prevent + * i915_vma_parked() from interfering while execbuf validates vmas. + */ + if (eb->gt->info.id) + intel_gt_pm_put(to_gt(eb->gt->i915), eb->wakeref_gt0); intel_gt_pm_put(eb->context->engine->gt, eb->wakeref); for_each_child(eb->context, child) intel_context_put(child); From 70cb9188ffc75e643debf292fcddff36c9dbd4ae Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 23 Apr 2024 18:23:10 +0200 Subject: [PATCH 0271/1578] drm/i915/gt: Disarm breadcrumbs if engines are already idle The breadcrumbs use a GT wakeref for guarding the interrupt, but are disarmed during release of the engine wakeref. This leaves a hole where we may attach a breadcrumb just as the engine is parking (after it has parked its breadcrumbs), execute the irq worker with some signalers still attached, but never be woken again. That issue manifests itself in CI with IGT runner timeouts while tests are waiting indefinitely for release of all GT wakerefs. <6> [209.151778] i915: Running live_engine_pm_selftests/live_engine_busy_stats <7> [209.231628] i915 0000:00:02.0: [drm:intel_power_well_disable [i915]] disabling PW_5 <7> [209.231816] i915 0000:00:02.0: [drm:intel_power_well_disable [i915]] disabling PW_4 <7> [209.231944] i915 0000:00:02.0: [drm:intel_power_well_disable [i915]] disabling PW_3 <7> [209.232056] i915 0000:00:02.0: [drm:intel_power_well_disable [i915]] disabling PW_2 <7> [209.232166] i915 0000:00:02.0: [drm:intel_power_well_disable [i915]] disabling DC_off <7> [209.232270] i915 0000:00:02.0: [drm:skl_enable_dc6 [i915]] Enabling DC6 <7> [209.232368] i915 0000:00:02.0: [drm:gen9_set_dc_state.part.0 [i915]] Setting DC state from 00 to 02 <4> [299.356116] [IGT] Inactivity timeout exceeded. Killing the current test with SIGQUIT. ... <6> [299.356526] sysrq: Show State ... <6> [299.373964] task:i915_selftest state:D stack:11784 pid:5578 tgid:5578 ppid:873 flags:0x00004002 <6> [299.373967] Call Trace: <6> [299.373968] <6> [299.373970] __schedule+0x3bb/0xda0 <6> [299.373974] schedule+0x41/0x110 <6> [299.373976] intel_wakeref_wait_for_idle+0x82/0x100 [i915] <6> [299.374083] ? __pfx_var_wake_function+0x10/0x10 <6> [299.374087] live_engine_busy_stats+0x9b/0x500 [i915] <6> [299.374173] __i915_subtests+0xbe/0x240 [i915] <6> [299.374277] ? __pfx___intel_gt_live_setup+0x10/0x10 [i915] <6> [299.374369] ? __pfx___intel_gt_live_teardown+0x10/0x10 [i915] <6> [299.374456] intel_engine_live_selftests+0x1c/0x30 [i915] <6> [299.374547] __run_selftests+0xbb/0x190 [i915] <6> [299.374635] i915_live_selftests+0x4b/0x90 [i915] <6> [299.374717] i915_pci_probe+0x10d/0x210 [i915] At the end of the interrupt worker, if there are no more engines awake, disarm the breadcrumb and go to sleep. Fixes: 9d5612ca165a ("drm/i915/gt: Defer enabling the breadcrumb interrupt to after submission") Closes: https://gitlab.freedesktop.org/drm/intel/issues/10026 Signed-off-by: Chris Wilson Cc: Andrzej Hajda Cc: # v5.12+ Signed-off-by: Janusz Krzysztofik Acked-by: Nirmoy Das Reviewed-by: Andrzej Hajda Reviewed-by: Andi Shyti Signed-off-by: Andi Shyti Link: https://patchwork.freedesktop.org/patch/msgid/20240423165505.465734-2-janusz.krzysztofik@linux.intel.com (cherry picked from commit fbad43eccae5cb14594195c20113369aabaa22b5) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gt/intel_breadcrumbs.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c index d650beb8ed22f6..20b9b04ec1e0bf 100644 --- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c +++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c @@ -263,8 +263,13 @@ static void signal_irq_work(struct irq_work *work) i915_request_put(rq); } + /* Lazy irq enabling after HW submission */ if (!READ_ONCE(b->irq_armed) && !list_empty(&b->signalers)) intel_breadcrumbs_arm_irq(b); + + /* And confirm that we still want irqs enabled before we yield */ + if (READ_ONCE(b->irq_armed) && !atomic_read(&b->active)) + intel_breadcrumbs_disarm_irq(b); } struct intel_breadcrumbs * @@ -315,13 +320,7 @@ void __intel_breadcrumbs_park(struct intel_breadcrumbs *b) return; /* Kick the work once more to drain the signalers, and disarm the irq */ - irq_work_sync(&b->irq_work); - while (READ_ONCE(b->irq_armed) && !atomic_read(&b->active)) { - local_irq_disable(); - signal_irq_work(&b->irq_work); - local_irq_enable(); - cond_resched(); - } + irq_work_queue(&b->irq_work); } void intel_breadcrumbs_free(struct kref *kref) @@ -404,7 +403,7 @@ static void insert_breadcrumb(struct i915_request *rq) * the request as it may have completed and raised the interrupt as * we were attaching it into the lists. */ - if (!b->irq_armed || __i915_request_is_complete(rq)) + if (!READ_ONCE(b->irq_armed) || __i915_request_is_complete(rq)) irq_work_queue(&b->irq_work); } From d4f36db62396b73bed383c0b6e48d36278cafa78 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 30 Apr 2024 09:48:09 -0700 Subject: [PATCH 0272/1578] drm/i915/guc: avoid FIELD_PREP warning With gcc-7 and earlier, there are lots of warnings like In file included from :0:0: In function '__guc_context_policy_add_priority.isra.66', inlined from '__guc_context_set_prio.isra.67' at drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c:3292:3, inlined from 'guc_context_set_prio' at drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c:3320:2: include/linux/compiler_types.h:399:38: error: call to '__compiletime_assert_631' declared with attribute error: FIELD_PREP: mask is not constant _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) ^ ... drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c:2422:3: note: in expansion of macro 'FIELD_PREP' FIELD_PREP(GUC_KLV_0_KEY, GUC_CONTEXT_POLICIES_KLV_ID_##id) | \ ^~~~~~~~~~ Make sure that GUC_KLV_0_KEY is an unsigned value to avoid the warning. Fixes: 77b6f79df66e ("drm/i915/guc: Update to GuC version 69.0.3") Signed-off-by: Arnd Bergmann Reviewed-by: Michal Wajdeczko Signed-off-by: Julia Filipchuk Signed-off-by: John Harrison Link: https://patchwork.freedesktop.org/patch/msgid/20240430164809.482131-1-julia.filipchuk@intel.com (cherry picked from commit 364e039827ef628c650c21c1afe1c54d9c3296d9) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gt/uc/abi/guc_klvs_abi.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_klvs_abi.h b/drivers/gpu/drm/i915/gt/uc/abi/guc_klvs_abi.h index bebf28e3c4794b..525587cfe1af94 100644 --- a/drivers/gpu/drm/i915/gt/uc/abi/guc_klvs_abi.h +++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_klvs_abi.h @@ -29,9 +29,9 @@ */ #define GUC_KLV_LEN_MIN 1u -#define GUC_KLV_0_KEY (0xffff << 16) -#define GUC_KLV_0_LEN (0xffff << 0) -#define GUC_KLV_n_VALUE (0xffffffff << 0) +#define GUC_KLV_0_KEY (0xffffu << 16) +#define GUC_KLV_0_LEN (0xffffu << 0) +#define GUC_KLV_n_VALUE (0xffffffffu << 0) /** * DOC: GuC Self Config KLVs From 33defcacd207196a6b35857087e6335590adad62 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Tue, 28 May 2024 22:39:18 +0300 Subject: [PATCH 0273/1578] drm/panel/lg-sw43408: select CONFIG_DRM_DISPLAY_DP_HELPER This panel driver uses DSC PPS functions and as such depends on the DRM_DISPLAY_DP_HELPER. Select this symbol to make required functions available to the driver. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202404200800.kYsRYyli-lkp@intel.com/ Fixes: 069a6c0e94f9 ("drm: panel: Add LG sw43408 panel driver") Reviewed-by: Neil Armstrong Reviewed-by: Marijn Suijten Link: https://patchwork.freedesktop.org/patch/msgid/20240528-panel-sw43408-fix-v4-1-330b42445bcc@linaro.org Signed-off-by: Dmitry Baryshkov --- drivers/gpu/drm/panel/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/panel/Kconfig b/drivers/gpu/drm/panel/Kconfig index 982324ef5a41b8..2ae0eb0638f325 100644 --- a/drivers/gpu/drm/panel/Kconfig +++ b/drivers/gpu/drm/panel/Kconfig @@ -340,6 +340,8 @@ config DRM_PANEL_LG_SW43408 depends on OF depends on DRM_MIPI_DSI depends on BACKLIGHT_CLASS_DEVICE + select DRM_DISPLAY_DP_HELPER + select DRM_DISPLAY_HELPER help Say Y here if you want to enable support for LG sw43408 panel. The panel has a 1080x2160@60Hz resolution and uses 24 bit RGB per From 659a3062c705753a9ec6fd28a4c67ee4254f9584 Mon Sep 17 00:00:00 2001 From: Nirmoy Das Date: Thu, 16 May 2024 17:14:03 +0200 Subject: [PATCH 0274/1578] drm/i915/selftests: Set always_coherent to false when reading from CPU Commit 8d4ba9fc1c6c ("drm/i915/selftests: Pick correct caching mode.") was not complete as for non LLC sharing platforms cpu read can happen from LLC which probably doesn't have the latest changes made by GPU. Cc: Andi Shyti Cc: Janusz Krzysztofik Cc: Jonathan Cavitt Fixes: 8d4ba9fc1c6c ("drm/i915/selftests: Pick correct caching mode.") Reviewed-by: Jonathan Cavitt Reviewed-by: Andi Shyti Link: https://patchwork.freedesktop.org/patch/msgid/20240516151403.2875-1-nirmoy.das@intel.com Signed-off-by: Nirmoy Das (cherry picked from commit 007ed70831426d4cc108d879d688de6b8e3e6d45) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c index 65a931ea80e9b8..3527b8f446fe3b 100644 --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c @@ -196,7 +196,7 @@ static int verify_access(struct drm_i915_private *i915, if (err) goto out_file; - mode = intel_gt_coherent_map_type(to_gt(i915), native_obj, true); + mode = intel_gt_coherent_map_type(to_gt(i915), native_obj, false); vaddr = i915_gem_object_pin_map_unlocked(native_obj, mode); if (IS_ERR(vaddr)) { err = PTR_ERR(vaddr); From 8c318cb70c88aa02068db7518e852b909c9b400f Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Tue, 28 May 2024 22:39:19 +0300 Subject: [PATCH 0275/1578] drm/panel/lg-sw43408: mark sw43408_backlight_ops as static Fix sparse warning regarding symbol 'sw43408_backlight_ops' not being declared. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202404200739.hbWZvOhR-lkp@intel.com/ Reviewed-by: Neil Armstrong Fixes: 069a6c0e94f9 ("drm: panel: Add LG sw43408 panel driver") Reviewed-by: Marijn Suijten Link: https://patchwork.freedesktop.org/patch/msgid/20240528-panel-sw43408-fix-v4-2-330b42445bcc@linaro.org Signed-off-by: Dmitry Baryshkov --- drivers/gpu/drm/panel/panel-lg-sw43408.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/panel/panel-lg-sw43408.c b/drivers/gpu/drm/panel/panel-lg-sw43408.c index 115f4702d59f76..2b3a73696dcec7 100644 --- a/drivers/gpu/drm/panel/panel-lg-sw43408.c +++ b/drivers/gpu/drm/panel/panel-lg-sw43408.c @@ -182,7 +182,7 @@ static int sw43408_backlight_update_status(struct backlight_device *bl) return mipi_dsi_dcs_set_display_brightness_large(dsi, brightness); } -const struct backlight_ops sw43408_backlight_ops = { +static const struct backlight_ops sw43408_backlight_ops = { .update_status = sw43408_backlight_update_status, }; From ee01b6a386eaf9984b58a2476e8f531149679da9 Mon Sep 17 00:00:00 2001 From: Andi Shyti Date: Fri, 17 May 2024 11:06:16 +0200 Subject: [PATCH 0276/1578] drm/i915/gt: Fix CCS id's calculation for CCS mode setting The whole point of the previous fixes has been to change the CCS hardware configuration to generate only one stream available to the compute users. We did this by changing the info.engine_mask that is set during device probe, reset during the detection of the fused engines, and finally reset again when choosing the CCS mode. We can't use the engine_mask variable anymore, as with the current configuration, it imposes only one CCS no matter what the hardware configuration is. Before changing the engine_mask for the third time, save it and use it for calculating the CCS mode. After the previous changes, the user reported a performance drop to around 1/4. We have tested that the compute operations, with the current patch, have improved by the same factor. Fixes: 6db31251bb26 ("drm/i915/gt: Enable only one CCS for compute workload") Signed-off-by: Andi Shyti Cc: Chris Wilson Cc: Gnattu OC Cc: Joonas Lahtinen Cc: Matt Roper Tested-by: Jian Ye Reviewed-by: Umesh Nerlige Ramappa Tested-by: Gnattu OC Link: https://patchwork.freedesktop.org/patch/msgid/20240517090616.242529-1-andi.shyti@linux.intel.com (cherry picked from commit a09d2327a9ba8e3f5be238bc1b7ca2809255b464) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gt/intel_engine_cs.c | 6 ++++++ drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.c | 2 +- drivers/gpu/drm/i915/gt/intel_gt_types.h | 8 ++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c index 5c8e9ee3b00837..3b740ca2500091 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -885,6 +885,12 @@ static intel_engine_mask_t init_engine_mask(struct intel_gt *gt) if (IS_DG2(gt->i915)) { u8 first_ccs = __ffs(CCS_MASK(gt)); + /* + * Store the number of active cslices before + * changing the CCS engine configuration + */ + gt->ccs.cslices = CCS_MASK(gt); + /* Mask off all the CCS engine */ info->engine_mask &= ~GENMASK(CCS3, CCS0); /* Put back in the first CCS engine */ diff --git a/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.c b/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.c index 99b71bb7da0a6b..3c62a44e9106ce 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.c +++ b/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.c @@ -19,7 +19,7 @@ unsigned int intel_gt_apply_ccs_mode(struct intel_gt *gt) /* Build the value for the fixed CCS load balancing */ for (cslice = 0; cslice < I915_MAX_CCS; cslice++) { - if (CCS_MASK(gt) & BIT(cslice)) + if (gt->ccs.cslices & BIT(cslice)) /* * If available, assign the cslice * to the first available engine... diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h index def7dd0eb6f196..cfdd2ad5e9549c 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h @@ -207,6 +207,14 @@ struct intel_gt { [MAX_ENGINE_INSTANCE + 1]; enum intel_submission_method submission_method; + struct { + /* + * Mask of the non fused CCS slices + * to be used for the load balancing + */ + intel_engine_mask_t cslices; + } ccs; + /* * Default address space (either GGTT or ppGTT depending on arch). * From 43e2b37e2ab660c3565d4cff27922bc70e79c3f1 Mon Sep 17 00:00:00 2001 From: Vidya Srinivas Date: Mon, 20 May 2024 22:26:34 +0530 Subject: [PATCH 0277/1578] drm/i915/dpt: Make DPT object unshrinkable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In some scenarios, the DPT object gets shrunk but the actual framebuffer did not and thus its still there on the DPT's vm->bound_list. Then it tries to rewrite the PTEs via a stale CPU mapping. This causes panic. Cc: stable@vger.kernel.org Reported-by: Shawn Lee Fixes: 0dc987b699ce ("drm/i915/display: Add smem fallback allocation for dpt") Signed-off-by: Vidya Srinivas [vsyrjala: Add TODO comment] Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20240520165634.1162470-1-vidya.srinivas@intel.com (cherry picked from commit 51064d471c53dcc8eddd2333c3f1c1d9131ba36c) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gem/i915_gem_object.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.h b/drivers/gpu/drm/i915/gem/i915_gem_object.h index 3560a062d2872f..5d7446a48ae790 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_object.h @@ -284,7 +284,9 @@ bool i915_gem_object_has_iomem(const struct drm_i915_gem_object *obj); static inline bool i915_gem_object_is_shrinkable(const struct drm_i915_gem_object *obj) { - return i915_gem_object_type_has(obj, I915_GEM_OBJECT_IS_SHRINKABLE); + /* TODO: make DPT shrinkable when it has no bound vmas */ + return i915_gem_object_type_has(obj, I915_GEM_OBJECT_IS_SHRINKABLE) && + !obj->is_dpt; } static inline bool From 75800e2e4203ea83bbc9d4f63ad97ea582244a08 Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Tue, 21 May 2024 17:30:22 +0300 Subject: [PATCH 0278/1578] drm/i915: Fix audio component initialization After registering the audio component in i915_audio_component_init() the audio driver may call i915_audio_component_get_power() via the component ops. This could program AUD_FREQ_CNTRL with an uninitialized value if the latter function is called before display.audio.freq_cntrl gets initialized. The get_power() function also does a modeset which in the above case happens too early before the initialization step and triggers the "Reject display access from task" error message added by the Fixes: commit below. Fix the above issue by registering the audio component only after the initialization step. Fixes: 87c1694533c9 ("drm/i915: save AUD_FREQ_CNTRL state at audio domain suspend") Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/10291 Cc: stable@vger.kernel.org # v5.5+ Signed-off-by: Imre Deak Reviewed-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20240521143022.3784539-1-imre.deak@intel.com (cherry picked from commit fdd0b80172758ce284f19fa8a26d90c61e4371d2) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/intel_audio.c | 32 ++++++++++++------- drivers/gpu/drm/i915/display/intel_audio.h | 1 + .../drm/i915/display/intel_display_driver.c | 2 ++ 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_audio.c b/drivers/gpu/drm/i915/display/intel_audio.c index ed81e1466c4b5a..40e7d862675eed 100644 --- a/drivers/gpu/drm/i915/display/intel_audio.c +++ b/drivers/gpu/drm/i915/display/intel_audio.c @@ -1252,17 +1252,6 @@ static const struct component_ops i915_audio_component_bind_ops = { static void i915_audio_component_init(struct drm_i915_private *i915) { u32 aud_freq, aud_freq_init; - int ret; - - ret = component_add_typed(i915->drm.dev, - &i915_audio_component_bind_ops, - I915_COMPONENT_AUDIO); - if (ret < 0) { - drm_err(&i915->drm, - "failed to add audio component (%d)\n", ret); - /* continue with reduced functionality */ - return; - } if (DISPLAY_VER(i915) >= 9) { aud_freq_init = intel_de_read(i915, AUD_FREQ_CNTRL); @@ -1285,6 +1274,21 @@ static void i915_audio_component_init(struct drm_i915_private *i915) /* init with current cdclk */ intel_audio_cdclk_change_post(i915); +} + +static void i915_audio_component_register(struct drm_i915_private *i915) +{ + int ret; + + ret = component_add_typed(i915->drm.dev, + &i915_audio_component_bind_ops, + I915_COMPONENT_AUDIO); + if (ret < 0) { + drm_err(&i915->drm, + "failed to add audio component (%d)\n", ret); + /* continue with reduced functionality */ + return; + } i915->display.audio.component_registered = true; } @@ -1317,6 +1321,12 @@ void intel_audio_init(struct drm_i915_private *i915) i915_audio_component_init(i915); } +void intel_audio_register(struct drm_i915_private *i915) +{ + if (!i915->display.audio.lpe.platdev) + i915_audio_component_register(i915); +} + /** * intel_audio_deinit() - deinitialize the audio driver * @i915: the i915 drm device private data diff --git a/drivers/gpu/drm/i915/display/intel_audio.h b/drivers/gpu/drm/i915/display/intel_audio.h index 9327954b801e57..576c061d72a450 100644 --- a/drivers/gpu/drm/i915/display/intel_audio.h +++ b/drivers/gpu/drm/i915/display/intel_audio.h @@ -28,6 +28,7 @@ void intel_audio_codec_get_config(struct intel_encoder *encoder, void intel_audio_cdclk_change_pre(struct drm_i915_private *dev_priv); void intel_audio_cdclk_change_post(struct drm_i915_private *dev_priv); void intel_audio_init(struct drm_i915_private *dev_priv); +void intel_audio_register(struct drm_i915_private *i915); void intel_audio_deinit(struct drm_i915_private *dev_priv); void intel_audio_sdp_split_update(const struct intel_crtc_state *crtc_state); diff --git a/drivers/gpu/drm/i915/display/intel_display_driver.c b/drivers/gpu/drm/i915/display/intel_display_driver.c index 89bd032ed995e7..794b4af380558d 100644 --- a/drivers/gpu/drm/i915/display/intel_display_driver.c +++ b/drivers/gpu/drm/i915/display/intel_display_driver.c @@ -540,6 +540,8 @@ void intel_display_driver_register(struct drm_i915_private *i915) intel_display_driver_enable_user_access(i915); + intel_audio_register(i915); + intel_display_debugfs_register(i915); /* From edb32776196afa393c074d6a2733e3a69e66b299 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 29 May 2024 10:37:59 +0200 Subject: [PATCH 0279/1578] ALSA: seq: Fix incorrect UMP type for system messages When converting a legacy system message to a UMP packet, it forgot to modify the UMP type field but keeping the default type (either type 2 or 4). Correct to the right type for system messages. Fixes: e9e02819a98a ("ALSA: seq: Automatic conversion of UMP events") Cc: Link: https://lore.kernel.org/r/20240529083800.5742-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/seq/seq_ump_convert.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/core/seq/seq_ump_convert.c b/sound/core/seq/seq_ump_convert.c index f5d22dd008426f..add70b4c2885de 100644 --- a/sound/core/seq/seq_ump_convert.c +++ b/sound/core/seq/seq_ump_convert.c @@ -739,6 +739,7 @@ static int system_1p_ev_to_ump_midi1(const struct snd_seq_event *event, union snd_ump_midi1_msg *data, unsigned char status) { + data->system.type = UMP_MSG_TYPE_SYSTEM; // override data->system.status = status; data->system.parm1 = event->data.control.value & 0x7f; return 1; @@ -750,6 +751,7 @@ static int system_2p_ev_to_ump_midi1(const struct snd_seq_event *event, union snd_ump_midi1_msg *data, unsigned char status) { + data->system.type = UMP_MSG_TYPE_SYSTEM; // override data->system.status = status; data->system.parm1 = (event->data.control.value >> 7) & 0x7f; data->system.parm2 = event->data.control.value & 0x7f; From fe85f6e607d75b856e7229924c71f55e005f8284 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 29 May 2024 10:38:21 +0200 Subject: [PATCH 0280/1578] ALSA: ump: Don't clear bank selection after sending a program change The current code clears the bank selection MSB/LSB after sending a program change, but this can be wrong, as many apps may not send the full bank selection with both MSB and LSB but sending only one. Better to keep the previous bank set. Fixes: 0b5288f5fe63 ("ALSA: ump: Add legacy raw MIDI support") Cc: Link: https://lore.kernel.org/r/20240529083823.5778-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/ump_convert.c | 1 - 1 file changed, 1 deletion(-) diff --git a/sound/core/ump_convert.c b/sound/core/ump_convert.c index de04799fdb69aa..f67c44c83fde44 100644 --- a/sound/core/ump_convert.c +++ b/sound/core/ump_convert.c @@ -404,7 +404,6 @@ static int cvt_legacy_cmd_to_ump(struct ump_cvt_to_ump *cvt, midi2->pg.bank_msb = cc->cc_bank_msb; midi2->pg.bank_lsb = cc->cc_bank_lsb; cc->bank_set = 0; - cc->cc_bank_msb = cc->cc_bank_lsb = 0; } break; case UMP_MSG_STATUS_CHANNEL_PRESSURE: From ed7ee6a69f9289337af4835a908aa782263d4852 Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 29 May 2024 10:39:01 +0200 Subject: [PATCH 0281/1578] statx: Update offset commentary for struct statx In commit 2a82bb02941f ("statx: stx_subvol"), a new member was added to struct statx, but the offset comment was not correct. Update it. Signed-off-by: John Garry Link: https://lore.kernel.org/r/20240529081725.3769290-1-john.g.garry@oracle.com Signed-off-by: Christian Brauner --- include/uapi/linux/stat.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h index 67626d53531664..95770941ee2c6a 100644 --- a/include/uapi/linux/stat.h +++ b/include/uapi/linux/stat.h @@ -126,8 +126,8 @@ struct statx { __u64 stx_mnt_id; __u32 stx_dio_mem_align; /* Memory buffer alignment for direct I/O */ __u32 stx_dio_offset_align; /* File offset alignment for direct I/O */ - __u64 stx_subvol; /* Subvolume identifier */ /* 0xa0 */ + __u64 stx_subvol; /* Subvolume identifier */ __u64 __spare3[11]; /* Spare space for future expansion */ /* 0x100 */ }; From 6d40dbc75877110c5e0d661dd77f6cfce916765e Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Tue, 28 May 2024 21:18:50 +0200 Subject: [PATCH 0282/1578] ALSA: pcm: fix typo in comment Fix the typo in the comment for SNDRV_PCM_RATE_KNOT Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20240528191850.63314-1-alexandre.belloni@bootlin.com Signed-off-by: Takashi Iwai --- include/sound/pcm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/sound/pcm.h b/include/sound/pcm.h index 61c6054618c8ea..3edd7a7346daaa 100644 --- a/include/sound/pcm.h +++ b/include/sound/pcm.h @@ -124,7 +124,7 @@ struct snd_pcm_ops { #define SNDRV_PCM_RATE_768000 (1U<<16) /* 768000Hz */ #define SNDRV_PCM_RATE_CONTINUOUS (1U<<30) /* continuous range */ -#define SNDRV_PCM_RATE_KNOT (1U<<31) /* supports more non-continuos rates */ +#define SNDRV_PCM_RATE_KNOT (1U<<31) /* supports more non-continuous rates */ #define SNDRV_PCM_RATE_8000_44100 (SNDRV_PCM_RATE_8000|SNDRV_PCM_RATE_11025|\ SNDRV_PCM_RATE_16000|SNDRV_PCM_RATE_22050|\ From b062938fd9afec844c50571fddd8d81623a60ee1 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 27 May 2024 14:19:40 -0500 Subject: [PATCH 0283/1578] ASoC: Intel: sof-sdw: fix missing SPI_MASTER dependency The addition of the Cirrus Logic 'sidecar' amps adds a dependency on SPI_MASTER. Kconfig warnings: (for reference only) WARNING: unmet direct dependencies detected for SND_SOC_CS35L56_SPI Depends on [n]: SOUND [=y] && SND [=y] && SND_SOC [=y] && SPI_MASTER [=n] && (SOUNDWIRE [=y] || !SOUNDWIRE [=y]) Selected by [y]: - SND_SOC_INTEL_SOUNDWIRE_SOF_MACH [=y] && SOUND [=y] && SND [=y] && SND_SOC [=y] && SND_SOC_INTEL_MACH [=y] && SND_SOC_SOF_INTEL_SOUNDWIRE [=y] && I2C [=y] && ACPI [=y] && (MFD_INTEL_LPSS [=y] || COMPILE_TEST [=n]) && (SND_SOC_INTEL_USER_FRIENDLY_LONG_NAMES [=y] || COMPILE_TEST [=n]) && SOUNDWIRE [=y] Fixes: b831b4dca48d ("ASoC: intel: sof_sdw: Add support for cs42l43-cs35l56 sidecar amps") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202405140758.o2HY4nYD-lkp@intel.com/ Signed-off-by: Pierre-Louis Bossart Reviewed-by: Bard Liao Link: https://msgid.link/r/20240527191940.30107-1-pierre-louis.bossart@linux.intel.com Reviewed-by: Charles Keepax Signed-off-by: Mark Brown --- sound/soc/intel/boards/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/intel/boards/Kconfig b/sound/soc/intel/boards/Kconfig index 3ed81ab649c53b..4e0586034de4b2 100644 --- a/sound/soc/intel/boards/Kconfig +++ b/sound/soc/intel/boards/Kconfig @@ -652,7 +652,7 @@ if SND_SOC_SOF_INTEL_SOUNDWIRE config SND_SOC_INTEL_SOUNDWIRE_SOF_MACH tristate "SoundWire generic machine driver" - depends on I2C && ACPI + depends on I2C && SPI_MASTER && ACPI depends on MFD_INTEL_LPSS || COMPILE_TEST depends on SND_SOC_INTEL_USER_FRIENDLY_LONG_NAMES || COMPILE_TEST depends on SOUNDWIRE From cc5ac966f26193ab185cc43d64d9f1ae998ccb6e Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 22 May 2024 19:42:57 +0800 Subject: [PATCH 0284/1578] cachefiles: add output string to cachefiles_obj_[get|put]_ondemand_fd This lets us see the correct trace output. Fixes: c8383054506c ("cachefiles: notify the user daemon when looking up cookie") Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240522114308.2402121-2-libaokun@huaweicloud.com Acked-by: Jeff Layton Reviewed-by: Jingbo Xu Signed-off-by: Christian Brauner --- include/trace/events/cachefiles.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/trace/events/cachefiles.h b/include/trace/events/cachefiles.h index cf4b98b9a9edc7..e3213af847cdf3 100644 --- a/include/trace/events/cachefiles.h +++ b/include/trace/events/cachefiles.h @@ -127,7 +127,9 @@ enum cachefiles_error_trace { EM(cachefiles_obj_see_lookup_cookie, "SEE lookup_cookie") \ EM(cachefiles_obj_see_lookup_failed, "SEE lookup_failed") \ EM(cachefiles_obj_see_withdraw_cookie, "SEE withdraw_cookie") \ - E_(cachefiles_obj_see_withdrawal, "SEE withdrawal") + EM(cachefiles_obj_see_withdrawal, "SEE withdrawal") \ + EM(cachefiles_obj_get_ondemand_fd, "GET ondemand_fd") \ + E_(cachefiles_obj_put_ondemand_fd, "PUT ondemand_fd") #define cachefiles_coherency_traces \ EM(cachefiles_coherency_check_aux, "BAD aux ") \ From 0fc75c5940fa634d84e64c93bfc388e1274ed013 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 22 May 2024 19:42:58 +0800 Subject: [PATCH 0285/1578] cachefiles: remove requests from xarray during flushing requests Even with CACHEFILES_DEAD set, we can still read the requests, so in the following concurrency the request may be used after it has been freed: mount | daemon_thread1 | daemon_thread2 ------------------------------------------------------------ cachefiles_ondemand_init_object cachefiles_ondemand_send_req REQ_A = kzalloc(sizeof(*req) + data_len) wait_for_completion(&REQ_A->done) cachefiles_daemon_read cachefiles_ondemand_daemon_read // close dev fd cachefiles_flush_reqs complete(&REQ_A->done) kfree(REQ_A) xa_lock(&cache->reqs); cachefiles_ondemand_select_req req->msg.opcode != CACHEFILES_OP_READ // req use-after-free !!! xa_unlock(&cache->reqs); xa_destroy(&cache->reqs) Hence remove requests from cache->reqs when flushing them to avoid accessing freed requests. Fixes: c8383054506c ("cachefiles: notify the user daemon when looking up cookie") Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240522114308.2402121-3-libaokun@huaweicloud.com Acked-by: Jeff Layton Reviewed-by: Jia Zhu Reviewed-by: Gao Xiang Reviewed-by: Jingbo Xu Signed-off-by: Christian Brauner --- fs/cachefiles/daemon.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index 6465e257423091..ccb7b707ea4b76 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -159,6 +159,7 @@ static void cachefiles_flush_reqs(struct cachefiles_cache *cache) xa_for_each(xa, index, req) { req->error = -EIO; complete(&req->done); + __xa_erase(xa, index); } xa_unlock(xa); From de3e26f9e5b76fc628077578c001c4a51bf54d06 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 22 May 2024 19:42:59 +0800 Subject: [PATCH 0286/1578] cachefiles: fix slab-use-after-free in cachefiles_ondemand_get_fd() We got the following issue in a fuzz test of randomly issuing the restore command: ================================================================== BUG: KASAN: slab-use-after-free in cachefiles_ondemand_daemon_read+0x609/0xab0 Write of size 4 at addr ffff888109164a80 by task ondemand-04-dae/4962 CPU: 11 PID: 4962 Comm: ondemand-04-dae Not tainted 6.8.0-rc7-dirty #542 Call Trace: kasan_report+0x94/0xc0 cachefiles_ondemand_daemon_read+0x609/0xab0 vfs_read+0x169/0xb50 ksys_read+0xf5/0x1e0 Allocated by task 626: __kmalloc+0x1df/0x4b0 cachefiles_ondemand_send_req+0x24d/0x690 cachefiles_create_tmpfile+0x249/0xb30 cachefiles_create_file+0x6f/0x140 cachefiles_look_up_object+0x29c/0xa60 cachefiles_lookup_cookie+0x37d/0xca0 fscache_cookie_state_machine+0x43c/0x1230 [...] Freed by task 626: kfree+0xf1/0x2c0 cachefiles_ondemand_send_req+0x568/0x690 cachefiles_create_tmpfile+0x249/0xb30 cachefiles_create_file+0x6f/0x140 cachefiles_look_up_object+0x29c/0xa60 cachefiles_lookup_cookie+0x37d/0xca0 fscache_cookie_state_machine+0x43c/0x1230 [...] ================================================================== Following is the process that triggers the issue: mount | daemon_thread1 | daemon_thread2 ------------------------------------------------------------ cachefiles_ondemand_init_object cachefiles_ondemand_send_req REQ_A = kzalloc(sizeof(*req) + data_len) wait_for_completion(&REQ_A->done) cachefiles_daemon_read cachefiles_ondemand_daemon_read REQ_A = cachefiles_ondemand_select_req cachefiles_ondemand_get_fd copy_to_user(_buffer, msg, n) process_open_req(REQ_A) ------ restore ------ cachefiles_ondemand_restore xas_for_each(&xas, req, ULONG_MAX) xas_set_mark(&xas, CACHEFILES_REQ_NEW); cachefiles_daemon_read cachefiles_ondemand_daemon_read REQ_A = cachefiles_ondemand_select_req write(devfd, ("copen %u,%llu", msg->msg_id, size)); cachefiles_ondemand_copen xa_erase(&cache->reqs, id) complete(&REQ_A->done) kfree(REQ_A) cachefiles_ondemand_get_fd(REQ_A) fd = get_unused_fd_flags file = anon_inode_getfile fd_install(fd, file) load = (void *)REQ_A->msg.data; load->fd = fd; // load UAF !!! This issue is caused by issuing a restore command when the daemon is still alive, which results in a request being processed multiple times thus triggering a UAF. So to avoid this problem, add an additional reference count to cachefiles_req, which is held while waiting and reading, and then released when the waiting and reading is over. Note that since there is only one reference count for waiting, we need to avoid the same request being completed multiple times, so we can only complete the request if it is successfully removed from the xarray. Fixes: e73fa11a356c ("cachefiles: add restore command to recover inflight ondemand read requests") Suggested-by: Hou Tao Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240522114308.2402121-4-libaokun@huaweicloud.com Acked-by: Jeff Layton Reviewed-by: Jia Zhu Reviewed-by: Jingbo Xu Signed-off-by: Christian Brauner --- fs/cachefiles/internal.h | 1 + fs/cachefiles/ondemand.c | 23 +++++++++++++++++++---- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index d33169f0018b10..7745b8abc3aa3a 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -138,6 +138,7 @@ static inline bool cachefiles_in_ondemand_mode(struct cachefiles_cache *cache) struct cachefiles_req { struct cachefiles_object *object; struct completion done; + refcount_t ref; int error; struct cachefiles_msg msg; }; diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index 4ba42f1fa3b407..c011fb24d23824 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -4,6 +4,12 @@ #include #include "internal.h" +static inline void cachefiles_req_put(struct cachefiles_req *req) +{ + if (refcount_dec_and_test(&req->ref)) + kfree(req); +} + static int cachefiles_ondemand_fd_release(struct inode *inode, struct file *file) { @@ -330,6 +336,7 @@ ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache, xas_clear_mark(&xas, CACHEFILES_REQ_NEW); cache->req_id_next = xas.xa_index + 1; + refcount_inc(&req->ref); xa_unlock(&cache->reqs); id = xas.xa_index; @@ -356,15 +363,22 @@ ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache, complete(&req->done); } + cachefiles_req_put(req); return n; err_put_fd: if (msg->opcode == CACHEFILES_OP_OPEN) close_fd(((struct cachefiles_open *)msg->data)->fd); error: - xa_erase(&cache->reqs, id); - req->error = ret; - complete(&req->done); + xas_reset(&xas); + xas_lock(&xas); + if (xas_load(&xas) == req) { + req->error = ret; + complete(&req->done); + xas_store(&xas, NULL); + } + xas_unlock(&xas); + cachefiles_req_put(req); return ret; } @@ -395,6 +409,7 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object, goto out; } + refcount_set(&req->ref, 1); req->object = object; init_completion(&req->done); req->msg.opcode = opcode; @@ -456,7 +471,7 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object, wake_up_all(&cache->daemon_pollwq); wait_for_completion(&req->done); ret = req->error; - kfree(req); + cachefiles_req_put(req); return ret; out: /* Reset the object to close state in error handling path. From da4a827416066191aafeeccee50a8836a826ba10 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 22 May 2024 19:43:00 +0800 Subject: [PATCH 0287/1578] cachefiles: fix slab-use-after-free in cachefiles_ondemand_daemon_read() We got the following issue in a fuzz test of randomly issuing the restore command: ================================================================== BUG: KASAN: slab-use-after-free in cachefiles_ondemand_daemon_read+0xb41/0xb60 Read of size 8 at addr ffff888122e84088 by task ondemand-04-dae/963 CPU: 13 PID: 963 Comm: ondemand-04-dae Not tainted 6.8.0-dirty #564 Call Trace: kasan_report+0x93/0xc0 cachefiles_ondemand_daemon_read+0xb41/0xb60 vfs_read+0x169/0xb50 ksys_read+0xf5/0x1e0 Allocated by task 116: kmem_cache_alloc+0x140/0x3a0 cachefiles_lookup_cookie+0x140/0xcd0 fscache_cookie_state_machine+0x43c/0x1230 [...] Freed by task 792: kmem_cache_free+0xfe/0x390 cachefiles_put_object+0x241/0x480 fscache_cookie_state_machine+0x5c8/0x1230 [...] ================================================================== Following is the process that triggers the issue: mount | daemon_thread1 | daemon_thread2 ------------------------------------------------------------ cachefiles_withdraw_cookie cachefiles_ondemand_clean_object(object) cachefiles_ondemand_send_req REQ_A = kzalloc(sizeof(*req) + data_len) wait_for_completion(&REQ_A->done) cachefiles_daemon_read cachefiles_ondemand_daemon_read REQ_A = cachefiles_ondemand_select_req msg->object_id = req->object->ondemand->ondemand_id ------ restore ------ cachefiles_ondemand_restore xas_for_each(&xas, req, ULONG_MAX) xas_set_mark(&xas, CACHEFILES_REQ_NEW) cachefiles_daemon_read cachefiles_ondemand_daemon_read REQ_A = cachefiles_ondemand_select_req copy_to_user(_buffer, msg, n) xa_erase(&cache->reqs, id) complete(&REQ_A->done) ------ close(fd) ------ cachefiles_ondemand_fd_release cachefiles_put_object cachefiles_put_object kmem_cache_free(cachefiles_object_jar, object) REQ_A->object->ondemand->ondemand_id // object UAF !!! When we see the request within xa_lock, req->object must not have been freed yet, so grab the reference count of object before xa_unlock to avoid the above issue. Fixes: 0a7e54c1959c ("cachefiles: resend an open request if the read request's object is closed") Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240522114308.2402121-5-libaokun@huaweicloud.com Acked-by: Jeff Layton Reviewed-by: Jia Zhu Reviewed-by: Jingbo Xu Signed-off-by: Christian Brauner --- fs/cachefiles/ondemand.c | 3 +++ include/trace/events/cachefiles.h | 6 +++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index c011fb24d23824..3dd002108a8721 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -337,6 +337,7 @@ ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache, xas_clear_mark(&xas, CACHEFILES_REQ_NEW); cache->req_id_next = xas.xa_index + 1; refcount_inc(&req->ref); + cachefiles_grab_object(req->object, cachefiles_obj_get_read_req); xa_unlock(&cache->reqs); id = xas.xa_index; @@ -357,6 +358,7 @@ ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache, goto err_put_fd; } + cachefiles_put_object(req->object, cachefiles_obj_put_read_req); /* CLOSE request has no reply */ if (msg->opcode == CACHEFILES_OP_CLOSE) { xa_erase(&cache->reqs, id); @@ -370,6 +372,7 @@ ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache, if (msg->opcode == CACHEFILES_OP_OPEN) close_fd(((struct cachefiles_open *)msg->data)->fd); error: + cachefiles_put_object(req->object, cachefiles_obj_put_read_req); xas_reset(&xas); xas_lock(&xas); if (xas_load(&xas) == req) { diff --git a/include/trace/events/cachefiles.h b/include/trace/events/cachefiles.h index e3213af847cdf3..7d931db02b9346 100644 --- a/include/trace/events/cachefiles.h +++ b/include/trace/events/cachefiles.h @@ -33,6 +33,8 @@ enum cachefiles_obj_ref_trace { cachefiles_obj_see_withdrawal, cachefiles_obj_get_ondemand_fd, cachefiles_obj_put_ondemand_fd, + cachefiles_obj_get_read_req, + cachefiles_obj_put_read_req, }; enum fscache_why_object_killed { @@ -129,7 +131,9 @@ enum cachefiles_error_trace { EM(cachefiles_obj_see_withdraw_cookie, "SEE withdraw_cookie") \ EM(cachefiles_obj_see_withdrawal, "SEE withdrawal") \ EM(cachefiles_obj_get_ondemand_fd, "GET ondemand_fd") \ - E_(cachefiles_obj_put_ondemand_fd, "PUT ondemand_fd") + EM(cachefiles_obj_put_ondemand_fd, "PUT ondemand_fd") \ + EM(cachefiles_obj_get_read_req, "GET read_req") \ + E_(cachefiles_obj_put_read_req, "PUT read_req") #define cachefiles_coherency_traces \ EM(cachefiles_coherency_check_aux, "BAD aux ") \ From 3e6d704f02aa4c50c7bc5fe91a4401df249a137b Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 22 May 2024 19:43:01 +0800 Subject: [PATCH 0288/1578] cachefiles: remove err_put_fd label in cachefiles_ondemand_daemon_read() The err_put_fd label is only used once, so remove it to make the code more readable. In addition, the logic for deleting error request and CLOSE request is merged to simplify the code. Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240522114308.2402121-6-libaokun@huaweicloud.com Acked-by: Jeff Layton Reviewed-by: Jia Zhu Reviewed-by: Gao Xiang Reviewed-by: Jingbo Xu Signed-off-by: Christian Brauner --- fs/cachefiles/ondemand.c | 45 ++++++++++++++-------------------------- 1 file changed, 16 insertions(+), 29 deletions(-) diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index 3dd002108a8721..bb94ef6a6f61b7 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -305,7 +305,6 @@ ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache, { struct cachefiles_req *req; struct cachefiles_msg *msg; - unsigned long id = 0; size_t n; int ret = 0; XA_STATE(xas, &cache->reqs, cache->req_id_next); @@ -340,49 +339,37 @@ ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache, cachefiles_grab_object(req->object, cachefiles_obj_get_read_req); xa_unlock(&cache->reqs); - id = xas.xa_index; - if (msg->opcode == CACHEFILES_OP_OPEN) { ret = cachefiles_ondemand_get_fd(req); if (ret) { cachefiles_ondemand_set_object_close(req->object); - goto error; + goto out; } } - msg->msg_id = id; + msg->msg_id = xas.xa_index; msg->object_id = req->object->ondemand->ondemand_id; if (copy_to_user(_buffer, msg, n) != 0) { ret = -EFAULT; - goto err_put_fd; - } - - cachefiles_put_object(req->object, cachefiles_obj_put_read_req); - /* CLOSE request has no reply */ - if (msg->opcode == CACHEFILES_OP_CLOSE) { - xa_erase(&cache->reqs, id); - complete(&req->done); + if (msg->opcode == CACHEFILES_OP_OPEN) + close_fd(((struct cachefiles_open *)msg->data)->fd); } - - cachefiles_req_put(req); - return n; - -err_put_fd: - if (msg->opcode == CACHEFILES_OP_OPEN) - close_fd(((struct cachefiles_open *)msg->data)->fd); -error: +out: cachefiles_put_object(req->object, cachefiles_obj_put_read_req); - xas_reset(&xas); - xas_lock(&xas); - if (xas_load(&xas) == req) { - req->error = ret; - complete(&req->done); - xas_store(&xas, NULL); + /* Remove error request and CLOSE request has no reply */ + if (ret || msg->opcode == CACHEFILES_OP_CLOSE) { + xas_reset(&xas); + xas_lock(&xas); + if (xas_load(&xas) == req) { + req->error = ret; + complete(&req->done); + xas_store(&xas, NULL); + } + xas_unlock(&xas); } - xas_unlock(&xas); cachefiles_req_put(req); - return ret; + return ret ? ret : n; } typedef int (*init_req_fn)(struct cachefiles_req *req, void *private); From a26dc49df37e996876f50a0210039b2d211fdd6f Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 22 May 2024 19:43:02 +0800 Subject: [PATCH 0289/1578] cachefiles: add consistency check for copen/cread This prevents malicious processes from completing random copen/cread requests and crashing the system. Added checks are listed below: * Generic, copen can only complete open requests, and cread can only complete read requests. * For copen, ondemand_id must not be 0, because this indicates that the request has not been read by the daemon. * For cread, the object corresponding to fd and req should be the same. Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240522114308.2402121-7-libaokun@huaweicloud.com Acked-by: Jeff Layton Reviewed-by: Jingbo Xu Signed-off-by: Christian Brauner --- fs/cachefiles/ondemand.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index bb94ef6a6f61b7..898fab68332bcf 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -82,12 +82,12 @@ static loff_t cachefiles_ondemand_fd_llseek(struct file *filp, loff_t pos, } static long cachefiles_ondemand_fd_ioctl(struct file *filp, unsigned int ioctl, - unsigned long arg) + unsigned long id) { struct cachefiles_object *object = filp->private_data; struct cachefiles_cache *cache = object->volume->cache; struct cachefiles_req *req; - unsigned long id; + XA_STATE(xas, &cache->reqs, id); if (ioctl != CACHEFILES_IOC_READ_COMPLETE) return -EINVAL; @@ -95,10 +95,15 @@ static long cachefiles_ondemand_fd_ioctl(struct file *filp, unsigned int ioctl, if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags)) return -EOPNOTSUPP; - id = arg; - req = xa_erase(&cache->reqs, id); - if (!req) + xa_lock(&cache->reqs); + req = xas_load(&xas); + if (!req || req->msg.opcode != CACHEFILES_OP_READ || + req->object != object) { + xa_unlock(&cache->reqs); return -EINVAL; + } + xas_store(&xas, NULL); + xa_unlock(&cache->reqs); trace_cachefiles_ondemand_cread(object, id); complete(&req->done); @@ -126,6 +131,7 @@ int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args) unsigned long id; long size; int ret; + XA_STATE(xas, &cache->reqs, 0); if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags)) return -EOPNOTSUPP; @@ -149,9 +155,16 @@ int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args) if (ret) return ret; - req = xa_erase(&cache->reqs, id); - if (!req) + xa_lock(&cache->reqs); + xas.xa_index = id; + req = xas_load(&xas); + if (!req || req->msg.opcode != CACHEFILES_OP_OPEN || + !req->object->ondemand->ondemand_id) { + xa_unlock(&cache->reqs); return -EINVAL; + } + xas_store(&xas, NULL); + xa_unlock(&cache->reqs); /* fail OPEN request if copen format is invalid */ ret = kstrtol(psize, 0, &size); From 0a790040838c736495d5afd6b2d636f159f817f1 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 22 May 2024 19:43:03 +0800 Subject: [PATCH 0290/1578] cachefiles: add spin_lock for cachefiles_ondemand_info The following concurrency may cause a read request to fail to be completed and result in a hung: t1 | t2 --------------------------------------------------------- cachefiles_ondemand_copen req = xa_erase(&cache->reqs, id) // Anon fd is maliciously closed. cachefiles_ondemand_fd_release xa_lock(&cache->reqs) cachefiles_ondemand_set_object_close(object) xa_unlock(&cache->reqs) cachefiles_ondemand_set_object_open // No one will ever close it again. cachefiles_ondemand_daemon_read cachefiles_ondemand_select_req // Get a read req but its fd is already closed. // The daemon can't issue a cread ioctl with an closed fd, then hung. So add spin_lock for cachefiles_ondemand_info to protect ondemand_id and state, thus we can avoid the above problem in cachefiles_ondemand_copen() by using ondemand_id to determine if fd has been closed. Fixes: c8383054506c ("cachefiles: notify the user daemon when looking up cookie") Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240522114308.2402121-8-libaokun@huaweicloud.com Acked-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/cachefiles/internal.h | 1 + fs/cachefiles/ondemand.c | 35 ++++++++++++++++++++++++++++++++++- 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 7745b8abc3aa3a..45c8bed6053834 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -55,6 +55,7 @@ struct cachefiles_ondemand_info { int ondemand_id; enum cachefiles_object_state state; struct cachefiles_object *object; + spinlock_t lock; }; /* diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index 898fab68332bcf..d04ddc6576e3b6 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -16,13 +16,16 @@ static int cachefiles_ondemand_fd_release(struct inode *inode, struct cachefiles_object *object = file->private_data; struct cachefiles_cache *cache = object->volume->cache; struct cachefiles_ondemand_info *info = object->ondemand; - int object_id = info->ondemand_id; + int object_id; struct cachefiles_req *req; XA_STATE(xas, &cache->reqs, 0); xa_lock(&cache->reqs); + spin_lock(&info->lock); + object_id = info->ondemand_id; info->ondemand_id = CACHEFILES_ONDEMAND_ID_CLOSED; cachefiles_ondemand_set_object_close(object); + spin_unlock(&info->lock); /* Only flush CACHEFILES_REQ_NEW marked req to avoid race with daemon_read */ xas_for_each_marked(&xas, req, ULONG_MAX, CACHEFILES_REQ_NEW) { @@ -127,6 +130,7 @@ int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args) { struct cachefiles_req *req; struct fscache_cookie *cookie; + struct cachefiles_ondemand_info *info; char *pid, *psize; unsigned long id; long size; @@ -185,6 +189,33 @@ int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args) goto out; } + info = req->object->ondemand; + spin_lock(&info->lock); + /* + * The anonymous fd was closed before copen ? Fail the request. + * + * t1 | t2 + * --------------------------------------------------------- + * cachefiles_ondemand_copen + * req = xa_erase(&cache->reqs, id) + * // Anon fd is maliciously closed. + * cachefiles_ondemand_fd_release + * xa_lock(&cache->reqs) + * cachefiles_ondemand_set_object_close(object) + * xa_unlock(&cache->reqs) + * cachefiles_ondemand_set_object_open + * // No one will ever close it again. + * cachefiles_ondemand_daemon_read + * cachefiles_ondemand_select_req + * + * Get a read req but its fd is already closed. The daemon can't + * issue a cread ioctl with an closed fd, then hung. + */ + if (info->ondemand_id == CACHEFILES_ONDEMAND_ID_CLOSED) { + spin_unlock(&info->lock); + req->error = -EBADFD; + goto out; + } cookie = req->object->cookie; cookie->object_size = size; if (size) @@ -194,6 +225,7 @@ int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args) trace_cachefiles_ondemand_copen(req->object, id, size); cachefiles_ondemand_set_object_open(req->object); + spin_unlock(&info->lock); wake_up_all(&cache->daemon_pollwq); out: @@ -596,6 +628,7 @@ int cachefiles_ondemand_init_obj_info(struct cachefiles_object *object, return -ENOMEM; object->ondemand->object = object; + spin_lock_init(&object->ondemand->lock); INIT_WORK(&object->ondemand->ondemand_work, ondemand_object_worker); return 0; } From 4988e35e95fc938bdde0e15880fe72042fc86acf Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 22 May 2024 19:43:04 +0800 Subject: [PATCH 0291/1578] cachefiles: never get a new anonymous fd if ondemand_id is valid Now every time the daemon reads an open request, it gets a new anonymous fd and ondemand_id. With the introduction of "restore", it is possible to read the same open request more than once, and therefore an object can have more than one anonymous fd. If the anonymous fd is not unique, the following concurrencies will result in an fd leak: t1 | t2 | t3 ------------------------------------------------------------ cachefiles_ondemand_init_object cachefiles_ondemand_send_req REQ_A = kzalloc(sizeof(*req) + data_len) wait_for_completion(&REQ_A->done) cachefiles_daemon_read cachefiles_ondemand_daemon_read REQ_A = cachefiles_ondemand_select_req cachefiles_ondemand_get_fd load->fd = fd0 ondemand_id = object_id0 ------ restore ------ cachefiles_ondemand_restore // restore REQ_A cachefiles_daemon_read cachefiles_ondemand_daemon_read REQ_A = cachefiles_ondemand_select_req cachefiles_ondemand_get_fd load->fd = fd1 ondemand_id = object_id1 process_open_req(REQ_A) write(devfd, ("copen %u,%llu", msg->msg_id, size)) cachefiles_ondemand_copen xa_erase(&cache->reqs, id) complete(&REQ_A->done) kfree(REQ_A) process_open_req(REQ_A) // copen fails due to no req // daemon close(fd1) cachefiles_ondemand_fd_release // set object closed -- umount -- cachefiles_withdraw_cookie cachefiles_ondemand_clean_object cachefiles_ondemand_init_close_req if (!cachefiles_ondemand_object_is_open(object)) return -ENOENT; // The fd0 is not closed until the daemon exits. However, the anonymous fd holds the reference count of the object and the object holds the reference count of the cookie. So even though the cookie has been relinquished, it will not be unhashed and freed until the daemon exits. In fscache_hash_cookie(), when the same cookie is found in the hash list, if the cookie is set with the FSCACHE_COOKIE_RELINQUISHED bit, then the new cookie waits for the old cookie to be unhashed, while the old cookie is waiting for the leaked fd to be closed, if the daemon does not exit in time it will trigger a hung task. To avoid this, allocate a new anonymous fd only if no anonymous fd has been allocated (ondemand_id == 0) or if the previously allocated anonymous fd has been closed (ondemand_id == -1). Moreover, returns an error if ondemand_id is valid, letting the daemon know that the current userland restore logic is abnormal and needs to be checked. Fixes: c8383054506c ("cachefiles: notify the user daemon when looking up cookie") Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240522114308.2402121-9-libaokun@huaweicloud.com Acked-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/cachefiles/ondemand.c | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index d04ddc6576e3b6..d2d4e27fca6f52 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -14,11 +14,18 @@ static int cachefiles_ondemand_fd_release(struct inode *inode, struct file *file) { struct cachefiles_object *object = file->private_data; - struct cachefiles_cache *cache = object->volume->cache; - struct cachefiles_ondemand_info *info = object->ondemand; + struct cachefiles_cache *cache; + struct cachefiles_ondemand_info *info; int object_id; struct cachefiles_req *req; - XA_STATE(xas, &cache->reqs, 0); + XA_STATE(xas, NULL, 0); + + if (!object) + return 0; + + info = object->ondemand; + cache = object->volume->cache; + xas.xa = &cache->reqs; xa_lock(&cache->reqs); spin_lock(&info->lock); @@ -288,22 +295,39 @@ static int cachefiles_ondemand_get_fd(struct cachefiles_req *req) goto err_put_fd; } + spin_lock(&object->ondemand->lock); + if (object->ondemand->ondemand_id > 0) { + spin_unlock(&object->ondemand->lock); + /* Pair with check in cachefiles_ondemand_fd_release(). */ + file->private_data = NULL; + ret = -EEXIST; + goto err_put_file; + } + file->f_mode |= FMODE_PWRITE | FMODE_LSEEK; fd_install(fd, file); load = (void *)req->msg.data; load->fd = fd; object->ondemand->ondemand_id = object_id; + spin_unlock(&object->ondemand->lock); cachefiles_get_unbind_pincount(cache); trace_cachefiles_ondemand_open(object, &req->msg, load); return 0; +err_put_file: + fput(file); err_put_fd: put_unused_fd(fd); err_free_id: xa_erase(&cache->ondemand_ids, object_id); err: + spin_lock(&object->ondemand->lock); + /* Avoid marking an opened object as closed. */ + if (object->ondemand->ondemand_id <= 0) + cachefiles_ondemand_set_object_close(object); + spin_unlock(&object->ondemand->lock); cachefiles_put_object(object, cachefiles_obj_put_ondemand_fd); return ret; } @@ -386,10 +410,8 @@ ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache, if (msg->opcode == CACHEFILES_OP_OPEN) { ret = cachefiles_ondemand_get_fd(req); - if (ret) { - cachefiles_ondemand_set_object_close(req->object); + if (ret) goto out; - } } msg->msg_id = xas.xa_index; From 4b4391e77a6bf24cba2ef1590e113d9b73b11039 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 22 May 2024 19:43:05 +0800 Subject: [PATCH 0292/1578] cachefiles: defer exposing anon_fd until after copy_to_user() succeeds After installing the anonymous fd, we can now see it in userland and close it. However, at this point we may not have gotten the reference count of the cache, but we will put it during colse fd, so this may cause a cache UAF. So grab the cache reference count before fd_install(). In addition, by kernel convention, fd is taken over by the user land after fd_install(), and the kernel should not call close_fd() after that, i.e., it should call fd_install() after everything is ready, thus fd_install() is called after copy_to_user() succeeds. Fixes: c8383054506c ("cachefiles: notify the user daemon when looking up cookie") Suggested-by: Hou Tao Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240522114308.2402121-10-libaokun@huaweicloud.com Acked-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/cachefiles/ondemand.c | 53 +++++++++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index d2d4e27fca6f52..6f815e7c508674 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -4,6 +4,11 @@ #include #include "internal.h" +struct ondemand_anon_file { + struct file *file; + int fd; +}; + static inline void cachefiles_req_put(struct cachefiles_req *req) { if (refcount_dec_and_test(&req->ref)) @@ -263,14 +268,14 @@ int cachefiles_ondemand_restore(struct cachefiles_cache *cache, char *args) return 0; } -static int cachefiles_ondemand_get_fd(struct cachefiles_req *req) +static int cachefiles_ondemand_get_fd(struct cachefiles_req *req, + struct ondemand_anon_file *anon_file) { struct cachefiles_object *object; struct cachefiles_cache *cache; struct cachefiles_open *load; - struct file *file; u32 object_id; - int ret, fd; + int ret; object = cachefiles_grab_object(req->object, cachefiles_obj_get_ondemand_fd); @@ -282,16 +287,16 @@ static int cachefiles_ondemand_get_fd(struct cachefiles_req *req) if (ret < 0) goto err; - fd = get_unused_fd_flags(O_WRONLY); - if (fd < 0) { - ret = fd; + anon_file->fd = get_unused_fd_flags(O_WRONLY); + if (anon_file->fd < 0) { + ret = anon_file->fd; goto err_free_id; } - file = anon_inode_getfile("[cachefiles]", &cachefiles_ondemand_fd_fops, - object, O_WRONLY); - if (IS_ERR(file)) { - ret = PTR_ERR(file); + anon_file->file = anon_inode_getfile("[cachefiles]", + &cachefiles_ondemand_fd_fops, object, O_WRONLY); + if (IS_ERR(anon_file->file)) { + ret = PTR_ERR(anon_file->file); goto err_put_fd; } @@ -299,16 +304,15 @@ static int cachefiles_ondemand_get_fd(struct cachefiles_req *req) if (object->ondemand->ondemand_id > 0) { spin_unlock(&object->ondemand->lock); /* Pair with check in cachefiles_ondemand_fd_release(). */ - file->private_data = NULL; + anon_file->file->private_data = NULL; ret = -EEXIST; goto err_put_file; } - file->f_mode |= FMODE_PWRITE | FMODE_LSEEK; - fd_install(fd, file); + anon_file->file->f_mode |= FMODE_PWRITE | FMODE_LSEEK; load = (void *)req->msg.data; - load->fd = fd; + load->fd = anon_file->fd; object->ondemand->ondemand_id = object_id; spin_unlock(&object->ondemand->lock); @@ -317,9 +321,11 @@ static int cachefiles_ondemand_get_fd(struct cachefiles_req *req) return 0; err_put_file: - fput(file); + fput(anon_file->file); + anon_file->file = NULL; err_put_fd: - put_unused_fd(fd); + put_unused_fd(anon_file->fd); + anon_file->fd = ret; err_free_id: xa_erase(&cache->ondemand_ids, object_id); err: @@ -376,6 +382,7 @@ ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache, struct cachefiles_msg *msg; size_t n; int ret = 0; + struct ondemand_anon_file anon_file; XA_STATE(xas, &cache->reqs, cache->req_id_next); xa_lock(&cache->reqs); @@ -409,7 +416,7 @@ ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache, xa_unlock(&cache->reqs); if (msg->opcode == CACHEFILES_OP_OPEN) { - ret = cachefiles_ondemand_get_fd(req); + ret = cachefiles_ondemand_get_fd(req, &anon_file); if (ret) goto out; } @@ -417,10 +424,16 @@ ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache, msg->msg_id = xas.xa_index; msg->object_id = req->object->ondemand->ondemand_id; - if (copy_to_user(_buffer, msg, n) != 0) { + if (copy_to_user(_buffer, msg, n) != 0) ret = -EFAULT; - if (msg->opcode == CACHEFILES_OP_OPEN) - close_fd(((struct cachefiles_open *)msg->data)->fd); + + if (msg->opcode == CACHEFILES_OP_OPEN) { + if (ret < 0) { + fput(anon_file.file); + put_unused_fd(anon_file.fd); + goto out; + } + fd_install(anon_file.fd, anon_file.file); } out: cachefiles_put_object(req->object, cachefiles_obj_put_read_req); From 4f8703fb3482f92edcfd31661857b16fec89c2c0 Mon Sep 17 00:00:00 2001 From: Zizhi Wo Date: Wed, 22 May 2024 19:43:06 +0800 Subject: [PATCH 0293/1578] cachefiles: Set object to close if ondemand_id < 0 in copen If copen is maliciously called in the user mode, it may delete the request corresponding to the random id. And the request may have not been read yet. Note that when the object is set to reopen, the open request will be done with the still reopen state in above case. As a result, the request corresponding to this object is always skipped in select_req function, so the read request is never completed and blocks other process. Fix this issue by simply set object to close if its id < 0 in copen. Signed-off-by: Zizhi Wo Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240522114308.2402121-11-libaokun@huaweicloud.com Acked-by: Jeff Layton Reviewed-by: Jia Zhu Signed-off-by: Christian Brauner --- fs/cachefiles/ondemand.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index 6f815e7c508674..922cab1a314b25 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -182,6 +182,7 @@ int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args) xas_store(&xas, NULL); xa_unlock(&cache->reqs); + info = req->object->ondemand; /* fail OPEN request if copen format is invalid */ ret = kstrtol(psize, 0, &size); if (ret) { @@ -201,7 +202,6 @@ int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args) goto out; } - info = req->object->ondemand; spin_lock(&info->lock); /* * The anonymous fd was closed before copen ? Fail the request. @@ -241,6 +241,11 @@ int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args) wake_up_all(&cache->daemon_pollwq); out: + spin_lock(&info->lock); + /* Need to set object close to avoid reopen status continuing */ + if (info->ondemand_id == CACHEFILES_ONDEMAND_ID_CLOSED) + cachefiles_ondemand_set_object_close(req->object); + spin_unlock(&info->lock); complete(&req->done); return ret; } From 85e833cd7243bda7285492b0653c3abb1e2e757b Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 22 May 2024 19:43:07 +0800 Subject: [PATCH 0294/1578] cachefiles: flush all requests after setting CACHEFILES_DEAD In ondemand mode, when the daemon is processing an open request, if the kernel flags the cache as CACHEFILES_DEAD, the cachefiles_daemon_write() will always return -EIO, so the daemon can't pass the copen to the kernel. Then the kernel process that is waiting for the copen triggers a hung_task. Since the DEAD state is irreversible, it can only be exited by closing /dev/cachefiles. Therefore, after calling cachefiles_io_error() to mark the cache as CACHEFILES_DEAD, if in ondemand mode, flush all requests to avoid the above hungtask. We may still be able to read some of the cached data before closing the fd of /dev/cachefiles. Note that this relies on the patch that adds reference counting to the req, otherwise it may UAF. Fixes: c8383054506c ("cachefiles: notify the user daemon when looking up cookie") Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240522114308.2402121-12-libaokun@huaweicloud.com Acked-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/cachefiles/daemon.c | 2 +- fs/cachefiles/internal.h | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index ccb7b707ea4b76..06cdf1a8a16f6a 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -133,7 +133,7 @@ static int cachefiles_daemon_open(struct inode *inode, struct file *file) return 0; } -static void cachefiles_flush_reqs(struct cachefiles_cache *cache) +void cachefiles_flush_reqs(struct cachefiles_cache *cache) { struct xarray *xa = &cache->reqs; struct cachefiles_req *req; diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 45c8bed6053834..6845a90cdfcce9 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -188,6 +188,7 @@ extern int cachefiles_has_space(struct cachefiles_cache *cache, * daemon.c */ extern const struct file_operations cachefiles_daemon_fops; +extern void cachefiles_flush_reqs(struct cachefiles_cache *cache); extern void cachefiles_get_unbind_pincount(struct cachefiles_cache *cache); extern void cachefiles_put_unbind_pincount(struct cachefiles_cache *cache); @@ -426,6 +427,8 @@ do { \ pr_err("I/O Error: " FMT"\n", ##__VA_ARGS__); \ fscache_io_error((___cache)->cache); \ set_bit(CACHEFILES_DEAD, &(___cache)->flags); \ + if (cachefiles_in_ondemand_mode(___cache)) \ + cachefiles_flush_reqs(___cache); \ } while (0) #define cachefiles_io_error_obj(object, FMT, ...) \ From bc9dde6155464e906e630a0a5c17a4cab241ffbb Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 22 May 2024 19:43:08 +0800 Subject: [PATCH 0295/1578] cachefiles: make on-demand read killable Replacing wait_for_completion() with wait_for_completion_killable() in cachefiles_ondemand_send_req() allows us to kill processes that might trigger a hunk_task if the daemon is abnormal. But now only CACHEFILES_OP_READ is killable, because OP_CLOSE and OP_OPEN is initiated from kworker context and the signal is prohibited in these kworker. Note that when the req in xas changes, i.e. xas_load(&xas) != req, it means that a process will complete the current request soon, so wait again for the request to be completed. In addition, add the cachefiles_ondemand_finish_req() helper function to simplify the code. Suggested-by: Hou Tao Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240522114308.2402121-13-libaokun@huaweicloud.com Acked-by: Jeff Layton Reviewed-by: Jia Zhu Signed-off-by: Christian Brauner --- fs/cachefiles/ondemand.c | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index 922cab1a314b25..58bd80956c5a64 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -380,6 +380,20 @@ static struct cachefiles_req *cachefiles_ondemand_select_req(struct xa_state *xa return NULL; } +static inline bool cachefiles_ondemand_finish_req(struct cachefiles_req *req, + struct xa_state *xas, int err) +{ + if (unlikely(!xas || !req)) + return false; + + if (xa_cmpxchg(xas->xa, xas->xa_index, req, NULL, 0) != req) + return false; + + req->error = err; + complete(&req->done); + return true; +} + ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache, char __user *_buffer, size_t buflen) { @@ -443,16 +457,8 @@ ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache, out: cachefiles_put_object(req->object, cachefiles_obj_put_read_req); /* Remove error request and CLOSE request has no reply */ - if (ret || msg->opcode == CACHEFILES_OP_CLOSE) { - xas_reset(&xas); - xas_lock(&xas); - if (xas_load(&xas) == req) { - req->error = ret; - complete(&req->done); - xas_store(&xas, NULL); - } - xas_unlock(&xas); - } + if (ret || msg->opcode == CACHEFILES_OP_CLOSE) + cachefiles_ondemand_finish_req(req, &xas, ret); cachefiles_req_put(req); return ret ? ret : n; } @@ -544,8 +550,18 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object, goto out; wake_up_all(&cache->daemon_pollwq); - wait_for_completion(&req->done); - ret = req->error; +wait: + ret = wait_for_completion_killable(&req->done); + if (!ret) { + ret = req->error; + } else { + ret = -EINTR; + if (!cachefiles_ondemand_finish_req(req, &xas, ret)) { + /* Someone will complete it soon. */ + cpu_relax(); + goto wait; + } + } cachefiles_req_put(req); return ret; out: From e9022b31db80019025967b03df1d059433e9f26d Mon Sep 17 00:00:00 2001 From: Minda Chen Date: Tue, 28 May 2024 09:51:20 +0800 Subject: [PATCH 0296/1578] MAINTAINERS: dwmac: starfive: update Maintainer Update the maintainer of starfive dwmac driver. Signed-off-by: Minda Chen Acked-by: Emil Renner Berthing Signed-off-by: David S. Miller --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index e80db76fe3937d..90930a03e7a565 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21318,7 +21318,7 @@ F: arch/riscv/boot/dts/starfive/ STARFIVE DWMAC GLUE LAYER M: Emil Renner Berthing -M: Samin Guo +M: Minda Chen S: Maintained F: Documentation/devicetree/bindings/net/starfive,jh7110-dwmac.yaml F: drivers/net/ethernet/stmicro/stmmac/dwmac-starfive.c From 068648aab72c9ba7b0597354ef4d81ffaac7b979 Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Tue, 28 May 2024 11:12:31 +0800 Subject: [PATCH 0297/1578] nfc/nci: Add the inconsistency check between the input data length and count write$nci(r0, &(0x7f0000000740)=ANY=[@ANYBLOB="610501"], 0xf) Syzbot constructed a write() call with a data length of 3 bytes but a count value of 15, which passed too little data to meet the basic requirements of the function nci_rf_intf_activated_ntf_packet(). Therefore, increasing the comparison between data length and count value to avoid problems caused by inconsistent data length and count. Reported-and-tested-by: syzbot+71bfed2b2bcea46c98f2@syzkaller.appspotmail.com Signed-off-by: Edward Adam Davis Signed-off-by: David S. Miller --- drivers/nfc/virtual_ncidev.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/nfc/virtual_ncidev.c b/drivers/nfc/virtual_ncidev.c index 590b038e449e5c..6b89d596ba9afe 100644 --- a/drivers/nfc/virtual_ncidev.c +++ b/drivers/nfc/virtual_ncidev.c @@ -125,6 +125,10 @@ static ssize_t virtual_ncidev_write(struct file *file, kfree_skb(skb); return -EFAULT; } + if (strnlen(skb->data, count) != count) { + kfree_skb(skb); + return -EINVAL; + } nci_recv_frame(vdev->ndev, skb); return count; From b1e7cee96127468c2483cf10c2899c9b5cf79bf8 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Mon, 13 May 2024 10:02:48 +0000 Subject: [PATCH 0298/1578] powerpc/bpf: enforce full ordering for ATOMIC operations with BPF_FETCH The Linux Kernel Memory Model [1][2] requires RMW operations that have a return value to be fully ordered. BPF atomic operations with BPF_FETCH (including BPF_XCHG and BPF_CMPXCHG) return a value back so they need to be JITed to fully ordered operations. POWERPC currently emits relaxed operations for these. We can show this by running the following litmus-test: PPC SB+atomic_add+fetch { 0:r0=x; (* dst reg assuming offset is 0 *) 0:r1=2; (* src reg *) 0:r2=1; 0:r4=y; (* P0 writes to this, P1 reads this *) 0:r5=z; (* P1 writes to this, P0 reads this *) 0:r6=0; 1:r2=1; 1:r4=y; 1:r5=z; } P0 | P1 ; stw r2, 0(r4) | stw r2,0(r5) ; | ; loop:lwarx r3, r6, r0 | ; mr r8, r3 | ; add r3, r3, r1 | sync ; stwcx. r3, r6, r0 | ; bne loop | ; mr r1, r8 | ; | ; lwa r7, 0(r5) | lwa r7,0(r4) ; ~exists(0:r7=0 /\ 1:r7=0) Witnesses Positive: 9 Negative: 3 Condition ~exists (0:r7=0 /\ 1:r7=0) Observation SB+atomic_add+fetch Sometimes 3 9 This test shows that the older store in P0 is reordered with a newer load to a different address. Although there is a RMW operation with fetch between them. Adding a sync before and after RMW fixes the issue: Witnesses Positive: 9 Negative: 0 Condition ~exists (0:r7=0 /\ 1:r7=0) Observation SB+atomic_add+fetch Never 0 9 [1] https://www.kernel.org/doc/Documentation/memory-barriers.txt [2] https://www.kernel.org/doc/Documentation/atomic_t.txt Fixes: aea7ef8a82c0 ("powerpc/bpf/32: add support for BPF_ATOMIC bitwise operations") Fixes: 2d9206b22743 ("powerpc/bpf/32: Add instructions for atomic_[cmp]xchg") Fixes: dbe6e2456fb0 ("powerpc/bpf/64: add support for atomic fetch operations") Fixes: 1e82dfaa7819 ("powerpc/bpf/64: Add instructions for atomic_[cmp]xchg") Cc: stable@vger.kernel.org # v6.0+ Signed-off-by: Puranjay Mohan Reviewed-by: Christophe Leroy Reviewed-by: Naveen N Rao Acked-by: Paul E. McKenney Signed-off-by: Michael Ellerman Link: https://msgid.link/20240513100248.110535-1-puranjay@kernel.org --- arch/powerpc/net/bpf_jit_comp32.c | 12 ++++++++++++ arch/powerpc/net/bpf_jit_comp64.c | 12 ++++++++++++ 2 files changed, 24 insertions(+) diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c index 43b97032a91c00..a0c4f1bde83e86 100644 --- a/arch/powerpc/net/bpf_jit_comp32.c +++ b/arch/powerpc/net/bpf_jit_comp32.c @@ -900,6 +900,15 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code /* Get offset into TMP_REG */ EMIT(PPC_RAW_LI(tmp_reg, off)); + /* + * Enforce full ordering for operations with BPF_FETCH by emitting a 'sync' + * before and after the operation. + * + * This is a requirement in the Linux Kernel Memory Model. + * See __cmpxchg_u32() in asm/cmpxchg.h as an example. + */ + if ((imm & BPF_FETCH) && IS_ENABLED(CONFIG_SMP)) + EMIT(PPC_RAW_SYNC()); tmp_idx = ctx->idx * 4; /* load value from memory into r0 */ EMIT(PPC_RAW_LWARX(_R0, tmp_reg, dst_reg, 0)); @@ -953,6 +962,9 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code /* For the BPF_FETCH variant, get old data into src_reg */ if (imm & BPF_FETCH) { + /* Emit 'sync' to enforce full ordering */ + if (IS_ENABLED(CONFIG_SMP)) + EMIT(PPC_RAW_SYNC()); EMIT(PPC_RAW_MR(ret_reg, ax_reg)); if (!fp->aux->verifier_zext) EMIT(PPC_RAW_LI(ret_reg - 1, 0)); /* higher 32-bit */ diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 8afc14a4a1258e..7703dcf48be86b 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -846,6 +846,15 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code /* Get offset into TMP_REG_1 */ EMIT(PPC_RAW_LI(tmp1_reg, off)); + /* + * Enforce full ordering for operations with BPF_FETCH by emitting a 'sync' + * before and after the operation. + * + * This is a requirement in the Linux Kernel Memory Model. + * See __cmpxchg_u64() in asm/cmpxchg.h as an example. + */ + if ((imm & BPF_FETCH) && IS_ENABLED(CONFIG_SMP)) + EMIT(PPC_RAW_SYNC()); tmp_idx = ctx->idx * 4; /* load value from memory into TMP_REG_2 */ if (size == BPF_DW) @@ -908,6 +917,9 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code PPC_BCC_SHORT(COND_NE, tmp_idx); if (imm & BPF_FETCH) { + /* Emit 'sync' to enforce full ordering */ + if (IS_ENABLED(CONFIG_SMP)) + EMIT(PPC_RAW_SYNC()); EMIT(PPC_RAW_MR(ret_reg, _R0)); /* * Skip unnecessary zero-extension for 32-bit cmpxchg. From ffa077b2f6ad124ec3d23fbddc5e4b0ff2647af8 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Wed, 29 May 2024 15:12:01 +0300 Subject: [PATCH 0299/1578] ASoC: SOF: ipc4-topology: Fix input format query of process modules without base extension If a process module does not have base config extension then the same format applies to all of it's inputs and the process->base_config_ext is NULL, causing NULL dereference when specifically crafted topology and sequences used. Fixes: 648fea128476 ("ASoC: SOF: ipc4-topology: set copier output format for process module") Signed-off-by: Peter Ujfalusi Reviewed-by: Pierre-Louis Bossart Reviewed-by: Seppo Ingalsuo Reviewed-by: Ranjani Sridharan Cc: stable@vger.kernel.org Link: https://msgid.link/r/20240529121201.14687-1-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ipc4-topology.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sound/soc/sof/ipc4-topology.c b/sound/soc/sof/ipc4-topology.c index beff1098932475..33e8c5f7d9ca07 100644 --- a/sound/soc/sof/ipc4-topology.c +++ b/sound/soc/sof/ipc4-topology.c @@ -217,6 +217,14 @@ sof_ipc4_get_input_pin_audio_fmt(struct snd_sof_widget *swidget, int pin_index) } process = swidget->private; + + /* + * For process modules without base config extension, base module config + * format is used for all input pins + */ + if (process->init_config != SOF_IPC4_MODULE_INIT_CONFIG_TYPE_BASE_CFG_WITH_EXT) + return &process->base_config.audio_fmt; + base_cfg_ext = process->base_config_ext; /* From 4a69c1264ff41bc5bf7c03101ada0454fbf08868 Mon Sep 17 00:00:00 2001 From: Witold Sadowski Date: Wed, 29 May 2024 00:40:32 -0700 Subject: [PATCH 0300/1578] spi: cadence: Ensure data lines set to low during dummy-cycle period During dummy-cycles xSPI will switch GPIO into Hi-Z mode. In that dummy period voltage on data lines will slowly drop, what can cause unintentional modebyte transmission. Value send to SPI memory chip will depend on last address, and clock frequency. To prevent unforeseen consequences of that behaviour, force send single modebyte(0x00). Modebyte will be send only if number of dummy-cycles is not equal to 0. Code must also reduce dummycycle byte count by one - as one byte is send as modebyte. Signed-off-by: Witold Sadowski Link: https://msgid.link/r/20240529074037.1345882-2-wsadowski@marvell.com Signed-off-by: Mark Brown --- drivers/spi/spi-cadence-xspi.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/spi/spi-cadence-xspi.c b/drivers/spi/spi-cadence-xspi.c index 2209e9fc378fa9..2e3eacd46b7232 100644 --- a/drivers/spi/spi-cadence-xspi.c +++ b/drivers/spi/spi-cadence-xspi.c @@ -145,6 +145,9 @@ #define CDNS_XSPI_STIG_DONE_FLAG BIT(0) #define CDNS_XSPI_TRD_STATUS 0x0104 +#define MODE_NO_OF_BYTES GENMASK(25, 24) +#define MODEBYTES_COUNT 1 + /* Helper macros for filling command registers */ #define CDNS_XSPI_CMD_FLD_P1_INSTR_CMD_1(op, data_phase) ( \ FIELD_PREP(CDNS_XSPI_CMD_INSTR_TYPE, (data_phase) ? \ @@ -157,9 +160,10 @@ FIELD_PREP(CDNS_XSPI_CMD_P1_R2_ADDR3, ((op)->addr.val >> 24) & 0xFF) | \ FIELD_PREP(CDNS_XSPI_CMD_P1_R2_ADDR4, ((op)->addr.val >> 32) & 0xFF)) -#define CDNS_XSPI_CMD_FLD_P1_INSTR_CMD_3(op) ( \ +#define CDNS_XSPI_CMD_FLD_P1_INSTR_CMD_3(op, modebytes) ( \ FIELD_PREP(CDNS_XSPI_CMD_P1_R3_ADDR5, ((op)->addr.val >> 40) & 0xFF) | \ FIELD_PREP(CDNS_XSPI_CMD_P1_R3_CMD, (op)->cmd.opcode) | \ + FIELD_PREP(MODE_NO_OF_BYTES, modebytes) | \ FIELD_PREP(CDNS_XSPI_CMD_P1_R3_NUM_ADDR_BYTES, (op)->addr.nbytes)) #define CDNS_XSPI_CMD_FLD_P1_INSTR_CMD_4(op, chipsel) ( \ @@ -173,12 +177,12 @@ #define CDNS_XSPI_CMD_FLD_DSEQ_CMD_2(op) \ FIELD_PREP(CDNS_XSPI_CMD_DSEQ_R2_DCNT_L, (op)->data.nbytes & 0xFFFF) -#define CDNS_XSPI_CMD_FLD_DSEQ_CMD_3(op) ( \ +#define CDNS_XSPI_CMD_FLD_DSEQ_CMD_3(op, dummybytes) ( \ FIELD_PREP(CDNS_XSPI_CMD_DSEQ_R3_DCNT_H, \ ((op)->data.nbytes >> 16) & 0xffff) | \ FIELD_PREP(CDNS_XSPI_CMD_DSEQ_R3_NUM_OF_DUMMY, \ (op)->dummy.buswidth != 0 ? \ - (((op)->dummy.nbytes * 8) / (op)->dummy.buswidth) : \ + (((dummybytes) * 8) / (op)->dummy.buswidth) : \ 0)) #define CDNS_XSPI_CMD_FLD_DSEQ_CMD_4(op, chipsel) ( \ @@ -351,6 +355,7 @@ static int cdns_xspi_send_stig_command(struct cdns_xspi_dev *cdns_xspi, u32 cmd_regs[6]; u32 cmd_status; int ret; + int dummybytes = op->dummy.nbytes; ret = cdns_xspi_wait_for_controller_idle(cdns_xspi); if (ret < 0) @@ -365,7 +370,12 @@ static int cdns_xspi_send_stig_command(struct cdns_xspi_dev *cdns_xspi, memset(cmd_regs, 0, sizeof(cmd_regs)); cmd_regs[1] = CDNS_XSPI_CMD_FLD_P1_INSTR_CMD_1(op, data_phase); cmd_regs[2] = CDNS_XSPI_CMD_FLD_P1_INSTR_CMD_2(op); - cmd_regs[3] = CDNS_XSPI_CMD_FLD_P1_INSTR_CMD_3(op); + if (dummybytes != 0) { + cmd_regs[3] = CDNS_XSPI_CMD_FLD_P1_INSTR_CMD_3(op, 1); + dummybytes--; + } else { + cmd_regs[3] = CDNS_XSPI_CMD_FLD_P1_INSTR_CMD_3(op, 0); + } cmd_regs[4] = CDNS_XSPI_CMD_FLD_P1_INSTR_CMD_4(op, cdns_xspi->cur_cs); @@ -375,7 +385,7 @@ static int cdns_xspi_send_stig_command(struct cdns_xspi_dev *cdns_xspi, cmd_regs[0] = CDNS_XSPI_STIG_DONE_FLAG; cmd_regs[1] = CDNS_XSPI_CMD_FLD_DSEQ_CMD_1(op); cmd_regs[2] = CDNS_XSPI_CMD_FLD_DSEQ_CMD_2(op); - cmd_regs[3] = CDNS_XSPI_CMD_FLD_DSEQ_CMD_3(op); + cmd_regs[3] = CDNS_XSPI_CMD_FLD_DSEQ_CMD_3(op, dummybytes); cmd_regs[4] = CDNS_XSPI_CMD_FLD_DSEQ_CMD_4(op, cdns_xspi->cur_cs); From b7d7f11a291830fdf69d3301075dd0fb347ced84 Mon Sep 17 00:00:00 2001 From: Nicolas Escande Date: Tue, 28 May 2024 16:26:05 +0200 Subject: [PATCH 0301/1578] wifi: mac80211: mesh: Fix leak of mesh_preq_queue objects The hwmp code use objects of type mesh_preq_queue, added to a list in ieee80211_if_mesh, to keep track of mpath we need to resolve. If the mpath gets deleted, ex mesh interface is removed, the entries in that list will never get cleaned. Fix this by flushing all corresponding items of the preq_queue in mesh_path_flush_pending(). This should take care of KASAN reports like this: unreferenced object 0xffff00000668d800 (size 128): comm "kworker/u8:4", pid 67, jiffies 4295419552 (age 1836.444s) hex dump (first 32 bytes): 00 1f 05 09 00 00 ff ff 00 d5 68 06 00 00 ff ff ..........h..... 8e 97 ea eb 3e b8 01 00 00 00 00 00 00 00 00 00 ....>........... backtrace: [<000000007302a0b6>] __kmem_cache_alloc_node+0x1e0/0x35c [<00000000049bd418>] kmalloc_trace+0x34/0x80 [<0000000000d792bb>] mesh_queue_preq+0x44/0x2a8 [<00000000c99c3696>] mesh_nexthop_resolve+0x198/0x19c [<00000000926bf598>] ieee80211_xmit+0x1d0/0x1f4 [<00000000fc8c2284>] __ieee80211_subif_start_xmit+0x30c/0x764 [<000000005926ee38>] ieee80211_subif_start_xmit+0x9c/0x7a4 [<000000004c86e916>] dev_hard_start_xmit+0x174/0x440 [<0000000023495647>] __dev_queue_xmit+0xe24/0x111c [<00000000cfe9ca78>] batadv_send_skb_packet+0x180/0x1e4 [<000000007bacc5d5>] batadv_v_elp_periodic_work+0x2f4/0x508 [<00000000adc3cd94>] process_one_work+0x4b8/0xa1c [<00000000b36425d1>] worker_thread+0x9c/0x634 [<0000000005852dd5>] kthread+0x1bc/0x1c4 [<000000005fccd770>] ret_from_fork+0x10/0x20 unreferenced object 0xffff000009051f00 (size 128): comm "kworker/u8:4", pid 67, jiffies 4295419553 (age 1836.440s) hex dump (first 32 bytes): 90 d6 92 0d 00 00 ff ff 00 d8 68 06 00 00 ff ff ..........h..... 36 27 92 e4 02 e0 01 00 00 58 79 06 00 00 ff ff 6'.......Xy..... backtrace: [<000000007302a0b6>] __kmem_cache_alloc_node+0x1e0/0x35c [<00000000049bd418>] kmalloc_trace+0x34/0x80 [<0000000000d792bb>] mesh_queue_preq+0x44/0x2a8 [<00000000c99c3696>] mesh_nexthop_resolve+0x198/0x19c [<00000000926bf598>] ieee80211_xmit+0x1d0/0x1f4 [<00000000fc8c2284>] __ieee80211_subif_start_xmit+0x30c/0x764 [<000000005926ee38>] ieee80211_subif_start_xmit+0x9c/0x7a4 [<000000004c86e916>] dev_hard_start_xmit+0x174/0x440 [<0000000023495647>] __dev_queue_xmit+0xe24/0x111c [<00000000cfe9ca78>] batadv_send_skb_packet+0x180/0x1e4 [<000000007bacc5d5>] batadv_v_elp_periodic_work+0x2f4/0x508 [<00000000adc3cd94>] process_one_work+0x4b8/0xa1c [<00000000b36425d1>] worker_thread+0x9c/0x634 [<0000000005852dd5>] kthread+0x1bc/0x1c4 [<000000005fccd770>] ret_from_fork+0x10/0x20 Fixes: 050ac52cbe1f ("mac80211: code for on-demand Hybrid Wireless Mesh Protocol") Signed-off-by: Nicolas Escande Link: https://msgid.link/20240528142605.1060566-1-nico.escande@gmail.com Signed-off-by: Johannes Berg --- net/mac80211/mesh_pathtbl.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index a6b62169f08483..c0a5c75cddcb9f 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -1017,10 +1017,23 @@ void mesh_path_discard_frame(struct ieee80211_sub_if_data *sdata, */ void mesh_path_flush_pending(struct mesh_path *mpath) { + struct ieee80211_sub_if_data *sdata = mpath->sdata; + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; + struct mesh_preq_queue *preq, *tmp; struct sk_buff *skb; while ((skb = skb_dequeue(&mpath->frame_queue)) != NULL) mesh_path_discard_frame(mpath->sdata, skb); + + spin_lock_bh(&ifmsh->mesh_preq_queue_lock); + list_for_each_entry_safe(preq, tmp, &ifmsh->preq_queue.list, list) { + if (ether_addr_equal(mpath->dst, preq->dst)) { + list_del(&preq->list); + kfree(preq); + --ifmsh->preq_queue_len; + } + } + spin_unlock_bh(&ifmsh->mesh_preq_queue_lock); } /** From 6f6291f09a322c1c1578badac8072d049363f4e6 Mon Sep 17 00:00:00 2001 From: Nicolas Escande Date: Mon, 27 May 2024 16:17:59 +0200 Subject: [PATCH 0302/1578] wifi: mac80211: mesh: init nonpeer_pm to active by default in mesh sdata With a ath9k device I can see that: iw phy phy0 interface add mesh0 type mp ip link set mesh0 up iw dev mesh0 scan Will start a scan with the Power Management bit set in the Frame Control Field. This is because we set this bit depending on the nonpeer_pm variable of the mesh iface sdata and when there are no active links on the interface it remains to NL80211_MESH_POWER_UNKNOWN. As soon as links starts to be established, it wil switch to NL80211_MESH_POWER_ACTIVE as it is the value set by befault on the per sta nonpeer_pm field. As we want no power save by default, (as expressed with the per sta ini values), lets init it to the expected default value of NL80211_MESH_POWER_ACTIVE. Also please note that we cannot change the default value from userspace prior to establishing a link as using NL80211_CMD_SET_MESH_CONFIG will not work before NL80211_CMD_JOIN_MESH has been issued. So too late for our initial scan. Signed-off-by: Nicolas Escande Link: https://msgid.link/20240527141759.299411-1-nico.escande@gmail.com Signed-off-by: Johannes Berg --- net/mac80211/mesh.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index cbc9b5e40cb35e..6d4510221c98e6 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -1776,6 +1776,7 @@ void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata) ifmsh->last_preq = jiffies; ifmsh->next_perr = jiffies; ifmsh->csa_role = IEEE80211_MESH_CSA_ROLE_NONE; + ifmsh->nonpeer_pm = NL80211_MESH_POWER_ACTIVE; /* Allocate all mesh structures when creating the first mesh interface. */ if (!mesh_allocated) ieee80211s_init(); From 44c06bbde6443de206b30f513100b5670b23fc5e Mon Sep 17 00:00:00 2001 From: Remi Pommarel Date: Wed, 29 May 2024 08:57:53 +0200 Subject: [PATCH 0303/1578] wifi: mac80211: Fix deadlock in ieee80211_sta_ps_deliver_wakeup() The ieee80211_sta_ps_deliver_wakeup() function takes sta->ps_lock to synchronizes with ieee80211_tx_h_unicast_ps_buf() which is called from softirq context. However using only spin_lock() to get sta->ps_lock in ieee80211_sta_ps_deliver_wakeup() does not prevent softirq to execute on this same CPU, to run ieee80211_tx_h_unicast_ps_buf() and try to take this same lock ending in deadlock. Below is an example of rcu stall that arises in such situation. rcu: INFO: rcu_sched self-detected stall on CPU rcu: 2-....: (42413413 ticks this GP) idle=b154/1/0x4000000000000000 softirq=1763/1765 fqs=21206996 rcu: (t=42586894 jiffies g=2057 q=362405 ncpus=4) CPU: 2 PID: 719 Comm: wpa_supplicant Tainted: G W 6.4.0-02158-g1b062f552873 #742 Hardware name: RPT (r1) (DT) pstate: 00000005 (nzcv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : queued_spin_lock_slowpath+0x58/0x2d0 lr : invoke_tx_handlers_early+0x5b4/0x5c0 sp : ffff00001ef64660 x29: ffff00001ef64660 x28: ffff000009bc1070 x27: ffff000009bc0ad8 x26: ffff000009bc0900 x25: ffff00001ef647a8 x24: 0000000000000000 x23: ffff000009bc0900 x22: ffff000009bc0900 x21: ffff00000ac0e000 x20: ffff00000a279e00 x19: ffff00001ef646e8 x18: 0000000000000000 x17: ffff800016468000 x16: ffff00001ef608c0 x15: 0010533c93f64f80 x14: 0010395c9faa3946 x13: 0000000000000000 x12: 00000000fa83b2da x11: 000000012edeceea x10: ffff0000010fbe00 x9 : 0000000000895440 x8 : 000000000010533c x7 : ffff00000ad8b740 x6 : ffff00000c350880 x5 : 0000000000000007 x4 : 0000000000000001 x3 : 0000000000000000 x2 : 0000000000000000 x1 : 0000000000000001 x0 : ffff00000ac0e0e8 Call trace: queued_spin_lock_slowpath+0x58/0x2d0 ieee80211_tx+0x80/0x12c ieee80211_tx_pending+0x110/0x278 tasklet_action_common.constprop.0+0x10c/0x144 tasklet_action+0x20/0x28 _stext+0x11c/0x284 ____do_softirq+0xc/0x14 call_on_irq_stack+0x24/0x34 do_softirq_own_stack+0x18/0x20 do_softirq+0x74/0x7c __local_bh_enable_ip+0xa0/0xa4 _ieee80211_wake_txqs+0x3b0/0x4b8 __ieee80211_wake_queue+0x12c/0x168 ieee80211_add_pending_skbs+0xec/0x138 ieee80211_sta_ps_deliver_wakeup+0x2a4/0x480 ieee80211_mps_sta_status_update.part.0+0xd8/0x11c ieee80211_mps_sta_status_update+0x18/0x24 sta_apply_parameters+0x3bc/0x4c0 ieee80211_change_station+0x1b8/0x2dc nl80211_set_station+0x444/0x49c genl_family_rcv_msg_doit.isra.0+0xa4/0xfc genl_rcv_msg+0x1b0/0x244 netlink_rcv_skb+0x38/0x10c genl_rcv+0x34/0x48 netlink_unicast+0x254/0x2bc netlink_sendmsg+0x190/0x3b4 ____sys_sendmsg+0x1e8/0x218 ___sys_sendmsg+0x68/0x8c __sys_sendmsg+0x44/0x84 __arm64_sys_sendmsg+0x20/0x28 do_el0_svc+0x6c/0xe8 el0_svc+0x14/0x48 el0t_64_sync_handler+0xb0/0xb4 el0t_64_sync+0x14c/0x150 Using spin_lock_bh()/spin_unlock_bh() instead prevents softirq to raise on the same CPU that is holding the lock. Fixes: 1d147bfa6429 ("mac80211: fix AP powersave TX vs. wakeup race") Signed-off-by: Remi Pommarel Link: https://msgid.link/8e36fe07d0fbc146f89196cd47a53c8a0afe84aa.1716910344.git.repk@triplefau.lt Signed-off-by: Johannes Berg --- net/mac80211/sta_info.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index da5fdd6f5c852b..aa22f09e6d145f 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -1724,7 +1724,7 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) skb_queue_head_init(&pending); /* sync with ieee80211_tx_h_unicast_ps_buf */ - spin_lock(&sta->ps_lock); + spin_lock_bh(&sta->ps_lock); /* Send all buffered frames to the station */ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { int count = skb_queue_len(&pending), tmp; @@ -1753,7 +1753,7 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) */ clear_sta_flag(sta, WLAN_STA_PSPOLL); clear_sta_flag(sta, WLAN_STA_UAPSD); - spin_unlock(&sta->ps_lock); + spin_unlock_bh(&sta->ps_lock); atomic_dec(&ps->num_sta_ps); From 4dc3a3893dae5a7f73e5809273aca0f1f3548d55 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 23 May 2024 12:05:33 +0200 Subject: [PATCH 0304/1578] wifi: cfg80211: validate HE operation element parsing Validate that the HE operation element has the correct length before parsing it. Cc: stable@vger.kernel.org Fixes: 645f3d85129d ("wifi: cfg80211: handle UHB AP and STA power type") Reviewed-by: Miriam Rachel Korenblit Link: https://msgid.link/20240523120533.677025eb4a92.I44c091029ef113c294e8fe8b9bf871bf5dbeeb27@changeid Signed-off-by: Johannes Berg --- net/wireless/scan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 127853877a0ad1..8daed8232b0549 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -2128,7 +2128,8 @@ static bool cfg80211_6ghz_power_type_valid(const u8 *ie, size_t ielen, struct ieee80211_he_operation *he_oper; tmp = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_OPERATION, ie, ielen); - if (tmp && tmp->datalen >= sizeof(*he_oper) + 1) { + if (tmp && tmp->datalen >= sizeof(*he_oper) + 1 && + tmp->datalen >= ieee80211_he_oper_size(tmp->data + 1)) { const struct ieee80211_he_6ghz_oper *he_6ghz_oper; he_oper = (void *)&tmp->data[1]; From e296c95eac655008d5a709b8cf54d0018da1c916 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 22 May 2024 12:41:25 +0200 Subject: [PATCH 0305/1578] wifi: cfg80211: fully move wiphy work to unbound workqueue Previously I had moved the wiphy work to the unbound system workqueue, but missed that when it restarts and during resume it was still using the normal system workqueue. Fix that. Fixes: 91d20ab9d9ca ("wifi: cfg80211: use system_unbound_wq for wiphy work") Reviewed-by: Miriam Rachel Korenblit Link: https://msgid.link/20240522124126.7ca959f2cbd3.I3e2a71ef445d167b84000ccf934ea245aef8d395@changeid Signed-off-by: Johannes Berg --- net/wireless/core.c | 2 +- net/wireless/sysfs.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/wireless/core.c b/net/wireless/core.c index 3fb1b637352a9d..4b1f45e3070e06 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -431,7 +431,7 @@ static void cfg80211_wiphy_work(struct work_struct *work) if (wk) { list_del_init(&wk->entry); if (!list_empty(&rdev->wiphy_work_list)) - schedule_work(work); + queue_work(system_unbound_wq, work); spin_unlock_irq(&rdev->wiphy_work_lock); wk->func(&rdev->wiphy, wk); diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c index 565511a3f461ed..62f26618f67474 100644 --- a/net/wireless/sysfs.c +++ b/net/wireless/sysfs.c @@ -5,7 +5,7 @@ * * Copyright 2005-2006 Jiri Benc * Copyright 2006 Johannes Berg - * Copyright (C) 2020-2021, 2023 Intel Corporation + * Copyright (C) 2020-2021, 2023-2024 Intel Corporation */ #include @@ -137,7 +137,7 @@ static int wiphy_resume(struct device *dev) if (rdev->wiphy.registered && rdev->ops->resume) ret = rdev_resume(rdev); rdev->suspended = false; - schedule_work(&rdev->wiphy_work); + queue_work(system_unbound_wq, &rdev->wiphy_work); wiphy_unlock(&rdev->wiphy); if (ret) From 642f89daa34567d02f312d03e41523a894906dae Mon Sep 17 00:00:00 2001 From: Remi Pommarel Date: Tue, 21 May 2024 21:47:26 +0200 Subject: [PATCH 0306/1578] wifi: cfg80211: Lock wiphy in cfg80211_get_station Wiphy should be locked before calling rdev_get_station() (see lockdep assert in ieee80211_get_station()). This fixes the following kernel NULL dereference: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000050 Mem abort info: ESR = 0x0000000096000006 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x06: level 2 translation fault Data abort info: ISV = 0, ISS = 0x00000006 CM = 0, WnR = 0 user pgtable: 4k pages, 48-bit VAs, pgdp=0000000003001000 [0000000000000050] pgd=0800000002dca003, p4d=0800000002dca003, pud=08000000028e9003, pmd=0000000000000000 Internal error: Oops: 0000000096000006 [#1] SMP Modules linked in: netconsole dwc3_meson_g12a dwc3_of_simple dwc3 ip_gre gre ath10k_pci ath10k_core ath9k ath9k_common ath9k_hw ath CPU: 0 PID: 1091 Comm: kworker/u8:0 Not tainted 6.4.0-02144-g565f9a3a7911-dirty #705 Hardware name: RPT (r1) (DT) Workqueue: bat_events batadv_v_elp_throughput_metric_update pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : ath10k_sta_statistics+0x10/0x2dc [ath10k_core] lr : sta_set_sinfo+0xcc/0xbd4 sp : ffff000007b43ad0 x29: ffff000007b43ad0 x28: ffff0000071fa900 x27: ffff00000294ca98 x26: ffff000006830880 x25: ffff000006830880 x24: ffff00000294c000 x23: 0000000000000001 x22: ffff000007b43c90 x21: ffff800008898acc x20: ffff00000294c6e8 x19: ffff000007b43c90 x18: 0000000000000000 x17: 445946354d552d78 x16: 62661f7200000000 x15: 57464f445946354d x14: 0000000000000000 x13: 00000000000000e3 x12: d5f0acbcebea978e x11: 00000000000000e3 x10: 000000010048fe41 x9 : 0000000000000000 x8 : ffff000007b43d90 x7 : 000000007a1e2125 x6 : 0000000000000000 x5 : ffff0000024e0900 x4 : ffff800000a0250c x3 : ffff000007b43c90 x2 : ffff00000294ca98 x1 : ffff000006831920 x0 : 0000000000000000 Call trace: ath10k_sta_statistics+0x10/0x2dc [ath10k_core] sta_set_sinfo+0xcc/0xbd4 ieee80211_get_station+0x2c/0x44 cfg80211_get_station+0x80/0x154 batadv_v_elp_get_throughput+0x138/0x1fc batadv_v_elp_throughput_metric_update+0x1c/0xa4 process_one_work+0x1ec/0x414 worker_thread+0x70/0x46c kthread+0xdc/0xe0 ret_from_fork+0x10/0x20 Code: a9bb7bfd 910003fd a90153f3 f9411c40 (f9402814) This happens because STA has time to disconnect and reconnect before batadv_v_elp_throughput_metric_update() delayed work gets scheduled. In this situation, ath10k_sta_state() can be in the middle of resetting arsta data when the work queue get chance to be scheduled and ends up accessing it. Locking wiphy prevents that. Fixes: 7406353d43c8 ("cfg80211: implement cfg80211_get_station cfg80211 API") Signed-off-by: Remi Pommarel Reviewed-by: Nicolas Escande Acked-by: Antonio Quartulli Link: https://msgid.link/983b24a6a176e0800c01aedcd74480d9b551cb13.1716046653.git.repk@triplefau.lt Signed-off-by: Johannes Berg --- net/wireless/util.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/wireless/util.c b/net/wireless/util.c index 2bde8a35463132..082c6f9c5416eb 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -2549,6 +2549,7 @@ int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr, { struct cfg80211_registered_device *rdev; struct wireless_dev *wdev; + int ret; wdev = dev->ieee80211_ptr; if (!wdev) @@ -2560,7 +2561,11 @@ int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr, memset(sinfo, 0, sizeof(*sinfo)); - return rdev_get_station(rdev, dev, mac_addr, sinfo); + wiphy_lock(&rdev->wiphy); + ret = rdev_get_station(rdev, dev, mac_addr, sinfo); + wiphy_unlock(&rdev->wiphy); + + return ret; } EXPORT_SYMBOL(cfg80211_get_station); From ab904521f4de52fef4f179d2dfc1877645ef5f5c Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Tue, 21 May 2024 15:50:59 +0800 Subject: [PATCH 0307/1578] wifi: cfg80211: pmsr: use correct nla_get_uX functions The commit 9bb7e0f24e7e ("cfg80211: add peer measurement with FTM initiator API") defines four attributes NL80211_PMSR_FTM_REQ_ATTR_ {NUM_BURSTS_EXP}/{BURST_PERIOD}/{BURST_DURATION}/{FTMS_PER_BURST} in following ways. static const struct nla_policy nl80211_pmsr_ftm_req_attr_policy[NL80211_PMSR_FTM_REQ_ATTR_MAX + 1] = { ... [NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP] = NLA_POLICY_MAX(NLA_U8, 15), [NL80211_PMSR_FTM_REQ_ATTR_BURST_PERIOD] = { .type = NLA_U16 }, [NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION] = NLA_POLICY_MAX(NLA_U8, 15), [NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST] = NLA_POLICY_MAX(NLA_U8, 31), ... }; That is, those attributes are expected to be NLA_U8 and NLA_U16 types. However, the consumers of these attributes in `pmsr_parse_ftm` blindly all use `nla_get_u32`, which is incorrect and causes functionality issues on little-endian platforms. Hence, fix them with the correct `nla_get_u8` and `nla_get_u16` functions. Fixes: 9bb7e0f24e7e ("cfg80211: add peer measurement with FTM initiator API") Signed-off-by: Lin Ma Link: https://msgid.link/20240521075059.47999-1-linma@zju.edu.cn Signed-off-by: Johannes Berg --- net/wireless/pmsr.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c index e106dcea397782..c569c37da31758 100644 --- a/net/wireless/pmsr.c +++ b/net/wireless/pmsr.c @@ -56,7 +56,7 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev, out->ftm.burst_period = 0; if (tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_PERIOD]) out->ftm.burst_period = - nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_PERIOD]); + nla_get_u16(tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_PERIOD]); out->ftm.asap = !!tb[NL80211_PMSR_FTM_REQ_ATTR_ASAP]; if (out->ftm.asap && !capa->ftm.asap) { @@ -75,7 +75,7 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev, out->ftm.num_bursts_exp = 0; if (tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP]) out->ftm.num_bursts_exp = - nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP]); + nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP]); if (capa->ftm.max_bursts_exponent >= 0 && out->ftm.num_bursts_exp > capa->ftm.max_bursts_exponent) { @@ -88,7 +88,7 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev, out->ftm.burst_duration = 15; if (tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION]) out->ftm.burst_duration = - nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION]); + nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION]); out->ftm.ftms_per_burst = 0; if (tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST]) @@ -107,7 +107,7 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev, out->ftm.ftmr_retries = 3; if (tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_FTMR_RETRIES]) out->ftm.ftmr_retries = - nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_FTMR_RETRIES]); + nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_FTMR_RETRIES]); out->ftm.request_lci = !!tb[NL80211_PMSR_FTM_REQ_ATTR_REQUEST_LCI]; if (out->ftm.request_lci && !capa->ftm.request_lci) { From 02c665f048a439c0d58cc45334c94634bd7c18e6 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 15 May 2024 13:34:10 +0200 Subject: [PATCH 0308/1578] wifi: mac80211: apply mcast rate only if interface is up If the interface isn't enabled, don't apply multicast rate changes immediately. Reported-by: syzbot+de87c09cc7b964ea2e23@syzkaller.appspotmail.com Link: https://msgid.link/20240515133410.d6cffe5756cc.I47b624a317e62bdb4609ff7fa79403c0c444d32d@changeid Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index b08e5d7687e3fc..99abbb9e8477bd 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2958,8 +2958,9 @@ static int ieee80211_set_mcast_rate(struct wiphy *wiphy, struct net_device *dev, memcpy(sdata->vif.bss_conf.mcast_rate, rate, sizeof(int) * NUM_NL80211_BANDS); - ieee80211_link_info_change_notify(sdata, &sdata->deflink, - BSS_CHANGED_MCAST_RATE); + if (ieee80211_sdata_running(sdata)) + ieee80211_link_info_change_notify(sdata, &sdata->deflink, + BSS_CHANGED_MCAST_RATE); return 0; } From 177c6ae9725d783f9e96f02593ce8fb2639be22f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 15 May 2024 13:53:19 +0200 Subject: [PATCH 0309/1578] wifi: mac80211: handle tasklet frames before stopping The code itself doesn't want to handle frames from the driver if it's already stopped, but if the tasklet was queued before and runs after the stop, then all bets are off. Flush queues before actually stopping, RX should be off at this point since all the interfaces are removed already, etc. Reported-by: syzbot+8830db5d3593b5546d2e@syzkaller.appspotmail.com Link: https://msgid.link/20240515135318.b05f11385c9a.I41c1b33a2e1814c3a7ef352cd7f2951b91785617@changeid Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 2 ++ net/mac80211/main.c | 10 ++++++++-- net/mac80211/util.c | 2 ++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index eb62b7d4b4f7e2..3cedfdc9099b7d 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1845,6 +1845,8 @@ void ieee80211_link_info_change_notify(struct ieee80211_sub_if_data *sdata, void ieee80211_configure_filter(struct ieee80211_local *local); u64 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata); +void ieee80211_handle_queued_frames(struct ieee80211_local *local); + u64 ieee80211_mgmt_tx_cookie(struct ieee80211_local *local); int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb, u64 *cookie, gfp_t gfp); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 4eaea0a9975b47..1132dea0e290ea 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -423,9 +423,8 @@ u64 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata) BSS_CHANGED_ERP_SLOT; } -static void ieee80211_tasklet_handler(struct tasklet_struct *t) +void ieee80211_handle_queued_frames(struct ieee80211_local *local) { - struct ieee80211_local *local = from_tasklet(local, t, tasklet); struct sk_buff *skb; while ((skb = skb_dequeue(&local->skb_queue)) || @@ -450,6 +449,13 @@ static void ieee80211_tasklet_handler(struct tasklet_struct *t) } } +static void ieee80211_tasklet_handler(struct tasklet_struct *t) +{ + struct ieee80211_local *local = from_tasklet(local, t, tasklet); + + ieee80211_handle_queued_frames(local); +} + static void ieee80211_restart_work(struct work_struct *work) { struct ieee80211_local *local = diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 0b893e95895944..283bfc99417e57 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1567,6 +1567,8 @@ u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, void ieee80211_stop_device(struct ieee80211_local *local) { + ieee80211_handle_queued_frames(local); + ieee80211_led_radio(local, false); ieee80211_mod_tpt_led_trig(local, 0, IEEE80211_TPT_LEDTRIG_FL_RADIO); From f7a8b10bfd614d7a9a16fbe80d28ead4f063cb00 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 10 May 2024 11:37:38 +0200 Subject: [PATCH 0310/1578] wifi: cfg80211: fix 6 GHz scan request building The 6 GHz scan request struct allocated by cfg80211_scan_6ghz() is meant to be formed this way: [base struct][channels][ssids][6ghz_params] It is allocated with [channels] as the maximum number of channels supported by the driver in the 6 GHz band, since allocation is before knowing how many there will be. However, the inner pointers are set incorrectly: initially, the 6 GHz scan parameters pointer is set: [base struct][channels] ^ scan_6ghz_params and later the SSID pointer is set to the end of the actually _used_ channels. [base struct][channels] ^ ssids If many APs were to be discovered, and many channels used, and there were many SSIDs, then the SSIDs could overlap the 6 GHz parameters. Additionally, the request->ssids for most of the function points to the original request still (given the struct copy) but is used normally, which is confusing. Clear this up, by actually using the allocated space for 6 GHz parameters _after_ the SSIDs, and set up the SSIDs initially so they are used more clearly. Just like in nl80211.c, set them only if there actually are SSIDs though. Finally, also copy the elements (ie/ie_len) so they're part of the same request, not pointing to the old request. Co-developed-by: Miri Korenblit Signed-off-by: Miri Korenblit Reviewed-by: Ilan Peer Signed-off-by: Johannes Berg Link: https://msgid.link/20240510113738.4190692ef4ee.I0cb19188be17a8abd029805e3373c0a7777c214c@changeid Signed-off-by: Johannes Berg --- net/wireless/rdev-ops.h | 6 +++++- net/wireless/scan.c | 47 +++++++++++++++++++++++++++-------------- 2 files changed, 36 insertions(+), 17 deletions(-) diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 43897a5269b6a9..755af47b88b91a 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -2,7 +2,7 @@ /* * Portions of this file * Copyright(c) 2016-2017 Intel Deutschland GmbH - * Copyright (C) 2018, 2021-2023 Intel Corporation + * Copyright (C) 2018, 2021-2024 Intel Corporation */ #ifndef __CFG80211_RDEV_OPS #define __CFG80211_RDEV_OPS @@ -458,6 +458,10 @@ static inline int rdev_scan(struct cfg80211_registered_device *rdev, struct cfg80211_scan_request *request) { int ret; + + if (WARN_ON_ONCE(!request->n_ssids && request->ssids)) + return -EINVAL; + trace_rdev_scan(&rdev->wiphy, request); ret = rdev->ops->scan(&rdev->wiphy, request); trace_rdev_return_int(&rdev->wiphy, ret); diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 8daed8232b0549..2f2a3163968a7c 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -812,6 +812,7 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) LIST_HEAD(coloc_ap_list); bool need_scan_psc = true; const struct ieee80211_sband_iftype_data *iftd; + size_t size, offs_ssids, offs_6ghz_params, offs_ies; rdev_req->scan_6ghz = true; @@ -877,10 +878,15 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) spin_unlock_bh(&rdev->bss_lock); } - request = kzalloc(struct_size(request, channels, n_channels) + - sizeof(*request->scan_6ghz_params) * count + - sizeof(*request->ssids) * rdev_req->n_ssids, - GFP_KERNEL); + size = struct_size(request, channels, n_channels); + offs_ssids = size; + size += sizeof(*request->ssids) * rdev_req->n_ssids; + offs_6ghz_params = size; + size += sizeof(*request->scan_6ghz_params) * count; + offs_ies = size; + size += rdev_req->ie_len; + + request = kzalloc(size, GFP_KERNEL); if (!request) { cfg80211_free_coloc_ap_list(&coloc_ap_list); return -ENOMEM; @@ -888,8 +894,26 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) *request = *rdev_req; request->n_channels = 0; - request->scan_6ghz_params = - (void *)&request->channels[n_channels]; + request->n_6ghz_params = 0; + if (rdev_req->n_ssids) { + /* + * Add the ssids from the parent scan request to the new + * scan request, so the driver would be able to use them + * in its probe requests to discover hidden APs on PSC + * channels. + */ + request->ssids = (void *)request + offs_ssids; + memcpy(request->ssids, rdev_req->ssids, + sizeof(*request->ssids) * request->n_ssids); + } + request->scan_6ghz_params = (void *)request + offs_6ghz_params; + + if (rdev_req->ie_len) { + void *ie = (void *)request + offs_ies; + + memcpy(ie, rdev_req->ie, rdev_req->ie_len); + request->ie = ie; + } /* * PSC channels should not be scanned in case of direct scan with 1 SSID @@ -978,17 +1002,8 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) if (request->n_channels) { struct cfg80211_scan_request *old = rdev->int_scan_req; - rdev->int_scan_req = request; - /* - * Add the ssids from the parent scan request to the new scan - * request, so the driver would be able to use them in its - * probe requests to discover hidden APs on PSC channels. - */ - request->ssids = (void *)&request->channels[request->n_channels]; - request->n_ssids = rdev_req->n_ssids; - memcpy(request->ssids, rdev_req->ssids, sizeof(*request->ssids) * - request->n_ssids); + rdev->int_scan_req = request; /* * If this scan follows a previous scan, save the scan start From 8ecc4d7a7cd3e9704b63b8e4f6cd8b6b7314210f Mon Sep 17 00:00:00 2001 From: Aditya Kumar Singh Date: Thu, 9 May 2024 08:55:55 +0530 Subject: [PATCH 0311/1578] wifi: mac80211: pass proper link id for channel switch started notification Original changes[1] posted is having proper changes. However, at the same time, there was chandef puncturing changes which had a conflict with this. While applying, two errors crept in - a) Whitespace error. b) Link ID being passed to channel switch started notifier function is 0. However proper link ID is present in the function. Fix these now. [1] https://lore.kernel.org/all/20240130140918.1172387-5-quic_adisi@quicinc.com/ Fixes: 1a96bb4e8a79 ("wifi: mac80211: start and finalize channel switch on link basis") Signed-off-by: Aditya Kumar Singh Link: https://msgid.link/20240509032555.263933-1-quic_adisi@quicinc.com Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 99abbb9e8477bd..83ad6c9709fe60 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -4017,7 +4017,7 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, goto out; } - link_data->csa_chanreq = chanreq; + link_data->csa_chanreq = chanreq; link_conf->csa_active = true; if (params->block_tx && @@ -4028,7 +4028,7 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, } cfg80211_ch_switch_started_notify(sdata->dev, - &link_data->csa_chanreq.oper, 0, + &link_data->csa_chanreq.oper, link_id, params->count, params->block_tx); if (changed) { From 92158790ce4391ce4c35d8dfbce759195e4724cb Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Sun, 12 May 2024 15:25:00 +0300 Subject: [PATCH 0312/1578] wifi: iwlwifi: mvm: don't initialize csa_work twice The initialization of this worker moved to iwl_mvm_mac_init_mvmvif but we removed only from the pre-MLD version of the add_interface callback. Remove it also from the MLD version. Fixes: 0bcc2155983e ("wifi: iwlwifi: mvm: init vif works only once") Signed-off-by: Miri Korenblit Reviewed-by: Johannes Berg Link: https://msgid.link/20240512152312.4f15b41604f0.Iec912158e5a706175531d3736d77d25adf02fba4@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c index 0a3b7284eeddf9..fcfd2dd7568e50 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c @@ -75,8 +75,6 @@ static int iwl_mvm_mld_mac_add_interface(struct ieee80211_hw *hw, goto out_free_bf; iwl_mvm_tcm_add_vif(mvm, vif); - INIT_DELAYED_WORK(&mvmvif->csa_work, - iwl_mvm_channel_switch_disconnect_wk); if (vif->type == NL80211_IFTYPE_MONITOR) { mvm->monitor_on = true; From 98b7017ddb914a32152ed9aec7468386fff85f8f Mon Sep 17 00:00:00 2001 From: Shaul Triebitz Date: Sun, 12 May 2024 15:25:04 +0300 Subject: [PATCH 0313/1578] wifi: iwlwifi: mvm: always set the TWT IE offset In beacon template version 14, make sure to always set the TWT IE offset before sending the beacon template command, also in the debugfs inject_beacon_ie path. If the TWT IE does not exist, the offset will be set to zero. Fixes: bf0212fd8faa ("wifi: iwlwifi: mvm: add beacon template version 14") Signed-off-by: Shaul Triebitz Signed-off-by: Miri Korenblit Link: https://msgid.link/20240512152312.eb27175c345a.If30ef24aba10fe47fd42a7a9703eb8903035e294@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c | 9 +++++++++ drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c | 2 +- drivers/net/wireless/intel/iwlwifi/mvm/mvm.h | 1 + 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c b/drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c index 79f4ac8cbc729e..8101ecbb478b6f 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c @@ -1617,6 +1617,15 @@ static int _iwl_dbgfs_inject_beacon_ie(struct iwl_mvm *mvm, char *bin, int len) &beacon_cmd.tim_size, beacon->data, beacon->len); + if (iwl_fw_lookup_cmd_ver(mvm->fw, + BEACON_TEMPLATE_CMD, 0) >= 14) { + u32 offset = iwl_mvm_find_ie_offset(beacon->data, + WLAN_EID_S1G_TWT, + beacon->len); + + beacon_cmd.btwt_offset = cpu_to_le32(offset); + } + iwl_mvm_mac_ctxt_send_beacon_cmd(mvm, beacon, &beacon_cmd, sizeof(beacon_cmd)); } diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c index 5a06f887769a6a..5144fa0f96b0e0 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c @@ -873,7 +873,7 @@ void iwl_mvm_mac_ctxt_set_tim(struct iwl_mvm *mvm, } } -static u32 iwl_mvm_find_ie_offset(u8 *beacon, u8 eid, u32 frame_size) +u32 iwl_mvm_find_ie_offset(u8 *beacon, u8 eid, u32 frame_size) { struct ieee80211_mgmt *mgmt = (void *)beacon; const u8 *ie; diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h index 1f58c727fa632d..0a1959bd407999 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h @@ -1758,6 +1758,7 @@ u8 iwl_mvm_next_antenna(struct iwl_mvm *mvm, u8 valid, u8 last_idx); void iwl_mvm_get_sync_time(struct iwl_mvm *mvm, int clock_type, u32 *gp2, u64 *boottime, ktime_t *realtime); u32 iwl_mvm_get_systime(struct iwl_mvm *mvm); +u32 iwl_mvm_find_ie_offset(u8 *beacon, u8 eid, u32 frame_size); /* Tx / Host Commands */ int __must_check iwl_mvm_send_cmd(struct iwl_mvm *mvm, From 788e4c75f831d06fcfbbec1d455fac429521e607 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Sun, 12 May 2024 07:31:06 +0300 Subject: [PATCH 0314/1578] wifi: iwlwifi: mvm: fix a crash on 7265 Since IWL_FW_CMD_VER_UNKNOWN = 99, then my change to consider cmd_ver >= 7 instead of cmd_ver = 7 included also firmwares that don't advertise the command version at all. This made us send a command with a bad size and because of that, the firmware hit a BAD_COMMAND immediately after handling the REDUCE_TX_POWER_CMD command. Fixes: 8f892e225f41 ("wifi: iwlwifi: mvm: support iwl_dev_tx_power_cmd_v8") Signed-off-by: Emmanuel Grumbach Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://msgid.link/20240512072733.eb20ff5050d3.Ie4fc6f5496cd296fd6ff20d15e98676f28a3cccd@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/fw.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c index e7f5978ef2d719..ca7fcae6995ee3 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c @@ -895,8 +895,8 @@ int iwl_mvm_sar_select_profile(struct iwl_mvm *mvm, int prof_a, int prof_b) int ret; u16 len = 0; u32 n_subbands; - u8 cmd_ver = iwl_fw_lookup_cmd_ver(mvm->fw, cmd_id, - IWL_FW_CMD_VER_UNKNOWN); + u8 cmd_ver = iwl_fw_lookup_cmd_ver(mvm->fw, cmd_id, 3); + if (cmd_ver >= 7) { len = sizeof(cmd.v7); n_subbands = IWL_NUM_SUB_BANDS_V2; From b7ffca99313d856f7d1cc89038d9061b128e8e97 Mon Sep 17 00:00:00 2001 From: Yedidya Benshimol Date: Fri, 10 May 2024 17:06:29 +0300 Subject: [PATCH 0315/1578] wifi: iwlwifi: mvm: d3: fix WoWLAN command version lookup After moving from commands to notificaitons in the d3 resume flow, removing the WOWLAN_GET_STATUSES and REPLY_OFFLOADS_QUERY_CMD causes the return of the default value when looking up their version. Returning zero here results in the driver sending the not supported NON_QOS_TX_COUNTER_CMD. Signed-off-by: Yedidya Benshimol Reviewed-by: Gregory Greenman Signed-off-by: Miri Korenblit Link: https://msgid.link/20240510170500.8cabfd580614.If3a0db9851f56041f8f5360959354abd5379224a@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/d3.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c index 71e6b06481a93b..e37ae34acba606 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c @@ -2341,7 +2341,8 @@ static bool iwl_mvm_setup_connection_keep(struct iwl_mvm *mvm, out: if (iwl_fw_lookup_notif_ver(mvm->fw, LONG_GROUP, - WOWLAN_GET_STATUSES, 0) < 10) { + WOWLAN_GET_STATUSES, + IWL_FW_CMD_VER_UNKNOWN) < 10) { mvmvif->seqno_valid = true; /* +0x10 because the set API expects next-to-use, not last-used */ mvmvif->seqno = status->non_qos_seq_ctr + 0x10; From 4a7aace2899711592327463c1a29ffee44fcc66e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 10 May 2024 17:06:33 +0300 Subject: [PATCH 0316/1578] wifi: iwlwifi: mvm: revert gen2 TX A-MPDU size to 64 We don't actually support >64 even for HE devices, so revert back to 64. This fixes an issue where the session is refused because the queue is configured differently from the actual session later. Fixes: 514c30696fbc ("iwlwifi: add support for IEEE802.11ax") Signed-off-by: Johannes Berg Reviewed-by: Liad Kaufman Reviewed-by: Luciano Coelho Signed-off-by: Miri Korenblit Link: https://msgid.link/20240510170500.52f7b4cf83aa.If47e43adddf7fe250ed7f5571fbb35d8221c7c47@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/rs.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rs.h b/drivers/net/wireless/intel/iwlwifi/mvm/rs.h index 376b23b409dcad..6cd4ec4d8f3441 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/rs.h +++ b/drivers/net/wireless/intel/iwlwifi/mvm/rs.h @@ -122,13 +122,8 @@ enum { #define LINK_QUAL_AGG_FRAME_LIMIT_DEF (63) #define LINK_QUAL_AGG_FRAME_LIMIT_MAX (63) -/* - * FIXME - various places in firmware API still use u8, - * e.g. LQ command and SCD config command. - * This should be 256 instead. - */ -#define LINK_QUAL_AGG_FRAME_LIMIT_GEN2_DEF (255) -#define LINK_QUAL_AGG_FRAME_LIMIT_GEN2_MAX (255) +#define LINK_QUAL_AGG_FRAME_LIMIT_GEN2_DEF (64) +#define LINK_QUAL_AGG_FRAME_LIMIT_GEN2_MAX (64) #define LINK_QUAL_AGG_FRAME_LIMIT_MIN (0) #define LQ_SIZE 2 /* 2 mode tables: "Active" and "Search" */ From 0f2e9f6f21d1ff292363cdfb5bc4d492eeaff76e Mon Sep 17 00:00:00 2001 From: Mordechay Goodstein Date: Fri, 10 May 2024 17:06:35 +0300 Subject: [PATCH 0317/1578] wifi: iwlwifi: mvm: set properly mac header In the driver we only use skb_put* for adding data to the skb, hence data never moves and skb_reset_mac_haeder would set mac_header to the first time data was added and not to mac80211 header, fix this my using the actual len of bytes added for setting the mac header. Fixes: 3f7a9d577d47 ("wifi: iwlwifi: mvm: simplify by using SKB MAC header pointer") Signed-off-by: Mordechay Goodstein Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://msgid.link/20240510170500.12f2de2909c3.I72a819b96f2fe55bde192a8fd31a4b96c301aa73@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c index d78af29281522e..489cfb0a4ab1ec 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c @@ -2450,8 +2450,11 @@ void iwl_mvm_rx_monitor_no_data(struct iwl_mvm *mvm, struct napi_struct *napi, * * We mark it as mac header, for upper layers to know where * all radio tap header ends. + * + * Since data doesn't move data while putting data on skb and that is + * the only way we use, data + len is the next place that hdr would be put */ - skb_reset_mac_header(skb); + skb_set_mac_header(skb, skb->len); /* * Override the nss from the rx_vec since the rate_n_flags has From 87821b67dea87addbc4ab093ba752753b002176a Mon Sep 17 00:00:00 2001 From: Shahar S Matityahu Date: Fri, 10 May 2024 17:06:39 +0300 Subject: [PATCH 0318/1578] wifi: iwlwifi: dbg_ini: move iwl_dbg_tlv_free outside of debugfs ifdef The driver should call iwl_dbg_tlv_free even if debugfs is not defined since ini mode does not depend on debugfs ifdef. Fixes: 68f6f492c4fa ("iwlwifi: trans: support loading ini TLVs from external file") Signed-off-by: Shahar S Matityahu Reviewed-by: Luciano Coelho Signed-off-by: Miri Korenblit Link: https://msgid.link/20240510170500.c8e3723f55b0.I5e805732b0be31ee6b83c642ec652a34e974ff10@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/iwl-drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-drv.c b/drivers/net/wireless/intel/iwlwifi/iwl-drv.c index 33654f228ee871..d156a9c6419404 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-drv.c +++ b/drivers/net/wireless/intel/iwlwifi/iwl-drv.c @@ -1815,8 +1815,8 @@ struct iwl_drv *iwl_drv_start(struct iwl_trans *trans) err_fw: #ifdef CONFIG_IWLWIFI_DEBUGFS debugfs_remove_recursive(drv->dbgfs_drv); - iwl_dbg_tlv_free(drv->trans); #endif + iwl_dbg_tlv_free(drv->trans); kfree(drv); err: return ERR_PTR(ret); From cc3ba78f202de9752aceb16342ab62bdfbffac7e Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Mon, 13 May 2024 13:27:08 +0300 Subject: [PATCH 0319/1578] wifi: iwlwifi: mvm: remove stale STA link data during restart If pre-recovery mac80211 tried to disable a link but this disablement failed, then there might be a mismatch between mac80211 assuming the link has been disabled and the driver still having the data around. During recover itself, that is not a problem, but should the link be activated again at a later point, iwlwifi will refuse the activation as it detects the inconsistent state. Solve this corner-case by iterating the station in the restart cleanup handler. Signed-off-by: Benjamin Berg Signed-off-by: Miri Korenblit Link: https://msgid.link/20240513132416.d2fd60338055.I840d4fdce5fd49fe69896d928b071067e3730259@changeid Signed-off-by: Johannes Berg --- .../net/wireless/intel/iwlwifi/mvm/mac80211.c | 37 +++++++++++++++++++ .../net/wireless/intel/iwlwifi/mvm/mld-sta.c | 10 ++--- drivers/net/wireless/intel/iwlwifi/mvm/sta.h | 5 +++ 3 files changed, 47 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 486a6b8f3c97f7..628b50ee02443c 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -1128,6 +1128,39 @@ static void iwl_mvm_cleanup_iterator(void *data, u8 *mac, RCU_INIT_POINTER(mvmvif->deflink.probe_resp_data, NULL); } +static void iwl_mvm_cleanup_sta_iterator(void *data, struct ieee80211_sta *sta) +{ + struct iwl_mvm *mvm = data; + struct iwl_mvm_sta *mvm_sta; + struct ieee80211_vif *vif; + int link_id; + + mvm_sta = iwl_mvm_sta_from_mac80211(sta); + vif = mvm_sta->vif; + + if (!sta->valid_links) + return; + + for (link_id = 0; link_id < ARRAY_SIZE((sta)->link); link_id++) { + struct iwl_mvm_link_sta *mvm_link_sta; + + mvm_link_sta = + rcu_dereference_check(mvm_sta->link[link_id], + lockdep_is_held(&mvm->mutex)); + if (mvm_link_sta && !(vif->active_links & BIT(link_id))) { + /* + * We have a link STA but the link is inactive in + * mac80211. This will happen if we failed to + * deactivate the link but mac80211 roll back the + * deactivation of the link. + * Delete the stale data to avoid issues later on. + */ + iwl_mvm_mld_free_sta_link(mvm, mvm_sta, mvm_link_sta, + link_id, false); + } + } +} + static void iwl_mvm_restart_cleanup(struct iwl_mvm *mvm) { iwl_mvm_stop_device(mvm); @@ -1150,6 +1183,10 @@ static void iwl_mvm_restart_cleanup(struct iwl_mvm *mvm) */ ieee80211_iterate_interfaces(mvm->hw, 0, iwl_mvm_cleanup_iterator, mvm); + /* cleanup stations as links may be gone after restart */ + ieee80211_iterate_stations_atomic(mvm->hw, + iwl_mvm_cleanup_sta_iterator, mvm); + mvm->p2p_device_vif = NULL; iwl_mvm_reset_phy_ctxts(mvm); diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mld-sta.c b/drivers/net/wireless/intel/iwlwifi/mvm/mld-sta.c index b7a461dba41ee3..ae3cde0e64f3f8 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mld-sta.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mld-sta.c @@ -515,11 +515,11 @@ static int iwl_mvm_mld_cfg_sta(struct iwl_mvm *mvm, struct ieee80211_sta *sta, return iwl_mvm_mld_send_sta_cmd(mvm, &cmd); } -static void iwl_mvm_mld_free_sta_link(struct iwl_mvm *mvm, - struct iwl_mvm_sta *mvm_sta, - struct iwl_mvm_link_sta *mvm_sta_link, - unsigned int link_id, - bool is_in_fw) +void iwl_mvm_mld_free_sta_link(struct iwl_mvm *mvm, + struct iwl_mvm_sta *mvm_sta, + struct iwl_mvm_link_sta *mvm_sta_link, + unsigned int link_id, + bool is_in_fw) { RCU_INIT_POINTER(mvm->fw_id_to_mac_id[mvm_sta_link->sta_id], is_in_fw ? ERR_PTR(-EINVAL) : NULL); diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/sta.h b/drivers/net/wireless/intel/iwlwifi/mvm/sta.h index 264f1f9394b6de..754a05a8c189bc 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.h +++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.h @@ -662,6 +662,11 @@ int iwl_mvm_mld_update_sta(struct iwl_mvm *mvm, struct ieee80211_vif *vif, struct ieee80211_sta *sta); int iwl_mvm_mld_rm_sta(struct iwl_mvm *mvm, struct ieee80211_vif *vif, struct ieee80211_sta *sta); +void iwl_mvm_mld_free_sta_link(struct iwl_mvm *mvm, + struct iwl_mvm_sta *mvm_sta, + struct iwl_mvm_link_sta *mvm_sta_link, + unsigned int link_id, + bool is_in_fw); int iwl_mvm_mld_rm_sta_id(struct iwl_mvm *mvm, u8 sta_id); int iwl_mvm_mld_update_sta_links(struct iwl_mvm *mvm, struct ieee80211_vif *vif, From 08b16d1b5997dc378533318e2a9cd73c7a898284 Mon Sep 17 00:00:00 2001 From: Yedidya Benshimol Date: Mon, 13 May 2024 13:27:09 +0300 Subject: [PATCH 0320/1578] wifi: iwlwifi: mvm: Handle BIGTK cipher in kek_kck cmd The BIGTK cipher field was added to the kek_kck_material_cmd but wasn't assigned. Fix that by differentiating between the IGTK/BIGTK keys and assign the ciphers fields accordingly. Signed-off-by: Yedidya Benshimol Signed-off-by: Miri Korenblit Link: https://msgid.link/20240513132416.7fd0b22b7267.Ie9b581652b74bd7806980364d59e1b2e78e682c0@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/d3.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c index e37ae34acba606..54f4acbbd05bd4 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c @@ -595,6 +595,12 @@ static void iwl_mvm_wowlan_gtk_type_iter(struct ieee80211_hw *hw, void *_data) { struct wowlan_key_gtk_type_iter *data = _data; + __le32 *cipher = NULL; + + if (key->keyidx == 4 || key->keyidx == 5) + cipher = &data->kek_kck_cmd->igtk_cipher; + if (key->keyidx == 6 || key->keyidx == 7) + cipher = &data->kek_kck_cmd->bigtk_cipher; switch (key->cipher) { default: @@ -606,10 +612,13 @@ static void iwl_mvm_wowlan_gtk_type_iter(struct ieee80211_hw *hw, return; case WLAN_CIPHER_SUITE_BIP_GMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: - data->kek_kck_cmd->igtk_cipher = cpu_to_le32(STA_KEY_FLG_GCMP); + if (cipher) + *cipher = cpu_to_le32(STA_KEY_FLG_GCMP); return; case WLAN_CIPHER_SUITE_AES_CMAC: - data->kek_kck_cmd->igtk_cipher = cpu_to_le32(STA_KEY_FLG_CCM); + case WLAN_CIPHER_SUITE_BIP_CMAC_256: + if (cipher) + *cipher = cpu_to_le32(STA_KEY_FLG_CCM); return; case WLAN_CIPHER_SUITE_CCMP: if (!sta) From 4d08c0b3357cba0aeffaf3abc62cae0c154f2816 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 13 May 2024 13:27:10 +0300 Subject: [PATCH 0321/1578] wifi: iwlwifi: mvm: handle BA session teardown in RF-kill When entering RF-kill, mac80211 tears down BA sessions, but due to RF-kill the commands aren't sent to the device. As a result, there can be frames pending on the reorder buffer or perhaps even received while doing so, leading to warnings. Avoid the warnings by doing the BA session teardown normally even in RF-kill, which also requires queue sync. Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://msgid.link/20240513132416.0762cd80fb3d.I43c5877f3b546159b2db4f36d6d956b333c41cf0@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 2 +- drivers/net/wireless/intel/iwlwifi/mvm/mld-sta.c | 3 ++- drivers/net/wireless/intel/iwlwifi/mvm/sta.c | 12 ++++++++---- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 628b50ee02443c..de9f0b44654562 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -6385,7 +6385,7 @@ void iwl_mvm_sync_rx_queues_internal(struct iwl_mvm *mvm, .len[0] = sizeof(cmd), .data[1] = data, .len[1] = size, - .flags = sync ? 0 : CMD_ASYNC, + .flags = CMD_SEND_IN_RFKILL | (sync ? 0 : CMD_ASYNC), }; int ret; diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mld-sta.c b/drivers/net/wireless/intel/iwlwifi/mvm/mld-sta.c index ae3cde0e64f3f8..9d139b56e1527c 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mld-sta.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mld-sta.c @@ -1014,7 +1014,8 @@ static int iwl_mvm_mld_update_sta_baids(struct iwl_mvm *mvm, cmd.modify.tid = cpu_to_le32(data->tid); - ret = iwl_mvm_send_cmd_pdu(mvm, cmd_id, 0, sizeof(cmd), &cmd); + ret = iwl_mvm_send_cmd_pdu(mvm, cmd_id, CMD_SEND_IN_RFKILL, + sizeof(cmd), &cmd); data->sta_mask = new_sta_mask; if (ret) return ret; diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c index 20d4968d692a36..cc79fe991c2633 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c @@ -2848,7 +2848,12 @@ static int iwl_mvm_fw_baid_op_cmd(struct iwl_mvm *mvm, .action = start ? cpu_to_le32(IWL_RX_BAID_ACTION_ADD) : cpu_to_le32(IWL_RX_BAID_ACTION_REMOVE), }; - u32 cmd_id = WIDE_ID(DATA_PATH_GROUP, RX_BAID_ALLOCATION_CONFIG_CMD); + struct iwl_host_cmd hcmd = { + .id = WIDE_ID(DATA_PATH_GROUP, RX_BAID_ALLOCATION_CONFIG_CMD), + .flags = CMD_SEND_IN_RFKILL, + .len[0] = sizeof(cmd), + .data[0] = &cmd, + }; int ret; BUILD_BUG_ON(sizeof(struct iwl_rx_baid_cfg_resp) != sizeof(baid)); @@ -2860,7 +2865,7 @@ static int iwl_mvm_fw_baid_op_cmd(struct iwl_mvm *mvm, cmd.alloc.ssn = cpu_to_le16(ssn); cmd.alloc.win_size = cpu_to_le16(buf_size); baid = -EIO; - } else if (iwl_fw_lookup_cmd_ver(mvm->fw, cmd_id, 1) == 1) { + } else if (iwl_fw_lookup_cmd_ver(mvm->fw, hcmd.id, 1) == 1) { cmd.remove_v1.baid = cpu_to_le32(baid); BUILD_BUG_ON(sizeof(cmd.remove_v1) > sizeof(cmd.remove)); } else { @@ -2869,8 +2874,7 @@ static int iwl_mvm_fw_baid_op_cmd(struct iwl_mvm *mvm, cmd.remove.tid = cpu_to_le32(tid); } - ret = iwl_mvm_send_cmd_pdu_status(mvm, cmd_id, sizeof(cmd), - &cmd, &baid); + ret = iwl_mvm_send_cmd_status(mvm, &hcmd, &baid); if (ret) return ret; From 989830d1cf16bd149bf0690d889a9caef95fb5b1 Mon Sep 17 00:00:00 2001 From: Ayala Beker Date: Mon, 13 May 2024 13:27:11 +0300 Subject: [PATCH 0322/1578] wifi: iwlwifi: mvm: properly set 6 GHz channel direct probe option Ensure that the 6 GHz channel is configured with a valid direct BSSID, avoiding any invalid or multicast BSSID addresses. Signed-off-by: Ayala Beker Reviewed-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://msgid.link/20240513132416.91a631a0fe60.I2ea2616af9b8a2eaf959b156c69cf65a2f1204d4@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/scan.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c index a7ec172eeade85..890d4d79b172b7 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c @@ -1730,7 +1730,10 @@ iwl_mvm_umac_scan_fill_6g_chan_list(struct iwl_mvm *mvm, break; } - if (k == idex_b && idex_b < SCAN_BSSID_MAX_SIZE) { + if (k == idex_b && idex_b < SCAN_BSSID_MAX_SIZE && + !WARN_ONCE(!is_valid_ether_addr(scan_6ghz_params[j].bssid), + "scan: invalid BSSID at index %u, index_b=%u\n", + j, idex_b)) { memcpy(&pp->bssid_array[idex_b++], scan_6ghz_params[j].bssid, ETH_ALEN); } From 60d62757df30b74bf397a2847a6db7385c6ee281 Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Mon, 13 May 2024 13:27:12 +0300 Subject: [PATCH 0323/1578] wifi: iwlwifi: mvm: check n_ssids before accessing the ssids In some versions of cfg80211, the ssids poinet might be a valid one even though n_ssids is 0. Accessing the pointer in this case will cuase an out-of-bound access. Fix this by checking n_ssids first. Fixes: c1a7515393e4 ("iwlwifi: mvm: add adaptive dwell support") Signed-off-by: Miri Korenblit Reviewed-by: Ilan Peer Reviewed-by: Johannes Berg Link: https://msgid.link/20240513132416.6e4d1762bf0d.I5a0e6cc8f02050a766db704d15594c61fe583d45@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/scan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c index 890d4d79b172b7..9739c38ef8241c 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c @@ -1313,7 +1313,7 @@ static void iwl_mvm_scan_umac_dwell(struct iwl_mvm *mvm, if (IWL_MVM_ADWELL_MAX_BUDGET) cmd->v7.adwell_max_budget = cpu_to_le16(IWL_MVM_ADWELL_MAX_BUDGET); - else if (params->ssids && params->ssids[0].ssid_len) + else if (params->n_ssids && params->ssids[0].ssid_len) cmd->v7.adwell_max_budget = cpu_to_le16(IWL_SCAN_ADWELL_MAX_BUDGET_DIRECTED_SCAN); else @@ -1418,7 +1418,7 @@ iwl_mvm_scan_umac_dwell_v11(struct iwl_mvm *mvm, if (IWL_MVM_ADWELL_MAX_BUDGET) general_params->adwell_max_budget = cpu_to_le16(IWL_MVM_ADWELL_MAX_BUDGET); - else if (params->ssids && params->ssids[0].ssid_len) + else if (params->n_ssids && params->ssids[0].ssid_len) general_params->adwell_max_budget = cpu_to_le16(IWL_SCAN_ADWELL_MAX_BUDGET_DIRECTED_SCAN); else From e6dd2936ce7ce94a1915b799f8af8193ec628e87 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 13 May 2024 13:27:13 +0300 Subject: [PATCH 0324/1578] wifi: iwlwifi: mvm: Fix scan abort handling with HW rfkill When HW rfkill is toggled to disable the RF, the flow to stop scan is called. When trying to send the command to abort the scan, since HW rfkill is toggled, the command is not sent due to rfkill being asserted, and -ERFKILL is returned from iwl_trans_send_cmd(), but this is silently ignored in iwl_mvm_send_cmd() and thus the scan abort flow continues to wait for scan complete notification and fails. Since it fails, the UID to type mapping is not cleared, and thus a warning is later fired when trying to stop the interface. To fix this, modify the UMAC scan abort flow to force sending the scan abort command even when in rfkill, so stop the FW from accessing the radio etc. Signed-off-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://msgid.link/20240513132416.8cbe2f8c1a97.Iffe235c12a919dafec88eef399eb1f7bae2c5bdb@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/scan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c index 9739c38ef8241c..b5f664ae5a17d0 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c @@ -3322,10 +3322,11 @@ static int iwl_mvm_umac_scan_abort(struct iwl_mvm *mvm, int type) ret = iwl_mvm_send_cmd_pdu(mvm, WIDE_ID(IWL_ALWAYS_LONG_GROUP, SCAN_ABORT_UMAC), - 0, sizeof(cmd), &cmd); + CMD_SEND_IN_RFKILL, sizeof(cmd), &cmd); if (!ret) mvm->scan_uid_status[uid] = type << IWL_MVM_SCAN_STOPPING_SHIFT; + IWL_DEBUG_SCAN(mvm, "Scan abort: ret=%d\n", ret); return ret; } From 4bb95f4535489ed830cf9b34b0a891e384d1aee4 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Mon, 13 May 2024 13:27:14 +0300 Subject: [PATCH 0325/1578] wifi: iwlwifi: mvm: don't read past the mfuart notifcation In case the firmware sends a notification that claims it has more data than it has, we will read past that was allocated for the notification. Remove the print of the buffer, we won't see it by default. If needed, we can see the content with tracing. This was reported by KFENCE. Fixes: bdccdb854f2f ("iwlwifi: mvm: support MFUART dump in case of MFUART assert") Signed-off-by: Emmanuel Grumbach Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://msgid.link/20240513132416.ba82a01a559e.Ia91dd20f5e1ca1ad380b95e68aebf2794f553d9b@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/fw.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c index ca7fcae6995ee3..f4937a100cbe9b 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c @@ -94,20 +94,10 @@ void iwl_mvm_mfu_assert_dump_notif(struct iwl_mvm *mvm, { struct iwl_rx_packet *pkt = rxb_addr(rxb); struct iwl_mfu_assert_dump_notif *mfu_dump_notif = (void *)pkt->data; - __le32 *dump_data = mfu_dump_notif->data; - int n_words = le32_to_cpu(mfu_dump_notif->data_size) / sizeof(__le32); - int i; if (mfu_dump_notif->index_num == 0) IWL_INFO(mvm, "MFUART assert id 0x%x occurred\n", le32_to_cpu(mfu_dump_notif->assert_id)); - - for (i = 0; i < n_words; i++) - IWL_DEBUG_INFO(mvm, - "MFUART assert dump, dword %u: 0x%08x\n", - le16_to_cpu(mfu_dump_notif->index_num) * - n_words + i, - le32_to_cpu(dump_data[i])); } static bool iwl_alive_fn(struct iwl_notif_wait_data *notif_wait, From 0c2fd18f7ec552796179c14f13a0e06942f09d16 Mon Sep 17 00:00:00 2001 From: Lingbo Kong Date: Thu, 16 May 2024 10:18:53 +0800 Subject: [PATCH 0326/1578] wifi: mac80211: fix Spatial Reuse element size check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the way to check the size of Spatial Reuse IE data in the ieee80211_parse_extension_element() is incorrect. This is because the len variable in the ieee80211_parse_extension_element() function is equal to the size of Spatial Reuse IE data minus one and the value of returned by the ieee80211_he_spr_size() function is equal to the length of Spatial Reuse IE data. So the result of the len >= ieee80211_he_spr_size(data) statement always false. To address this issue and make it consistent with the logic used elsewhere with ieee80211_he_oper_size(), change the "len >= ieee80211_he_spr_size(data)" to “len >= ieee80211_he_spr_size(data) - 1”. Fixes: 9d0480a7c05b ("wifi: mac80211: move element parsing to a new file") Signed-off-by: Lingbo Kong Link: https://msgid.link/20240516021854.5682-2-quic_lingbok@quicinc.com Signed-off-by: Johannes Berg --- net/mac80211/parse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/parse.c b/net/mac80211/parse.c index 55e5497f89781c..055a60e90979b3 100644 --- a/net/mac80211/parse.c +++ b/net/mac80211/parse.c @@ -111,7 +111,7 @@ ieee80211_parse_extension_element(u32 *crc, if (params->mode < IEEE80211_CONN_MODE_HE) break; if (len >= sizeof(*elems->he_spr) && - len >= ieee80211_he_spr_size(data)) + len >= ieee80211_he_spr_size(data) - 1) elems->he_spr = data; break; case WLAN_EID_EXT_HE_6GHZ_CAPA: From a26d8dc5227f449a54518a8b40733a54c6600a8b Mon Sep 17 00:00:00 2001 From: Lingbo Kong Date: Thu, 16 May 2024 10:18:54 +0800 Subject: [PATCH 0327/1578] wifi: mac80211: correctly parse Spatial Reuse Parameter Set element Currently, the way of parsing Spatial Reuse Parameter Set element is incorrect and some members of struct ieee80211_he_obss_pd are not assigned. To address this issue, it must be parsed in the order of the elements of Spatial Reuse Parameter Set defined in the IEEE Std 802.11ax specification. The diagram of the Spatial Reuse Parameter Set element (IEEE Std 802.11ax -2021-9.4.2.252). ------------------------------------------------------------------------- | | | | |Non-SRG| SRG | SRG | SRG | SRG | |Element|Length| Element | SR |OBSS PD|OBSS PD|OBSS PD| BSS |Partial| | ID | | ID |Control| Max | Min | Max |Color | BSSID | | | |Extension| | Offset| Offset|Offset |Bitmap|Bitmap | ------------------------------------------------------------------------- Fixes: 1ced169cc1c2 ("mac80211: allow setting spatial reuse parameters from bss_conf") Signed-off-by: Lingbo Kong Link: https://msgid.link/20240516021854.5682-3-quic_lingbok@quicinc.com Signed-off-by: Johannes Berg --- net/mac80211/he.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/mac80211/he.c b/net/mac80211/he.c index 9f5ffdc9db284a..ecbb042dd0433e 100644 --- a/net/mac80211/he.c +++ b/net/mac80211/he.c @@ -230,15 +230,21 @@ ieee80211_he_spr_ie_to_bss_conf(struct ieee80211_vif *vif, if (!he_spr_ie_elem) return; + + he_obss_pd->sr_ctrl = he_spr_ie_elem->he_sr_control; data = he_spr_ie_elem->optional; if (he_spr_ie_elem->he_sr_control & IEEE80211_HE_SPR_NON_SRG_OFFSET_PRESENT) - data++; + he_obss_pd->non_srg_max_offset = *data++; + if (he_spr_ie_elem->he_sr_control & IEEE80211_HE_SPR_SRG_INFORMATION_PRESENT) { - he_obss_pd->max_offset = *data++; he_obss_pd->min_offset = *data++; + he_obss_pd->max_offset = *data++; + memcpy(he_obss_pd->bss_color_bitmap, data, 8); + data += 8; + memcpy(he_obss_pd->partial_bssid_bitmap, data, 8); he_obss_pd->enable = true; } } From 92ecbb3ac6f3fe8ae9edf3226c76aa17b6800699 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Fri, 17 May 2024 18:33:32 +0300 Subject: [PATCH 0328/1578] wifi: mac80211: fix UBSAN noise in ieee80211_prep_hw_scan() When testing the previous patch with CONFIG_UBSAN_BOUNDS, I've noticed the following: UBSAN: array-index-out-of-bounds in net/mac80211/scan.c:372:4 index 0 is out of range for type 'struct ieee80211_channel *[]' CPU: 0 PID: 1435 Comm: wpa_supplicant Not tainted 6.9.0+ #1 Hardware name: LENOVO 20UN005QRT/20UN005QRT <...BIOS details...> Call Trace: dump_stack_lvl+0x2d/0x90 __ubsan_handle_out_of_bounds+0xe7/0x140 ? timerqueue_add+0x98/0xb0 ieee80211_prep_hw_scan+0x2db/0x480 [mac80211] ? __kmalloc+0xe1/0x470 __ieee80211_start_scan+0x541/0x760 [mac80211] rdev_scan+0x1f/0xe0 [cfg80211] nl80211_trigger_scan+0x9b6/0xae0 [cfg80211] ... Since '__ieee80211_start_scan()' leaves 'hw_scan_req->req.n_channels' uninitialized, actual boundaries of 'hw_scan_req->req.channels' can't be checked in 'ieee80211_prep_hw_scan()'. Although an initialization of 'hw_scan_req->req.n_channels' introduces some confusion around allocated vs. used VLA members, this shouldn't be a problem since everything is correctly adjusted soon in 'ieee80211_prep_hw_scan()'. Cleanup 'kmalloc()' math in '__ieee80211_start_scan()' by using the convenient 'struct_size()' as well. Signed-off-by: Dmitry Antipov Link: https://msgid.link/20240517153332.18271-2-dmantipov@yandex.ru [improve (imho) indentation a bit] Signed-off-by: Johannes Berg --- net/mac80211/scan.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 3da1c5c450358c..8ecc4b710b0e64 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -744,15 +744,21 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, local->hw_scan_ies_bufsize *= n_bands; } - local->hw_scan_req = kmalloc( - sizeof(*local->hw_scan_req) + - req->n_channels * sizeof(req->channels[0]) + - local->hw_scan_ies_bufsize, GFP_KERNEL); + local->hw_scan_req = kmalloc(struct_size(local->hw_scan_req, + req.channels, + req->n_channels) + + local->hw_scan_ies_bufsize, + GFP_KERNEL); if (!local->hw_scan_req) return -ENOMEM; local->hw_scan_req->req.ssids = req->ssids; local->hw_scan_req->req.n_ssids = req->n_ssids; + /* None of the channels are actually set + * up but let UBSAN know the boundaries. + */ + local->hw_scan_req->req.n_channels = req->n_channels; + ies = (u8 *)local->hw_scan_req + sizeof(*local->hw_scan_req) + req->n_channels * sizeof(req->channels[0]); From 9dedabe95b49ec9b0d16ce8f0ed1f9a12dd4a040 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 29 May 2024 11:42:35 -0400 Subject: [PATCH 0329/1578] spi: Assign dummy scatterlist to unidirectional transfers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 8cc3bad9d9d6 ("spi: Remove unneded check for orig_nents") introduced a regression: unmapped data could now be passed to the DMA APIs, resulting in null pointer dereferences. Commit 9f788ba457b4 ("spi: Don't mark message DMA mapped when no transfer in it is") and commit da560097c056 ("spi: Check if transfer is mapped before calling DMA sync APIs") addressed the problem, but only partially. Unidirectional transactions will still result in null pointer dereference. To prevent that from happening, assign a dummy scatterlist when no data is mapped, so that the DMA API can be called and not result in a null pointer dereference. Signed-off-by: Andy Shevchenko Reported-by: Neil Armstrong Closes: https://lore.kernel.org/r/8ae675b5-fcf9-4c9b-b06a-4462f70e1322@linaro.org Reported-by: Nícolas F. R. A. Prado Closes: https://lore.kernel.org/all/d3679496-2e4e-4a7c-97ed-f193bd53af1d@notapiano Closes: https://lore.kernel.org/all/4748499f-789c-45a8-b50a-2dd09f4bac8c@notapiano Fixes: 8cc3bad9d9d6 ("spi: Remove unneded check for orig_nents") Tested-by: Nícolas F. R. A. Prado [nfraprado: wrote the commit message] Signed-off-by: Nícolas F. R. A. Prado Link: https://msgid.link/r/20240529-dma-oops-dummy-v1-1-bb43aacfb11b@collabora.com Signed-off-by: Mark Brown --- drivers/spi/spi.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index f94420858c224f..9bc9fd10d538d2 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -1220,6 +1220,11 @@ void spi_unmap_buf(struct spi_controller *ctlr, struct device *dev, spi_unmap_buf_attrs(ctlr, dev, sgt, dir, 0); } +/* Dummy SG for unidirect transfers */ +static struct scatterlist dummy_sg = { + .page_link = SG_END, +}; + static int __spi_map_msg(struct spi_controller *ctlr, struct spi_message *msg) { struct device *tx_dev, *rx_dev; @@ -1258,6 +1263,8 @@ static int __spi_map_msg(struct spi_controller *ctlr, struct spi_message *msg) attrs); if (ret != 0) return ret; + } else { + xfer->tx_sg.sgl = &dummy_sg; } if (xfer->rx_buf != NULL) { @@ -1271,6 +1278,8 @@ static int __spi_map_msg(struct spi_controller *ctlr, struct spi_message *msg) return ret; } + } else { + xfer->rx_sg.sgl = &dummy_sg; } } /* No transfer has been mapped, bail out with success */ From 06fe9b1df1086b42718d632aa57e8f7cd1a66a21 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 29 May 2024 09:38:38 -0600 Subject: [PATCH 0330/1578] io_uring: don't attempt to mmap larger than what the user asks for MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If IORING_FEAT_SINGLE_MMAP is ignored, as can happen if an application uses an ancient liburing or does setup manually, then 3 mmap's are required to map the ring into userspace. The kernel will still have collapsed the mappings, however userspace may ask for mapping them individually. If so, then we should not use the full number of ring pages, as it may exceed the partial mapping. Doing so will yield an -EFAULT from vm_insert_pages(), as we pass in more pages than what the application asked for. Cap the number of pages to match what the application asked for, for the particular mapping operation. Reported-by: Lucas Mülling Link: https://github.com/axboe/liburing/issues/1157 Fixes: 3ab1db3c6039 ("io_uring: get rid of remap_pfn_range() for mapping rings/sqes") Signed-off-by: Jens Axboe --- io_uring/memmap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 4785d6af5fee92..a0f32a255fd1e1 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -244,6 +244,7 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) struct io_ring_ctx *ctx = file->private_data; size_t sz = vma->vm_end - vma->vm_start; long offset = vma->vm_pgoff << PAGE_SHIFT; + unsigned int npages; void *ptr; ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); @@ -253,8 +254,8 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) switch (offset & IORING_OFF_MMAP_MASK) { case IORING_OFF_SQ_RING: case IORING_OFF_CQ_RING: - return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, - ctx->n_ring_pages); + npages = min(ctx->n_ring_pages, (sz + PAGE_SIZE - 1) >> PAGE_SHIFT); + return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, npages); case IORING_OFF_SQES: return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages, ctx->n_sqe_pages); From 3bd27a847a3a4827a948387cc8f0dbc9fa5931d5 Mon Sep 17 00:00:00 2001 From: Matthias Maennich Date: Tue, 28 May 2024 11:32:43 +0000 Subject: [PATCH 0331/1578] kheaders: explicitly define file modes for archived headers Build environments might be running with different umask settings resulting in indeterministic file modes for the files contained in kheaders.tar.xz. The file itself is served with 444, i.e. world readable. Archive the files explicitly with 744,a+X to improve reproducibility across build environments. --mode=0444 is not suitable as directories need to be executable. Also, 444 makes it hard to delete all the readonly files after extraction. Cc: stable@vger.kernel.org Signed-off-by: Matthias Maennich Signed-off-by: Masahiro Yamada --- kernel/gen_kheaders.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 6d443ea22bb732..8b6e0c2bc0dfe0 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -84,7 +84,7 @@ find $cpio_dir -type f -print0 | # Create archive and try to normalize metadata for reproducibility. tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \ - --owner=0 --group=0 --sort=name --numeric-owner \ + --owner=0 --group=0 --sort=name --numeric-owner --mode=u=rw,go=r,a+X \ -I $XZ -cf $tarfile -C $cpio_dir/ . > /dev/null echo $headers_md5 > kernel/kheaders.md5 From 6e58e0173507e506a5627741358bc770f220e356 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 28 May 2024 18:31:50 +0200 Subject: [PATCH 0332/1578] kheaders: use `command -v` to test for existence of `cpio` Commit 13e1df09284d ("kheaders: explicitly validate existence of cpio command") added an explicit check for `cpio` using `type`. However, `type` in `dash` (which is used in some popular distributions and base images as the shell script runner) prints the missing message to standard output, and thus no error is printed: $ bash -c 'type missing >/dev/null' bash: line 1: type: missing: not found $ dash -c 'type missing >/dev/null' $ For instance, this issue may be seen by loongarch builders, given its defconfig enables CONFIG_IKHEADERS since commit 9cc1df421f00 ("LoongArch: Update Loongson-3 default config file"). Therefore, use `command -v` instead to have consistent behavior, and take the chance to provide a more explicit error. Fixes: 13e1df09284d ("kheaders: explicitly validate existence of cpio command") Signed-off-by: Miguel Ojeda Signed-off-by: Masahiro Yamada --- kernel/gen_kheaders.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 8b6e0c2bc0dfe0..383fd43ac61222 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -14,7 +14,12 @@ include/ arch/$SRCARCH/include/ " -type cpio > /dev/null +if ! command -v cpio >/dev/null; then + echo >&2 "***" + echo >&2 "*** 'cpio' could not be found." + echo >&2 "***" + exit 1 +fi # Support incremental builds by skipping archive generation # if timestamps of files being archived are not changed. From 1b1c9f0fd3fb70adf1f3b0aec58ab037d6e595d0 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Tue, 28 May 2024 15:02:32 -0500 Subject: [PATCH 0333/1578] dt-bindings: kbuild: Fix dt_binding_check on unconfigured build The 'dt_binding_check' target shouldn't depend on the kernel configuration, but it has since commit 604a57ba9781 ("dt-bindings: kbuild: Add separate target/dependency for processed-schema.json"). That is because CHECK_DT_BINDING make variable was dropped, but scripts/dtc/Makefile was missed. The CHECK_DTBS variable can be used instead. Reported-by: Francesco Dolcini Fixes: 604a57ba9781 ("dt-bindings: kbuild: Add separate target/dependency for processed-schema.json") Signed-off-by: "Rob Herring (Arm)" Signed-off-by: Masahiro Yamada --- scripts/dtc/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/dtc/Makefile b/scripts/dtc/Makefile index a186570725412f..b47f4daa4515df 100644 --- a/scripts/dtc/Makefile +++ b/scripts/dtc/Makefile @@ -3,7 +3,7 @@ # *** Also keep .gitignore in sync when changing *** hostprogs-always-$(CONFIG_DTC) += dtc fdtoverlay -hostprogs-always-$(CHECK_DT_BINDING) += dtc +hostprogs-always-$(CHECK_DTBS) += dtc dtc-objs := dtc.o flattree.o fstree.o data.o livetree.o treesource.o \ srcpos.o checks.o util.o From 4a4be1ad3a6efea16c56615f31117590fd881358 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 29 May 2024 09:39:34 -0700 Subject: [PATCH 0334/1578] Revert "vfs: Delete the associated dentry when deleting a file" This reverts commit 681ce8623567ba7e7333908e9826b77145312dda. We gave it a try, but it turns out the kernel test robot did in fact find performance regressions for it, so we'll have to look at the more involved alternative fixes for Yafang Shao's Elasticsearch load issue. There were several alternatives discussed, they just weren't as simple as this first attempt. The report is of a -7.4% regression of filebench.sum_operations/s, which appears significant enough to trigger my "this patch may get reverted if somebody finds a performance regression on some other load" rule. So it's still the case that we should end up deleting dentries more aggressively - or just be better at pruning them later - but it needs a bit more finesse than this simple thing. Link: https://lore.kernel.org/all/202405291318.4dfbb352-oliver.sang@intel.com/ Cc: Yafang Shao Cc: Al Viro Cc: Christian Brauner Signed-off-by: Linus Torvalds --- fs/dcache.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 1ee6404b430baa..407095188f83a7 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2360,17 +2360,19 @@ EXPORT_SYMBOL(d_hash_and_lookup); * - unhash this dentry and free it. * * Usually, we want to just turn this into - * a negative dentry, but certain workloads can - * generate a large number of negative dentries. - * Therefore, it would be better to simply - * unhash it. + * a negative dentry, but if anybody else is + * currently using the dentry or the inode + * we can't do that and we fall back on removing + * it from the hash queues and waiting for + * it to be deleted later when it has no users */ /** * d_delete - delete a dentry * @dentry: The dentry to delete * - * Remove the dentry from the hash queues so it can be deleted later. + * Turn the dentry into a negative dentry if possible, otherwise + * remove it from the hash queues so it can be deleted later */ void d_delete(struct dentry * dentry) @@ -2379,8 +2381,6 @@ void d_delete(struct dentry * dentry) spin_lock(&inode->i_lock); spin_lock(&dentry->d_lock); - __d_drop(dentry); - /* * Are we the only user? */ @@ -2388,6 +2388,7 @@ void d_delete(struct dentry * dentry) dentry->d_flags &= ~DCACHE_CANT_MOUNT; dentry_unlink_inode(dentry); } else { + __d_drop(dentry); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); } From ac0d71ee534e67c7e53439e8e9cb45ed40731660 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 29 May 2024 18:47:16 +0200 Subject: [PATCH 0335/1578] ALSA: ump: Don't accept an invalid UMP protocol number When a UMP Stream Configuration message is received, the driver tries to switch the protocol, but there was no sanity check of the protocol, hence it can pass an invalid value. Add the check and bail out if a wrong value is passed. Fixes: a79807683781 ("ALSA: ump: Add helper to change MIDI protocol") Cc: Link: https://lore.kernel.org/r/20240529164723.18309-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/ump.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sound/core/ump.c b/sound/core/ump.c index fd6a68a5427886..117c7ecc48563b 100644 --- a/sound/core/ump.c +++ b/sound/core/ump.c @@ -685,10 +685,17 @@ static void seq_notify_protocol(struct snd_ump_endpoint *ump) */ int snd_ump_switch_protocol(struct snd_ump_endpoint *ump, unsigned int protocol) { + unsigned int type; + protocol &= ump->info.protocol_caps; if (protocol == ump->info.protocol) return 0; + type = protocol & SNDRV_UMP_EP_INFO_PROTO_MIDI_MASK; + if (type != SNDRV_UMP_EP_INFO_PROTO_MIDI1 && + type != SNDRV_UMP_EP_INFO_PROTO_MIDI2) + return 0; + ump->info.protocol = protocol; ump_dbg(ump, "New protocol = %x (caps = %x)\n", protocol, ump->info.protocol_caps); From bc42ca002d5d211f9c57334b9b4c25ddb0b4ec35 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 29 May 2024 18:47:17 +0200 Subject: [PATCH 0336/1578] ALSA: ump: Set default protocol when not given explicitly When an inquiry of the current protocol via UMP Stream Configuration message fails by some reason, we may leave the current protocol undefined, which may lead to unexpected behavior. Better to assume a valid protocol found in the protocol capability bits instead. For a device that doesn't support the UMP v1.2 feature, it won't reach to this code path, and USB MIDI GTB descriptor would be used for determining the protocol, instead. Link: https://lore.kernel.org/r/20240529164723.18309-2-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/ump.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sound/core/ump.c b/sound/core/ump.c index 117c7ecc48563b..3f61220c23b4ec 100644 --- a/sound/core/ump.c +++ b/sound/core/ump.c @@ -967,6 +967,14 @@ int snd_ump_parse_endpoint(struct snd_ump_endpoint *ump) if (err < 0) ump_dbg(ump, "Unable to get UMP EP stream config\n"); + /* If no protocol is set by some reason, assume the valid one */ + if (!(ump->info.protocol & SNDRV_UMP_EP_INFO_PROTO_MIDI_MASK)) { + if (ump->info.protocol_caps & SNDRV_UMP_EP_INFO_PROTO_MIDI2) + ump->info.protocol |= SNDRV_UMP_EP_INFO_PROTO_MIDI2; + else if (ump->info.protocol_caps & SNDRV_UMP_EP_INFO_PROTO_MIDI1) + ump->info.protocol |= SNDRV_UMP_EP_INFO_PROTO_MIDI1; + } + /* Query and create blocks from Function Blocks */ for (blk = 0; blk < ump->info.num_blocks; blk++) { err = create_block_from_fb_info(ump, blk); From 95d7c452a26564ef0c427f2806761b857106d8c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 21 May 2024 12:52:42 +0200 Subject: [PATCH 0337/1578] spi: stm32: Don't warn about spurious interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dev_warn to notify about a spurious interrupt was introduced with the reasoning that these are unexpected. However spurious interrupts tend to trigger continously and the error message on the serial console prevents that the core's detection of spurious interrupts kicks in (which disables the irq) and just floods the console. Fixes: c64e7efe46b7 ("spi: stm32: make spurious and overrun interrupts visible") Signed-off-by: Uwe Kleine-König Link: https://msgid.link/r/20240521105241.62400-2-u.kleine-koenig@pengutronix.de Signed-off-by: Mark Brown --- drivers/spi/spi-stm32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-stm32.c b/drivers/spi/spi-stm32.c index e4e7ddb7524a99..4c4ff074e3f6f8 100644 --- a/drivers/spi/spi-stm32.c +++ b/drivers/spi/spi-stm32.c @@ -1057,7 +1057,7 @@ static irqreturn_t stm32h7_spi_irq_thread(int irq, void *dev_id) mask |= STM32H7_SPI_SR_TXP | STM32H7_SPI_SR_RXP; if (!(sr & mask)) { - dev_warn(spi->dev, "spurious IT (sr=0x%08x, ier=0x%08x)\n", + dev_vdbg(spi->dev, "spurious IT (sr=0x%08x, ier=0x%08x)\n", sr, ier); spin_unlock_irqrestore(&spi->lock, flags); return IRQ_NONE; From 84081a885394fc94055c24c727c99c321df6abac Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Fri, 3 May 2024 10:44:02 -0500 Subject: [PATCH 0338/1578] dt-bindings: arm: sunxi: Fix incorrect '-' usage Commit 6bc6bf8a940a ("dt-bindings: arm: sunxi: document Anbernic RG35XX handheld gaming device variants") mistakenly added '-' on each line which created empty (i.e. description only) schemas matching anything. This causes validation to fail on all the root node compatibles as there are multiple oneOf clauses passing. Fixes: 6bc6bf8a940a ("dt-bindings: arm: sunxi: document Anbernic RG35XX handheld gaming device variants") Reviewed-by: Ryan Walklin Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240503154402.967632-1-robh@kernel.org Signed-off-by: Rob Herring (Arm) --- Documentation/devicetree/bindings/arm/sunxi.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/arm/sunxi.yaml b/Documentation/devicetree/bindings/arm/sunxi.yaml index c6d0d8d81ed4de..c2a158b75e4979 100644 --- a/Documentation/devicetree/bindings/arm/sunxi.yaml +++ b/Documentation/devicetree/bindings/arm/sunxi.yaml @@ -57,17 +57,17 @@ properties: - const: allwinner,sun8i-v3s - description: Anbernic RG35XX (2024) - - items: + items: - const: anbernic,rg35xx-2024 - const: allwinner,sun50i-h700 - description: Anbernic RG35XX Plus - - items: + items: - const: anbernic,rg35xx-plus - const: allwinner,sun50i-h700 - description: Anbernic RG35XX H - - items: + items: - const: anbernic,rg35xx-h - const: allwinner,sun50i-h700 From 321e4fa68ce15660ec578bdec5cc9607635087cf Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Thu, 23 May 2024 10:42:07 -0500 Subject: [PATCH 0339/1578] dt-bindings: arm: stm32: st,mlahb: Drop spurious "reg" property from example "reg" is not documented nor used for st,mlahb, so drop it from the example to fix the warning: Documentation/devicetree/bindings/arm/stm32/st,mlahb.example.dtb: ahb@38000000: Unevaluated properties are not allowed ('reg' was unexpected) from schema $id: http://devicetree.org/schemas/arm/stm32/st,mlahb.yaml# Since "reg" is dropped, the unit-address must be as well. Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20240523154208.2457864-1-robh@kernel.org Signed-off-by: Rob Herring (Arm) --- Documentation/devicetree/bindings/arm/stm32/st,mlahb.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/arm/stm32/st,mlahb.yaml b/Documentation/devicetree/bindings/arm/stm32/st,mlahb.yaml index d2dce238ff5d68..3e996346b26448 100644 --- a/Documentation/devicetree/bindings/arm/stm32/st,mlahb.yaml +++ b/Documentation/devicetree/bindings/arm/stm32/st,mlahb.yaml @@ -54,11 +54,10 @@ unevaluatedProperties: false examples: - | - mlahb: ahb@38000000 { + ahb { compatible = "st,mlahb", "simple-bus"; #address-cells = <1>; #size-cells = <1>; - reg = <0x10000000 0x40000>; ranges; dma-ranges = <0x00000000 0x38000000 0x10000>, <0x10000000 0x10000000 0x60000>, From 2032e61e24fe9fe55d6c7a34fb5506c911b3e280 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 16 May 2024 16:27:33 +0100 Subject: [PATCH 0340/1578] kselftest/alsa: Ensure _GNU_SOURCE is defined The pcmtest driver tests use the kselftest harness which requires that _GNU_SOURCE is defined but nothing causes it to be defined. Since the KHDR_INCLUDES Makefile variable has had the required define added let's use that, this should provide some futureproofing. Fixes: daef47b89efd ("selftests: Compile kselftest headers with -D_GNU_SOURCE") Signed-off-by: Mark Brown Reviewed-by: Muhammad Usama Anjum Signed-off-by: Shuah Khan --- tools/testing/selftests/alsa/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/alsa/Makefile b/tools/testing/selftests/alsa/Makefile index 5af9ba8a4645bc..c1ce39874e2b54 100644 --- a/tools/testing/selftests/alsa/Makefile +++ b/tools/testing/selftests/alsa/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 # -CFLAGS += $(shell pkg-config --cflags alsa) +CFLAGS += $(shell pkg-config --cflags alsa) $(KHDR_INCLUDES) LDLIBS += $(shell pkg-config --libs alsa) ifeq ($(LDLIBS),) LDLIBS += -lasound From f6c3c83db1d939ebdb8c8922748ae647d8126d91 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 21 May 2024 09:00:22 +0900 Subject: [PATCH 0341/1578] selftests/ftrace: Fix to check required event file The dynevent/test_duplicates.tc test case uses `syscalls/sys_enter_openat` event for defining eprobe on it. Since this `syscalls` events depend on CONFIG_FTRACE_SYSCALLS=y, if it is not set, the test will fail. Add the event file to `required` line so that the test will return `unsupported` result. Fixes: 297e1dcdca3d ("selftests/ftrace: Add selftest for testing duplicate eprobes and kprobes") Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Shuah Khan --- .../testing/selftests/ftrace/test.d/dynevent/test_duplicates.tc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/test_duplicates.tc b/tools/testing/selftests/ftrace/test.d/dynevent/test_duplicates.tc index d3a79da215c8b0..5f72abe6fa79bb 100644 --- a/tools/testing/selftests/ftrace/test.d/dynevent/test_duplicates.tc +++ b/tools/testing/selftests/ftrace/test.d/dynevent/test_duplicates.tc @@ -1,7 +1,7 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0 # description: Generic dynamic event - check if duplicate events are caught -# requires: dynamic_events "e[:[/][]] . []":README +# requires: dynamic_events "e[:[/][]] . []":README events/syscalls/sys_enter_openat echo 0 > events/enable From 7ea794604bf6a3be8bb4b0f1483eb1d3972eac93 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 21 May 2024 09:00:32 +0900 Subject: [PATCH 0342/1578] selftests/ftrace: Update required config Update required config options for running all tests. This also sorts the config entries alphabetically. Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Shuah Khan --- tools/testing/selftests/ftrace/config | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/ftrace/config b/tools/testing/selftests/ftrace/config index e59d985eeff0c8..048a312abf4052 100644 --- a/tools/testing/selftests/ftrace/config +++ b/tools/testing/selftests/ftrace/config @@ -1,16 +1,28 @@ -CONFIG_KPROBES=y +CONFIG_BPF_SYSCALL=y +CONFIG_DEBUG_INFO_BTF=y +CONFIG_DEBUG_INFO_DWARF4=y +CONFIG_EPROBE_EVENTS=y +CONFIG_FPROBE=y +CONFIG_FPROBE_EVENTS=y CONFIG_FTRACE=y +CONFIG_FTRACE_SYSCALLS=y +CONFIG_FUNCTION_GRAPH_RETVAL=y CONFIG_FUNCTION_PROFILER=y -CONFIG_TRACER_SNAPSHOT=y -CONFIG_STACK_TRACER=y CONFIG_HIST_TRIGGERS=y -CONFIG_SCHED_TRACER=y -CONFIG_PREEMPT_TRACER=y CONFIG_IRQSOFF_TRACER=y -CONFIG_PREEMPTIRQ_DELAY_TEST=m +CONFIG_KALLSYMS_ALL=y +CONFIG_KPROBES=y +CONFIG_KPROBE_EVENTS=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y +CONFIG_PREEMPTIRQ_DELAY_TEST=m +CONFIG_PREEMPT_TRACER=y +CONFIG_PROBE_EVENTS_BTF_ARGS=y CONFIG_SAMPLES=y CONFIG_SAMPLE_FTRACE_DIRECT=m CONFIG_SAMPLE_TRACE_PRINTK=m -CONFIG_KALLSYMS_ALL=y +CONFIG_SCHED_TRACER=y +CONFIG_STACK_TRACER=y +CONFIG_TRACER_SNAPSHOT=y +CONFIG_UPROBES=y +CONFIG_UPROBE_EVENTS=y From 23a4b108accc29a6125ed14de4a044689ffeda78 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Mon, 20 May 2024 20:57:37 -0400 Subject: [PATCH 0343/1578] tracing/selftests: Fix kprobe event name test for .isra. functions The kprobe_eventname.tc test checks if a function with .isra. can have a kprobe attached to it. It loops through the kallsyms file for all the functions that have the .isra. name, and checks if it exists in the available_filter_functions file, and if it does, it uses it to attach a kprobe to it. The issue is that kprobes can not attach to functions that are listed more than once in available_filter_functions. With the latest kernel, the function that is found is: rapl_event_update.isra.0 # grep rapl_event_update.isra.0 /sys/kernel/tracing/available_filter_functions rapl_event_update.isra.0 rapl_event_update.isra.0 It is listed twice. This causes the attached kprobe to it to fail which in turn fails the test. Instead of just picking the function function that is found in available_filter_functions, pick the first one that is listed only once in available_filter_functions. Cc: stable@vger.kernel.org Fixes: 604e3548236d ("selftests/ftrace: Select an existing function in kprobe_eventname test") Signed-off-by: Steven Rostedt (Google) Acked-by: Masami Hiramatsu (Google) Signed-off-by: Shuah Khan --- .../testing/selftests/ftrace/test.d/kprobe/kprobe_eventname.tc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_eventname.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_eventname.tc index 1f6981ef7afa06..ba19b81cef39af 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_eventname.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_eventname.tc @@ -30,7 +30,8 @@ find_dot_func() { fi grep " [tT] .*\.isra\..*" /proc/kallsyms | cut -f 3 -d " " | while read f; do - if grep -s $f available_filter_functions; then + cnt=`grep -s $f available_filter_functions | wc -l`; + if [ $cnt -eq 1 ]; then echo $f break fi From bc4d5f5d2debf8bb65fba188313481549ead8576 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 21 May 2024 13:01:11 +1000 Subject: [PATCH 0344/1578] selftests: cachestat: Fix build warnings on ppc64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix warnings like: test_cachestat.c: In function ‘print_cachestat’: test_cachestat.c:30:38: warning: format ‘%llu’ expects argument of type ‘long long unsigned int’, but argument 2 has type ‘__u64’ {aka ‘long unsigned int’} [-Wformat=] By switching to unsigned long long for u64 for ppc64 builds. Signed-off-by: Michael Ellerman Signed-off-by: Shuah Khan --- tools/testing/selftests/cachestat/test_cachestat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/cachestat/test_cachestat.c b/tools/testing/selftests/cachestat/test_cachestat.c index b171fd53b004e8..632ab44737ec3a 100644 --- a/tools/testing/selftests/cachestat/test_cachestat.c +++ b/tools/testing/selftests/cachestat/test_cachestat.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #define _GNU_SOURCE +#define __SANE_USERSPACE_TYPES__ // Use ll64 #include #include From 84b6df4c49a1cc2854a16937acd5fd3e6315d083 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 21 May 2024 13:03:25 +1000 Subject: [PATCH 0345/1578] selftests/openat2: Fix build warnings on ppc64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix warnings like: openat2_test.c: In function ‘test_openat2_flags’: openat2_test.c:303:73: warning: format ‘%llX’ expects argument of type ‘long long unsigned int’, but argument 5 has type ‘__u64’ {aka ‘long unsigned int’} [-Wformat=] By switching to unsigned long long for u64 for ppc64 builds. Signed-off-by: Michael Ellerman Reviewed-by: Muhammad Usama Anjum Signed-off-by: Shuah Khan --- tools/testing/selftests/openat2/openat2_test.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/openat2/openat2_test.c b/tools/testing/selftests/openat2/openat2_test.c index 9024754530b230..5790ab446527f1 100644 --- a/tools/testing/selftests/openat2/openat2_test.c +++ b/tools/testing/selftests/openat2/openat2_test.c @@ -5,6 +5,7 @@ */ #define _GNU_SOURCE +#define __SANE_USERSPACE_TYPES__ // Use ll64 #include #include #include From e8b8c5264d4ebd248f60a5cef077fe615806e7a0 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 21 May 2024 12:26:16 +1000 Subject: [PATCH 0346/1578] selftests/overlayfs: Fix build error on ppc64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix build error on ppc64: dev_in_maps.c: In function ‘get_file_dev_and_inode’: dev_in_maps.c:60:59: error: format ‘%llu’ expects argument of type ‘long long unsigned int *’, but argument 7 has type ‘__u64 *’ {aka ‘long unsigned int *’} [-Werror=format=] By switching to unsigned long long for u64 for ppc64 builds. Signed-off-by: Michael Ellerman Signed-off-by: Shuah Khan --- tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c b/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c index 759f86e7d263e4..2862aae58b79ac 100644 --- a/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c +++ b/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #define _GNU_SOURCE +#define __SANE_USERSPACE_TYPES__ // Use ll64 #include #include From 2607133196c35f31892ee199ce7ffa717bea4ad1 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Mon, 27 May 2024 17:14:12 -0700 Subject: [PATCH 0347/1578] clk: sifive: Do not register clkdevs for PRCI clocks These clkdevs were unnecessary, because systems using this driver always look up clocks using the devicetree. And as Russell King points out[1], since the provided device name was truncated, lookups via clkdev would never match. Recently, commit 8d532528ff6a ("clkdev: report over-sized strings when creating clkdev entries") caused clkdev registration to fail due to the truncation, and this now prevents the driver from probing. Fix the driver by removing the clkdev registration. Link: https://lore.kernel.org/linux-clk/ZkfYqj+OcAxd9O2t@shell.armlinux.org.uk/ [1] Fixes: 30b8e27e3b58 ("clk: sifive: add a driver for the SiFive FU540 PRCI IP block") Fixes: 8d532528ff6a ("clkdev: report over-sized strings when creating clkdev entries") Reported-by: Guenter Roeck Closes: https://lore.kernel.org/linux-clk/7eda7621-0dde-4153-89e4-172e4c095d01@roeck-us.net/ Suggested-by: Russell King Signed-off-by: Samuel Holland Link: https://lore.kernel.org/r/20240528001432.1200403-1-samuel.holland@sifive.com Signed-off-by: Stephen Boyd --- drivers/clk/sifive/sifive-prci.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/clk/sifive/sifive-prci.c b/drivers/clk/sifive/sifive-prci.c index 25b8e1a80ddce0..b32a59fe55e749 100644 --- a/drivers/clk/sifive/sifive-prci.c +++ b/drivers/clk/sifive/sifive-prci.c @@ -4,7 +4,6 @@ * Copyright (C) 2020 Zong Li */ -#include #include #include #include @@ -537,13 +536,6 @@ static int __prci_register_clocks(struct device *dev, struct __prci_data *pd, return r; } - r = clk_hw_register_clkdev(&pic->hw, pic->name, dev_name(dev)); - if (r) { - dev_warn(dev, "Failed to register clkdev for %s: %d\n", - init.name, r); - return r; - } - pd->hw_clks.hws[i] = &pic->hw; } From 7b038b564b3e2a752d2211e7b0c3c29fd2f6e197 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 29 May 2024 16:28:41 -0400 Subject: [PATCH 0348/1578] bcachefs: Fix failure to return error on misaligned dio write This was reported as an error when running coreutils shred. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-io-direct.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index 09d21aef879af8..049b61bc9a5b38 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -609,8 +609,10 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) if (unlikely(ret)) goto err_put_write_ref; - if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) + if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) { + ret = -EINVAL; goto err_put_write_ref; + } inode_dio_begin(&inode->v); bch2_pagecache_block_get(inode); From ba46b3bda296c4f82b061ac40b90f49d2a00a380 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 15 May 2024 11:25:49 -0400 Subject: [PATCH 0349/1578] drm/amdgpu: Adjust logic in amdgpu_device_partner_bandwidth() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use current speed/width on devices which don't support dynamic PCIe switching. Fixes: 466a7d115326 ("drm/amd: Use the first non-dGPU PCI device for BW limits") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3289 Acked-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 861ccff78af954..932dc93b2e631d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5944,13 +5944,18 @@ static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, *speed = PCI_SPEED_UNKNOWN; *width = PCIE_LNK_WIDTH_UNKNOWN; - while ((parent = pci_upstream_bridge(parent))) { - /* skip upstream/downstream switches internal to dGPU*/ - if (parent->vendor == PCI_VENDOR_ID_ATI) - continue; - *speed = pcie_get_speed_cap(parent); - *width = pcie_get_width_cap(parent); - break; + if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { + while ((parent = pci_upstream_bridge(parent))) { + /* skip upstream/downstream switches internal to dGPU*/ + if (parent->vendor == PCI_VENDOR_ID_ATI) + continue; + *speed = pcie_get_speed_cap(parent); + *width = pcie_get_width_cap(parent); + break; + } + } else { + /* use the current speeds rather than max if switching is not supported */ + pcie_bandwidth_available(adev->pdev, NULL, speed, width); } } From 05d9e24ddb15160164ba6e917a88c00907dc2434 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 16 May 2024 09:51:26 -0400 Subject: [PATCH 0350/1578] drm/amdgpu: silence UBSAN warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert a variable sized array from [1] to []. Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/include/atomfirmware.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/include/atomfirmware.h b/drivers/gpu/drm/amd/include/atomfirmware.h index 1acb2d2c5597b5..5716918372001a 100644 --- a/drivers/gpu/drm/amd/include/atomfirmware.h +++ b/drivers/gpu/drm/amd/include/atomfirmware.h @@ -3583,7 +3583,7 @@ struct atom_gpio_voltage_object_v4 uint8_t phase_delay_us; // phase delay in unit of micro second uint8_t reserved; uint32_t gpio_mask_val; // GPIO Mask value - struct atom_voltage_gpio_map_lut voltage_gpio_lut[1]; + struct atom_voltage_gpio_map_lut voltage_gpio_lut[] __counted_by(gpio_entry_num); }; struct atom_svid2_voltage_object_v4 From a0cf36546cc24ae1c95d72253c7795d4d2fc77aa Mon Sep 17 00:00:00 2001 From: Jesse Zhang Date: Thu, 23 May 2024 17:14:45 +0800 Subject: [PATCH 0351/1578] drm/amdgpu: fix dereference null return value for the function amdgpu_vm_pt_parent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pointer parent may be NULLed by the function amdgpu_vm_pt_parent. To make the code more robust, check the pointer parent. Signed-off-by: Jesse Zhang Suggested-by: Christian König Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c index 7fdd306a48a0ec..f07647a9a9d97b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c @@ -706,11 +706,15 @@ int amdgpu_vm_pde_update(struct amdgpu_vm_update_params *params, struct amdgpu_vm_bo_base *entry) { struct amdgpu_vm_bo_base *parent = amdgpu_vm_pt_parent(entry); - struct amdgpu_bo *bo = parent->bo, *pbo; + struct amdgpu_bo *bo, *pbo; struct amdgpu_vm *vm = params->vm; uint64_t pde, pt, flags; unsigned int level; + if (WARN_ON(!parent)) + return -EINVAL; + + bo = parent->bo; for (level = 0, pbo = bo->parent; pbo; ++level) pbo = pbo->parent; From dd2b75fd9a79bf418e088656822af06fc253dbe3 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 20 May 2024 14:41:31 -0400 Subject: [PATCH 0352/1578] Revert "drm/amdkfd: fix gfx_target_version for certain 11.0.3 devices" This reverts commit 28ebbb4981cb1fad12e0b1227dbecc88810b1ee8. Revert this commit as apparently the LLVM code to take advantage of this never landed. Reviewed-by: Feifei Xu Signed-off-by: Alex Deucher Cc: Feifei Xu --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 9596bca572129d..afc57df421cd9c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -408,15 +408,8 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf) f2g = &gfx_v11_kfd2kgd; break; case IP_VERSION(11, 0, 3): - if ((adev->pdev->device == 0x7460 && - adev->pdev->revision == 0x00) || - (adev->pdev->device == 0x7461 && - adev->pdev->revision == 0x00)) - /* Note: Compiler version is 11.0.5 while HW version is 11.0.3 */ - gfx_target_version = 110005; - else - /* Note: Compiler version is 11.0.1 while HW version is 11.0.3 */ - gfx_target_version = 110001; + /* Note: Compiler version is 11.0.1 while HW version is 11.0.3 */ + gfx_target_version = 110001; f2g = &gfx_v11_kfd2kgd; break; case IP_VERSION(11, 5, 0): From 1f327dfc846ae82e16e52ed9c559d566826486d2 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 22 May 2024 15:26:50 -0400 Subject: [PATCH 0353/1578] drm/amdkfd: simplify APU VRAM handling With commit 89773b85599a ("drm/amdkfd: Let VRAM allocations go to GTT domain on small APUs") big and small APU "VRAM" handling in KFD was unified. Since AMD_IS_APU is set for both big and small APUs, we can simplify the checks in the code. v2: clean up a few more places (Lang) Acked-by: Felix Kuehling Reviewed-by: Lang Yu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 ++++++++-------- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 6 ++---- drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 1 - 4 files changed, 11 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 8975cf41a91ac9..48ad0c04aa72b7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -196,7 +196,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev, return -EINVAL; vram_size = KFD_XCP_MEMORY_SIZE(adev, xcp_id); - if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) { + if (adev->flags & AMD_IS_APU) { system_mem_needed = size; ttm_mem_needed = size; } @@ -233,7 +233,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev, if (adev && xcp_id >= 0) { adev->kfd.vram_used[xcp_id] += vram_needed; adev->kfd.vram_used_aligned[xcp_id] += - (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) ? + (adev->flags & AMD_IS_APU) ? vram_needed : ALIGN(vram_needed, VRAM_AVAILABLITY_ALIGN); } @@ -261,7 +261,7 @@ void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev, if (adev) { adev->kfd.vram_used[xcp_id] -= size; - if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) { + if (adev->flags & AMD_IS_APU) { adev->kfd.vram_used_aligned[xcp_id] -= size; kfd_mem_limit.system_mem_used -= size; kfd_mem_limit.ttm_mem_used -= size; @@ -890,7 +890,7 @@ static int kfd_mem_attach(struct amdgpu_device *adev, struct kgd_mem *mem, * if peer device has large BAR. In contrast, access over xGMI is * allowed for both small and large BAR configurations of peer device */ - if ((adev != bo_adev && !(adev->gmc.is_app_apu || adev->flags & AMD_IS_APU)) && + if ((adev != bo_adev && !(adev->flags & AMD_IS_APU)) && ((mem->domain == AMDGPU_GEM_DOMAIN_VRAM) || (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) || (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP))) { @@ -1658,7 +1658,7 @@ size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev, - atomic64_read(&adev->vram_pin_size) - reserved_for_pt; - if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) { + if (adev->flags & AMD_IS_APU) { system_mem_available = no_system_mem_limit ? kfd_mem_limit.max_system_mem_limit : kfd_mem_limit.max_system_mem_limit - @@ -1706,7 +1706,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( if (flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) { domain = alloc_domain = AMDGPU_GEM_DOMAIN_VRAM; - if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) { + if (adev->flags & AMD_IS_APU) { domain = AMDGPU_GEM_DOMAIN_GTT; alloc_domain = AMDGPU_GEM_DOMAIN_GTT; alloc_flags = 0; @@ -1953,7 +1953,7 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu( if (size) { if (!is_imported && (mem->bo->preferred_domains == AMDGPU_GEM_DOMAIN_VRAM || - ((adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) && + ((adev->flags & AMD_IS_APU) && mem->bo->preferred_domains == AMDGPU_GEM_DOMAIN_GTT))) *size = bo_size; else @@ -2376,7 +2376,7 @@ static int import_obj_create(struct amdgpu_device *adev, (*mem)->bo = bo; (*mem)->va = va; (*mem)->domain = (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) && - !(adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) ? + !(adev->flags & AMD_IS_APU) ? AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT; (*mem)->mapped_to_gpu_memory = 0; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index 4816fcb9803a9d..8ee3d07ffbdfa2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -1023,7 +1023,7 @@ int kgd2kfd_init_zone_device(struct amdgpu_device *adev) if (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(9, 0, 1)) return -EINVAL; - if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) + if (adev->flags & AMD_IS_APU) return 0; pgmap = &kfddev->pgmap; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 069b81eeea03cb..31e500859ab012 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -2619,8 +2619,7 @@ svm_range_best_restore_location(struct svm_range *prange, return -1; } - if (node->adev->gmc.is_app_apu || - node->adev->flags & AMD_IS_APU) + if (node->adev->flags & AMD_IS_APU) return 0; if (prange->preferred_loc == gpuid || @@ -3338,8 +3337,7 @@ svm_range_best_prefetch_location(struct svm_range *prange) goto out; } - if (bo_node->adev->gmc.is_app_apu || - bo_node->adev->flags & AMD_IS_APU) { + if (bo_node->adev->flags & AMD_IS_APU) { best_loc = 0; goto out; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h index 9c37bd0567efa5..70c1776611c472 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h @@ -201,7 +201,6 @@ void svm_range_list_lock_and_flush_work(struct svm_range_list *svms, struct mm_s * is initialized to not 0 when page migration register device memory. */ #define KFD_IS_SVM_API_SUPPORTED(adev) ((adev)->kfd.pgmap.type != 0 ||\ - (adev)->gmc.is_app_apu ||\ ((adev)->flags & AMD_IS_APU)) void svm_range_bo_unref_async(struct svm_range_bo *svm_bo); From a9bc5a19e4958fe664254d1ad2dc2a9f5868c210 Mon Sep 17 00:00:00 2001 From: Rajneesh Bhardwaj Date: Wed, 22 May 2024 15:04:29 -0400 Subject: [PATCH 0354/1578] drm/amdgpu: Make CPX mode auto default in NPS4 On GFXIP9.4.3, make CPX mode as the default compute mode if the node is setup in NPS4 memory partition mode. This change is only applicable for dGPU, for APU, continue to use TPX mode. Reviewed-by: Felix Kuehling Signed-off-by: Rajneesh Bhardwaj Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c index 414ea3f560a7a5..d4e2aed2efa331 100644 --- a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c +++ b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c @@ -422,7 +422,7 @@ __aqua_vanjaram_get_auto_mode(struct amdgpu_xcp_mgr *xcp_mgr) if (adev->gmc.num_mem_partitions == num_xcc / 2) return (adev->flags & AMD_IS_APU) ? AMDGPU_TPX_PARTITION_MODE : - AMDGPU_QPX_PARTITION_MODE; + AMDGPU_CPX_PARTITION_MODE; if (adev->gmc.num_mem_partitions == 2 && !(adev->flags & AMD_IS_APU)) return AMDGPU_DPX_PARTITION_MODE; From 67c7d4fa267bcfe8d68fb36d938e3c6e0912b57d Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 9 May 2024 13:37:27 +0200 Subject: [PATCH 0355/1578] drm/amd/pm: remove deprecated I2C_CLASS_SPD support from newly added SMU_14_0_2 Support for I2C_CLASS_SPD is currently being removed from the kernel. Only remaining step is to remove the definition of I2C_CLASS_SPD. Setting I2C_CLASS_SPD in a driver is a no-op meanwhile, so remove it here. Reviewed-by: Alex Deucher Signed-off-by: Heiner Kallweit Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c index 706265220292a9..90703f4542aba5 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c @@ -1562,7 +1562,6 @@ static int smu_v14_0_2_i2c_control_init(struct smu_context *smu) smu_i2c->port = i; mutex_init(&smu_i2c->mutex); control->owner = THIS_MODULE; - control->class = I2C_CLASS_SPD; control->dev.parent = &adev->pdev->dev; control->algo = &smu_v14_0_2_i2c_algo; snprintf(control->name, sizeof(control->name), "AMDGPU SMU %d", i); From fa0bc8f297b29126b5ae983406e9bc76d48a9a8e Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Wed, 22 May 2024 23:08:09 +0200 Subject: [PATCH 0356/1578] hwmon: (dell-smm) Add Dell G15 5511 to fan control whitelist MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A user reported that he needs to disable BIOS fan control on his Dell G15 5511 in order to be able to control the fans. Closes: https://github.com/Wer-Wolf/i8kutils/issues/5 Signed-off-by: Armin Wolf Acked-by: Pali Rohár Link: https://lore.kernel.org/r/20240522210809.294488-1-W_Armin@gmx.de Signed-off-by: Guenter Roeck --- drivers/hwmon/dell-smm-hwmon.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/hwmon/dell-smm-hwmon.c b/drivers/hwmon/dell-smm-hwmon.c index 48a81c64f00d25..942526bd4775fa 100644 --- a/drivers/hwmon/dell-smm-hwmon.c +++ b/drivers/hwmon/dell-smm-hwmon.c @@ -1545,6 +1545,14 @@ static const struct dmi_system_id i8k_whitelist_fan_control[] __initconst = { }, .driver_data = (void *)&i8k_fan_control_data[I8K_FAN_30A3_31A3], }, + { + .ident = "Dell G15 5511", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "Dell G15 5511"), + }, + .driver_data = (void *)&i8k_fan_control_data[I8K_FAN_30A3_31A3], + }, { } }; From a94ff8e50c20bde6d50864849a98b106e45d30c6 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Thu, 23 May 2024 17:47:14 +0200 Subject: [PATCH 0357/1578] hwmon: (ltc2992) Fix memory leak in ltc2992_parse_dt() A new error path was added to the fwnode_for_each_available_node() loop in ltc2992_parse_dt(), which leads to an early return that requires a call to fwnode_handle_put() to avoid a memory leak in that case. Add the missing fwnode_handle_put() in the error path from a zero value shunt resistor. Cc: stable@vger.kernel.org Fixes: 10b029020487 ("hwmon: (ltc2992) Avoid division by zero") Signed-off-by: Javier Carrasco Link: https://lore.kernel.org/r/20240523-fwnode_for_each_available_child_node_scoped-v2-1-701f3a03f2fb@gmail.com Signed-off-by: Guenter Roeck --- drivers/hwmon/ltc2992.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/hwmon/ltc2992.c b/drivers/hwmon/ltc2992.c index 229aed15d5caa7..d4a93223cd3b34 100644 --- a/drivers/hwmon/ltc2992.c +++ b/drivers/hwmon/ltc2992.c @@ -876,9 +876,11 @@ static int ltc2992_parse_dt(struct ltc2992_state *st) ret = fwnode_property_read_u32(child, "shunt-resistor-micro-ohms", &val); if (!ret) { - if (!val) + if (!val) { + fwnode_handle_put(child); return dev_err_probe(&st->client->dev, -EINVAL, "shunt resistor value cannot be zero\n"); + } st->r_sense_uohm[addr] = val; } } From 92f1655aa2b2294d0b49925f3b875a634bd3b59e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 May 2024 11:43:53 +0000 Subject: [PATCH 0358/1578] net: fix __dst_negative_advice() race __dst_negative_advice() does not enforce proper RCU rules when sk->dst_cache must be cleared, leading to possible UAF. RCU rules are that we must first clear sk->sk_dst_cache, then call dst_release(old_dst). Note that sk_dst_reset(sk) is implementing this protocol correctly, while __dst_negative_advice() uses the wrong order. Given that ip6_negative_advice() has special logic against RTF_CACHE, this means each of the three ->negative_advice() existing methods must perform the sk_dst_reset() themselves. Note the check against NULL dst is centralized in __dst_negative_advice(), there is no need to duplicate it in various callbacks. Many thanks to Clement Lecigne for tracking this issue. This old bug became visible after the blamed commit, using UDP sockets. Fixes: a87cb3e48ee8 ("net: Facility to report route quality of connected sockets") Reported-by: Clement Lecigne Diagnosed-by: Clement Lecigne Signed-off-by: Eric Dumazet Cc: Tom Herbert Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240528114353.1794151-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/dst_ops.h | 2 +- include/net/sock.h | 13 +++---------- net/ipv4/route.c | 22 ++++++++-------------- net/ipv6/route.c | 29 +++++++++++++++-------------- net/xfrm/xfrm_policy.c | 11 +++-------- 5 files changed, 30 insertions(+), 47 deletions(-) diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h index 6d1c8541183dbe..3a9001a042a5c3 100644 --- a/include/net/dst_ops.h +++ b/include/net/dst_ops.h @@ -24,7 +24,7 @@ struct dst_ops { void (*destroy)(struct dst_entry *); void (*ifdown)(struct dst_entry *, struct net_device *dev); - struct dst_entry * (*negative_advice)(struct dst_entry *); + void (*negative_advice)(struct sock *sk, struct dst_entry *); void (*link_failure)(struct sk_buff *); void (*update_pmtu)(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, diff --git a/include/net/sock.h b/include/net/sock.h index 5f4d0629348f3f..953c8dc4e259e8 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2063,17 +2063,10 @@ sk_dst_get(const struct sock *sk) static inline void __dst_negative_advice(struct sock *sk) { - struct dst_entry *ndst, *dst = __sk_dst_get(sk); + struct dst_entry *dst = __sk_dst_get(sk); - if (dst && dst->ops->negative_advice) { - ndst = dst->ops->negative_advice(dst); - - if (ndst != dst) { - rcu_assign_pointer(sk->sk_dst_cache, ndst); - sk_tx_queue_clear(sk); - WRITE_ONCE(sk->sk_dst_pending_confirm, 0); - } - } + if (dst && dst->ops->negative_advice) + dst->ops->negative_advice(sk, dst); } static inline void dst_negative_advice(struct sock *sk) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 5fd54103174f72..b3073d1c8f8f71 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -129,7 +129,8 @@ struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ipv4_default_advmss(const struct dst_entry *dst); INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst); -static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); +static void ipv4_negative_advice(struct sock *sk, + struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, @@ -825,22 +826,15 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf __ip_do_redirect(rt, skb, &fl4, true); } -static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) +static void ipv4_negative_advice(struct sock *sk, + struct dst_entry *dst) { struct rtable *rt = dst_rtable(dst); - struct dst_entry *ret = dst; - if (rt) { - if (dst->obsolete > 0) { - ip_rt_put(rt); - ret = NULL; - } else if ((rt->rt_flags & RTCF_REDIRECTED) || - rt->dst.expires) { - ip_rt_put(rt); - ret = NULL; - } - } - return ret; + if ((dst->obsolete > 0) || + (rt->rt_flags & RTCF_REDIRECTED) || + rt->dst.expires) + sk_dst_reset(sk); } /* diff --git a/net/ipv6/route.c b/net/ipv6/route.c index bbc2a0dd931429..a504b88ec06b5a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -87,7 +87,8 @@ struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ip6_default_advmss(const struct dst_entry *dst); INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst); -static struct dst_entry *ip6_negative_advice(struct dst_entry *); +static void ip6_negative_advice(struct sock *sk, + struct dst_entry *dst); static void ip6_dst_destroy(struct dst_entry *); static void ip6_dst_ifdown(struct dst_entry *, struct net_device *dev); @@ -2770,24 +2771,24 @@ INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst, } EXPORT_INDIRECT_CALLABLE(ip6_dst_check); -static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) +static void ip6_negative_advice(struct sock *sk, + struct dst_entry *dst) { struct rt6_info *rt = dst_rt6_info(dst); - if (rt) { - if (rt->rt6i_flags & RTF_CACHE) { - rcu_read_lock(); - if (rt6_check_expired(rt)) { - rt6_remove_exception_rt(rt); - dst = NULL; - } - rcu_read_unlock(); - } else { - dst_release(dst); - dst = NULL; + if (rt->rt6i_flags & RTF_CACHE) { + rcu_read_lock(); + if (rt6_check_expired(rt)) { + /* counteract the dst_release() in sk_dst_reset() */ + dst_hold(dst); + sk_dst_reset(sk); + + rt6_remove_exception_rt(rt); } + rcu_read_unlock(); + return; } - return dst; + sk_dst_reset(sk); } static void ip6_link_failure(struct sk_buff *skb) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 475b904fe68b8f..66e07de2de35cd 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -3910,15 +3910,10 @@ static void xfrm_link_failure(struct sk_buff *skb) /* Impossible. Such dst must be popped before reaches point of failure. */ } -static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst) +static void xfrm_negative_advice(struct sock *sk, struct dst_entry *dst) { - if (dst) { - if (dst->obsolete) { - dst_release(dst); - dst = NULL; - } - } - return dst; + if (dst->obsolete) + sk_dst_reset(sk); } static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr) From b8c8abefc07b47f0dc9342530b7618237df96724 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 28 May 2024 22:30:30 +0200 Subject: [PATCH 0359/1578] ipv4: correctly iterate over the target netns in inet_dump_ifaddr() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A recent change to inet_dump_ifaddr had the function incorrectly iterate over net rather than tgt_net, resulting in the data coming for the incorrect network namespace. Fixes: cdb2f80f1c10 ("inet: use xa_array iterator to implement inet_dump_ifaddr()") Reported-by: Stéphane Graber Closes: https://github.com/lxc/incus/issues/892 Bisected-by: Stéphane Graber Signed-off-by: Alexander Mikhalitsyn Tested-by: Stéphane Graber Acked-by: Christian Brauner Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240528203030.10839-1-aleksandr.mikhalitsyn@canonical.com Signed-off-by: Jakub Kicinski --- net/ipv4/devinet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index e827da128c5f55..f3892ee9dfb33f 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1903,7 +1903,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) cb->seq = inet_base_seq(tgt_net); - for_each_netdev_dump(net, dev, ctx->ifindex) { + for_each_netdev_dump(tgt_net, dev, ctx->ifindex) { in_dev = __in_dev_get_rcu(dev); if (!in_dev) continue; From 278d65ccdadb5f0fa0ceaf7b9cc97b305cd72822 Mon Sep 17 00:00:00 2001 From: Tristram Ha Date: Tue, 28 May 2024 14:34:26 -0700 Subject: [PATCH 0360/1578] net: dsa: microchip: fix RGMII error in KSZ DSA driver The driver should return RMII interface when XMII is running in RMII mode. Fixes: 0ab7f6bf1675 ("net: dsa: microchip: ksz9477: use common xmii function") Signed-off-by: Tristram Ha Acked-by: Arun Ramadoss Acked-by: Jerry Ray Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/1716932066-3342-1-git-send-email-Tristram.Ha@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/microchip/ksz_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index 1e0085cd9a9ad5..2818e24e2a51dd 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -3142,7 +3142,7 @@ phy_interface_t ksz_get_xmii(struct ksz_device *dev, int port, bool gbit) else interface = PHY_INTERFACE_MODE_MII; } else if (val == bitval[P_RMII_SEL]) { - interface = PHY_INTERFACE_MODE_RGMII; + interface = PHY_INTERFACE_MODE_RMII; } else { interface = PHY_INTERFACE_MODE_RGMII; if (data8 & P_RGMII_ID_EG_ENABLE) From bfd546a552e140b0a4c8a21527c39d6d21addb28 Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Tue, 28 May 2024 15:06:04 -0700 Subject: [PATCH 0361/1578] e1000e: move force SMBUS near the end of enable_ulp function The commit 861e8086029e ("e1000e: move force SMBUS from enable ulp function to avoid PHY loss issue") introduces a regression on PCH_MTP_I219_LM18 (PCIID: 0x8086550A). Without the referred commit, the ethernet works well after suspend and resume, but after applying the commit, the ethernet couldn't work anymore after the resume and the dmesg shows that the NIC link changes to 10Mbps (1000Mbps originally): [ 43.305084] e1000e 0000:00:1f.6 enp0s31f6: NIC Link is Up 10 Mbps Full Duplex, Flow Control: Rx/Tx Without the commit, the force SMBUS code will not be executed if "return 0" or "goto out" is executed in the enable_ulp(), and in my case, the "goto out" is executed since FWSM_FW_VALID is set. But after applying the commit, the force SMBUS code will be ran unconditionally. Here move the force SMBUS code back to enable_ulp() and put it immediately ahead of hw->phy.ops.release(hw), this could allow the longest settling time as possible for interface in this function and doesn't change the original code logic. The issue was found on a Lenovo laptop with the ethernet hw as below: 00:1f.6 Ethernet controller [0200]: Intel Corporation Device [8086:550a] (rev 20). And this patch is verified (cable plug and unplug, system suspend and resume) on Lenovo laptops with ethernet hw: [8086:550a], [8086:550b], [8086:15bb], [8086:15be], [8086:1a1f], [8086:1a1c] and [8086:0dc7]. Fixes: 861e8086029e ("e1000e: move force SMBUS from enable ulp function to avoid PHY loss issue") Signed-off-by: Hui Wang Acked-by: Vitaly Lifshits Tested-by: Naama Meir Reviewed-by: Simon Horman Reviewed-by: Paul Menzel Signed-off-by: Tony Nguyen Tested-by: Zhang Rui Signed-off-by: Jacob Keller Link: https://lore.kernel.org/r/20240528-net-2024-05-28-intel-net-fixes-v1-1-dc8593d2bbc6@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/e1000e/ich8lan.c | 22 +++++++++++++++++++++ drivers/net/ethernet/intel/e1000e/netdev.c | 18 ----------------- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index f9e94be36e97f2..2e98a2a0bead95 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -1225,6 +1225,28 @@ s32 e1000_enable_ulp_lpt_lp(struct e1000_hw *hw, bool to_sx) } release: + /* Switching PHY interface always returns MDI error + * so disable retry mechanism to avoid wasting time + */ + e1000e_disable_phy_retry(hw); + + /* Force SMBus mode in PHY */ + ret_val = e1000_read_phy_reg_hv_locked(hw, CV_SMB_CTRL, &phy_reg); + if (ret_val) { + e1000e_enable_phy_retry(hw); + hw->phy.ops.release(hw); + goto out; + } + phy_reg |= CV_SMB_CTRL_FORCE_SMBUS; + e1000_write_phy_reg_hv_locked(hw, CV_SMB_CTRL, phy_reg); + + e1000e_enable_phy_retry(hw); + + /* Force SMBus mode in MAC */ + mac_reg = er32(CTRL_EXT); + mac_reg |= E1000_CTRL_EXT_FORCE_SMBUS; + ew32(CTRL_EXT, mac_reg); + hw->phy.ops.release(hw); out: if (ret_val) diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index 220d62fca55d1b..da5c59daf8ba94 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -6623,7 +6623,6 @@ static int __e1000_shutdown(struct pci_dev *pdev, bool runtime) struct e1000_hw *hw = &adapter->hw; u32 ctrl, ctrl_ext, rctl, status, wufc; int retval = 0; - u16 smb_ctrl; /* Runtime suspend should only enable wakeup for link changes */ if (runtime) @@ -6697,23 +6696,6 @@ static int __e1000_shutdown(struct pci_dev *pdev, bool runtime) if (retval) return retval; } - - /* Force SMBUS to allow WOL */ - /* Switching PHY interface always returns MDI error - * so disable retry mechanism to avoid wasting time - */ - e1000e_disable_phy_retry(hw); - - e1e_rphy(hw, CV_SMB_CTRL, &smb_ctrl); - smb_ctrl |= CV_SMB_CTRL_FORCE_SMBUS; - e1e_wphy(hw, CV_SMB_CTRL, smb_ctrl); - - e1000e_enable_phy_retry(hw); - - /* Force SMBus mode in MAC */ - ctrl_ext = er32(CTRL_EXT); - ctrl_ext |= E1000_CTRL_EXT_FORCE_SMBUS; - ew32(CTRL_EXT, ctrl_ext); } /* Ensure that the appropriate bits are set in LPI_CTRL From 218ed820d364ddc2b0150951e6b1a1bd1e49469d Mon Sep 17 00:00:00 2001 From: Thinh Tran Date: Tue, 28 May 2024 15:06:05 -0700 Subject: [PATCH 0362/1578] i40e: factoring out i40e_suspend/i40e_resume Two new functions, i40e_io_suspend() and i40e_io_resume(), have been introduced. These functions were factored out from the existing i40e_suspend() and i40e_resume() respectively. This factoring was done due to concerns about the logic of the I40E_SUSPENSED state, which caused the device to be unable to recover. The functions are now used in the EEH handling for device suspend/resume callbacks. The function i40e_enable_mc_magic_wake() has been moved ahead of i40e_io_suspend() to ensure it is declared before being used. Tested-by: Robert Thomas Signed-off-by: Thinh Tran Reviewed-by: Simon Horman Tested-by: Pucha Himasekhar Reddy Reviewed-by: Jacob Keller Signed-off-by: Jacob Keller Link: https://lore.kernel.org/r/20240528-net-2024-05-28-intel-net-fixes-v1-2-dc8593d2bbc6@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e_main.c | 249 +++++++++++--------- 1 file changed, 135 insertions(+), 114 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 1f188c052828b4..d5f25ea304bf73 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -16334,6 +16334,139 @@ static void i40e_remove(struct pci_dev *pdev) pci_disable_device(pdev); } +/** + * i40e_enable_mc_magic_wake - enable multicast magic packet wake up + * using the mac_address_write admin q function + * @pf: pointer to i40e_pf struct + **/ +static void i40e_enable_mc_magic_wake(struct i40e_pf *pf) +{ + struct i40e_vsi *main_vsi = i40e_pf_get_main_vsi(pf); + struct i40e_hw *hw = &pf->hw; + u8 mac_addr[6]; + u16 flags = 0; + int ret; + + /* Get current MAC address in case it's an LAA */ + if (main_vsi && main_vsi->netdev) { + ether_addr_copy(mac_addr, main_vsi->netdev->dev_addr); + } else { + dev_err(&pf->pdev->dev, + "Failed to retrieve MAC address; using default\n"); + ether_addr_copy(mac_addr, hw->mac.addr); + } + + /* The FW expects the mac address write cmd to first be called with + * one of these flags before calling it again with the multicast + * enable flags. + */ + flags = I40E_AQC_WRITE_TYPE_LAA_WOL; + + if (hw->func_caps.flex10_enable && hw->partition_id != 1) + flags = I40E_AQC_WRITE_TYPE_LAA_ONLY; + + ret = i40e_aq_mac_address_write(hw, flags, mac_addr, NULL); + if (ret) { + dev_err(&pf->pdev->dev, + "Failed to update MAC address registers; cannot enable Multicast Magic packet wake up"); + return; + } + + flags = I40E_AQC_MC_MAG_EN + | I40E_AQC_WOL_PRESERVE_ON_PFR + | I40E_AQC_WRITE_TYPE_UPDATE_MC_MAG; + ret = i40e_aq_mac_address_write(hw, flags, mac_addr, NULL); + if (ret) + dev_err(&pf->pdev->dev, + "Failed to enable Multicast Magic Packet wake up\n"); +} + +/** + * i40e_io_suspend - suspend all IO operations + * @pf: pointer to i40e_pf struct + * + **/ +static int i40e_io_suspend(struct i40e_pf *pf) +{ + struct i40e_hw *hw = &pf->hw; + + set_bit(__I40E_DOWN, pf->state); + + /* Ensure service task will not be running */ + del_timer_sync(&pf->service_timer); + cancel_work_sync(&pf->service_task); + + /* Client close must be called explicitly here because the timer + * has been stopped. + */ + i40e_notify_client_of_netdev_close(pf, false); + + if (test_bit(I40E_HW_CAP_WOL_MC_MAGIC_PKT_WAKE, pf->hw.caps) && + pf->wol_en) + i40e_enable_mc_magic_wake(pf); + + /* Since we're going to destroy queues during the + * i40e_clear_interrupt_scheme() we should hold the RTNL lock for this + * whole section + */ + rtnl_lock(); + + i40e_prep_for_reset(pf); + + wr32(hw, I40E_PFPM_APM, (pf->wol_en ? I40E_PFPM_APM_APME_MASK : 0)); + wr32(hw, I40E_PFPM_WUFC, (pf->wol_en ? I40E_PFPM_WUFC_MAG_MASK : 0)); + + /* Clear the interrupt scheme and release our IRQs so that the system + * can safely hibernate even when there are a large number of CPUs. + * Otherwise hibernation might fail when mapping all the vectors back + * to CPU0. + */ + i40e_clear_interrupt_scheme(pf); + + rtnl_unlock(); + + return 0; +} + +/** + * i40e_io_resume - resume IO operations + * @pf: pointer to i40e_pf struct + * + **/ +static int i40e_io_resume(struct i40e_pf *pf) +{ + struct device *dev = &pf->pdev->dev; + int err; + + /* We need to hold the RTNL lock prior to restoring interrupt schemes, + * since we're going to be restoring queues + */ + rtnl_lock(); + + /* We cleared the interrupt scheme when we suspended, so we need to + * restore it now to resume device functionality. + */ + err = i40e_restore_interrupt_scheme(pf); + if (err) { + dev_err(dev, "Cannot restore interrupt scheme: %d\n", + err); + } + + clear_bit(__I40E_DOWN, pf->state); + i40e_reset_and_rebuild(pf, false, true); + + rtnl_unlock(); + + /* Clear suspended state last after everything is recovered */ + clear_bit(__I40E_SUSPENDED, pf->state); + + /* Restart the service task */ + mod_timer(&pf->service_timer, + round_jiffies(jiffies + pf->service_timer_period)); + + return 0; +} + /** * i40e_pci_error_detected - warning that something funky happened in PCI land * @pdev: PCI device information struct @@ -16446,53 +16579,6 @@ static void i40e_pci_error_resume(struct pci_dev *pdev) i40e_handle_reset_warning(pf, false); } -/** - * i40e_enable_mc_magic_wake - enable multicast magic packet wake up - * using the mac_address_write admin q function - * @pf: pointer to i40e_pf struct - **/ -static void i40e_enable_mc_magic_wake(struct i40e_pf *pf) -{ - struct i40e_vsi *main_vsi = i40e_pf_get_main_vsi(pf); - struct i40e_hw *hw = &pf->hw; - u8 mac_addr[6]; - u16 flags = 0; - int ret; - - /* Get current MAC address in case it's an LAA */ - if (main_vsi && main_vsi->netdev) { - ether_addr_copy(mac_addr, main_vsi->netdev->dev_addr); - } else { - dev_err(&pf->pdev->dev, - "Failed to retrieve MAC address; using default\n"); - ether_addr_copy(mac_addr, hw->mac.addr); - } - - /* The FW expects the mac address write cmd to first be called with - * one of these flags before calling it again with the multicast - * enable flags. - */ - flags = I40E_AQC_WRITE_TYPE_LAA_WOL; - - if (hw->func_caps.flex10_enable && hw->partition_id != 1) - flags = I40E_AQC_WRITE_TYPE_LAA_ONLY; - - ret = i40e_aq_mac_address_write(hw, flags, mac_addr, NULL); - if (ret) { - dev_err(&pf->pdev->dev, - "Failed to update MAC address registers; cannot enable Multicast Magic packet wake up"); - return; - } - - flags = I40E_AQC_MC_MAG_EN - | I40E_AQC_WOL_PRESERVE_ON_PFR - | I40E_AQC_WRITE_TYPE_UPDATE_MC_MAG; - ret = i40e_aq_mac_address_write(hw, flags, mac_addr, NULL); - if (ret) - dev_err(&pf->pdev->dev, - "Failed to enable Multicast Magic Packet wake up\n"); -} - /** * i40e_shutdown - PCI callback for shutting down * @pdev: PCI device information struct @@ -16552,48 +16638,11 @@ static void i40e_shutdown(struct pci_dev *pdev) static int i40e_suspend(struct device *dev) { struct i40e_pf *pf = dev_get_drvdata(dev); - struct i40e_hw *hw = &pf->hw; /* If we're already suspended, then there is nothing to do */ if (test_and_set_bit(__I40E_SUSPENDED, pf->state)) return 0; - - set_bit(__I40E_DOWN, pf->state); - - /* Ensure service task will not be running */ - del_timer_sync(&pf->service_timer); - cancel_work_sync(&pf->service_task); - - /* Client close must be called explicitly here because the timer - * has been stopped. - */ - i40e_notify_client_of_netdev_close(pf, false); - - if (test_bit(I40E_HW_CAP_WOL_MC_MAGIC_PKT_WAKE, pf->hw.caps) && - pf->wol_en) - i40e_enable_mc_magic_wake(pf); - - /* Since we're going to destroy queues during the - * i40e_clear_interrupt_scheme() we should hold the RTNL lock for this - * whole section - */ - rtnl_lock(); - - i40e_prep_for_reset(pf); - - wr32(hw, I40E_PFPM_APM, (pf->wol_en ? I40E_PFPM_APM_APME_MASK : 0)); - wr32(hw, I40E_PFPM_WUFC, (pf->wol_en ? I40E_PFPM_WUFC_MAG_MASK : 0)); - - /* Clear the interrupt scheme and release our IRQs so that the system - * can safely hibernate even when there are a large number of CPUs. - * Otherwise hibernation might fail when mapping all the vectors back - * to CPU0. - */ - i40e_clear_interrupt_scheme(pf); - - rtnl_unlock(); - - return 0; + return i40e_io_suspend(pf); } /** @@ -16603,39 +16652,11 @@ static int i40e_suspend(struct device *dev) static int i40e_resume(struct device *dev) { struct i40e_pf *pf = dev_get_drvdata(dev); - int err; /* If we're not suspended, then there is nothing to do */ if (!test_bit(__I40E_SUSPENDED, pf->state)) return 0; - - /* We need to hold the RTNL lock prior to restoring interrupt schemes, - * since we're going to be restoring queues - */ - rtnl_lock(); - - /* We cleared the interrupt scheme when we suspended, so we need to - * restore it now to resume device functionality. - */ - err = i40e_restore_interrupt_scheme(pf); - if (err) { - dev_err(dev, "Cannot restore interrupt scheme: %d\n", - err); - } - - clear_bit(__I40E_DOWN, pf->state); - i40e_reset_and_rebuild(pf, false, true); - - rtnl_unlock(); - - /* Clear suspended state last after everything is recovered */ - clear_bit(__I40E_SUSPENDED, pf->state); - - /* Restart the service task */ - mod_timer(&pf->service_timer, - round_jiffies(jiffies + pf->service_timer_period)); - - return 0; + return i40e_io_resume(pf); } static const struct pci_error_handlers i40e_err_handler = { From c80b6538d35a7a60d874c5a76c3c5a82b6a28fbb Mon Sep 17 00:00:00 2001 From: Thinh Tran Date: Tue, 28 May 2024 15:06:06 -0700 Subject: [PATCH 0363/1578] i40e: Fully suspend and resume IO operations in EEH case When EEH events occurs, the callback functions in the i40e, which are managed by the EEH driver, will completely suspend and resume all IO operations. - In the PCI error detected callback, replaced i40e_prep_for_reset() with i40e_io_suspend(). The change is to fully suspend all I/O operations - In the PCI error slot reset callback, replaced pci_enable_device_mem() with pci_enable_device(). This change enables both I/O and memory of the device. - In the PCI error resume callback, replaced i40e_handle_reset_warning() with i40e_io_resume(). This change allows the system to resume I/O operations Fixes: a5f3d2c17b07 ("powerpc/pseries/pci: Add MSI domains") Reviewed-by: Jacob Keller Tested-by: Robert Thomas Signed-off-by: Thinh Tran Reviewed-by: Simon Horman Tested-by: Pucha Himasekhar Reddy Signed-off-by: Jacob Keller Link: https://lore.kernel.org/r/20240528-net-2024-05-28-intel-net-fixes-v1-3-dc8593d2bbc6@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e_main.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index d5f25ea304bf73..284c3fad5a6e4f 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -11171,6 +11171,8 @@ static void i40e_reset_and_rebuild(struct i40e_pf *pf, bool reinit, ret = i40e_reset(pf); if (!ret) i40e_rebuild(pf, reinit, lock_acquired); + else + dev_err(&pf->pdev->dev, "%s: i40e_reset() FAILED", __func__); } /** @@ -16491,7 +16493,7 @@ static pci_ers_result_t i40e_pci_error_detected(struct pci_dev *pdev, /* shutdown all operations */ if (!test_bit(__I40E_SUSPENDED, pf->state)) - i40e_prep_for_reset(pf); + i40e_io_suspend(pf); /* Request a slot reset */ return PCI_ERS_RESULT_NEED_RESET; @@ -16513,7 +16515,8 @@ static pci_ers_result_t i40e_pci_error_slot_reset(struct pci_dev *pdev) u32 reg; dev_dbg(&pdev->dev, "%s\n", __func__); - if (pci_enable_device_mem(pdev)) { + /* enable I/O and memory of the device */ + if (pci_enable_device(pdev)) { dev_info(&pdev->dev, "Cannot re-enable PCI device after reset.\n"); result = PCI_ERS_RESULT_DISCONNECT; @@ -16576,7 +16579,7 @@ static void i40e_pci_error_resume(struct pci_dev *pdev) if (test_bit(__I40E_SUSPENDED, pf->state)) return; - i40e_handle_reset_warning(pf, false); + i40e_io_resume(pf); } /** From 2a6d8f2de2224ac46df94dc40f43f8b9701f6703 Mon Sep 17 00:00:00 2001 From: Paul Greenwalt Date: Tue, 28 May 2024 15:06:08 -0700 Subject: [PATCH 0364/1578] ice: fix 200G PHY types to link speed mapping Commit 24407a01e57c ("ice: Add 200G speed/phy type use") added support for 200G PHY speeds, but did not include the mapping of 200G PHY types to link speed. As a result the driver is returning UNKNOWN link speed when setting 200G ethtool advertised link modes. To fix this add 200G PHY types to link speed mapping to ice_get_link_speed_based_on_phy_type(). Fixes: 24407a01e57c ("ice: Add 200G speed/phy type use") Reviewed-by: Michal Swiatkowski Signed-off-by: Paul Greenwalt Tested-by: Pucha Himasekhar Reddy Signed-off-by: Jacob Keller Link: https://lore.kernel.org/r/20240528-net-2024-05-28-intel-net-fixes-v1-5-dc8593d2bbc6@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_common.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index 5649b257e6312e..24716a3b494cd0 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -3148,6 +3148,16 @@ ice_get_link_speed_based_on_phy_type(u64 phy_type_low, u64 phy_type_high) case ICE_PHY_TYPE_HIGH_100G_AUI2: speed_phy_type_high = ICE_AQ_LINK_SPEED_100GB; break; + case ICE_PHY_TYPE_HIGH_200G_CR4_PAM4: + case ICE_PHY_TYPE_HIGH_200G_SR4: + case ICE_PHY_TYPE_HIGH_200G_FR4: + case ICE_PHY_TYPE_HIGH_200G_LR4: + case ICE_PHY_TYPE_HIGH_200G_DR4: + case ICE_PHY_TYPE_HIGH_200G_KR4_PAM4: + case ICE_PHY_TYPE_HIGH_200G_AUI4_AOC_ACC: + case ICE_PHY_TYPE_HIGH_200G_AUI4: + speed_phy_type_high = ICE_AQ_LINK_SPEED_200GB; + break; default: speed_phy_type_high = ICE_AQ_LINK_SPEED_UNKNOWN; break; From a51c9b1c9ab2351e62933357fcad5bfad27f2400 Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Tue, 28 May 2024 15:06:11 -0700 Subject: [PATCH 0365/1578] ice: check for unregistering correct number of devlink params On module load, the ice driver checks for the lack of a specific PF capability to determine if it should reduce the number of devlink params to register. One situation when this test returns true is when the driver loads in safe mode. The same check is not present on the unload path when devlink params are unregistered. This results in the driver triggering a WARN_ON in the kernel devlink code. The current check and code path uses a reduction in the number of elements reported in the list of params. This is fragile and not good for future maintaining. Change the parameters to be held in two lists, one always registered and one dependent on the check. Add a symmetrical check in the unload path so that the correct parameters are unregistered as well. Fixes: 109eb2917284 ("ice: Add tx_scheduling_layers devlink param") CC: Lukasz Czapnik Reviewed-by: Przemek Kitszel Signed-off-by: Dave Ertman Reviewed-by: Jacob Keller Tested-by: Pucha Himasekhar Reddy Signed-off-by: Jacob Keller Link: https://lore.kernel.org/r/20240528-net-2024-05-28-intel-net-fixes-v1-8-dc8593d2bbc6@intel.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/intel/ice/devlink/devlink.c | 31 +++++++++++++------ 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/devlink/devlink.c b/drivers/net/ethernet/intel/ice/devlink/devlink.c index c4b69655cdf57d..704e9ad5144e8c 100644 --- a/drivers/net/ethernet/intel/ice/devlink/devlink.c +++ b/drivers/net/ethernet/intel/ice/devlink/devlink.c @@ -1388,7 +1388,7 @@ enum ice_param_id { ICE_DEVLINK_PARAM_ID_TX_SCHED_LAYERS, }; -static const struct devlink_param ice_devlink_params[] = { +static const struct devlink_param ice_dvl_rdma_params[] = { DEVLINK_PARAM_GENERIC(ENABLE_ROCE, BIT(DEVLINK_PARAM_CMODE_RUNTIME), ice_devlink_enable_roce_get, ice_devlink_enable_roce_set, @@ -1397,6 +1397,9 @@ static const struct devlink_param ice_devlink_params[] = { ice_devlink_enable_iw_get, ice_devlink_enable_iw_set, ice_devlink_enable_iw_validate), +}; + +static const struct devlink_param ice_dvl_sched_params[] = { DEVLINK_PARAM_DRIVER(ICE_DEVLINK_PARAM_ID_TX_SCHED_LAYERS, "tx_scheduling_layers", DEVLINK_PARAM_TYPE_U8, @@ -1464,21 +1467,31 @@ int ice_devlink_register_params(struct ice_pf *pf) { struct devlink *devlink = priv_to_devlink(pf); struct ice_hw *hw = &pf->hw; - size_t params_size; + int status; - params_size = ARRAY_SIZE(ice_devlink_params); + status = devl_params_register(devlink, ice_dvl_rdma_params, + ARRAY_SIZE(ice_dvl_rdma_params)); + if (status) + return status; - if (!hw->func_caps.common_cap.tx_sched_topo_comp_mode_en) - params_size--; + if (hw->func_caps.common_cap.tx_sched_topo_comp_mode_en) + status = devl_params_register(devlink, ice_dvl_sched_params, + ARRAY_SIZE(ice_dvl_sched_params)); - return devl_params_register(devlink, ice_devlink_params, - params_size); + return status; } void ice_devlink_unregister_params(struct ice_pf *pf) { - devl_params_unregister(priv_to_devlink(pf), ice_devlink_params, - ARRAY_SIZE(ice_devlink_params)); + struct devlink *devlink = priv_to_devlink(pf); + struct ice_hw *hw = &pf->hw; + + devl_params_unregister(devlink, ice_dvl_rdma_params, + ARRAY_SIZE(ice_dvl_rdma_params)); + + if (hw->func_caps.common_cap.tx_sched_topo_comp_mode_en) + devl_params_unregister(devlink, ice_dvl_sched_params, + ARRAY_SIZE(ice_dvl_sched_params)); } #define ICE_DEVLINK_READ_BLK_SIZE (1024 * 1024) From 2dc8b1e7177d4f49f492ce648440caf2de0c3616 Mon Sep 17 00:00:00 2001 From: Shay Agroskin Date: Tue, 28 May 2024 20:09:12 +0300 Subject: [PATCH 0366/1578] net: ena: Fix redundant device NUMA node override The driver overrides the NUMA node id of the device regardless of whether it knows its correct value (often setting it to -1 even though the node id is advertised in 'struct device'). This can lead to suboptimal configurations. This patch fixes this behavior and makes the shared memory allocation functions use the NUMA node id advertised by the underlying device. Fixes: 1738cd3ed342 ("net: ena: Add a driver for Amazon Elastic Network Adapters (ENA)") Signed-off-by: Shay Agroskin Link: https://lore.kernel.org/r/20240528170912.1204417-1-shayagr@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amazon/ena/ena_com.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_com.c b/drivers/net/ethernet/amazon/ena/ena_com.c index 2d8a66ea82fab7..713a595370bffa 100644 --- a/drivers/net/ethernet/amazon/ena/ena_com.c +++ b/drivers/net/ethernet/amazon/ena/ena_com.c @@ -312,7 +312,6 @@ static int ena_com_init_io_sq(struct ena_com_dev *ena_dev, struct ena_com_io_sq *io_sq) { size_t size; - int dev_node = 0; memset(&io_sq->desc_addr, 0x0, sizeof(io_sq->desc_addr)); @@ -325,12 +324,9 @@ static int ena_com_init_io_sq(struct ena_com_dev *ena_dev, size = io_sq->desc_entry_size * io_sq->q_depth; if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST) { - dev_node = dev_to_node(ena_dev->dmadev); - set_dev_node(ena_dev->dmadev, ctx->numa_node); io_sq->desc_addr.virt_addr = dma_alloc_coherent(ena_dev->dmadev, size, &io_sq->desc_addr.phys_addr, GFP_KERNEL); - set_dev_node(ena_dev->dmadev, dev_node); if (!io_sq->desc_addr.virt_addr) { io_sq->desc_addr.virt_addr = dma_alloc_coherent(ena_dev->dmadev, size, @@ -354,10 +350,7 @@ static int ena_com_init_io_sq(struct ena_com_dev *ena_dev, size = (size_t)io_sq->bounce_buf_ctrl.buffer_size * io_sq->bounce_buf_ctrl.buffers_num; - dev_node = dev_to_node(ena_dev->dmadev); - set_dev_node(ena_dev->dmadev, ctx->numa_node); io_sq->bounce_buf_ctrl.base_buffer = devm_kzalloc(ena_dev->dmadev, size, GFP_KERNEL); - set_dev_node(ena_dev->dmadev, dev_node); if (!io_sq->bounce_buf_ctrl.base_buffer) io_sq->bounce_buf_ctrl.base_buffer = devm_kzalloc(ena_dev->dmadev, size, GFP_KERNEL); @@ -397,7 +390,6 @@ static int ena_com_init_io_cq(struct ena_com_dev *ena_dev, struct ena_com_io_cq *io_cq) { size_t size; - int prev_node = 0; memset(&io_cq->cdesc_addr, 0x0, sizeof(io_cq->cdesc_addr)); @@ -409,11 +401,8 @@ static int ena_com_init_io_cq(struct ena_com_dev *ena_dev, size = io_cq->cdesc_entry_size_in_bytes * io_cq->q_depth; - prev_node = dev_to_node(ena_dev->dmadev); - set_dev_node(ena_dev->dmadev, ctx->numa_node); io_cq->cdesc_addr.virt_addr = dma_alloc_coherent(ena_dev->dmadev, size, &io_cq->cdesc_addr.phys_addr, GFP_KERNEL); - set_dev_node(ena_dev->dmadev, prev_node); if (!io_cq->cdesc_addr.virt_addr) { io_cq->cdesc_addr.virt_addr = dma_alloc_coherent(ena_dev->dmadev, size, &io_cq->cdesc_addr.phys_addr, From ed8c7fbdfe117abbef81f65428ba263118ef298a Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Thu, 30 May 2024 00:06:56 +0800 Subject: [PATCH 0367/1578] fs/file: fix the check in find_next_fd() The maximum possible return value of find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) is maxbit. This return value, multiplied by BITS_PER_LONG, gives the value of bitbit, which can never be greater than maxfd, it can only be equal to maxfd at most, so the following check 'if (bitbit > maxfd)' will never be true. Moreover, when bitbit equals maxfd, it indicates that there are no unused fds, and the function can directly return. Fix this check. Signed-off-by: Yuntao Wang Link: https://lore.kernel.org/r/20240529160656.209352-1-yuntao.wang@linux.dev Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/file.c b/fs/file.c index 8076aef9c21011..a3b72aa64f1164 100644 --- a/fs/file.c +++ b/fs/file.c @@ -486,12 +486,12 @@ struct files_struct init_files = { static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) { - unsigned int maxfd = fdt->max_fds; + unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */ unsigned int maxbit = maxfd / BITS_PER_LONG; unsigned int bitbit = start / BITS_PER_LONG; bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG; - if (bitbit > maxfd) + if (bitbit >= maxfd) return maxfd; if (bitbit > start) start = bitbit; From b3dc6e8003b500861fa307e9a3400c52e78e4d3a Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Wed, 29 May 2024 17:56:33 +0800 Subject: [PATCH 0368/1578] ipvlan: Dont Use skb->sk in ipvlan_process_v{4,6}_outbound Raw packet from PF_PACKET socket ontop of an IPv6-backed ipvlan device will hit WARN_ON_ONCE() in sk_mc_loop() through sch_direct_xmit() path. WARNING: CPU: 2 PID: 0 at net/core/sock.c:775 sk_mc_loop+0x2d/0x70 Modules linked in: sch_netem ipvlan rfkill cirrus drm_shmem_helper sg drm_kms_helper CPU: 2 PID: 0 Comm: swapper/2 Kdump: loaded Not tainted 6.9.0+ #279 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 RIP: 0010:sk_mc_loop+0x2d/0x70 Code: fa 0f 1f 44 00 00 65 0f b7 15 f7 96 a3 4f 31 c0 66 85 d2 75 26 48 85 ff 74 1c RSP: 0018:ffffa9584015cd78 EFLAGS: 00010212 RAX: 0000000000000011 RBX: ffff91e585793e00 RCX: 0000000002c6a001 RDX: 0000000000000000 RSI: 0000000000000040 RDI: ffff91e589c0f000 RBP: ffff91e5855bd100 R08: 0000000000000000 R09: 3d00545216f43d00 R10: ffff91e584fdcc50 R11: 00000060dd8616f4 R12: ffff91e58132d000 R13: ffff91e584fdcc68 R14: ffff91e5869ce800 R15: ffff91e589c0f000 FS: 0000000000000000(0000) GS:ffff91e898100000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f788f7c44c0 CR3: 0000000008e1a000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? __warn (kernel/panic.c:693) ? sk_mc_loop (net/core/sock.c:760) ? report_bug (lib/bug.c:201 lib/bug.c:219) ? handle_bug (arch/x86/kernel/traps.c:239) ? exc_invalid_op (arch/x86/kernel/traps.c:260 (discriminator 1)) ? asm_exc_invalid_op (./arch/x86/include/asm/idtentry.h:621) ? sk_mc_loop (net/core/sock.c:760) ip6_finish_output2 (net/ipv6/ip6_output.c:83 (discriminator 1)) ? nf_hook_slow (net/netfilter/core.c:626) ip6_finish_output (net/ipv6/ip6_output.c:222) ? __pfx_ip6_finish_output (net/ipv6/ip6_output.c:215) ipvlan_xmit_mode_l3 (drivers/net/ipvlan/ipvlan_core.c:602) ipvlan ipvlan_start_xmit (drivers/net/ipvlan/ipvlan_main.c:226) ipvlan dev_hard_start_xmit (net/core/dev.c:3594) sch_direct_xmit (net/sched/sch_generic.c:343) __qdisc_run (net/sched/sch_generic.c:416) net_tx_action (net/core/dev.c:5286) handle_softirqs (kernel/softirq.c:555) __irq_exit_rcu (kernel/softirq.c:589) sysvec_apic_timer_interrupt (arch/x86/kernel/apic/apic.c:1043) The warning triggers as this: packet_sendmsg packet_snd //skb->sk is packet sk __dev_queue_xmit __dev_xmit_skb //q->enqueue is not NULL __qdisc_run sch_direct_xmit dev_hard_start_xmit ipvlan_start_xmit ipvlan_xmit_mode_l3 //l3 mode ipvlan_process_outbound //vepa flag ipvlan_process_v6_outbound ip6_local_out __ip6_finish_output ip6_finish_output2 //multicast packet sk_mc_loop //sk->sk_family is AF_PACKET Call ip{6}_local_out() with NULL sk in ipvlan as other tunnels to fix this. Fixes: 2ad7bf363841 ("ipvlan: Initial check-in of the IPVLAN driver.") Suggested-by: Eric Dumazet Signed-off-by: Yue Haibing Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240529095633.613103-1-yuehaibing@huawei.com Signed-off-by: Paolo Abeni --- drivers/net/ipvlan/ipvlan_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index 2d5b021b4ea605..fef4eff7753a7a 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -439,7 +439,7 @@ static noinline_for_stack int ipvlan_process_v4_outbound(struct sk_buff *skb) memset(IPCB(skb), 0, sizeof(*IPCB(skb))); - err = ip_local_out(net, skb->sk, skb); + err = ip_local_out(net, NULL, skb); if (unlikely(net_xmit_eval(err))) DEV_STATS_INC(dev, tx_errors); else @@ -494,7 +494,7 @@ static int ipvlan_process_v6_outbound(struct sk_buff *skb) memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); - err = ip6_local_out(dev_net(dev), skb->sk, skb); + err = ip6_local_out(dev_net(dev), NULL, skb); if (unlikely(net_xmit_eval(err))) DEV_STATS_INC(dev, tx_errors); else From 700fe6fd093d08c6da2bda8efe00479b0e617327 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 30 May 2024 12:10:43 +0200 Subject: [PATCH 0369/1578] ALSA: seq: Fix yet another spot for system message conversion We fixed the incorrect UMP type for system messages in the recent commit, but it missed one place in system_ev_to_ump_midi1(). Fix it now. Fixes: e9e02819a98a ("ALSA: seq: Automatic conversion of UMP events") Fixes: c2bb79613fed ("ALSA: seq: Fix incorrect UMP type for system messages") Link: https://lore.kernel.org/r/20240530101044.17524-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/seq/seq_ump_convert.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/core/seq/seq_ump_convert.c b/sound/core/seq/seq_ump_convert.c index add70b4c2885de..f0f332017e662d 100644 --- a/sound/core/seq/seq_ump_convert.c +++ b/sound/core/seq/seq_ump_convert.c @@ -729,6 +729,7 @@ static int system_ev_to_ump_midi1(const struct snd_seq_event *event, union snd_ump_midi1_msg *data, unsigned char status) { + data->system.type = UMP_MSG_TYPE_SYSTEM; // override data->system.status = status; return 1; } From 13c7c941e72908b8cce5a84b45a7b5e485ca12ed Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 29 May 2024 09:35:47 -0700 Subject: [PATCH 0370/1578] netdev: add qstat for csum complete Recent commit 0cfe71f45f42 ("netdev: add queue stats") added a lot of useful stats, but only those immediately needed by virtio. Presumably virtio does not support CHECKSUM_COMPLETE, so statistic for that form of checksumming wasn't included. Other drivers will definitely need it, in fact we expect it to be needed in net-next soon (mlx5). So let's add the definition of the counter for CHECKSUM_COMPLETE to uAPI in net already, so that the counters are in a more natural order (all subsequent counters have not been present in any released kernel, yet). Signed-off-by: Jakub Kicinski Reviewed-by: Joe Damato Fixes: 0cfe71f45f42 ("netdev: add queue stats") Link: https://lore.kernel.org/r/20240529163547.3693194-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/netdev.yaml | 4 ++++ include/uapi/linux/netdev.h | 1 + tools/include/uapi/linux/netdev.h | 1 + 3 files changed, 6 insertions(+) diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 11a32373365ab0..959755be4d7f9c 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -349,6 +349,10 @@ attribute-sets: Number of packets dropped due to transient lack of resources, such as buffer space, host descriptors etc. type: uint + - + name: rx-csum-complete + doc: Number of packets that were marked as CHECKSUM_COMPLETE. + type: uint - name: rx-csum-unnecessary doc: Number of packets that were marked as CHECKSUM_UNNECESSARY. diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index a8188202413ec4..43742ac5b00da0 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -148,6 +148,7 @@ enum { NETDEV_A_QSTATS_RX_ALLOC_FAIL, NETDEV_A_QSTATS_RX_HW_DROPS, NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, + NETDEV_A_QSTATS_RX_CSUM_COMPLETE, NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, NETDEV_A_QSTATS_RX_CSUM_NONE, NETDEV_A_QSTATS_RX_CSUM_BAD, diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index a8188202413ec4..43742ac5b00da0 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -148,6 +148,7 @@ enum { NETDEV_A_QSTATS_RX_ALLOC_FAIL, NETDEV_A_QSTATS_RX_HW_DROPS, NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, + NETDEV_A_QSTATS_RX_CSUM_COMPLETE, NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, NETDEV_A_QSTATS_RX_CSUM_NONE, NETDEV_A_QSTATS_RX_CSUM_BAD, From 49cb894d567980235b6e64d5e69950ff77debd8c Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 30 May 2024 14:19:14 +0300 Subject: [PATCH 0371/1578] ASoC: SOF: ipc4-topology: Add support for NHLT with 16-bit only DMIC blob The ACPI NHLT table always had 32-bit DMIC blob even if 16-bit was also present and taken as a 'rule' which obviously got broken and there is at least one device on the market which ships with only 16-bit DMIC configuration blob. This corner case has never been supported and it is going to need topology updates for DMIC copier to support multiple formats. As for the kernel side: if the copier supports multiple formats and the preferred 32-bit DMIC blob is not found then we will try to get a 16-bit DMIC configuration and look for a 16-bit copier config. Fixes: f9209644ae76 ("ASoC: SOF: ipc4-topology: Correct DAI copier config and NHLT blob request") Link: https://github.com/thesofproject/linux/issues/4973 Signed-off-by: Peter Ujfalusi Reviewed-by: Seppo Ingalsuo Reviewed-by: Pierre-Louis Bossart Reviewed-by: Ranjani Sridharan Link: https://msgid.link/r/20240530111918.21974-2-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ipc4-topology.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/sound/soc/sof/ipc4-topology.c b/sound/soc/sof/ipc4-topology.c index beff1098932475..521b4dcba6012e 100644 --- a/sound/soc/sof/ipc4-topology.c +++ b/sound/soc/sof/ipc4-topology.c @@ -1483,6 +1483,8 @@ snd_sof_get_nhlt_endpoint_data(struct snd_sof_dev *sdev, struct snd_sof_dai *dai dir, dev_type); if (!cfg) { + bool get_new_blob = false; + if (format_change) { /* * The 32-bit blob was not found in NHLT table, try to @@ -1490,7 +1492,20 @@ snd_sof_get_nhlt_endpoint_data(struct snd_sof_dev *sdev, struct snd_sof_dai *dai */ bit_depth = params_width(params); format_change = false; + get_new_blob = true; + } else if (linktype == SOF_DAI_INTEL_DMIC && !single_format) { + /* + * The requested 32-bit blob (no format change for the + * blob request) was not found in NHLT table, try to + * look for 16-bit blob if the copier supports multiple + * formats + */ + bit_depth = 16; + format_change = true; + get_new_blob = true; + } + if (get_new_blob) { cfg = intel_nhlt_get_endpoint_blob(sdev->dev, ipc4_data->nhlt, dai_index, nhlt_type, bit_depth, bit_depth, @@ -1513,8 +1528,8 @@ snd_sof_get_nhlt_endpoint_data(struct snd_sof_dev *sdev, struct snd_sof_dai *dai if (format_change) { /* - * Update the params to reflect that we have loaded 32-bit blob - * instead of the 16-bit. + * Update the params to reflect that different blob was loaded + * instead of the requested bit depth (16 -> 32 or 32 -> 16). * This information is going to be used by the caller to find * matching copier format on the dai side. */ @@ -1522,7 +1537,11 @@ snd_sof_get_nhlt_endpoint_data(struct snd_sof_dev *sdev, struct snd_sof_dai *dai m = hw_param_mask(params, SNDRV_PCM_HW_PARAM_FORMAT); snd_mask_none(m); - snd_mask_set_format(m, SNDRV_PCM_FORMAT_S32_LE); + if (bit_depth == 16) + snd_mask_set_format(m, SNDRV_PCM_FORMAT_S16_LE); + else + snd_mask_set_format(m, SNDRV_PCM_FORMAT_S32_LE); + } return 0; From 2a865c9c3fb0289a95f1cb51b42d248736ff45cb Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 30 May 2024 14:19:15 +0300 Subject: [PATCH 0372/1578] ASoC: SOF: ipc4-topology: Print out the channel count in sof_ipc4_dbg_audio_format Print out the number of channels for the format explicitly instead of having the reader to understand how to interpret the ch_map and ch_cfg values. Signed-off-by: Peter Ujfalusi Reviewed-by: Pierre-Louis Bossart Reviewed-by: Ranjani Sridharan Reviewed-by: Bard Liao Link: https://msgid.link/r/20240530111918.21974-3-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ipc4-topology.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sound/soc/sof/ipc4-topology.c b/sound/soc/sof/ipc4-topology.c index 521b4dcba6012e..2e0234b865ad42 100644 --- a/sound/soc/sof/ipc4-topology.c +++ b/sound/soc/sof/ipc4-topology.c @@ -195,9 +195,10 @@ static void sof_ipc4_dbg_audio_format(struct device *dev, struct sof_ipc4_pin_fo for (i = 0; i < num_formats; i++) { struct sof_ipc4_audio_format *fmt = &pin_fmt[i].audio_fmt; dev_dbg(dev, - "Pin index #%d: %uHz, %ubit (ch_map %#x ch_cfg %u interleaving_style %u fmt_cfg %#x) buffer size %d\n", - pin_fmt[i].pin_index, fmt->sampling_frequency, fmt->bit_depth, fmt->ch_map, - fmt->ch_cfg, fmt->interleaving_style, fmt->fmt_cfg, + "Pin index #%d: %uHz, %ubit, %luch (ch_map %#x ch_cfg %u interleaving_style %u fmt_cfg %#x) buffer size %d\n", + pin_fmt[i].pin_index, fmt->sampling_frequency, fmt->bit_depth, + SOF_IPC4_AUDIO_FORMAT_CFG_CHANNELS_COUNT(fmt->fmt_cfg), + fmt->ch_map, fmt->ch_cfg, fmt->interleaving_style, fmt->fmt_cfg, pin_fmt[i].buffer_size); } } From 3b64fd2f83f203f5a34faed3dadf6464313f827d Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 30 May 2024 14:19:16 +0300 Subject: [PATCH 0373/1578] ASoC: SOF: ipc4-topology/pcm: Rename sof_ipc4_copier_is_single_format() Rename the sof_ipc4_copier_is_single_format() to sof_ipc4_copier_is_single_bitdepth() to clear the confusion of the use of 'format' when we are querying information on the bit depth. Format is used to describe a combination of parameters (rate, channels, sample format / bit depth). Rename the flags used to store the result at the same time. Signed-off-by: Peter Ujfalusi Reviewed-by: Pierre-Louis Bossart Reviewed-by: Bard Liao Reviewed-by: Seppo Ingalsuo Reviewed-by: Ranjani Sridharan Link: https://msgid.link/r/20240530111918.21974-4-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ipc4-pcm.c | 12 +++++------ sound/soc/sof/ipc4-topology.c | 40 +++++++++++++++++------------------ sound/soc/sof/ipc4-topology.h | 6 +++--- 3 files changed, 29 insertions(+), 29 deletions(-) diff --git a/sound/soc/sof/ipc4-pcm.c b/sound/soc/sof/ipc4-pcm.c index 307bee63756b5c..4df2be3d39eba0 100644 --- a/sound/soc/sof/ipc4-pcm.c +++ b/sound/soc/sof/ipc4-pcm.c @@ -650,7 +650,7 @@ static int sof_ipc4_pcm_dai_link_fixup(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *cpu_dai = snd_soc_rtd_to_cpu(rtd, 0); struct sof_ipc4_audio_format *ipc4_fmt; struct sof_ipc4_copier *ipc4_copier; - bool single_fmt = false; + bool single_bitdepth = false; u32 valid_bits = 0; int dir, ret; @@ -682,18 +682,18 @@ static int sof_ipc4_pcm_dai_link_fixup(struct snd_soc_pcm_runtime *rtd, return 0; if (dir == SNDRV_PCM_STREAM_PLAYBACK) { - if (sof_ipc4_copier_is_single_format(sdev, + if (sof_ipc4_copier_is_single_bitdepth(sdev, available_fmt->output_pin_fmts, available_fmt->num_output_formats)) { ipc4_fmt = &available_fmt->output_pin_fmts->audio_fmt; - single_fmt = true; + single_bitdepth = true; } } else { - if (sof_ipc4_copier_is_single_format(sdev, + if (sof_ipc4_copier_is_single_bitdepth(sdev, available_fmt->input_pin_fmts, available_fmt->num_input_formats)) { ipc4_fmt = &available_fmt->input_pin_fmts->audio_fmt; - single_fmt = true; + single_bitdepth = true; } } } @@ -703,7 +703,7 @@ static int sof_ipc4_pcm_dai_link_fixup(struct snd_soc_pcm_runtime *rtd, if (ret) return ret; - if (single_fmt) { + if (single_bitdepth) { snd_mask_none(fmt); valid_bits = SOF_IPC4_AUDIO_FORMAT_CFG_V_BIT_DEPTH(ipc4_fmt->fmt_cfg); dev_dbg(component->dev, "Set %s to %d bit format\n", dai->name, valid_bits); diff --git a/sound/soc/sof/ipc4-topology.c b/sound/soc/sof/ipc4-topology.c index 2e0234b865ad42..afb7908766a38e 100644 --- a/sound/soc/sof/ipc4-topology.c +++ b/sound/soc/sof/ipc4-topology.c @@ -1423,7 +1423,7 @@ static int snd_sof_get_hw_config_params(struct snd_sof_dev *sdev, struct snd_sof static int snd_sof_get_nhlt_endpoint_data(struct snd_sof_dev *sdev, struct snd_sof_dai *dai, - bool single_format, + bool single_bitdepth, struct snd_pcm_hw_params *params, u32 dai_index, u32 linktype, u8 dir, u32 **dst, u32 *len) { @@ -1446,7 +1446,7 @@ snd_sof_get_nhlt_endpoint_data(struct snd_sof_dev *sdev, struct snd_sof_dai *dai * Look for 32-bit blob first instead of 16-bit if copier * supports multiple formats */ - if (bit_depth == 16 && !single_format) { + if (bit_depth == 16 && !single_bitdepth) { dev_dbg(sdev->dev, "Looking for 32-bit blob first for DMIC\n"); format_change = true; bit_depth = 32; @@ -1494,7 +1494,7 @@ snd_sof_get_nhlt_endpoint_data(struct snd_sof_dev *sdev, struct snd_sof_dai *dai bit_depth = params_width(params); format_change = false; get_new_blob = true; - } else if (linktype == SOF_DAI_INTEL_DMIC && !single_format) { + } else if (linktype == SOF_DAI_INTEL_DMIC && !single_bitdepth) { /* * The requested 32-bit blob (no format change for the * blob request) was not found in NHLT table, try to @@ -1550,7 +1550,7 @@ snd_sof_get_nhlt_endpoint_data(struct snd_sof_dev *sdev, struct snd_sof_dai *dai #else static int snd_sof_get_nhlt_endpoint_data(struct snd_sof_dev *sdev, struct snd_sof_dai *dai, - bool single_format, + bool single_bitdepth, struct snd_pcm_hw_params *params, u32 dai_index, u32 linktype, u8 dir, u32 **dst, u32 *len) { @@ -1558,9 +1558,9 @@ snd_sof_get_nhlt_endpoint_data(struct snd_sof_dev *sdev, struct snd_sof_dai *dai } #endif -bool sof_ipc4_copier_is_single_format(struct snd_sof_dev *sdev, - struct sof_ipc4_pin_format *pin_fmts, - u32 pin_fmts_size) +bool sof_ipc4_copier_is_single_bitdepth(struct snd_sof_dev *sdev, + struct sof_ipc4_pin_format *pin_fmts, + u32 pin_fmts_size) { struct sof_ipc4_audio_format *fmt; u32 valid_bits; @@ -1591,7 +1591,7 @@ sof_ipc4_prepare_dai_copier(struct snd_sof_dev *sdev, struct snd_sof_dai *dai, struct snd_pcm_hw_params dai_params = *params; struct sof_ipc4_copier_data *copier_data; struct sof_ipc4_copier *ipc4_copier; - bool single_format; + bool single_bitdepth; int ret; ipc4_copier = dai->private; @@ -1605,12 +1605,12 @@ sof_ipc4_prepare_dai_copier(struct snd_sof_dev *sdev, struct snd_sof_dai *dai, * format lookup */ if (dir == SNDRV_PCM_STREAM_PLAYBACK) { - single_format = sof_ipc4_copier_is_single_format(sdev, + single_bitdepth = sof_ipc4_copier_is_single_bitdepth(sdev, available_fmt->output_pin_fmts, available_fmt->num_output_formats); /* Update the dai_params with the only supported format */ - if (single_format) { + if (single_bitdepth) { ret = sof_ipc4_update_hw_params(sdev, &dai_params, &available_fmt->output_pin_fmts[0].audio_fmt, BIT(SNDRV_PCM_HW_PARAM_FORMAT)); @@ -1618,12 +1618,12 @@ sof_ipc4_prepare_dai_copier(struct snd_sof_dev *sdev, struct snd_sof_dai *dai, return ret; } } else { - single_format = sof_ipc4_copier_is_single_format(sdev, + single_bitdepth = sof_ipc4_copier_is_single_bitdepth(sdev, available_fmt->input_pin_fmts, available_fmt->num_input_formats); /* Update the dai_params with the only supported format */ - if (single_format) { + if (single_bitdepth) { ret = sof_ipc4_update_hw_params(sdev, &dai_params, &available_fmt->input_pin_fmts[0].audio_fmt, BIT(SNDRV_PCM_HW_PARAM_FORMAT)); @@ -1632,7 +1632,7 @@ sof_ipc4_prepare_dai_copier(struct snd_sof_dev *sdev, struct snd_sof_dai *dai, } } - ret = snd_sof_get_nhlt_endpoint_data(sdev, dai, single_format, + ret = snd_sof_get_nhlt_endpoint_data(sdev, dai, single_bitdepth, &dai_params, ipc4_copier->dai_index, ipc4_copier->dai_type, dir, @@ -1667,7 +1667,7 @@ sof_ipc4_prepare_copier_module(struct snd_sof_widget *swidget, u32 out_ref_rate, out_ref_channels; u32 deep_buffer_dma_ms = 0; int output_fmt_index; - bool single_output_format; + bool single_output_bitdepth; int i; dev_dbg(sdev->dev, "copier %s, type %d", swidget->widget->name, swidget->id); @@ -1804,9 +1804,9 @@ sof_ipc4_prepare_copier_module(struct snd_sof_widget *swidget, return ret; /* set the reference params for output format selection */ - single_output_format = sof_ipc4_copier_is_single_format(sdev, - available_fmt->output_pin_fmts, - available_fmt->num_output_formats); + single_output_bitdepth = sof_ipc4_copier_is_single_bitdepth(sdev, + available_fmt->output_pin_fmts, + available_fmt->num_output_formats); switch (swidget->id) { case snd_soc_dapm_aif_in: case snd_soc_dapm_dai_out: @@ -1818,7 +1818,7 @@ sof_ipc4_prepare_copier_module(struct snd_sof_widget *swidget, out_ref_rate = in_fmt->sampling_frequency; out_ref_channels = SOF_IPC4_AUDIO_FORMAT_CFG_CHANNELS_COUNT(in_fmt->fmt_cfg); - if (!single_output_format) + if (!single_output_bitdepth) out_ref_valid_bits = SOF_IPC4_AUDIO_FORMAT_CFG_V_BIT_DEPTH(in_fmt->fmt_cfg); break; @@ -1827,7 +1827,7 @@ sof_ipc4_prepare_copier_module(struct snd_sof_widget *swidget, case snd_soc_dapm_dai_in: out_ref_rate = params_rate(fe_params); out_ref_channels = params_channels(fe_params); - if (!single_output_format) { + if (!single_output_bitdepth) { out_ref_valid_bits = sof_ipc4_get_valid_bits(sdev, fe_params); if (out_ref_valid_bits < 0) return out_ref_valid_bits; @@ -1845,7 +1845,7 @@ sof_ipc4_prepare_copier_module(struct snd_sof_widget *swidget, * if the output format is the same across all available output formats, choose * that as the reference. */ - if (single_output_format) { + if (single_output_bitdepth) { struct sof_ipc4_audio_format *out_fmt; out_fmt = &available_fmt->output_pin_fmts[0].audio_fmt; diff --git a/sound/soc/sof/ipc4-topology.h b/sound/soc/sof/ipc4-topology.h index 4488762f6a71ba..f4dc499c0ffe55 100644 --- a/sound/soc/sof/ipc4-topology.h +++ b/sound/soc/sof/ipc4-topology.h @@ -476,7 +476,7 @@ struct sof_ipc4_process { u32 init_config; }; -bool sof_ipc4_copier_is_single_format(struct snd_sof_dev *sdev, - struct sof_ipc4_pin_format *pin_fmts, - u32 pin_fmts_size); +bool sof_ipc4_copier_is_single_bitdepth(struct snd_sof_dev *sdev, + struct sof_ipc4_pin_format *pin_fmts, + u32 pin_fmts_size); #endif From 2fcad03eaba1b86e6b829f73a9e75e681b7f3106 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 30 May 2024 14:19:17 +0300 Subject: [PATCH 0374/1578] ASoC: SOF: ipc4-topology: Improve readability of sof_ipc4_prepare_dai_copier() Remove the duplicated code paths to check for single bit depth and to update the params with storing the parameters needed by the function and have a single code section. No functional change but the code is easier to follow. Signed-off-by: Peter Ujfalusi Reviewed-by: Pierre-Louis Bossart Reviewed-by: Bard Liao Reviewed-by: Seppo Ingalsuo Reviewed-by: Ranjani Sridharan Link: https://msgid.link/r/20240530111918.21974-5-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ipc4-topology.c | 42 +++++++++++++++-------------------- 1 file changed, 18 insertions(+), 24 deletions(-) diff --git a/sound/soc/sof/ipc4-topology.c b/sound/soc/sof/ipc4-topology.c index afb7908766a38e..645252789cfe5c 100644 --- a/sound/soc/sof/ipc4-topology.c +++ b/sound/soc/sof/ipc4-topology.c @@ -1590,8 +1590,10 @@ sof_ipc4_prepare_dai_copier(struct snd_sof_dev *sdev, struct snd_sof_dai *dai, struct sof_ipc4_available_audio_format *available_fmt; struct snd_pcm_hw_params dai_params = *params; struct sof_ipc4_copier_data *copier_data; + struct sof_ipc4_pin_format *pin_fmts; struct sof_ipc4_copier *ipc4_copier; bool single_bitdepth; + u32 num_pin_fmts; int ret; ipc4_copier = dai->private; @@ -1605,31 +1607,23 @@ sof_ipc4_prepare_dai_copier(struct snd_sof_dev *sdev, struct snd_sof_dai *dai, * format lookup */ if (dir == SNDRV_PCM_STREAM_PLAYBACK) { - single_bitdepth = sof_ipc4_copier_is_single_bitdepth(sdev, - available_fmt->output_pin_fmts, - available_fmt->num_output_formats); - - /* Update the dai_params with the only supported format */ - if (single_bitdepth) { - ret = sof_ipc4_update_hw_params(sdev, &dai_params, - &available_fmt->output_pin_fmts[0].audio_fmt, - BIT(SNDRV_PCM_HW_PARAM_FORMAT)); - if (ret) - return ret; - } + pin_fmts = available_fmt->output_pin_fmts; + num_pin_fmts = available_fmt->num_output_formats; } else { - single_bitdepth = sof_ipc4_copier_is_single_bitdepth(sdev, - available_fmt->input_pin_fmts, - available_fmt->num_input_formats); - - /* Update the dai_params with the only supported format */ - if (single_bitdepth) { - ret = sof_ipc4_update_hw_params(sdev, &dai_params, - &available_fmt->input_pin_fmts[0].audio_fmt, - BIT(SNDRV_PCM_HW_PARAM_FORMAT)); - if (ret) - return ret; - } + pin_fmts = available_fmt->input_pin_fmts; + num_pin_fmts = available_fmt->num_input_formats; + } + + single_bitdepth = sof_ipc4_copier_is_single_bitdepth(sdev, pin_fmts, + num_pin_fmts); + + /* Update the dai_params with the only supported format */ + if (single_bitdepth) { + ret = sof_ipc4_update_hw_params(sdev, &dai_params, + &pin_fmts[0].audio_fmt, + BIT(SNDRV_PCM_HW_PARAM_FORMAT)); + if (ret) + return ret; } ret = snd_sof_get_nhlt_endpoint_data(sdev, dai, single_bitdepth, From b65456b7b379e20ab225a4e906dc4a0c98fddd7a Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 30 May 2024 14:19:18 +0300 Subject: [PATCH 0375/1578] ASoC: SOF: ipc4-topology: Adjust the params based on DAI formats Currently we only check the bit depth value among to DAI formats, but other parameters might be constant, like number of channels and/or rate. In capture we use the fe params as a reference to find the format and blob which should be used, but in the path we can have components which can handle expanding/narrowing number of channels or do a resample. In these cases the topology is expected to have 'fixed' parameter for channels/rates/bit depth and the conversion to the fe format is going to be done within the path. In practice this patch fixes issues like: All DMIC formats are fixed four channels We have a component which converts the four channel to stereo FE is opened with 2 channel Even if we have the correct bit depth format and blob (for four channel) we will still be looking for stereo configurations, which will fail. Note: the adjustment of params have switched order with the checking of single bit depth (needed for the NHLT blob fallback support). This change is non function, just that if the sof_ipc4_narrow_params_to_format() would fail, there is no point of checking the single bit depth. Signed-off-by: Peter Ujfalusi Reviewed-by: Pierre-Louis Bossart Reviewed-by: Bard Liao Reviewed-by: Seppo Ingalsuo Reviewed-by: Ranjani Sridharan Link: https://msgid.link/r/20240530111918.21974-6-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ipc4-topology.c | 71 ++++++++++++++++++++++++++++------- 1 file changed, 57 insertions(+), 14 deletions(-) diff --git a/sound/soc/sof/ipc4-topology.c b/sound/soc/sof/ipc4-topology.c index 645252789cfe5c..1bea4bab958b8b 100644 --- a/sound/soc/sof/ipc4-topology.c +++ b/sound/soc/sof/ipc4-topology.c @@ -1583,6 +1583,55 @@ bool sof_ipc4_copier_is_single_bitdepth(struct snd_sof_dev *sdev, return true; } +static int +sof_ipc4_adjust_params_to_dai_format(struct snd_sof_dev *sdev, + struct snd_pcm_hw_params *params, + struct sof_ipc4_pin_format *pin_fmts, + u32 pin_fmts_size) +{ + u32 params_mask = BIT(SNDRV_PCM_HW_PARAM_RATE) | + BIT(SNDRV_PCM_HW_PARAM_CHANNELS) | + BIT(SNDRV_PCM_HW_PARAM_FORMAT); + struct sof_ipc4_audio_format *fmt; + u32 rate, channels, valid_bits; + int i; + + fmt = &pin_fmts[0].audio_fmt; + rate = fmt->sampling_frequency; + channels = SOF_IPC4_AUDIO_FORMAT_CFG_CHANNELS_COUNT(fmt->fmt_cfg); + valid_bits = SOF_IPC4_AUDIO_FORMAT_CFG_V_BIT_DEPTH(fmt->fmt_cfg); + + /* check if parameters in topology defined formats are the same */ + for (i = 1; i < pin_fmts_size; i++) { + u32 val; + + fmt = &pin_fmts[i].audio_fmt; + + if (params_mask & BIT(SNDRV_PCM_HW_PARAM_RATE)) { + val = fmt->sampling_frequency; + if (val != rate) + params_mask &= ~BIT(SNDRV_PCM_HW_PARAM_RATE); + } + if (params_mask & BIT(SNDRV_PCM_HW_PARAM_CHANNELS)) { + val = SOF_IPC4_AUDIO_FORMAT_CFG_CHANNELS_COUNT(fmt->fmt_cfg); + if (val != channels) + params_mask &= ~BIT(SNDRV_PCM_HW_PARAM_CHANNELS); + } + if (params_mask & BIT(SNDRV_PCM_HW_PARAM_FORMAT)) { + val = SOF_IPC4_AUDIO_FORMAT_CFG_V_BIT_DEPTH(fmt->fmt_cfg); + if (val != valid_bits) + params_mask &= ~BIT(SNDRV_PCM_HW_PARAM_FORMAT); + } + } + + if (params_mask) + return sof_ipc4_update_hw_params(sdev, params, + &pin_fmts[0].audio_fmt, + params_mask); + + return 0; +} + static int sof_ipc4_prepare_dai_copier(struct snd_sof_dev *sdev, struct snd_sof_dai *dai, struct snd_pcm_hw_params *params, int dir) @@ -1601,10 +1650,9 @@ sof_ipc4_prepare_dai_copier(struct snd_sof_dev *sdev, struct snd_sof_dai *dai, available_fmt = &ipc4_copier->available_fmt; /* - * If the copier on the DAI side supports only single bit depth then - * this depth (format) should be used to look for the NHLT blob (if - * needed) and in case of capture this should be used for the input - * format lookup + * Fixup the params based on the format parameters of the DAI. If any + * of the RATE, CHANNELS, bit depth is static among the formats then + * narrow the params to only allow that specific parameter value. */ if (dir == SNDRV_PCM_STREAM_PLAYBACK) { pin_fmts = available_fmt->output_pin_fmts; @@ -1614,18 +1662,13 @@ sof_ipc4_prepare_dai_copier(struct snd_sof_dev *sdev, struct snd_sof_dai *dai, num_pin_fmts = available_fmt->num_input_formats; } + ret = sof_ipc4_adjust_params_to_dai_format(sdev, &dai_params, pin_fmts, + num_pin_fmts); + if (ret) + return ret; + single_bitdepth = sof_ipc4_copier_is_single_bitdepth(sdev, pin_fmts, num_pin_fmts); - - /* Update the dai_params with the only supported format */ - if (single_bitdepth) { - ret = sof_ipc4_update_hw_params(sdev, &dai_params, - &pin_fmts[0].audio_fmt, - BIT(SNDRV_PCM_HW_PARAM_FORMAT)); - if (ret) - return ret; - } - ret = snd_sof_get_nhlt_endpoint_data(sdev, dai, single_bitdepth, &dai_params, ipc4_copier->dai_index, From 056620da899527c14cf36e5019a0decaf4cf0f79 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Mon, 20 May 2024 01:56:58 -0700 Subject: [PATCH 0376/1578] RDMA/bnxt_re: Fix the max msix vectors macro bnxt_re no longer decide the number of MSI-x vectors used by itself. Its decided by bnxt_en now. So when bnxt_en changes this value, system crash is seen. Depend on the max value reported by bnxt_en instead of using the its own macros. Fixes: 303432211324 ("bnxt_en: Remove runtime interrupt vector allocation") Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/1716195418-11767-1-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index 9dca451ed5221a..6974922e5609a0 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -107,8 +107,6 @@ struct bnxt_re_gsi_context { struct bnxt_re_sqp_entries *sqp_tbl; }; -#define BNXT_RE_MIN_MSIX 2 -#define BNXT_RE_MAX_MSIX 9 #define BNXT_RE_AEQ_IDX 0 #define BNXT_RE_NQ_IDX 1 #define BNXT_RE_GEN_P5_MAX_VF 64 @@ -168,7 +166,7 @@ struct bnxt_re_dev { struct bnxt_qplib_rcfw rcfw; /* NQ */ - struct bnxt_qplib_nq nq[BNXT_RE_MAX_MSIX]; + struct bnxt_qplib_nq nq[BNXT_MAX_ROCE_MSIX]; /* Device Resources */ struct bnxt_qplib_dev_attr dev_attr; From 12870ae3818e39ea65bf710f645972277b634f72 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Fri, 24 May 2024 14:29:54 -0500 Subject: [PATCH 0377/1578] powerpc/pseries/lparcfg: drop error message from guest name lookup It's not an error or exceptional situation when the hosting environment does not expose a name for the LP/guest via RTAS or the device tree. This happens with qemu when run without the '-name' option. The message also lacks a newline. Remove it. Signed-off-by: Nathan Lynch Fixes: eddaa9a40275 ("powerpc/pseries: read the lpar name from the firmware") Signed-off-by: Michael Ellerman Link: https://msgid.link/20240524-lparcfg-updates-v2-1-62e2e9d28724@linux.ibm.com --- arch/powerpc/platforms/pseries/lparcfg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c index 6e7029640c0ca2..62da20f9700a97 100644 --- a/arch/powerpc/platforms/pseries/lparcfg.c +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -371,8 +371,8 @@ static int read_dt_lpar_name(struct seq_file *m) static void read_lpar_name(struct seq_file *m) { - if (read_rtas_lpar_name(m) && read_dt_lpar_name(m)) - pr_err_once("Error can't get the LPAR name"); + if (read_rtas_lpar_name(m)) + read_dt_lpar_name(m); } #define SPLPAR_MAXLENGTH 1026*(sizeof(char)) From 2d43cc701b96f910f50915ac4c2a0cae5deb734c Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 29 May 2024 22:30:28 +1000 Subject: [PATCH 0378/1578] powerpc/uaccess: Fix build errors seen with GCC 13/14 Building ppc64le_defconfig with GCC 14 fails with assembler errors: CC fs/readdir.o /tmp/ccdQn0mD.s: Assembler messages: /tmp/ccdQn0mD.s:212: Error: operand out of domain (18 is not a multiple of 4) /tmp/ccdQn0mD.s:226: Error: operand out of domain (18 is not a multiple of 4) ... [6 lines] /tmp/ccdQn0mD.s:1699: Error: operand out of domain (18 is not a multiple of 4) A snippet of the asm shows: # ../fs/readdir.c:210: unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); ld 9,0(29) # MEM[(u64 *)name_38(D) + _88 * 1], MEM[(u64 *)name_38(D) + _88 * 1] # 210 "../fs/readdir.c" 1 1: std 9,18(8) # put_user # *__pus_addr_52, MEM[(u64 *)name_38(D) + _88 * 1] The 'std' instruction requires a 4-byte aligned displacement because it is a DS-form instruction, and as the assembler says, 18 is not a multiple of 4. A similar error is seen with GCC 13 and CONFIG_UBSAN_SIGNED_WRAP=y. The fix is to change the constraint on the memory operand to put_user(), from "m" which is a general memory reference to "YZ". The "Z" constraint is documented in the GCC manual PowerPC machine constraints, and specifies a "memory operand accessed with indexed or indirect addressing". "Y" is not documented in the manual but specifies a "memory operand for a DS-form instruction". Using both allows the compiler to generate a DS-form "std" or X-form "stdx" as appropriate. The change has to be conditional on CONFIG_PPC_KERNEL_PREFIXED because the "Y" constraint does not guarantee 4-byte alignment when prefixed instructions are enabled. Unfortunately clang doesn't support the "Y" constraint so that has to be behind an ifdef. Although the build error is only seen with GCC 13/14, that appears to just be luck. The constraint has been incorrect since it was first added. Fixes: c20beffeec3c ("powerpc/uaccess: Use flexible addressing with __put_user()/__get_user()") Cc: stable@vger.kernel.org # v5.10+ Suggested-by: Kewen Lin Signed-off-by: Michael Ellerman Link: https://msgid.link/20240529123029.146953-1-mpe@ellerman.id.au --- arch/powerpc/include/asm/uaccess.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index de10437fd20652..4cba724c88991e 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -92,9 +92,25 @@ __pu_failed: \ : label) #endif +#ifdef CONFIG_CC_IS_CLANG +#define DS_FORM_CONSTRAINT "Z<>" +#else +#define DS_FORM_CONSTRAINT "YZ<>" +#endif + #ifdef __powerpc64__ +#ifdef CONFIG_PPC_KERNEL_PREFIXED #define __put_user_asm2_goto(x, ptr, label) \ __put_user_asm_goto(x, ptr, label, "std") +#else +#define __put_user_asm2_goto(x, addr, label) \ + asm goto ("1: std%U1%X1 %0,%1 # put_user\n" \ + EX_TABLE(1b, %l2) \ + : \ + : "r" (x), DS_FORM_CONSTRAINT (*addr) \ + : \ + : label) +#endif // CONFIG_PPC_KERNEL_PREFIXED #else /* __powerpc64__ */ #define __put_user_asm2_goto(x, addr, label) \ asm goto( \ From 50934945d54238d2d6d8db4b7c1d4c90d2696c57 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 29 May 2024 22:30:29 +1000 Subject: [PATCH 0379/1578] powerpc/uaccess: Use YZ asm constraint for ld The 'ld' instruction requires a 4-byte aligned displacement because it is a DS-form instruction. But the "m" asm constraint doesn't enforce that. Add a special case of __get_user_asm2_goto() so that the "YZ" constraint can be used for "ld". The "Z" constraint is documented in the GCC manual PowerPC machine constraints, and specifies a "memory operand accessed with indexed or indirect addressing". "Y" is not documented in the manual but specifies a "memory operand for a DS-form instruction". Using both allows the compiler to generate a DS-form "ld" or X-form "ldx" as appropriate. The change has to be conditional on CONFIG_PPC_KERNEL_PREFIXED because the "Y" constraint does not guarantee 4-byte alignment when prefixed instructions are enabled. No build errors have been reported due to this, but the possibility is there depending on compiler code generation decisions. Fixes: c20beffeec3c ("powerpc/uaccess: Use flexible addressing with __put_user()/__get_user()") Signed-off-by: Michael Ellerman Link: https://msgid.link/20240529123029.146953-2-mpe@ellerman.id.au --- arch/powerpc/include/asm/uaccess.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 4cba724c88991e..fd594bf6c6a9c5 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -181,8 +181,19 @@ do { \ #endif #ifdef __powerpc64__ +#ifdef CONFIG_PPC_KERNEL_PREFIXED #define __get_user_asm2_goto(x, addr, label) \ __get_user_asm_goto(x, addr, label, "ld") +#else +#define __get_user_asm2_goto(x, addr, label) \ + asm_goto_output( \ + "1: ld%U1%X1 %0, %1 # get_user\n" \ + EX_TABLE(1b, %l2) \ + : "=r" (x) \ + : DS_FORM_CONSTRAINT (*addr) \ + : \ + : label) +#endif // CONFIG_PPC_KERNEL_PREFIXED #else /* __powerpc64__ */ #define __get_user_asm2_goto(x, addr, label) \ asm_goto_output( \ From be2fc65d66e0406cc9d39d40becaecdf4ee765f3 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Wed, 29 May 2024 09:28:50 -0700 Subject: [PATCH 0380/1578] powerpc: Limit ARCH_HAS_KERNEL_FPU_SUPPORT to PPC64 When building a 32-bit kernel, some toolchains do not allow mixing soft float and hard float object files: LD vmlinux.o powerpc64le-unknown-linux-musl-ld: lib/test_fpu_impl.o uses hard float, arch/powerpc/kernel/udbg.o uses soft float powerpc64le-unknown-linux-musl-ld: failed to merge target specific data of file lib/test_fpu_impl.o make[2]: *** [scripts/Makefile.vmlinux_o:62: vmlinux.o] Error 1 make[1]: *** [Makefile:1152: vmlinux_o] Error 2 make: *** [Makefile:240: __sub-make] Error 2 This is not an issue when building a 64-bit kernel. To unbreak the build, limit ARCH_HAS_KERNEL_FPU_SUPPORT to 64-bit kernels. This is okay because the only real user of this option, amdgpu, was previously limited to PPC64 anyway; see commit a28e4b672f04 ("drm/amd/display: use ARCH_HAS_KERNEL_FPU_SUPPORT"). Fixes: 01db473e1aa3 ("powerpc: implement ARCH_HAS_KERNEL_FPU_SUPPORT") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202405250851.Z4daYSWG-lkp@intel.com/ Reported-by: Guenter Roeck Closes: https://lore.kernel.org/lkml/eeffaec3-df63-4e55-ab7a-064a65c00efa@roeck-us.net/ Signed-off-by: Samuel Holland Tested-by: Guenter Roeck Signed-off-by: Michael Ellerman Link: https://msgid.link/20240529162852.1209-1-samuel.holland@sifive.com --- arch/powerpc/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 3c968f2f4ac447..c88c6d46a5bc01 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -137,7 +137,7 @@ config PPC select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_HUGEPD if HUGETLB_PAGE select ARCH_HAS_KCOV - select ARCH_HAS_KERNEL_FPU_SUPPORT if PPC_FPU + select ARCH_HAS_KERNEL_FPU_SUPPORT if PPC64 && PPC_FPU select ARCH_HAS_MEMBARRIER_CALLBACKS select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_MEMREMAP_COMPAT_ALIGN if PPC_64S_HASH_MMU From 0e5895ff7fab0fc05ec17daf9a568368828fa6ea Mon Sep 17 00:00:00 2001 From: Gerald Loacker Date: Wed, 29 May 2024 16:42:45 +0200 Subject: [PATCH 0381/1578] drm/panel: sitronix-st7789v: fix timing for jt240mhqs_hwt_ek_e3 panel Flickering was observed when using partial mode. Moving the vsync to the same position as used by the default sitronix-st7789v timing resolves this issue. Fixes: 0fbbe96bfa08 ("drm/panel: sitronix-st7789v: add jasonic jt240mhqs-hwt-ek-e3 support") Acked-by: Jessica Zhang Signed-off-by: Gerald Loacker Link: https://lore.kernel.org/r/20240409-bugfix-jt240mhqs_hwt_ek_e3-timing-v2-1-e4821802443d@wolfvision.net Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20240409-bugfix-jt240mhqs_hwt_ek_e3-timing-v2-1-e4821802443d@wolfvision.net --- drivers/gpu/drm/panel/panel-sitronix-st7789v.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/panel/panel-sitronix-st7789v.c b/drivers/gpu/drm/panel/panel-sitronix-st7789v.c index 88e80fe98112da..32e5c03480381e 100644 --- a/drivers/gpu/drm/panel/panel-sitronix-st7789v.c +++ b/drivers/gpu/drm/panel/panel-sitronix-st7789v.c @@ -286,9 +286,9 @@ static const struct drm_display_mode jt240mhqs_hwt_ek_e3_mode = { .hsync_end = 240 + 28 + 10, .htotal = 240 + 28 + 10 + 10, .vdisplay = 280, - .vsync_start = 280 + 8, - .vsync_end = 280 + 8 + 4, - .vtotal = 280 + 8 + 4 + 4, + .vsync_start = 280 + 48, + .vsync_end = 280 + 48 + 4, + .vtotal = 280 + 48 + 4 + 4, .width_mm = 43, .height_mm = 37, .flags = DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC, From 2ba50582634d0bfe3a333ab7575a7f0122a7cde8 Mon Sep 17 00:00:00 2001 From: Gerald Loacker Date: Wed, 29 May 2024 16:42:46 +0200 Subject: [PATCH 0382/1578] drm/panel: sitronix-st7789v: tweak timing for jt240mhqs_hwt_ek_e3 panel Use the default timing parameters to get a refresh rate of about 60 Hz for a clock of 6 MHz. Fixes: 0fbbe96bfa08 ("drm/panel: sitronix-st7789v: add jasonic jt240mhqs-hwt-ek-e3 support") Signed-off-by: Gerald Loacker Acked-by: Jessica Zhang Link: https://lore.kernel.org/r/20240409-bugfix-jt240mhqs_hwt_ek_e3-timing-v2-2-e4821802443d@wolfvision.net Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20240409-bugfix-jt240mhqs_hwt_ek_e3-timing-v2-2-e4821802443d@wolfvision.net --- drivers/gpu/drm/panel/panel-sitronix-st7789v.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/panel/panel-sitronix-st7789v.c b/drivers/gpu/drm/panel/panel-sitronix-st7789v.c index 32e5c03480381e..c7e3f1280404d0 100644 --- a/drivers/gpu/drm/panel/panel-sitronix-st7789v.c +++ b/drivers/gpu/drm/panel/panel-sitronix-st7789v.c @@ -282,9 +282,9 @@ static const struct drm_display_mode et028013dma_mode = { static const struct drm_display_mode jt240mhqs_hwt_ek_e3_mode = { .clock = 6000, .hdisplay = 240, - .hsync_start = 240 + 28, - .hsync_end = 240 + 28 + 10, - .htotal = 240 + 28 + 10 + 10, + .hsync_start = 240 + 38, + .hsync_end = 240 + 38 + 10, + .htotal = 240 + 38 + 10 + 10, .vdisplay = 280, .vsync_start = 280 + 48, .vsync_end = 280 + 48 + 4, From b62c150c3bae72ac1910dcc588f360159eb0744a Mon Sep 17 00:00:00 2001 From: Gerald Loacker Date: Wed, 29 May 2024 16:42:47 +0200 Subject: [PATCH 0383/1578] drm/panel: sitronix-st7789v: fix display size for jt240mhqs_hwt_ek_e3 panel This is a portrait mode display. Change the dimensions accordingly. Fixes: 0fbbe96bfa08 ("drm/panel: sitronix-st7789v: add jasonic jt240mhqs-hwt-ek-e3 support") Signed-off-by: Gerald Loacker Acked-by: Jessica Zhang Link: https://lore.kernel.org/r/20240409-bugfix-jt240mhqs_hwt_ek_e3-timing-v2-3-e4821802443d@wolfvision.net Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20240409-bugfix-jt240mhqs_hwt_ek_e3-timing-v2-3-e4821802443d@wolfvision.net --- drivers/gpu/drm/panel/panel-sitronix-st7789v.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/panel/panel-sitronix-st7789v.c b/drivers/gpu/drm/panel/panel-sitronix-st7789v.c index c7e3f1280404d0..e8f385b9c6182c 100644 --- a/drivers/gpu/drm/panel/panel-sitronix-st7789v.c +++ b/drivers/gpu/drm/panel/panel-sitronix-st7789v.c @@ -289,8 +289,8 @@ static const struct drm_display_mode jt240mhqs_hwt_ek_e3_mode = { .vsync_start = 280 + 48, .vsync_end = 280 + 48 + 4, .vtotal = 280 + 48 + 4 + 4, - .width_mm = 43, - .height_mm = 37, + .width_mm = 37, + .height_mm = 43, .flags = DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC, }; From 03fa18a992d5626fd7bf3557a52e826bf8b326b3 Mon Sep 17 00:00:00 2001 From: Honggang LI Date: Thu, 16 May 2024 17:50:52 +0800 Subject: [PATCH 0384/1578] RDMA/rxe: Fix data copy for IB_SEND_INLINE For RDMA Send and Write with IB_SEND_INLINE, the memory buffers specified in sge list will be placed inline in the Send Request. The data should be copied by CPU from the virtual addresses of corresponding sge list DMA addresses. Cc: stable@kernel.org Fixes: 8d7c7c0eeb74 ("RDMA: Add ib_virt_dma_to_page()") Signed-off-by: Honggang LI Link: https://lore.kernel.org/r/20240516095052.542767-1-honggangli@163.com Reviewed-by: Zhu Yanjun Reviewed-by: Li Zhijian Reviewed-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index c7d4d8ab5a0941..de6238ee4379b7 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -812,7 +812,7 @@ static void copy_inline_data_to_wqe(struct rxe_send_wqe *wqe, int i; for (i = 0; i < ibwr->num_sge; i++, sge++) { - memcpy(p, ib_virt_dma_to_page(sge->addr), sge->length); + memcpy(p, ib_virt_dma_to_ptr(sge->addr), sge->length); p += sge->length; } } From 34bf6bae3286a58762711cfbce2cf74ecd42e1b5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 May 2024 22:21:31 +0200 Subject: [PATCH 0385/1578] x86/topology/amd: Evaluate SMT in CPUID leaf 0x8000001e only on family 0x17 and greater The new AMD/HYGON topology parser evaluates the SMT information in CPUID leaf 0x8000001e unconditionally while the original code restricted it to CPUs with family 0x17 and greater. This breaks family 0x15 CPUs which advertise that leaf and have a non-zero value in the SMT section. The machine boots, but the scheduler complains loudly about the mismatch of the core IDs: WARNING: CPU: 1 PID: 0 at kernel/sched/core.c:6482 sched_cpu_starting+0x183/0x250 WARNING: CPU: 0 PID: 1 at kernel/sched/topology.c:2408 build_sched_domains+0x76b/0x12b0 Add the condition back to cure it. [ bp: Make it actually build because grandpa is not concerned with trivial stuff. :-P ] Fixes: f7fb3b2dd92c ("x86/cpu: Provide an AMD/HYGON specific topology parser") Closes: https://gitlab.archlinux.org/archlinux/packaging/packages/linux/-/issues/56 Reported-by: Tim Teichmann Reported-by: Christian Heusel Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Tim Teichmann Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/7skhx6mwe4hxiul64v6azhlxnokheorksqsdbp7qw6g2jduf6c@7b5pvomauugk --- arch/x86/kernel/cpu/topology_amd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c index d419deed6a4884..7d476fa697ca53 100644 --- a/arch/x86/kernel/cpu/topology_amd.c +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -84,9 +84,9 @@ static bool parse_8000_001e(struct topo_scan *tscan, bool has_topoext) /* * If leaf 0xb is available, then the domain shifts are set - * already and nothing to do here. + * already and nothing to do here. Only valid for family >= 0x17. */ - if (!has_topoext) { + if (!has_topoext && tscan->c->x86 >= 0x17) { /* * Leaf 0x80000008 set the CORE domain shift already. * Update the SMT domain, but do not propagate it. From e112311615a24e1618a591c73506571dc304eb8d Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 30 May 2024 07:23:39 -0700 Subject: [PATCH 0386/1578] io_uring/rw: Free iovec before cleaning async data kmemleak shows that there is a memory leak in io_uring read operation, where a buffer is allocated at iovec import, but never de-allocated. The memory is allocated at io_async_rw->free_iovec, but, then io_async_rw is kfreed, taking the allocated memory with it. I saw this happening when the read operation fails with -11 (EAGAIN). This is the kmemleak splat. unreferenced object 0xffff8881da591c00 (size 256): ... backtrace (crc 7a15bdee): [<00000000256f2de4>] __kmalloc+0x2d6/0x410 [<000000007a9f5fc7>] iovec_from_user.part.0+0xc6/0x160 [<00000000cecdf83a>] __import_iovec+0x50/0x220 [<00000000d1d586a2>] __io_import_iovec+0x13d/0x220 [<0000000054ee9bd2>] io_prep_rw+0x186/0x340 [<00000000a9c0372d>] io_prep_rwv+0x31/0x120 [<000000001d1170b9>] io_prep_readv+0xe/0x30 [<0000000070b8eb67>] io_submit_sqes+0x1bd/0x780 [<00000000812496d4>] __do_sys_io_uring_enter+0x3ed/0x5b0 [<0000000081499602>] do_syscall_64+0x5d/0x170 [<00000000de1c5a4d>] entry_SYSCALL_64_after_hwframe+0x76/0x7e This occurs because the async data cleanup functions are not set for read/write operations. As a result, the potentially allocated iovec in the rw async data is not freed before the async data is released, leading to a memory leak. With this following patch, kmemleak does not show the leaked memory anymore, and all liburing tests pass. Fixes: a9165b83c193 ("io_uring/rw: always setup io_async_rw for read/write requests") Signed-off-by: Breno Leitao Link: https://lore.kernel.org/r/20240530142340.1248216-1-leitao@debian.org Signed-off-by: Jens Axboe --- io_uring/opdef.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 2de5cca9504eb4..2e3b7b16effb34 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -516,10 +516,12 @@ const struct io_cold_def io_cold_defs[] = { }, [IORING_OP_READ_FIXED] = { .name = "READ_FIXED", + .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_WRITE_FIXED] = { .name = "WRITE_FIXED", + .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_POLL_ADD] = { @@ -582,10 +584,12 @@ const struct io_cold_def io_cold_defs[] = { }, [IORING_OP_READ] = { .name = "READ", + .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_WRITE] = { .name = "WRITE", + .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_FADVISE] = { @@ -692,6 +696,7 @@ const struct io_cold_def io_cold_defs[] = { }, [IORING_OP_READ_MULTISHOT] = { .name = "READ_MULTISHOT", + .cleanup = io_readv_writev_cleanup, }, [IORING_OP_WAITID] = { .name = "WAITID", From 328af04b1ac279f17c09ddd5958aa6ec47131c80 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Mon, 27 May 2024 23:43:38 +0300 Subject: [PATCH 0387/1578] media: mei: csi: Put the IPU device reference The mei csi's probe function obtains a reference to the IPU device but never puts that reference. Do that now. Fixes: 33116eb12c6b ("media: ivsc: csi: Use IPU bridge") Signed-off-by: Sakari Ailus Signed-off-by: Hans Verkuil --- drivers/media/pci/intel/ivsc/mei_csi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/pci/intel/ivsc/mei_csi.c b/drivers/media/pci/intel/ivsc/mei_csi.c index 89b582a221ab41..53151d01618826 100644 --- a/drivers/media/pci/intel/ivsc/mei_csi.c +++ b/drivers/media/pci/intel/ivsc/mei_csi.c @@ -677,6 +677,7 @@ static int mei_csi_probe(struct mei_cl_device *cldev, return -ENODEV; ret = ipu_bridge_init(&ipu->dev, ipu_bridge_parse_ssdb); + put_device(&ipu->dev); if (ret < 0) return ret; if (WARN_ON(!dev_fwnode(dev))) From cc864821c7e8b921ebbfb21b17c92f8b3ea3d7ff Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Mon, 27 May 2024 23:13:27 +0300 Subject: [PATCH 0388/1578] media: mei: csi: Warn less verbosely of a missing device fwnode The check for having device fwnode was meant to be a sanity check but this also happens if the ACPI DSDT has graph port nodes on sensor device(s) but not on the IVSC device. Use a more meaningful warning message to tell about this. Fixes: 33116eb12c6b ("media: ivsc: csi: Use IPU bridge") Signed-off-by: Sakari Ailus Signed-off-by: Hans Verkuil --- drivers/media/pci/intel/ivsc/mei_csi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/media/pci/intel/ivsc/mei_csi.c b/drivers/media/pci/intel/ivsc/mei_csi.c index 53151d01618826..f04a89584334bf 100644 --- a/drivers/media/pci/intel/ivsc/mei_csi.c +++ b/drivers/media/pci/intel/ivsc/mei_csi.c @@ -680,8 +680,10 @@ static int mei_csi_probe(struct mei_cl_device *cldev, put_device(&ipu->dev); if (ret < 0) return ret; - if (WARN_ON(!dev_fwnode(dev))) + if (!dev_fwnode(dev)) { + dev_err(dev, "mei-csi probed without device fwnode!\n"); return -ENXIO; + } csi = devm_kzalloc(dev, sizeof(struct mei_csi), GFP_KERNEL); if (!csi) From 54880795b494e6557b47fe3cf3f21f2f54c2aa68 Mon Sep 17 00:00:00 2001 From: Bingbu Cao Date: Wed, 29 May 2024 11:44:32 +0800 Subject: [PATCH 0389/1578] media: intel/ipu6: update the maximum supported csi2 port number to 6 IPU6EP on Meteor Lake SoC supports maximum 6 csi2 ports instead of 4. Fixes: 25fedc021985 ("media: intel/ipu6: add Intel IPU6 PCI device driver") Signed-off-by: Bingbu Cao Signed-off-by: Sakari Ailus Signed-off-by: Hans Verkuil --- drivers/media/pci/intel/ipu6/ipu6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/pci/intel/ipu6/ipu6.c b/drivers/media/pci/intel/ipu6/ipu6.c index f587f609259d7c..bbd646378ab3ed 100644 --- a/drivers/media/pci/intel/ipu6/ipu6.c +++ b/drivers/media/pci/intel/ipu6/ipu6.c @@ -285,7 +285,7 @@ EXPORT_SYMBOL_NS_GPL(ipu6_configure_spc, INTEL_IPU6); #define IPU6_ISYS_CSI2_NPORTS 4 #define IPU6SE_ISYS_CSI2_NPORTS 4 #define IPU6_TGL_ISYS_CSI2_NPORTS 8 -#define IPU6EP_MTL_ISYS_CSI2_NPORTS 4 +#define IPU6EP_MTL_ISYS_CSI2_NPORTS 6 static void ipu6_internal_pdata_init(struct ipu6_device *isp) { From ffb9072bce200a4d004006e8b40c366933cf517b Mon Sep 17 00:00:00 2001 From: Bingbu Cao Date: Wed, 29 May 2024 14:43:21 +0800 Subject: [PATCH 0390/1578] media: intel/ipu6: add csi2 port sanity check in notifier bound Invalid csi2 port will break the isys notifier bound ops as it is trying to access an invalid csi2 sub-device instance based on the port. It will trigger a mc warning, and it will cause the sensor driver to unbound an inexistent isys csi2 and crash. Adding a csi2 port sanity check, return error to avoid such case. Fixes: f50c4ca0a820 ("media: intel/ipu6: add the main input system driver") Signed-off-by: Bingbu Cao [Sakari Ailus: Fix spelling of "nports" field.] Signed-off-by: Sakari Ailus Signed-off-by: Hans Verkuil --- drivers/media/pci/intel/ipu6/ipu6-isys.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/media/pci/intel/ipu6/ipu6-isys.c b/drivers/media/pci/intel/ipu6/ipu6-isys.c index 1998b72ac07de9..8b9b77719bb164 100644 --- a/drivers/media/pci/intel/ipu6/ipu6-isys.c +++ b/drivers/media/pci/intel/ipu6/ipu6-isys.c @@ -678,6 +678,12 @@ static int isys_notifier_bound(struct v4l2_async_notifier *notifier, container_of(asc, struct sensor_async_sd, asc); int ret; + if (s_asd->csi2.port >= isys->pdata->ipdata->csi2.nports) { + dev_err(&isys->adev->auxdev.dev, "invalid csi2 port %u\n", + s_asd->csi2.port); + return -EINVAL; + } + ret = ipu_bridge_instantiate_vcm(sd->dev); if (ret) { dev_err(&isys->adev->auxdev.dev, "instantiate vcm failed\n"); From 2a38e4ca302280fdcce370ba2bee79bac16c4587 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 17 May 2024 13:05:34 -0700 Subject: [PATCH 0391/1578] x86/cpu: Provide default cache line size if not enumerated MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tl;dr: CPUs with CPUID.80000008H but without CPUID.01H:EDX[CLFSH] will end up reporting cache_line_size()==0 and bad things happen. Fill in a default on those to avoid the problem. Long Story: The kernel dies a horrible death if c->x86_cache_alignment (aka. cache_line_size() is 0. Normally, this value is populated from c->x86_clflush_size. Right now the code is set up to get c->x86_clflush_size from two places. First, modern CPUs get it from CPUID. Old CPUs that don't have leaf 0x80000008 (or CPUID at all) just get some sane defaults from the kernel in get_cpu_address_sizes(). The vast majority of CPUs that have leaf 0x80000008 also get ->x86_clflush_size from CPUID. But there are oddballs. Intel Quark CPUs[1] and others[2] have leaf 0x80000008 but don't set CPUID.01H:EDX[CLFSH], so they skip over filling in ->x86_clflush_size: cpuid(0x00000001, &tfms, &misc, &junk, &cap0); if (cap0 & (1<<19)) c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; So they: land in get_cpu_address_sizes() and see that CPUID has level 0x80000008 and jump into the side of the if() that does not fill in c->x86_clflush_size. That assigns a 0 to c->x86_cache_alignment, and hilarity ensues in code like: buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), GFP_KERNEL); To fix this, always provide a sane value for ->x86_clflush_size. Big thanks to Andy Shevchenko for finding and reporting this and also providing a first pass at a fix. But his fix was only partial and only worked on the Quark CPUs. It would not, for instance, have worked on the QEMU config. 1. https://raw.githubusercontent.com/InstLatx64/InstLatx64/master/GenuineIntel/GenuineIntel0000590_Clanton_03_CPUID.txt 2. You can also get this behavior if you use "-cpu 486,+clzero" in QEMU. [ dhansen: remove 'vp_bits_from_cpuid' reference in changelog because bpetkov brutally murdered it recently. ] Fixes: fbf6449f84bf ("x86/sev-es: Set x86_virt_bits to the correct value straight away, instead of a two-phase approach") Reported-by: Andy Shevchenko Signed-off-by: Dave Hansen Tested-by: Andy Shevchenko Tested-by: Jörn Heusipp Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20240516173928.3960193-1-andriy.shevchenko@linux.intel.com/ Link: https://lore.kernel.org/lkml/5e31cad3-ad4d-493e-ab07-724cfbfaba44@heusipp.de/ Link: https://lore.kernel.org/all/20240517200534.8EC5F33E%40davehans-spike.ostc.intel.com --- arch/x86/kernel/cpu/common.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2b170da84f97fe..e31293c9609f50 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1075,6 +1075,10 @@ void get_cpu_address_sizes(struct cpuinfo_x86 *c) c->x86_virt_bits = (eax >> 8) & 0xff; c->x86_phys_bits = eax & 0xff; + + /* Provide a sane default if not enumerated: */ + if (!c->x86_clflush_size) + c->x86_clflush_size = 32; } c->x86_cache_bits = c->x86_phys_bits; From 027a44fedd55fbdf1d45603894634acd960ad04b Mon Sep 17 00:00:00 2001 From: Peter Colberg Date: Tue, 21 May 2024 14:12:46 -0400 Subject: [PATCH 0392/1578] hwmon: (intel-m10-bmc-hwmon) Fix multiplier for N6000 board power sensor The Intel N6000 BMC outputs the board power value in milliwatt, whereas the hwmon sysfs interface must provide power values in microwatt. Fixes: e1983220ae14 ("hwmon: intel-m10-bmc-hwmon: Add N6000 sensors") Signed-off-by: Peter Colberg Reviewed-by: Matthew Gerlach Link: https://lore.kernel.org/r/20240521181246.683833-1-peter.colberg@intel.com Signed-off-by: Guenter Roeck --- drivers/hwmon/intel-m10-bmc-hwmon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/intel-m10-bmc-hwmon.c b/drivers/hwmon/intel-m10-bmc-hwmon.c index 6500ca548f9c73..ca2dff15892515 100644 --- a/drivers/hwmon/intel-m10-bmc-hwmon.c +++ b/drivers/hwmon/intel-m10-bmc-hwmon.c @@ -429,7 +429,7 @@ static const struct m10bmc_sdata n6000bmc_curr_tbl[] = { }; static const struct m10bmc_sdata n6000bmc_power_tbl[] = { - { 0x724, 0x0, 0x0, 0x0, 0x0, 1, "Board Power" }, + { 0x724, 0x0, 0x0, 0x0, 0x0, 1000, "Board Power" }, }; static const struct hwmon_channel_info * const n6000bmc_hinfo[] = { From 52a2c70c3ec555e670a34dd1ab958986451d2dd2 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 30 May 2024 08:20:14 -0700 Subject: [PATCH 0393/1578] hwmon: (shtc1) Fix property misspelling The property name is "sensirion,low-precision", not "sensicon,low-precision". Cc: Chris Ruehl Fixes: be7373b60df5 ("hwmon: shtc1: add support for device tree bindings") Signed-off-by: Guenter Roeck --- drivers/hwmon/shtc1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/shtc1.c b/drivers/hwmon/shtc1.c index 1f96e94967ee8d..439dd3dba5fc81 100644 --- a/drivers/hwmon/shtc1.c +++ b/drivers/hwmon/shtc1.c @@ -238,7 +238,7 @@ static int shtc1_probe(struct i2c_client *client) if (np) { data->setup.blocking_io = of_property_read_bool(np, "sensirion,blocking-io"); - data->setup.high_precision = !of_property_read_bool(np, "sensicon,low-precision"); + data->setup.high_precision = !of_property_read_bool(np, "sensirion,low-precision"); } else { if (client->dev.platform_data) data->setup = *(struct shtc1_platform_data *)dev->platform_data; From 41011e2de3480f9adb6420b35b67628b2d903355 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 28 May 2024 11:06:31 +0100 Subject: [PATCH 0394/1578] KVM: arm64: nv: Fix relative priorities of exceptions generated by ERETAx ERETAx can fail in multiple ways: (1) ELR_EL2 points lalaland (2) we get a PAC failure (3) SPSR_EL2 has the wrong mode (1) is easy, as we just let the CPU do its thing and deliver an Instruction Abort. However, (2) and (3) are interesting, because the PAC failure priority is way below that of the Illegal Execution State exception. Which means that if we have detected a PAC failure (and that we have FPACCOMBINE), we must be careful to give priority to the Illegal Execution State exception, should one be pending. Solving this involves hoisting the SPSR calculation earlier and testing for the IL bit before injecting the FPAC exception. In the extreme case of a ERETAx returning to an invalid mode *and* failing its PAC check, we end up with an Instruction Abort (due to the new PC being mangled by the failed Auth) *and* PSTATE.IL being set. Which matches the requirements of the architecture. Whilst we're at it, remove a stale comment that states the obvious and only confuses the reader. Fixes: 213b3d1ea161 ("KVM: arm64: nv: Handle ERETA[AB] instructions") Reviewed-by: Joey Gouly Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20240528100632.1831995-2-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/emulate-nested.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 72d733c74a382f..54090967a33567 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -2181,16 +2181,23 @@ void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu) if (forward_traps(vcpu, HCR_NV)) return; + spsr = vcpu_read_sys_reg(vcpu, SPSR_EL2); + spsr = kvm_check_illegal_exception_return(vcpu, spsr); + /* Check for an ERETAx */ esr = kvm_vcpu_get_esr(vcpu); if (esr_iss_is_eretax(esr) && !kvm_auth_eretax(vcpu, &elr)) { /* - * Oh no, ERETAx failed to authenticate. If we have - * FPACCOMBINE, deliver an exception right away. If we - * don't, then let the mangled ELR value trickle down the + * Oh no, ERETAx failed to authenticate. + * + * If we have FPACCOMBINE and we don't have a pending + * Illegal Execution State exception (which has priority + * over FPAC), deliver an exception right away. + * + * Otherwise, let the mangled ELR value trickle down the * ERET handling, and the guest will have a little surprise. */ - if (kvm_has_pauth(vcpu->kvm, FPACCOMBINE)) { + if (kvm_has_pauth(vcpu->kvm, FPACCOMBINE) && !(spsr & PSR_IL_BIT)) { esr &= ESR_ELx_ERET_ISS_ERETA; esr |= FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_FPAC); kvm_inject_nested_sync(vcpu, esr); @@ -2201,17 +2208,11 @@ void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu) preempt_disable(); kvm_arch_vcpu_put(vcpu); - spsr = __vcpu_sys_reg(vcpu, SPSR_EL2); - spsr = kvm_check_illegal_exception_return(vcpu, spsr); if (!esr_iss_is_eretax(esr)) elr = __vcpu_sys_reg(vcpu, ELR_EL2); trace_kvm_nested_eret(vcpu, elr, spsr); - /* - * Note that the current exception level is always the virtual EL2, - * since we set HCR_EL2.NV bit only when entering the virtual EL2. - */ *vcpu_pc(vcpu) = elr; *vcpu_cpsr(vcpu) = spsr; From 47eb2d68d10208e6a9e89d10b66018e8d6ca0623 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 28 May 2024 11:06:32 +0100 Subject: [PATCH 0395/1578] KVM: arm64: nv: Expose BTI and CSV_frac to a guest hypervisor Now that we expose PAC to NV guests, we can also expose BTI (as the two as joined at the hip, due to some of the PAC instructions being landing pads). While we're at it, also propagate CSV_frac, which requires no particular emulation. Fixes: f4f6a95bac49 ("KVM: arm64: nv: Advertise support for PAuth") Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20240528100632.1831995-3-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/nested.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 6813c7c7f00ab2..bae8536cbf003e 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -58,8 +58,10 @@ static u64 limit_nv_id_reg(u32 id, u64 val) break; case SYS_ID_AA64PFR1_EL1: - /* Only support SSBS */ - val &= NV_FTR(PFR1, SSBS); + /* Only support BTI, SSBS, CSV2_frac */ + val &= (NV_FTR(PFR1, BT) | + NV_FTR(PFR1, SSBS) | + NV_FTR(PFR1, CSV2_frac)); break; case SYS_ID_AA64MMFR0_EL1: From a638b0461b58aa3205cd9d5f14d6f703d795b4af Mon Sep 17 00:00:00 2001 From: Sergey Matyukevich Date: Thu, 23 May 2024 11:43:23 +0300 Subject: [PATCH 0396/1578] riscv: prevent pt_regs corruption for secondary idle threads Top of the kernel thread stack should be reserved for pt_regs. However this is not the case for the idle threads of the secondary boot harts. Their stacks overlap with their pt_regs, so both may get corrupted. Similar issue has been fixed for the primary hart, see c7cdd96eca28 ("riscv: prevent stack corruption by reserving task_pt_regs(p) early"). However that fix was not propagated to the secondary harts. The problem has been noticed in some CPU hotplug tests with V enabled. The function smp_callin stored several registers on stack, corrupting top of pt_regs structure including status field. As a result, kernel attempted to save or restore inexistent V context. Fixes: 9a2451f18663 ("RISC-V: Avoid using per cpu array for ordered booting") Fixes: 2875fe056156 ("RISC-V: Add cpu_ops and modify default booting method") Signed-off-by: Sergey Matyukevich Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20240523084327.2013211-1-geomatsi@gmail.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/cpu_ops_sbi.c | 2 +- arch/riscv/kernel/cpu_ops_spinwait.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/riscv/kernel/cpu_ops_sbi.c b/arch/riscv/kernel/cpu_ops_sbi.c index 1cc7df740eddc9..e6fbaaf549562d 100644 --- a/arch/riscv/kernel/cpu_ops_sbi.c +++ b/arch/riscv/kernel/cpu_ops_sbi.c @@ -72,7 +72,7 @@ static int sbi_cpu_start(unsigned int cpuid, struct task_struct *tidle) /* Make sure tidle is updated */ smp_mb(); bdata->task_ptr = tidle; - bdata->stack_ptr = task_stack_page(tidle) + THREAD_SIZE; + bdata->stack_ptr = task_pt_regs(tidle); /* Make sure boot data is updated */ smp_mb(); hsm_data = __pa(bdata); diff --git a/arch/riscv/kernel/cpu_ops_spinwait.c b/arch/riscv/kernel/cpu_ops_spinwait.c index 613872b0a21acb..24869eb889085e 100644 --- a/arch/riscv/kernel/cpu_ops_spinwait.c +++ b/arch/riscv/kernel/cpu_ops_spinwait.c @@ -34,8 +34,7 @@ static void cpu_update_secondary_bootdata(unsigned int cpuid, /* Make sure tidle is updated */ smp_mb(); - WRITE_ONCE(__cpu_spinwait_stack_pointer[hartid], - task_stack_page(tidle) + THREAD_SIZE); + WRITE_ONCE(__cpu_spinwait_stack_pointer[hartid], task_pt_regs(tidle)); WRITE_ONCE(__cpu_spinwait_task_pointer[hartid], tidle); } From 7bed51617401dab2be930b13ed5aacf581f7c8ef Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Sun, 26 May 2024 13:01:04 +0200 Subject: [PATCH 0397/1578] riscv: enable HAVE_ARCH_HUGE_VMAP for XIP kernel HAVE_ARCH_HUGE_VMAP also works on XIP kernel, so remove its dependency on !XIP_KERNEL. This also fixes a boot problem for XIP kernel introduced by the commit in "Fixes:". This commit used huge page mapping for vmemmap, but huge page vmap was not enabled for XIP kernel. Fixes: ff172d4818ad ("riscv: Use hugepage mappings for vmemmap") Signed-off-by: Nam Cao Cc: Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20240526110104.470429-1-namcao@linutronix.de Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index b94176e25be181..0525ee2d63c716 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -106,7 +106,7 @@ config RISCV select HAS_IOPORT if MMU select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_HUGE_VMALLOC if HAVE_ARCH_HUGE_VMAP - select HAVE_ARCH_HUGE_VMAP if MMU && 64BIT && !XIP_KERNEL + select HAVE_ARCH_HUGE_VMAP if MMU && 64BIT select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL select HAVE_ARCH_JUMP_LABEL_RELATIVE if !XIP_KERNEL select HAVE_ARCH_KASAN if MMU && 64BIT From 982a7eb97be685d1129c06671aed4c26d6919af4 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Fri, 24 May 2024 11:56:00 -0700 Subject: [PATCH 0398/1578] Documentation: RISC-V: uabi: Only scalar misaligned loads are supported We're stuck supporting scalar misaligned loads in userspace because they were part of the ISA at the time we froze the uABI. That wasn't the case for vector misaligned accesses, so depending on them unconditionally is a userspace bug. All extant vector hardware traps on these misaligned accesses. Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20240524185600.5919-1-palmer@rivosinc.com Signed-off-by: Palmer Dabbelt --- Documentation/arch/riscv/uabi.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/arch/riscv/uabi.rst b/Documentation/arch/riscv/uabi.rst index 54d199dce78bf5..2b420bab0527a7 100644 --- a/Documentation/arch/riscv/uabi.rst +++ b/Documentation/arch/riscv/uabi.rst @@ -65,4 +65,6 @@ the extension, or may have deliberately removed it from the listing. Misaligned accesses ------------------- -Misaligned accesses are supported in userspace, but they may perform poorly. +Misaligned scalar accesses are supported in userspace, but they may perform +poorly. Misaligned vector accesses are only supported if the Zicclsm extension +is supported. From 1d84afaf02524d2558e8ca3ca169be2ef720380b Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Thu, 30 May 2024 16:55:46 +0200 Subject: [PATCH 0399/1578] riscv: Fix fully ordered LR/SC xchg[8|16]() implementations The fully ordered versions of xchg[8|16]() using LR/SC lack the necessary memory barriers to guarantee the order. Fix this by matching what is already implemented in the fully ordered versions of cmpxchg() using LR/SC. Suggested-by: Andrea Parri Reported-by: Andrea Parri Closes: https://lore.kernel.org/linux-riscv/ZlYbupL5XgzgA0MX@andrea/T/#u Fixes: a8ed2b7a2c13 ("riscv/cmpxchg: Implement xchg for variables of size 1 and 2") Signed-off-by: Alexandre Ghiti Reviewed-by: Andrea Parri Link: https://lore.kernel.org/r/20240530145546.394248-1-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/cmpxchg.h | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h index ddb002ed89dea0..808b4c78462e5a 100644 --- a/arch/riscv/include/asm/cmpxchg.h +++ b/arch/riscv/include/asm/cmpxchg.h @@ -10,7 +10,7 @@ #include -#define __arch_xchg_masked(prepend, append, r, p, n) \ +#define __arch_xchg_masked(sc_sfx, prepend, append, r, p, n) \ ({ \ u32 *__ptr32b = (u32 *)((ulong)(p) & ~0x3); \ ulong __s = ((ulong)(p) & (0x4 - sizeof(*p))) * BITS_PER_BYTE; \ @@ -25,7 +25,7 @@ "0: lr.w %0, %2\n" \ " and %1, %0, %z4\n" \ " or %1, %1, %z3\n" \ - " sc.w %1, %1, %2\n" \ + " sc.w" sc_sfx " %1, %1, %2\n" \ " bnez %1, 0b\n" \ append \ : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b)) \ @@ -46,7 +46,8 @@ : "memory"); \ }) -#define _arch_xchg(ptr, new, sfx, prepend, append) \ +#define _arch_xchg(ptr, new, sc_sfx, swap_sfx, prepend, \ + sc_append, swap_append) \ ({ \ __typeof__(ptr) __ptr = (ptr); \ __typeof__(*(__ptr)) __new = (new); \ @@ -55,15 +56,15 @@ switch (sizeof(*__ptr)) { \ case 1: \ case 2: \ - __arch_xchg_masked(prepend, append, \ + __arch_xchg_masked(sc_sfx, prepend, sc_append, \ __ret, __ptr, __new); \ break; \ case 4: \ - __arch_xchg(".w" sfx, prepend, append, \ + __arch_xchg(".w" swap_sfx, prepend, swap_append, \ __ret, __ptr, __new); \ break; \ case 8: \ - __arch_xchg(".d" sfx, prepend, append, \ + __arch_xchg(".d" swap_sfx, prepend, swap_append, \ __ret, __ptr, __new); \ break; \ default: \ @@ -73,16 +74,17 @@ }) #define arch_xchg_relaxed(ptr, x) \ - _arch_xchg(ptr, x, "", "", "") + _arch_xchg(ptr, x, "", "", "", "", "") #define arch_xchg_acquire(ptr, x) \ - _arch_xchg(ptr, x, "", "", RISCV_ACQUIRE_BARRIER) + _arch_xchg(ptr, x, "", "", "", \ + RISCV_ACQUIRE_BARRIER, RISCV_ACQUIRE_BARRIER) #define arch_xchg_release(ptr, x) \ - _arch_xchg(ptr, x, "", RISCV_RELEASE_BARRIER, "") + _arch_xchg(ptr, x, "", "", RISCV_RELEASE_BARRIER, "", "") #define arch_xchg(ptr, x) \ - _arch_xchg(ptr, x, ".aqrl", "", "") + _arch_xchg(ptr, x, ".rl", ".aqrl", "", RISCV_FULL_BARRIER, "") #define xchg32(ptr, x) \ ({ \ From bb195358806847217efba98de62b7decec3b371f Mon Sep 17 00:00:00 2001 From: Abhinav Kumar Date: Tue, 7 May 2024 16:04:40 -0700 Subject: [PATCH 0400/1578] drm/msm: remove python 3.9 dependency for compiling msm Since commit 5acf49119630 ("drm/msm: import gen_header.py script from Mesa"), compilation is broken on machines having python versions older than 3.9 due to dependency on argparse.BooleanOptionalAction. Switch to use simple bool for the validate flag to remove the dependency. Fixes: 5acf49119630 ("drm/msm: import gen_header.py script from Mesa") Signed-off-by: Abhinav Kumar Tested-by: Douglas Anderson Reviewed-by: Dmitry Baryshkov Signed-off-by: Thierry Reding Link: https://patchwork.freedesktop.org/patch/msgid/20240507230440.3384949-1-quic_abhinavk@quicinc.com --- drivers/gpu/drm/msm/registers/gen_header.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/registers/gen_header.py b/drivers/gpu/drm/msm/registers/gen_header.py index fc3bfdc991d27a..3926485bb197b0 100644 --- a/drivers/gpu/drm/msm/registers/gen_header.py +++ b/drivers/gpu/drm/msm/registers/gen_header.py @@ -538,7 +538,7 @@ def add_all_usages(self, reg, usages): self.variants.add(reg.domain) def do_validate(self, schemafile): - if self.validate == False: + if not self.validate: return try: @@ -948,7 +948,8 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument('--rnn', type=str, required=True) parser.add_argument('--xml', type=str, required=True) - parser.add_argument('--validate', action=argparse.BooleanOptionalAction) + parser.add_argument('--validate', default=False, action='store_true') + parser.add_argument('--no-validate', dest='validate', action='store_false') subparsers = parser.add_subparsers() subparsers.required = True From cb708ab9f584f159798b60853edcf0c8b67ce295 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Tue, 28 May 2024 19:29:38 -0700 Subject: [PATCH 0401/1578] selftests/futex: pass _GNU_SOURCE without a value to the compiler It's slightly better to set _GNU_SOURCE in the source code, but if one must do it via the compiler invocation, then the best way to do so is this: $(CC) -D_GNU_SOURCE= ...because otherwise, if this form is used: $(CC) -D_GNU_SOURCE ...then that leads the compiler to set a value, as if you had passed in: $(CC) -D_GNU_SOURCE=1 That, in turn, leads to warnings under both gcc and clang, like this: futex_requeue_pi.c:20: warning: "_GNU_SOURCE" redefined Fix this by using the "-D_GNU_SOURCE=" form. Reviewed-by: Edward Liaw Reviewed-by: Davidlohr Bueso Signed-off-by: John Hubbard Signed-off-by: Shuah Khan --- tools/testing/selftests/futex/functional/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile index a392d0917b4e55..994fa3468f170c 100644 --- a/tools/testing/selftests/futex/functional/Makefile +++ b/tools/testing/selftests/futex/functional/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 INCLUDES := -I../include -I../../ $(KHDR_INCLUDES) -CFLAGS := $(CFLAGS) -g -O2 -Wall -D_GNU_SOURCE -pthread $(INCLUDES) $(KHDR_INCLUDES) +CFLAGS := $(CFLAGS) -g -O2 -Wall -D_GNU_SOURCE= -pthread $(INCLUDES) $(KHDR_INCLUDES) LDLIBS := -lpthread -lrt LOCAL_HDRS := \ From 18414a4a2eabb0281d12d374c92874327e0e3fe3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 30 May 2024 13:35:50 -0600 Subject: [PATCH 0402/1578] io_uring/net: assign kmsg inq/flags before buffer selection syzbot reports that recv is using an uninitialized value: ===================================================== BUG: KMSAN: uninit-value in io_req_cqe_overflow io_uring/io_uring.c:810 [inline] BUG: KMSAN: uninit-value in io_req_complete_post io_uring/io_uring.c:937 [inline] BUG: KMSAN: uninit-value in io_issue_sqe+0x1f1b/0x22c0 io_uring/io_uring.c:1763 io_req_cqe_overflow io_uring/io_uring.c:810 [inline] io_req_complete_post io_uring/io_uring.c:937 [inline] io_issue_sqe+0x1f1b/0x22c0 io_uring/io_uring.c:1763 io_wq_submit_work+0xa17/0xeb0 io_uring/io_uring.c:1860 io_worker_handle_work+0xc04/0x2000 io_uring/io-wq.c:597 io_wq_worker+0x447/0x1410 io_uring/io-wq.c:651 ret_from_fork+0x6d/0x90 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Uninit was stored to memory at: io_req_set_res io_uring/io_uring.h:215 [inline] io_recv_finish+0xf10/0x1560 io_uring/net.c:861 io_recv+0x12ec/0x1ea0 io_uring/net.c:1175 io_issue_sqe+0x429/0x22c0 io_uring/io_uring.c:1751 io_wq_submit_work+0xa17/0xeb0 io_uring/io_uring.c:1860 io_worker_handle_work+0xc04/0x2000 io_uring/io-wq.c:597 io_wq_worker+0x447/0x1410 io_uring/io-wq.c:651 ret_from_fork+0x6d/0x90 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Uninit was created at: slab_post_alloc_hook mm/slub.c:3877 [inline] slab_alloc_node mm/slub.c:3918 [inline] __do_kmalloc_node mm/slub.c:4038 [inline] __kmalloc+0x6e4/0x1060 mm/slub.c:4052 kmalloc include/linux/slab.h:632 [inline] io_alloc_async_data+0xc0/0x220 io_uring/io_uring.c:1662 io_msg_alloc_async io_uring/net.c:166 [inline] io_recvmsg_prep_setup io_uring/net.c:725 [inline] io_recvmsg_prep+0xbe8/0x1a20 io_uring/net.c:806 io_init_req io_uring/io_uring.c:2135 [inline] io_submit_sqe io_uring/io_uring.c:2182 [inline] io_submit_sqes+0x1135/0x2f10 io_uring/io_uring.c:2335 __do_sys_io_uring_enter io_uring/io_uring.c:3246 [inline] __se_sys_io_uring_enter+0x40f/0x3c80 io_uring/io_uring.c:3183 __x64_sys_io_uring_enter+0x11f/0x1a0 io_uring/io_uring.c:3183 x64_sys_call+0x2c0/0x3b50 arch/x86/include/generated/asm/syscalls_64.h:427 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcf/0x1e0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f which appears to be io_recv_finish() reading kmsg->msg.msg_inq to decide if it needs to set IORING_CQE_F_SOCK_NONEMPTY or not. If the recv is entered with buffer selection, but no buffer is available, then we jump error path which calls io_recv_finish() without having assigned kmsg->msg_inq. This might cause an errant setting of the NONEMPTY flag for a request get gets errored with -ENOBUFS. Reported-by: syzbot+b1647099e82b3b349fbf@syzkaller.appspotmail.com Fixes: 4a3223f7bfda ("io_uring/net: switch io_recv() to using io_async_msghdr") Signed-off-by: Jens Axboe --- io_uring/net.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 0a48596429d9f5..7c98c4d5094633 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1127,6 +1127,9 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) flags |= MSG_DONTWAIT; retry_multishot: + kmsg->msg.msg_inq = -1; + kmsg->msg.msg_flags = 0; + if (io_do_buffer_select(req)) { ret = io_recv_buf_select(req, kmsg, &len, issue_flags); if (unlikely(ret)) @@ -1134,9 +1137,6 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) sr->buf = NULL; } - kmsg->msg.msg_inq = -1; - kmsg->msg.msg_flags = 0; - if (flags & MSG_WAITALL) min_ret = iov_iter_count(&kmsg->msg.msg_iter); From 296f4ce81d08e73c22408c49f4938a85bd075e5c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 29 May 2024 09:11:36 +1000 Subject: [PATCH 0403/1578] NFS: abort nfs_atomic_open_v23 if name is too long. An attempt to open a file with a name longer than NFS3_MAXNAMLEN will trigger a WARN_ON_ONCE in encode_filename3() because nfs_atomic_open_v23() doesn't have the test on ->d_name.len that nfs_atomic_open() has. So add that test. Reported-by: James Clark Closes: https://lore.kernel.org/all/20240528105249.69200-1-james.clark@arm.com/ Fixes: 7c6c5249f061 ("NFS: add atomic_open for NFSv3 to handle O_TRUNC correctly.") Signed-off-by: NeilBrown Reviewed-by: Christoph Hellwig Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 788077a4feb9a6..2b68a14982c8f4 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2254,6 +2254,9 @@ int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry, */ int error = 0; + if (dentry->d_name.len > NFS_SERVER(dir)->namelen) + return -ENAMETOOLONG; + if (open_flags & O_CREAT) { file->f_mode |= FMODE_CREATED; error = nfs_do_create(dir, dentry, mode, open_flags); From 28568c906c1bb5f7560e18082ed7d6295860f1c2 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Wed, 29 May 2024 15:44:35 -0400 Subject: [PATCH 0404/1578] NFSv4.1 enforce rootpath check in fs_location query In commit 4ca9f31a2be66 ("NFSv4.1 test and add 4.1 trunking transport"), we introduce the ability to query the NFS server for possible trunking locations of the existing filesystem. However, we never checked the returned file system path for these alternative locations. According to the RFC, the server can say that the filesystem currently known under "fs_root" of fs_location also resides under these server locations under the following "rootpath" pathname. The client cannot handle trunking a filesystem that reside under different location under different paths other than what the main path is. This patch enforces the check that fs_root path and rootpath path in fs_location reply is the same. Fixes: 4ca9f31a2be6 ("NFSv4.1 test and add 4.1 trunking transport") Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 94c07875aa3f42..a691fa10b3e95f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4023,6 +4023,23 @@ static void test_fs_location_for_trunking(struct nfs4_fs_location *location, } } +static bool _is_same_nfs4_pathname(struct nfs4_pathname *path1, + struct nfs4_pathname *path2) +{ + int i; + + if (path1->ncomponents != path2->ncomponents) + return false; + for (i = 0; i < path1->ncomponents; i++) { + if (path1->components[i].len != path2->components[i].len) + return false; + if (memcmp(path1->components[i].data, path2->components[i].data, + path1->components[i].len)) + return false; + } + return true; +} + static int _nfs4_discover_trunking(struct nfs_server *server, struct nfs_fh *fhandle) { @@ -4056,9 +4073,13 @@ static int _nfs4_discover_trunking(struct nfs_server *server, if (status) goto out_free_3; - for (i = 0; i < locations->nlocations; i++) + for (i = 0; i < locations->nlocations; i++) { + if (!_is_same_nfs4_pathname(&locations->fs_path, + &locations->locations[i].rootpath)) + continue; test_fs_location_for_trunking(&locations->locations[i], clp, server); + } out_free_3: kfree(locations->fattr); out_free_2: From 33c94d7e3cb84f6d130678d6d59ba475a6c489cf Mon Sep 17 00:00:00 2001 From: Chen Hanxiao Date: Thu, 23 May 2024 16:47:16 +0800 Subject: [PATCH 0405/1578] SUNRPC: return proper error from gss_wrap_req_priv don't return 0 if snd_buf->len really greater than snd_buf->buflen Signed-off-by: Chen Hanxiao Fixes: 0c77668ddb4e ("SUNRPC: Introduce trace points in rpc_auth_gss.ko") Reviewed-by: Benjamin Coddington Reviewed-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/auth_gss.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index c7af0220f82f42..369310909fc985 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1875,8 +1875,10 @@ gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base; maj_stat = gss_wrap(ctx->gc_gss_ctx, offset, snd_buf, inpages); /* slack space should prevent this ever happening: */ - if (unlikely(snd_buf->len > snd_buf->buflen)) + if (unlikely(snd_buf->len > snd_buf->buflen)) { + status = -EIO; goto wrap_failed; + } /* We're assuming that when GSS_S_CONTEXT_EXPIRED, the encryption was * done anyway, so it's safe to put the request on the wire: */ if (maj_stat == GSS_S_CONTEXT_EXPIRED) From 99bc9f2eb3f79a2b4296d9bf43153e1d10ca50d3 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 28 May 2024 13:27:17 +1000 Subject: [PATCH 0406/1578] NFS: add barriers when testing for NFS_FSDATA_BLOCKED dentry->d_fsdata is set to NFS_FSDATA_BLOCKED while unlinking or renaming-over a file to ensure that no open succeeds while the NFS operation progressed on the server. Setting dentry->d_fsdata to NFS_FSDATA_BLOCKED is done under ->d_lock after checking the refcount is not elevated. Any attempt to open the file (through that name) will go through lookp_open() which will take ->d_lock while incrementing the refcount, we can be sure that once the new value is set, __nfs_lookup_revalidate() *will* see the new value and will block. We don't have any locking guarantee that when we set ->d_fsdata to NULL, the wait_var_event() in __nfs_lookup_revalidate() will notice. wait/wake primitives do NOT provide barriers to guarantee order. We must use smp_load_acquire() in wait_var_event() to ensure we look at an up-to-date value, and must use smp_store_release() before wake_up_var(). This patch adds those barrier functions and factors out block_revalidate() and unblock_revalidate() far clarity. There is also a hypothetical bug in that if memory allocation fails (which never happens in practice) we might leave ->d_fsdata locked. This patch adds the missing call to unblock_revalidate(). Reported-and-tested-by: Richard Kojedzinszky Closes: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1071501 Fixes: 3c59366c207e ("NFS: don't unhash dentry during unlink/rename") Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 47 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 2b68a14982c8f4..07a7be27182e50 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1803,9 +1803,10 @@ __nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags, if (parent != READ_ONCE(dentry->d_parent)) return -ECHILD; } else { - /* Wait for unlink to complete */ + /* Wait for unlink to complete - see unblock_revalidate() */ wait_var_event(&dentry->d_fsdata, - dentry->d_fsdata != NFS_FSDATA_BLOCKED); + smp_load_acquire(&dentry->d_fsdata) + != NFS_FSDATA_BLOCKED); parent = dget_parent(dentry); ret = reval(d_inode(parent), dentry, flags); dput(parent); @@ -1818,6 +1819,29 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) return __nfs_lookup_revalidate(dentry, flags, nfs_do_lookup_revalidate); } +static void block_revalidate(struct dentry *dentry) +{ + /* old devname - just in case */ + kfree(dentry->d_fsdata); + + /* Any new reference that could lead to an open + * will take ->d_lock in lookup_open() -> d_lookup(). + * Holding this lock ensures we cannot race with + * __nfs_lookup_revalidate() and removes and need + * for further barriers. + */ + lockdep_assert_held(&dentry->d_lock); + + dentry->d_fsdata = NFS_FSDATA_BLOCKED; +} + +static void unblock_revalidate(struct dentry *dentry) +{ + /* store_release ensures wait_var_event() sees the update */ + smp_store_release(&dentry->d_fsdata, NULL); + wake_up_var(&dentry->d_fsdata); +} + /* * A weaker form of d_revalidate for revalidating just the d_inode(dentry) * when we don't really care about the dentry name. This is called when a @@ -2551,15 +2575,12 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry) spin_unlock(&dentry->d_lock); goto out; } - /* old devname */ - kfree(dentry->d_fsdata); - dentry->d_fsdata = NFS_FSDATA_BLOCKED; + block_revalidate(dentry); spin_unlock(&dentry->d_lock); error = nfs_safe_remove(dentry); nfs_dentry_remove_handle_error(dir, dentry, error); - dentry->d_fsdata = NULL; - wake_up_var(&dentry->d_fsdata); + unblock_revalidate(dentry); out: trace_nfs_unlink_exit(dir, dentry, error); return error; @@ -2666,8 +2687,7 @@ nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data) { struct dentry *new_dentry = data->new_dentry; - new_dentry->d_fsdata = NULL; - wake_up_var(&new_dentry->d_fsdata); + unblock_revalidate(new_dentry); } /* @@ -2729,11 +2749,6 @@ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (WARN_ON(new_dentry->d_flags & DCACHE_NFSFS_RENAMED) || WARN_ON(new_dentry->d_fsdata == NFS_FSDATA_BLOCKED)) goto out; - if (new_dentry->d_fsdata) { - /* old devname */ - kfree(new_dentry->d_fsdata); - new_dentry->d_fsdata = NULL; - } spin_lock(&new_dentry->d_lock); if (d_count(new_dentry) > 2) { @@ -2755,7 +2770,7 @@ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, new_dentry = dentry; new_inode = NULL; } else { - new_dentry->d_fsdata = NFS_FSDATA_BLOCKED; + block_revalidate(new_dentry); must_unblock = true; spin_unlock(&new_dentry->d_lock); } @@ -2767,6 +2782,8 @@ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, must_unblock ? nfs_unblock_rename : NULL); if (IS_ERR(task)) { + if (must_unblock) + unblock_revalidate(new_dentry); error = PTR_ERR(task); goto out; } From b164316808ec5de391c3e7b0148ec937d32d280d Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 30 May 2024 14:40:32 +0900 Subject: [PATCH 0407/1578] null_blk: Do not allow runt zone with zone capacity smaller then zone size A zoned device with a smaller last zone together with a zone capacity smaller than the zone size does make any sense as that does not correspond to any possible setup for a real device: 1) For ZNS and zoned UFS devices, all zones are always the same size. 2) For SMR HDDs, all zones always have the same capacity. In other words, if we have a smaller last runt zone, then this zone capacity should always be equal to the zone size. Add a check in null_init_zoned_dev() to prevent a configuration to have both a smaller zone size and a zone capacity smaller than the zone size. Signed-off-by: Damien Le Moal Reviewed-by: Niklas Cassel Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20240530054035.491497-2-dlemoal@kernel.org Signed-off-by: Jens Axboe --- drivers/block/null_blk/zoned.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index 79c8e5e99f7ff1..f118d304f31080 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -74,6 +74,17 @@ int null_init_zoned_dev(struct nullb_device *dev, return -EINVAL; } + /* + * If a smaller zone capacity was requested, do not allow a smaller last + * zone at the same time as such zone configuration does not correspond + * to any real zoned device. + */ + if (dev->zone_capacity != dev->zone_size && + dev->size & (dev->zone_size - 1)) { + pr_err("A smaller last zone is not allowed with zone capacity smaller than zone size.\n"); + return -EINVAL; + } + zone_capacity_sects = mb_to_sects(dev->zone_capacity); dev_capacity_sects = mb_to_sects(dev->size); dev->zone_size_sects = mb_to_sects(dev->zone_size); From cd6399936869b4a042dd1270078cbf2bb871a407 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 30 May 2024 14:40:33 +0900 Subject: [PATCH 0408/1578] block: Fix validation of zoned device with a runt zone Commit ecfe43b11b02 ("block: Remember zone capacity when revalidating zones") introduced checks to ensure that the capacity of the zones of a zoned device is constant for all zones. However, this check ignores the possibility that a zoned device has a smaller last zone with a size not equal to the capacity of other zones. Such device correspond in practice to an SMR drive with a smaller last zone and all zones with a capacity equal to the zone size, leading to the last zone capacity being different than the capacity of other zones. Correctly handle such device by fixing the check for the constant zone capacity in blk_revalidate_seq_zone() using the new helper function disk_zone_is_last(). This helper function is also used in blk_revalidate_zone_cb() when checking the zone size. Fixes: ecfe43b11b02 ("block: Remember zone capacity when revalidating zones") Signed-off-by: Damien Le Moal Reviewed-by: Bart Van Assche Reviewed-by: Niklas Cassel Link: https://lore.kernel.org/r/20240530054035.491497-3-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 03aa4eead39e7b..402a50a1ac4d5b 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -450,6 +450,11 @@ static inline bool disk_zone_is_conv(struct gendisk *disk, sector_t sector) return test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap); } +static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone) +{ + return zone->start + zone->len >= get_capacity(disk); +} + static bool disk_insert_zone_wplug(struct gendisk *disk, struct blk_zone_wplug *zwplug) { @@ -1693,11 +1698,13 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, /* * Remember the capacity of the first sequential zone and check - * if it is constant for all zones. + * if it is constant for all zones, ignoring the last zone as it can be + * smaller. */ if (!args->zone_capacity) args->zone_capacity = zone->capacity; - if (zone->capacity != args->zone_capacity) { + if (!disk_zone_is_last(disk, zone) && + zone->capacity != args->zone_capacity) { pr_warn("%s: Invalid variable zone capacity\n", disk->disk_name); return -ENODEV; @@ -1732,7 +1739,6 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, { struct blk_revalidate_zone_args *args = data; struct gendisk *disk = args->disk; - sector_t capacity = get_capacity(disk); sector_t zone_sectors = disk->queue->limits.chunk_sectors; int ret; @@ -1743,7 +1749,7 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, return -ENODEV; } - if (zone->start >= capacity || !zone->len) { + if (zone->start >= get_capacity(disk) || !zone->len) { pr_warn("%s: Invalid zone start %llu, length %llu\n", disk->disk_name, zone->start, zone->len); return -ENODEV; @@ -1753,7 +1759,7 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, * All zones must have the same size, with the exception on an eventual * smaller last zone. */ - if (zone->start + zone->len < capacity) { + if (!disk_zone_is_last(disk, zone)) { if (zone->len != zone_sectors) { pr_warn("%s: Invalid zoned device with non constant zone size\n", disk->disk_name); From 29459c3eaa5c6261fbe0dea7bdeb9b48d35d862a Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 30 May 2024 14:40:34 +0900 Subject: [PATCH 0409/1578] block: Fix zone write plugging handling of devices with a runt zone A zoned device may have a last sequential write required zone that is smaller than other zones. However, all tests to check if a zone write plug write offset exceeds the zone capacity use the same capacity value stored in the gendisk zone_capacity field. This is incorrect for a zoned device with a last runt (smaller) zone. Add the new field last_zone_capacity to struct gendisk to store the capacity of the last zone of the device. blk_revalidate_seq_zone() and blk_revalidate_conv_zone() are both modified to get this value when disk_zone_is_last() returns true. Similarly to zone_capacity, the value is first stored using the last_zone_capacity field of struct blk_revalidate_zone_args. Once zone revalidation of all zones is done, this is used to set the gendisk last_zone_capacity field. The checks to determine if a zone is full or if a sector offset in a zone exceeds the zone capacity in disk_should_remove_zone_wplug(), disk_zone_wplug_abort_unaligned(), blk_zone_write_plug_init_request(), and blk_zone_wplug_prepare_bio() are modified to use the new helper functions disk_zone_is_full() and disk_zone_wplug_is_full(). disk_zone_is_full() uses the zone index to determine if the zone being tested is the last one of the disk and uses the either the disk zone_capacity or last_zone_capacity accordingly. Fixes: dd291d77cc90 ("block: Introduce zone write plugging") Signed-off-by: Damien Le Moal Reviewed-by: Bart Van Assche Reviewed-by: Niklas Cassel Link: https://lore.kernel.org/r/20240530054035.491497-4-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 35 +++++++++++++++++++++++++++-------- include/linux/blkdev.h | 1 + 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 402a50a1ac4d5b..52abebf56027a8 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -455,6 +455,20 @@ static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone) return zone->start + zone->len >= get_capacity(disk); } +static bool disk_zone_is_full(struct gendisk *disk, + unsigned int zno, unsigned int offset_in_zone) +{ + if (zno < disk->nr_zones - 1) + return offset_in_zone >= disk->zone_capacity; + return offset_in_zone >= disk->last_zone_capacity; +} + +static bool disk_zone_wplug_is_full(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + return disk_zone_is_full(disk, zwplug->zone_no, zwplug->wp_offset); +} + static bool disk_insert_zone_wplug(struct gendisk *disk, struct blk_zone_wplug *zwplug) { @@ -548,7 +562,7 @@ static inline bool disk_should_remove_zone_wplug(struct gendisk *disk, return false; /* We can remove zone write plugs for zones that are empty or full. */ - return !zwplug->wp_offset || zwplug->wp_offset >= disk->zone_capacity; + return !zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug); } static void disk_remove_zone_wplug(struct gendisk *disk, @@ -669,13 +683,12 @@ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) static void disk_zone_wplug_abort_unaligned(struct gendisk *disk, struct blk_zone_wplug *zwplug) { - unsigned int zone_capacity = disk->zone_capacity; unsigned int wp_offset = zwplug->wp_offset; struct bio_list bl = BIO_EMPTY_LIST; struct bio *bio; while ((bio = bio_list_pop(&zwplug->bio_list))) { - if (wp_offset >= zone_capacity || + if (disk_zone_is_full(disk, zwplug->zone_no, wp_offset) || (bio_op(bio) != REQ_OP_ZONE_APPEND && bio_offset_from_zone_start(bio) != wp_offset)) { blk_zone_wplug_bio_io_error(zwplug, bio); @@ -914,7 +927,6 @@ void blk_zone_write_plug_init_request(struct request *req) sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req); struct request_queue *q = req->q; struct gendisk *disk = q->disk; - unsigned int zone_capacity = disk->zone_capacity; struct blk_zone_wplug *zwplug = disk_get_zone_wplug(disk, blk_rq_pos(req)); unsigned long flags; @@ -938,7 +950,7 @@ void blk_zone_write_plug_init_request(struct request *req) * into the back of the request. */ spin_lock_irqsave(&zwplug->lock, flags); - while (zwplug->wp_offset < zone_capacity) { + while (!disk_zone_wplug_is_full(disk, zwplug)) { bio = bio_list_peek(&zwplug->bio_list); if (!bio) break; @@ -984,7 +996,7 @@ static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug, * We know such BIO will fail, and that would potentially overflow our * write pointer offset beyond the end of the zone. */ - if (zwplug->wp_offset >= disk->zone_capacity) + if (disk_zone_wplug_is_full(disk, zwplug)) goto err; if (bio_op(bio) == REQ_OP_ZONE_APPEND) { @@ -1561,6 +1573,7 @@ void disk_free_zone_resources(struct gendisk *disk) kfree(disk->conv_zones_bitmap); disk->conv_zones_bitmap = NULL; disk->zone_capacity = 0; + disk->last_zone_capacity = 0; disk->nr_zones = 0; } @@ -1605,6 +1618,7 @@ struct blk_revalidate_zone_args { unsigned long *conv_zones_bitmap; unsigned int nr_zones; unsigned int zone_capacity; + unsigned int last_zone_capacity; sector_t sector; }; @@ -1622,6 +1636,7 @@ static int disk_update_zone_resources(struct gendisk *disk, disk->nr_zones = args->nr_zones; disk->zone_capacity = args->zone_capacity; + disk->last_zone_capacity = args->last_zone_capacity; swap(disk->conv_zones_bitmap, args->conv_zones_bitmap); if (disk->conv_zones_bitmap) nr_conv_zones = bitmap_weight(disk->conv_zones_bitmap, @@ -1673,6 +1688,9 @@ static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, return -ENODEV; } + if (disk_zone_is_last(disk, zone)) + args->last_zone_capacity = zone->capacity; + if (!disk_need_zone_resources(disk)) return 0; @@ -1703,8 +1721,9 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, */ if (!args->zone_capacity) args->zone_capacity = zone->capacity; - if (!disk_zone_is_last(disk, zone) && - zone->capacity != args->zone_capacity) { + if (disk_zone_is_last(disk, zone)) { + args->last_zone_capacity = zone->capacity; + } else if (zone->capacity != args->zone_capacity) { pr_warn("%s: Invalid variable zone capacity\n", disk->disk_name); return -ENODEV; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index aefdda9f4ec711..24c36929920b76 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -186,6 +186,7 @@ struct gendisk { */ unsigned int nr_zones; unsigned int zone_capacity; + unsigned int last_zone_capacity; unsigned long *conv_zones_bitmap; unsigned int zone_wplugs_hash_bits; spinlock_t zone_wplugs_lock; From d53b681ce9ca7db5ef4ecb8d2cf465ae4a031264 Mon Sep 17 00:00:00 2001 From: Chanwoo Lee Date: Fri, 24 May 2024 10:59:04 +0900 Subject: [PATCH 0410/1578] scsi: ufs: mcq: Fix error output and clean up ufshcd_mcq_abort() An error unrelated to ufshcd_try_to_abort_task is being logged and can cause confusion. Modify ufshcd_mcq_abort() to print the result of the abort failure. For readability, return immediately instead of 'goto'. Fixes: f1304d442077 ("scsi: ufs: mcq: Added ufshcd_mcq_abort()") Signed-off-by: Chanwoo Lee Link: https://lore.kernel.org/r/20240524015904.1116005-1-cw9316.lee@samsung.com Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufs-mcq.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/ufs/core/ufs-mcq.c b/drivers/ufs/core/ufs-mcq.c index 005d63ab1f441c..8944548c30fa15 100644 --- a/drivers/ufs/core/ufs-mcq.c +++ b/drivers/ufs/core/ufs-mcq.c @@ -634,20 +634,20 @@ int ufshcd_mcq_abort(struct scsi_cmnd *cmd) struct ufshcd_lrb *lrbp = &hba->lrb[tag]; struct ufs_hw_queue *hwq; unsigned long flags; - int err = FAILED; + int err; if (!ufshcd_cmd_inflight(lrbp->cmd)) { dev_err(hba->dev, "%s: skip abort. cmd at tag %d already completed.\n", __func__, tag); - goto out; + return FAILED; } /* Skip task abort in case previous aborts failed and report failure */ if (lrbp->req_abort_skip) { dev_err(hba->dev, "%s: skip abort. tag %d failed earlier\n", __func__, tag); - goto out; + return FAILED; } hwq = ufshcd_mcq_req_to_hwq(hba, scsi_cmd_to_rq(cmd)); @@ -659,7 +659,7 @@ int ufshcd_mcq_abort(struct scsi_cmnd *cmd) */ dev_err(hba->dev, "%s: cmd found in sq. hwq=%d, tag=%d\n", __func__, hwq->id, tag); - goto out; + return FAILED; } /* @@ -667,18 +667,17 @@ int ufshcd_mcq_abort(struct scsi_cmnd *cmd) * in the completion queue either. Query the device to see if * the command is being processed in the device. */ - if (ufshcd_try_to_abort_task(hba, tag)) { + err = ufshcd_try_to_abort_task(hba, tag); + if (err) { dev_err(hba->dev, "%s: device abort failed %d\n", __func__, err); lrbp->req_abort_skip = true; - goto out; + return FAILED; } - err = SUCCESS; spin_lock_irqsave(&hwq->cq_lock, flags); if (ufshcd_cmd_inflight(lrbp->cmd)) ufshcd_release_scsi_cmd(hba, lrbp); spin_unlock_irqrestore(&hwq->cq_lock, flags); -out: - return err; + return SUCCESS; } From 935df1bd40d43c4ee91838c42a20e9af751885cc Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Wed, 29 May 2024 14:59:20 -0500 Subject: [PATCH 0411/1578] of/irq: Factor out parsing of interrupt-map parent phandle+args from of_irq_parse_raw() Factor out the parsing of interrupt-map interrupt parent phandle and its arg cells to a separate function, of_irq_parse_imap_parent(), so that it can be used in other parsing scenarios (e.g. fw_devlink). There was a refcount leak on non-matching entries when iterating thru "interrupt-map" which is fixed. Tested-by: Marc Zyngier Tested-by: Anup Patel Link: https://lore.kernel.org/r/20240529-dt-interrupt-map-fix-v2-1-ef86dc5bcd2a@kernel.org Signed-off-by: Rob Herring (Arm) --- drivers/of/irq.c | 125 ++++++++++++++++++++++++---------------- drivers/of/of_private.h | 3 + 2 files changed, 77 insertions(+), 51 deletions(-) diff --git a/drivers/of/irq.c b/drivers/of/irq.c index 174900072c18cd..462375b293e475 100644 --- a/drivers/of/irq.c +++ b/drivers/of/irq.c @@ -25,6 +25,8 @@ #include #include +#include "of_private.h" + /** * irq_of_parse_and_map - Parse and map an interrupt into linux virq space * @dev: Device node of the device whose interrupt is to be mapped @@ -96,6 +98,57 @@ static const char * const of_irq_imap_abusers[] = { NULL, }; +const __be32 *of_irq_parse_imap_parent(const __be32 *imap, int len, struct of_phandle_args *out_irq) +{ + u32 intsize, addrsize; + struct device_node *np; + + /* Get the interrupt parent */ + if (of_irq_workarounds & OF_IMAP_NO_PHANDLE) + np = of_node_get(of_irq_dflt_pic); + else + np = of_find_node_by_phandle(be32_to_cpup(imap)); + imap++; + + /* Check if not found */ + if (!np) { + pr_debug(" -> imap parent not found !\n"); + return NULL; + } + + /* Get #interrupt-cells and #address-cells of new parent */ + if (of_property_read_u32(np, "#interrupt-cells", + &intsize)) { + pr_debug(" -> parent lacks #interrupt-cells!\n"); + of_node_put(np); + return NULL; + } + if (of_property_read_u32(np, "#address-cells", + &addrsize)) + addrsize = 0; + + pr_debug(" -> intsize=%d, addrsize=%d\n", + intsize, addrsize); + + /* Check for malformed properties */ + if (WARN_ON(addrsize + intsize > MAX_PHANDLE_ARGS) + || (len < (addrsize + intsize))) { + of_node_put(np); + return NULL; + } + + pr_debug(" -> imaplen=%d\n", len); + + imap += addrsize + intsize; + + out_irq->np = np; + for (int i = 0; i < intsize; i++) + out_irq->args[i] = be32_to_cpup(imap - intsize + i); + out_irq->args_count = intsize; + + return imap; +} + /** * of_irq_parse_raw - Low level interrupt tree parsing * @addr: address specifier (start of "reg" property of the device) in be32 format @@ -112,12 +165,12 @@ static const char * const of_irq_imap_abusers[] = { */ int of_irq_parse_raw(const __be32 *addr, struct of_phandle_args *out_irq) { - struct device_node *ipar, *tnode, *old = NULL, *newpar = NULL; + struct device_node *ipar, *tnode, *old = NULL; __be32 initial_match_array[MAX_PHANDLE_ARGS]; const __be32 *match_array = initial_match_array; - const __be32 *tmp, *imap, *imask, dummy_imask[] = { [0 ... MAX_PHANDLE_ARGS] = cpu_to_be32(~0) }; - u32 intsize = 1, addrsize, newintsize = 0, newaddrsize = 0; - int imaplen, match, i, rc = -EINVAL; + const __be32 *tmp, dummy_imask[] = { [0 ... MAX_PHANDLE_ARGS] = cpu_to_be32(~0) }; + u32 intsize = 1, addrsize; + int i, rc = -EINVAL; #ifdef DEBUG of_print_phandle_args("of_irq_parse_raw: ", out_irq); @@ -176,6 +229,9 @@ int of_irq_parse_raw(const __be32 *addr, struct of_phandle_args *out_irq) /* Now start the actual "proper" walk of the interrupt tree */ while (ipar != NULL) { + int imaplen, match; + const __be32 *imap, *oldimap, *imask; + struct device_node *newpar; /* * Now check if cursor is an interrupt-controller and * if it is then we are done, unless there is an @@ -216,7 +272,7 @@ int of_irq_parse_raw(const __be32 *addr, struct of_phandle_args *out_irq) /* Parse interrupt-map */ match = 0; - while (imaplen > (addrsize + intsize + 1) && !match) { + while (imaplen > (addrsize + intsize + 1)) { /* Compare specifiers */ match = 1; for (i = 0; i < (addrsize + intsize); i++, imaplen--) @@ -224,48 +280,17 @@ int of_irq_parse_raw(const __be32 *addr, struct of_phandle_args *out_irq) pr_debug(" -> match=%d (imaplen=%d)\n", match, imaplen); - /* Get the interrupt parent */ - if (of_irq_workarounds & OF_IMAP_NO_PHANDLE) - newpar = of_node_get(of_irq_dflt_pic); - else - newpar = of_find_node_by_phandle(be32_to_cpup(imap)); - imap++; - --imaplen; - - /* Check if not found */ - if (newpar == NULL) { - pr_debug(" -> imap parent not found !\n"); - goto fail; - } - - if (!of_device_is_available(newpar)) - match = 0; - - /* Get #interrupt-cells and #address-cells of new - * parent - */ - if (of_property_read_u32(newpar, "#interrupt-cells", - &newintsize)) { - pr_debug(" -> parent lacks #interrupt-cells!\n"); - goto fail; - } - if (of_property_read_u32(newpar, "#address-cells", - &newaddrsize)) - newaddrsize = 0; - - pr_debug(" -> newintsize=%d, newaddrsize=%d\n", - newintsize, newaddrsize); - - /* Check for malformed properties */ - if (WARN_ON(newaddrsize + newintsize > MAX_PHANDLE_ARGS) - || (imaplen < (newaddrsize + newintsize))) { - rc = -EFAULT; + oldimap = imap; + imap = of_irq_parse_imap_parent(oldimap, imaplen, out_irq); + if (!imap) goto fail; - } - imap += newaddrsize + newintsize; - imaplen -= newaddrsize + newintsize; + match &= of_device_is_available(out_irq->np); + if (match) + break; + of_node_put(out_irq->np); + imaplen -= imap - oldimap; pr_debug(" -> imaplen=%d\n", imaplen); } if (!match) { @@ -287,11 +312,11 @@ int of_irq_parse_raw(const __be32 *addr, struct of_phandle_args *out_irq) * Successfully parsed an interrupt-map translation; copy new * interrupt specifier into the out_irq structure */ - match_array = imap - newaddrsize - newintsize; - for (i = 0; i < newintsize; i++) - out_irq->args[i] = be32_to_cpup(imap - newintsize + i); - out_irq->args_count = intsize = newintsize; - addrsize = newaddrsize; + match_array = oldimap + 1; + + newpar = out_irq->np; + intsize = out_irq->args_count; + addrsize = (imap - match_array) - intsize; if (ipar == newpar) { pr_debug("%pOF interrupt-map entry to self\n", ipar); @@ -300,7 +325,6 @@ int of_irq_parse_raw(const __be32 *addr, struct of_phandle_args *out_irq) skiplevel: /* Iterate again with new parent */ - out_irq->np = newpar; pr_debug(" -> new parent: %pOF\n", newpar); of_node_put(ipar); ipar = newpar; @@ -310,7 +334,6 @@ int of_irq_parse_raw(const __be32 *addr, struct of_phandle_args *out_irq) fail: of_node_put(ipar); - of_node_put(newpar); return rc; } diff --git a/drivers/of/of_private.h b/drivers/of/of_private.h index 94fc0aa07af9e3..04aa2a91f851ac 100644 --- a/drivers/of/of_private.h +++ b/drivers/of/of_private.h @@ -159,6 +159,9 @@ extern void __of_sysfs_remove_bin_file(struct device_node *np, extern int of_bus_n_addr_cells(struct device_node *np); extern int of_bus_n_size_cells(struct device_node *np); +const __be32 *of_irq_parse_imap_parent(const __be32 *imap, int len, + struct of_phandle_args *out_irq); + struct bus_dma_region; #if defined(CONFIG_OF_ADDRESS) && defined(CONFIG_HAS_DMA) int of_dma_get_range(struct device_node *np, From e7985f43609c782132f8f5794ee6cc4cdb66ca75 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 29 May 2024 14:59:21 -0500 Subject: [PATCH 0412/1578] of: property: Fix fw_devlink handling of interrupt-map Commit d976c6f4b32c ("of: property: Add fw_devlink support for interrupt-map property") tried to do what it says on the tin, but failed on a couple of points: - it confuses bytes and cells. Not a huge deal, except when it comes to pointer arithmetic - it doesn't really handle anything but interrupt-maps that have their parent #address-cells set to 0 The combinations of the two leads to some serious fun on my M1 box, with plenty of WARN-ON() firing all over the shop, and amusing values being generated for interrupt specifiers. Having 2 versions of parsing code for "interrupt-map" was a bad idea. Now that the common parsing parts have been refactored into of_irq_parse_imap_parent(), rework the code here to use it instead and fix the pointer arithmetic. Note that the dependency will be a bit different than the original code when the interrupt-map points to another interrupt-map. In this case, the original code would resolve to the final interrupt controller. Now the dependency is the parent interrupt-map (which itself should have a dependency to the parent). It is possible that a node with an interrupt-map has no driver. Fixes: d976c6f4b32c ("of: property: Add fw_devlink support for interrupt-map property") Signed-off-by: Marc Zyngier Co-developed-by: Rob Herring (Arm) Cc: Saravana Kannan Tested-by: Marc Zyngier Tested-by: Anup Patel Link: https://lore.kernel.org/r/20240529-dt-interrupt-map-fix-v2-2-ef86dc5bcd2a@kernel.org Signed-off-by: Rob Herring (Arm) --- drivers/of/property.c | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/drivers/of/property.c b/drivers/of/property.c index 1c83e68f805baa..164d77cb944585 100644 --- a/drivers/of/property.c +++ b/drivers/of/property.c @@ -1306,10 +1306,10 @@ static struct device_node *parse_interrupts(struct device_node *np, static struct device_node *parse_interrupt_map(struct device_node *np, const char *prop_name, int index) { - const __be32 *imap, *imap_end, *addr; + const __be32 *imap, *imap_end; struct of_phandle_args sup_args; u32 addrcells, intcells; - int i, imaplen; + int imaplen; if (!IS_ENABLED(CONFIG_OF_IRQ)) return NULL; @@ -1322,33 +1322,23 @@ static struct device_node *parse_interrupt_map(struct device_node *np, addrcells = of_bus_n_addr_cells(np); imap = of_get_property(np, "interrupt-map", &imaplen); - if (!imap || imaplen <= (addrcells + intcells)) + imaplen /= sizeof(*imap); + if (!imap) return NULL; - imap_end = imap + imaplen; - while (imap < imap_end) { - addr = imap; - imap += addrcells; + imap_end = imap + imaplen; - sup_args.np = np; - sup_args.args_count = intcells; - for (i = 0; i < intcells; i++) - sup_args.args[i] = be32_to_cpu(imap[i]); - imap += intcells; + for (int i = 0; imap + addrcells + intcells + 1 < imap_end; i++) { + imap += addrcells + intcells; - /* - * Upon success, the function of_irq_parse_raw() returns - * interrupt controller DT node pointer in sup_args.np. - */ - if (of_irq_parse_raw(addr, &sup_args)) + imap = of_irq_parse_imap_parent(imap, imap_end - imap, &sup_args); + if (!imap) return NULL; - if (!index) + if (i == index) return sup_args.np; of_node_put(sup_args.np); - imap += sup_args.args_count + 1; - index--; } return NULL; From 0a751df4566c86e5a24f2a03290dad3d0f215692 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 30 May 2024 09:45:47 -0400 Subject: [PATCH 0413/1578] blk-throttle: Fix incorrect display of io.max Commit bf20ab538c81 ("blk-throttle: remove CONFIG_BLK_DEV_THROTTLING_LOW") attempts to revert the code change introduced by commit cd5ab1b0fcb4 ("blk-throttle: add .low interface"). However, it leaves behind the bps_conf[] and iops_conf[] fields in the throtl_grp structure which aren't set anywhere in the new blk-throttle.c code but are still being used by tg_prfill_limit() to display the limits in io.max. Now io.max always displays the following values if a block queue is used: : rbps=0 wbps=0 riops=0 wiops=0 Fix this problem by removing bps_conf[] and iops_conf[] and use bps[] and iops[] instead to complete the revert. Fixes: bf20ab538c81 ("blk-throttle: remove CONFIG_BLK_DEV_THROTTLING_LOW") Reported-by: Justin Forbes Closes: https://github.com/containers/podman/issues/22701#issuecomment-2120627789 Signed-off-by: Waiman Long Acked-by: Tejun Heo Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20240530134547.970075-1-longman@redhat.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 24 ++++++++++++------------ block/blk-throttle.h | 8 ++------ 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 0be180f9a78990..c1bf73f8c75d95 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1399,32 +1399,32 @@ static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, bps_dft = U64_MAX; iops_dft = UINT_MAX; - if (tg->bps_conf[READ] == bps_dft && - tg->bps_conf[WRITE] == bps_dft && - tg->iops_conf[READ] == iops_dft && - tg->iops_conf[WRITE] == iops_dft) + if (tg->bps[READ] == bps_dft && + tg->bps[WRITE] == bps_dft && + tg->iops[READ] == iops_dft && + tg->iops[WRITE] == iops_dft) return 0; seq_printf(sf, "%s", dname); - if (tg->bps_conf[READ] == U64_MAX) + if (tg->bps[READ] == U64_MAX) seq_printf(sf, " rbps=max"); else - seq_printf(sf, " rbps=%llu", tg->bps_conf[READ]); + seq_printf(sf, " rbps=%llu", tg->bps[READ]); - if (tg->bps_conf[WRITE] == U64_MAX) + if (tg->bps[WRITE] == U64_MAX) seq_printf(sf, " wbps=max"); else - seq_printf(sf, " wbps=%llu", tg->bps_conf[WRITE]); + seq_printf(sf, " wbps=%llu", tg->bps[WRITE]); - if (tg->iops_conf[READ] == UINT_MAX) + if (tg->iops[READ] == UINT_MAX) seq_printf(sf, " riops=max"); else - seq_printf(sf, " riops=%u", tg->iops_conf[READ]); + seq_printf(sf, " riops=%u", tg->iops[READ]); - if (tg->iops_conf[WRITE] == UINT_MAX) + if (tg->iops[WRITE] == UINT_MAX) seq_printf(sf, " wiops=max"); else - seq_printf(sf, " wiops=%u", tg->iops_conf[WRITE]); + seq_printf(sf, " wiops=%u", tg->iops[WRITE]); seq_printf(sf, "\n"); return 0; diff --git a/block/blk-throttle.h b/block/blk-throttle.h index 393c3d134b9673..4d9ef5abdf21cc 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -95,15 +95,11 @@ struct throtl_grp { bool has_rules_bps[2]; bool has_rules_iops[2]; - /* internally used bytes per second rate limits */ + /* bytes per second rate limits */ uint64_t bps[2]; - /* user configured bps limits */ - uint64_t bps_conf[2]; - /* internally used IOPS limits */ + /* IOPS limits */ unsigned int iops[2]; - /* user configured IOPS limits */ - unsigned int iops_conf[2]; /* Number of bytes dispatched in current slice */ uint64_t bytes_disp[2]; From 2d707b4e37f9b0c37b8b2392f91b04c5b63ea538 Mon Sep 17 00:00:00 2001 From: Yong-Xuan Wang Date: Mon, 15 Apr 2024 14:49:04 +0800 Subject: [PATCH 0414/1578] RISC-V: KVM: No need to use mask when hart-index-bit is 0 When the maximum hart number within groups is 1, hart-index-bit is set to 0. Consequently, there is no need to restore the hart ID from IMSIC addresses and hart-index-bit settings. Currently, QEMU and kvmtool do not pass correct hart-index-bit values when the maximum hart number is a power of 2, thereby avoiding this issue. Corresponding patches for QEMU and kvmtool will also be dispatched. Fixes: 89d01306e34d ("RISC-V: KVM: Implement device interface for AIA irqchip") Signed-off-by: Yong-Xuan Wang Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20240415064905.25184-1-yongxuan.wang@sifive.com Signed-off-by: Anup Patel --- arch/riscv/kvm/aia_device.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/riscv/kvm/aia_device.c b/arch/riscv/kvm/aia_device.c index 0eb689351b7d04..5cd407c6a8e4f8 100644 --- a/arch/riscv/kvm/aia_device.c +++ b/arch/riscv/kvm/aia_device.c @@ -237,10 +237,11 @@ static gpa_t aia_imsic_ppn(struct kvm_aia *aia, gpa_t addr) static u32 aia_imsic_hart_index(struct kvm_aia *aia, gpa_t addr) { - u32 hart, group = 0; + u32 hart = 0, group = 0; - hart = (addr >> (aia->nr_guest_bits + IMSIC_MMIO_PAGE_SHIFT)) & - GENMASK_ULL(aia->nr_hart_bits - 1, 0); + if (aia->nr_hart_bits) + hart = (addr >> (aia->nr_guest_bits + IMSIC_MMIO_PAGE_SHIFT)) & + GENMASK_ULL(aia->nr_hart_bits - 1, 0); if (aia->nr_group_bits) group = (addr >> aia->nr_group_shift) & GENMASK_ULL(aia->nr_group_bits - 1, 0); From c66f3b40b17d3dfc4b6abb5efde8e71c46971821 Mon Sep 17 00:00:00 2001 From: Quan Zhou Date: Thu, 23 May 2024 10:13:34 +0800 Subject: [PATCH 0415/1578] RISC-V: KVM: Fix incorrect reg_subtype labels in kvm_riscv_vcpu_set_reg_isa_ext function In the function kvm_riscv_vcpu_set_reg_isa_ext, the original code used incorrect reg_subtype labels KVM_REG_RISCV_SBI_MULTI_EN/DIS. These have been corrected to KVM_REG_RISCV_ISA_MULTI_EN/DIS respectively. Although they are numerically equivalent, the actual processing will not result in errors, but it may lead to ambiguous code semantics. Fixes: 613029442a4b ("RISC-V: KVM: Extend ONE_REG to enable/disable multiple ISA extensions") Signed-off-by: Quan Zhou Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/ff1c6771a67d660db94372ac9aaa40f51e5e0090.1716429371.git.zhouquan@iscas.ac.cn Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_onereg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kvm/vcpu_onereg.c b/arch/riscv/kvm/vcpu_onereg.c index c676275ea0a017..62874fbca29ff5 100644 --- a/arch/riscv/kvm/vcpu_onereg.c +++ b/arch/riscv/kvm/vcpu_onereg.c @@ -724,9 +724,9 @@ static int kvm_riscv_vcpu_set_reg_isa_ext(struct kvm_vcpu *vcpu, switch (reg_subtype) { case KVM_REG_RISCV_ISA_SINGLE: return riscv_vcpu_set_isa_ext_single(vcpu, reg_num, reg_val); - case KVM_REG_RISCV_SBI_MULTI_EN: + case KVM_REG_RISCV_ISA_MULTI_EN: return riscv_vcpu_set_isa_ext_multi(vcpu, reg_num, reg_val, true); - case KVM_REG_RISCV_SBI_MULTI_DIS: + case KVM_REG_RISCV_ISA_MULTI_DIS: return riscv_vcpu_set_isa_ext_multi(vcpu, reg_num, reg_val, false); default: return -ENOENT; From 310fa3ec2859f1c094e6e9b5d2e1ca51738c409a Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 31 May 2024 09:51:07 +0200 Subject: [PATCH 0416/1578] ALSA: seq: ump: Fix swapped song position pointer data At converting between the legacy event and UMP, the parameters for MIDI Song Position Pointer are incorrectly stored. It should have been LSB -> MSB order while it stored in MSB -> LSB order. This patch corrects the ordering. Fixes: e9e02819a98a ("ALSA: seq: Automatic conversion of UMP events") Link: https://lore.kernel.org/r/20240531075110.3250-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/seq/seq_ump_convert.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/core/seq/seq_ump_convert.c b/sound/core/seq/seq_ump_convert.c index f0f332017e662d..171fb75267afae 100644 --- a/sound/core/seq/seq_ump_convert.c +++ b/sound/core/seq/seq_ump_convert.c @@ -157,7 +157,7 @@ static void ump_system_to_one_param_ev(const union snd_ump_midi1_msg *val, static void ump_system_to_songpos_ev(const union snd_ump_midi1_msg *val, struct snd_seq_event *ev) { - ev->data.control.value = (val->system.parm1 << 7) | val->system.parm2; + ev->data.control.value = (val->system.parm2 << 7) | val->system.parm1; } /* Encoders for 0xf0 - 0xff */ @@ -754,8 +754,8 @@ static int system_2p_ev_to_ump_midi1(const struct snd_seq_event *event, { data->system.type = UMP_MSG_TYPE_SYSTEM; // override data->system.status = status; - data->system.parm1 = (event->data.control.value >> 7) & 0x7f; - data->system.parm2 = event->data.control.value & 0x7f; + data->system.parm1 = event->data.control.value & 0x7f; + data->system.parm2 = (event->data.control.value >> 7) & 0x7f; return 1; } From e0eec24e2e199873f43df99ec39773ad3af2bff7 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 29 May 2024 09:39:10 +0200 Subject: [PATCH 0417/1578] memblock: make memblock_set_node() also warn about use of MAX_NUMNODES On an (old) x86 system with SRAT just covering space above 4Gb: ACPI: SRAT: Node 0 PXM 0 [mem 0x100000000-0xfffffffff] hotplug the commit referenced below leads to this NUMA configuration no longer being refused by a CONFIG_NUMA=y kernel (previously NUMA: nodes only cover 6144MB of your 8185MB e820 RAM. Not used. No NUMA configuration found Faking a node at [mem 0x0000000000000000-0x000000027fffffff] was seen in the log directly after the message quoted above), because of memblock_validate_numa_coverage() checking for NUMA_NO_NODE (only). This in turn led to memblock_alloc_range_nid()'s warning about MAX_NUMNODES triggering, followed by a NULL deref in memmap_init() when trying to access node 64's (NODE_SHIFT=6) node data. To compensate said change, make memblock_set_node() warn on and adjust a passed in value of MAX_NUMNODES, just like various other functions already do. Fixes: ff6c3d81f2e8 ("NUMA: optimize detection of memory with no node id assigned by firmware") Signed-off-by: Jan Beulich Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/1c8a058c-5365-4f27-a9f1-3aeb7fb3e7b2@suse.com Signed-off-by: Mike Rapoport (IBM) --- mm/memblock.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/memblock.c b/mm/memblock.c index d09136e040d3cc..08e9806b1cf91b 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1339,6 +1339,10 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, int start_rgn, end_rgn; int i, ret; + if (WARN_ONCE(nid == MAX_NUMNODES, + "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) + nid = NUMA_NO_NODE; + ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); if (ret) return ret; From 0a44078f2b72abcdda47581c942bd5d0468ec50b Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Thu, 30 May 2024 13:12:03 -0700 Subject: [PATCH 0418/1578] perf/x86/rapl: Add missing MODULE_DESCRIPTION() line Fix the warning from 'make C=1 W=1': WARNING: modpost: missing MODULE_DESCRIPTION() in arch/x86/events/rapl.o Signed-off-by: Jeff Johnson Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Link: https://lore.kernel.org/r/20240530-md-arch-x86-events-v1-1-e45ffa8af99f@quicinc.com --- arch/x86/events/rapl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 46e67358556031..0c5e7a7c43ac07 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -64,6 +64,7 @@ #include "perf_event.h" #include "probe.h" +MODULE_DESCRIPTION("Support Intel/AMD RAPL energy consumption counters"); MODULE_LICENSE("GPL"); /* From dc8e5dfb52d56e955ad09174330252710845b8d2 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Thu, 30 May 2024 13:42:51 -0700 Subject: [PATCH 0419/1578] perf/x86/intel: Add missing MODULE_DESCRIPTION() lines Fix the 'make W=1 C=1' warnings: WARNING: modpost: missing MODULE_DESCRIPTION() in arch/x86/events/intel/intel-uncore.o WARNING: modpost: missing MODULE_DESCRIPTION() in arch/x86/events/intel/intel-cstate.o Signed-off-by: Jeff Johnson Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Link: https://lore.kernel.org/r/20240530-md-arch-x86-events-intel-v1-1-8252194ed20a@quicinc.com --- arch/x86/events/intel/cstate.c | 1 + arch/x86/events/intel/uncore.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index e64eaa8dda5a45..9d6e8f13d13a78 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -114,6 +114,7 @@ #include "../perf_event.h" #include "../probe.h" +MODULE_DESCRIPTION("Support for Intel cstate performance events"); MODULE_LICENSE("GPL"); #define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format) \ diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 419c517b8594fb..c68f5b39952b0a 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -34,6 +34,7 @@ static struct event_constraint uncore_constraint_fixed = struct event_constraint uncore_constraint_empty = EVENT_CONSTRAINT(0, 0, 0); +MODULE_DESCRIPTION("Support for Intel uncore performance events"); MODULE_LICENSE("GPL"); int uncore_pcibus_to_dieid(struct pci_bus *bus) From d40605a6823577a6c40fad6fb1f10a40ea0389d7 Mon Sep 17 00:00:00 2001 From: Phil Auld Date: Thu, 30 May 2024 14:15:48 -0400 Subject: [PATCH 0420/1578] sched/x86: Export 'percpu arch_freq_scale' Commit: 7bc263840bc3 ("sched/topology: Consolidate and clean up access to a CPU's max compute capacity") removed rq->cpu_capacity_orig in favor of using arch_scale_freq_capacity() calls. Export the underlying percpu symbol on x86 so that external trace point helper modules can be made to work again. Signed-off-by: Phil Auld Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240530181548.2039216-1-pauld@redhat.com --- arch/x86/kernel/cpu/aperfmperf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index f9a8c7b7943fb2..b3fa61d45352e7 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -345,6 +345,7 @@ static DECLARE_WORK(disable_freq_invariance_work, disable_freq_invariance_workfn); DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; +EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale); static void scale_freq_tick(u64 acnt, u64 mcnt) { From 985cfe501b74f214905ab4817acee0df24627268 Mon Sep 17 00:00:00 2001 From: Aapo Vienamo Date: Fri, 24 May 2024 18:53:17 +0300 Subject: [PATCH 0421/1578] thunderbolt: debugfs: Fix margin debugfs node creation condition The margin debugfs node controls the "Enable Margin Test" field of the lane margining operations. This field selects between either low or high voltage margin values for voltage margin test or left or right timing margin values for timing margin test. According to the USB4 specification, whether or not the "Enable Margin Test" control applies, depends on the values of the "Independent High/Low Voltage Margin" or "Independent Left/Right Timing Margin" capability fields for voltage and timing margin tests respectively. The pre-existing condition enabled the debugfs node also in the case where both low/high or left/right margins are returned, which is incorrect. This change only enables the debugfs node in question, if the specific required capability values are met. Signed-off-by: Aapo Vienamo Fixes: d0f1e0c2a699 ("thunderbolt: Add support for receiver lane margining") Cc: stable@vger.kernel.org Signed-off-by: Mika Westerberg --- drivers/thunderbolt/debugfs.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/thunderbolt/debugfs.c b/drivers/thunderbolt/debugfs.c index 193e9dfc983b64..70b52aac3d97d4 100644 --- a/drivers/thunderbolt/debugfs.c +++ b/drivers/thunderbolt/debugfs.c @@ -943,8 +943,9 @@ static void margining_port_init(struct tb_port *port) debugfs_create_file("run", 0600, dir, port, &margining_run_fops); debugfs_create_file("results", 0600, dir, port, &margining_results_fops); debugfs_create_file("test", 0600, dir, port, &margining_test_fops); - if (independent_voltage_margins(usb4) || - (supports_time(usb4) && independent_time_margins(usb4))) + if (independent_voltage_margins(usb4) == USB4_MARGIN_CAP_0_VOLTAGE_HL || + (supports_time(usb4) && + independent_time_margins(usb4) == USB4_MARGIN_CAP_1_TIME_LR)) debugfs_create_file("margin", 0600, dir, port, &margining_margin_fops); } From 539d33b5783804f22a62bd62ff463dfd1cef4265 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 16 May 2024 14:37:24 +0100 Subject: [PATCH 0422/1578] drm/komeda: remove unused struct 'gamma_curve_segment' 'gamma_curve_segment' looks like it has never been used. Remove it. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Dmitry Baryshkov Acked-by: Liviu Dudau Link: https://patchwork.freedesktop.org/patch/msgid/20240516133724.251750-1-linux@treblig.org Signed-off-by: Liviu Dudau --- drivers/gpu/drm/arm/display/komeda/komeda_color_mgmt.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/gpu/drm/arm/display/komeda/komeda_color_mgmt.c b/drivers/gpu/drm/arm/display/komeda/komeda_color_mgmt.c index d8e449e6ebda28..50cb8f7ee6b2ca 100644 --- a/drivers/gpu/drm/arm/display/komeda/komeda_color_mgmt.c +++ b/drivers/gpu/drm/arm/display/komeda/komeda_color_mgmt.c @@ -72,11 +72,6 @@ struct gamma_curve_sector { u32 segment_width; }; -struct gamma_curve_segment { - u32 start; - u32 end; -}; - static struct gamma_curve_sector sector_tbl[] = { { 0, 4, 4 }, { 16, 4, 4 }, From d339131bf02d4ed918415574082caf5e8af6e664 Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Fri, 31 May 2024 12:27:16 +0100 Subject: [PATCH 0423/1578] ALSA: hda: cs35l56: Fix lifecycle of codec pointer The codec should be cleared when the amp driver is unbound and when resuming it should be tested to prevent loading firmware into the device and ALSA in a partially configured system state. Signed-off-by: Simon Trimmer Link: https://lore.kernel.org/r/20240531112716.25323-1-simont@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/cs35l56_hda.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sound/pci/hda/cs35l56_hda.c b/sound/pci/hda/cs35l56_hda.c index 11b0570ff56d44..0923e2589f5f78 100644 --- a/sound/pci/hda/cs35l56_hda.c +++ b/sound/pci/hda/cs35l56_hda.c @@ -735,6 +735,8 @@ static void cs35l56_hda_unbind(struct device *dev, struct device *master, void * if (comps[cs35l56->index].dev == dev) memset(&comps[cs35l56->index], 0, sizeof(*comps)); + cs35l56->codec = NULL; + dev_dbg(cs35l56->base.dev, "Unbound\n"); } @@ -840,6 +842,9 @@ static int cs35l56_hda_system_resume(struct device *dev) cs35l56->suspended = false; + if (!cs35l56->codec) + return 0; + ret = cs35l56_is_fw_reload_needed(&cs35l56->base); dev_dbg(cs35l56->base.dev, "fw_reload_needed: %d\n", ret); if (ret > 0) { From 6386682cdc8b41319c92fbbe421953e33a28840c Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Fri, 31 May 2024 13:08:20 +0100 Subject: [PATCH 0424/1578] ALSA: hda: cs35l41: Possible null pointer dereference in cs35l41_hda_unbind() The cs35l41_hda_unbind() function clears the hda_component entry matching it's index and then dereferences the codec pointer held in the first element of the hda_component array, this is an issue when the device index was 0. Instead use the codec pointer stashed in the cs35l41_hda structure as it will still be valid. Fixes: 7cf5ce66dfda ("ALSA: hda: cs35l41: Add device_link between HDA and cs35l41_hda") Signed-off-by: Simon Trimmer Link: https://lore.kernel.org/r/20240531120820.35367-1-simont@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/cs35l41_hda.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c index 6c49e5c6cd2088..d54d4d60b03ec7 100644 --- a/sound/pci/hda/cs35l41_hda.c +++ b/sound/pci/hda/cs35l41_hda.c @@ -1495,7 +1495,7 @@ static void cs35l41_hda_unbind(struct device *dev, struct device *master, void * if (comps[cs35l41->index].dev == dev) { memset(&comps[cs35l41->index], 0, sizeof(*comps)); sleep_flags = lock_system_sleep(); - device_link_remove(&comps->codec->core.dev, cs35l41->dev); + device_link_remove(&cs35l41->codec->core.dev, cs35l41->dev); unlock_system_sleep(sleep_flags); } } From 55fac50ea46f46a22a92e2139b92afaa3822ad19 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 31 May 2024 14:37:17 +0200 Subject: [PATCH 0425/1578] ALSA: seq: ump: Fix missing System Reset message handling The conversion from System Reset event to UMP was missing. Add the entry for a conversion to a proper UMP System message. Fixes: e9e02819a98a ("ALSA: seq: Automatic conversion of UMP events") Link: https://lore.kernel.org/r/20240531123718.13420-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/seq/seq_ump_convert.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/core/seq/seq_ump_convert.c b/sound/core/seq/seq_ump_convert.c index 171fb75267afae..d81f776a4c3ddd 100644 --- a/sound/core/seq/seq_ump_convert.c +++ b/sound/core/seq/seq_ump_convert.c @@ -1075,6 +1075,8 @@ static const struct seq_ev_to_ump seq_ev_ump_encoders[] = { system_ev_to_ump_midi1, system_ev_to_ump_midi2 }, { SNDRV_SEQ_EVENT_SENSING, UMP_SYSTEM_STATUS_ACTIVE_SENSING, system_ev_to_ump_midi1, system_ev_to_ump_midi2 }, + { SNDRV_SEQ_EVENT_RESET, UMP_SYSTEM_STATUS_RESET, + system_ev_to_ump_midi1, system_ev_to_ump_midi2 }, }; static const struct seq_ev_to_ump *find_ump_encoder(int type) From 86aaa7e9d641c1ad1035ed2df88b8d0b48c86b30 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 30 May 2024 23:28:17 +0200 Subject: [PATCH 0426/1578] ata: libata-core: Add ATA_HORKAGE_NOLPM for Crucial CT240BX500SSD1 Commit 7627a0edef54 ("ata: ahci: Drop low power policy board type") dropped the board_ahci_low_power board type, and instead enables LPM if: -The AHCI controller reports that it supports LPM (Partial/Slumber), and -CONFIG_SATA_MOBILE_LPM_POLICY != 0, and -The port is not defined as external in the per port PxCMD register, and -The port is not defined as hotplug capable in the per port PxCMD register. Partial and Slumber LPM states can either be initiated by HIPM or DIPM. For HIPM (host initiated power management) to get enabled, both the AHCI controller and the drive have to report that they support HIPM. For DIPM (device initiated power management) to get enabled, only the drive has to report that it supports DIPM. However, the HBA will reject device requests to enter LPM states which the HBA does not support. The problem is that Crucial CT240BX500SSD1 drives do not handle low power modes correctly. The problem was most likely not seen before because no one had used this drive with a AHCI controller with LPM enabled. Add a quirk so that we do not enable LPM for this drive, since we see command timeouts if we do (even though the drive claims to support DIPM). Fixes: 7627a0edef54 ("ata: ahci: Drop low power policy board type") Cc: stable@vger.kernel.org Reported-by: Aarrayy Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218832 Reviewed-by: Mika Westerberg Reviewed-by: Damien Le Moal Signed-off-by: Niklas Cassel --- drivers/ata/libata-core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 4f35aab81a0a38..b0ce621fe2a119 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -4136,8 +4136,9 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { { "PIONEER BD-RW BDR-207M", NULL, ATA_HORKAGE_NOLPM }, { "PIONEER BD-RW BDR-205", NULL, ATA_HORKAGE_NOLPM }, - /* Crucial BX100 SSD 500GB has broken LPM support */ + /* Crucial devices with broken LPM support */ { "CT500BX100SSD1", NULL, ATA_HORKAGE_NOLPM }, + { "CT240BX500SSD1", NULL, ATA_HORKAGE_NOLPM }, /* 512GB MX100 with MU01 firmware has both queued TRIM and LPM issues */ { "Crucial_CT512MX100*", "MU01", ATA_HORKAGE_NO_NCQ_TRIM | From 473880369304cfd4445720cdd8bae4c6f1e16e60 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 30 May 2024 23:32:44 +0200 Subject: [PATCH 0427/1578] ata: libata-core: Add ATA_HORKAGE_NOLPM for AMD Radeon S3 SSD Commit 7627a0edef54 ("ata: ahci: Drop low power policy board type") dropped the board_ahci_low_power board type, and instead enables LPM if: -The AHCI controller reports that it supports LPM (Partial/Slumber), and -CONFIG_SATA_MOBILE_LPM_POLICY != 0, and -The port is not defined as external in the per port PxCMD register, and -The port is not defined as hotplug capable in the per port PxCMD register. Partial and Slumber LPM states can either be initiated by HIPM or DIPM. For HIPM (host initiated power management) to get enabled, both the AHCI controller and the drive have to report that they support HIPM. For DIPM (device initiated power management) to get enabled, only the drive has to report that it supports DIPM. However, the HBA will reject device requests to enter LPM states which the HBA does not support. The problem is that AMD Radeon S3 SSD drives do not handle low power modes correctly. The problem was most likely not seen before because no one had used this drive with a AHCI controller with LPM enabled. Add a quirk so that we do not enable LPM for this drive, since we see command timeouts if we do (even though the drive claims to support both HIPM and DIPM). Fixes: 7627a0edef54 ("ata: ahci: Drop low power policy board type") Cc: stable@vger.kernel.org Reported-by: Doru Iorgulescu Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218832 Reviewed-by: Mika Westerberg Reviewed-by: Damien Le Moal Signed-off-by: Niklas Cassel --- drivers/ata/libata-core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index b0ce621fe2a119..4ea859dc381aa5 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -4156,6 +4156,9 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { ATA_HORKAGE_ZERO_AFTER_TRIM | ATA_HORKAGE_NOLPM }, + /* AMD Radeon devices with broken LPM support */ + { "R3SL240G", NULL, ATA_HORKAGE_NOLPM }, + /* These specific Samsung models/firmware-revs do not handle LPM well */ { "SAMSUNG MZMPC128HBFU-000MV", "CXM14M1Q", ATA_HORKAGE_NOLPM }, { "SAMSUNG SSD PM830 mSATA *", "CXM13D1Q", ATA_HORKAGE_NOLPM }, From 3cb648c4dd3e8dde800fb3659250ed11f2d9efa5 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 30 May 2024 23:27:04 +0200 Subject: [PATCH 0428/1578] ata: libata-core: Add ATA_HORKAGE_NOLPM for Apacer AS340 Commit 7627a0edef54 ("ata: ahci: Drop low power policy board type") dropped the board_ahci_low_power board type, and instead enables LPM if: -The AHCI controller reports that it supports LPM (Partial/Slumber), and -CONFIG_SATA_MOBILE_LPM_POLICY != 0, and -The port is not defined as external in the per port PxCMD register, and -The port is not defined as hotplug capable in the per port PxCMD register. Partial and Slumber LPM states can either be initiated by HIPM or DIPM. For HIPM (host initiated power management) to get enabled, both the AHCI controller and the drive have to report that they support HIPM. For DIPM (device initiated power management) to get enabled, only the drive has to report that it supports DIPM. However, the HBA will reject device requests to enter LPM states which the HBA does not support. The problem is that Apacer AS340 drives do not handle low power modes correctly. The problem was most likely not seen before because no one had used this drive with a AHCI controller with LPM enabled. Add a quirk so that we do not enable LPM for this drive, since we see command timeouts if we do (even though the drive claims to support DIPM). Fixes: 7627a0edef54 ("ata: ahci: Drop low power policy board type") Cc: stable@vger.kernel.org Reported-by: Tim Teichmann Closes: https://lore.kernel.org/linux-ide/87bk4pbve8.ffs@tglx/ Reviewed-by: Mika Westerberg Reviewed-by: Damien Le Moal Signed-off-by: Niklas Cassel --- drivers/ata/libata-core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 4ea859dc381aa5..e1bf8a19b3c89e 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -4159,6 +4159,9 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { /* AMD Radeon devices with broken LPM support */ { "R3SL240G", NULL, ATA_HORKAGE_NOLPM }, + /* Apacer models with LPM issues */ + { "Apacer AS340*", NULL, ATA_HORKAGE_NOLPM }, + /* These specific Samsung models/firmware-revs do not handle LPM well */ { "SAMSUNG MZMPC128HBFU-000MV", "CXM14M1Q", ATA_HORKAGE_NOLPM }, { "SAMSUNG SSD PM830 mSATA *", "CXM13D1Q", ATA_HORKAGE_NOLPM }, From 0f42bdf59b4e428485aa922bef871bfa6cc505e0 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Fri, 31 May 2024 18:43:37 +0900 Subject: [PATCH 0429/1578] selftests/tracing: Fix event filter test to retry up to 10 times Commit eb50d0f250e9 ("selftests/ftrace: Choose target function for filter test from samples") choose the target function from samples, but sometimes this test failes randomly because the target function does not hit at the next time. So retry getting samples up to 10 times. Fixes: eb50d0f250e9 ("selftests/ftrace: Choose target function for filter test from samples") Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Shuah Khan --- .../test.d/filter/event-filter-function.tc | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc b/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc index 3f74c09c56b624..118247b8dd84d8 100644 --- a/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc +++ b/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc @@ -10,7 +10,6 @@ fail() { #msg } sample_events() { - echo > trace echo 1 > events/kmem/kmem_cache_free/enable echo 1 > tracing_on ls > /dev/null @@ -22,6 +21,7 @@ echo 0 > tracing_on echo 0 > events/enable echo "Get the most frequently calling function" +echo > trace sample_events target_func=`cat trace | grep -o 'call_site=\([^+]*\)' | sed 's/call_site=//' | sort | uniq -c | sort | tail -n 1 | sed 's/^[ 0-9]*//'` @@ -32,7 +32,16 @@ echo > trace echo "Test event filter function name" echo "call_site.function == $target_func" > events/kmem/kmem_cache_free/filter + +sample_events +max_retry=10 +while [ `grep kmem_cache_free trace| wc -l` -eq 0 ]; do sample_events +max_retry=$((max_retry - 1)) +if [ $max_retry -eq 0 ]; then + exit_fail +fi +done hitcnt=`grep kmem_cache_free trace| grep $target_func | wc -l` misscnt=`grep kmem_cache_free trace| grep -v $target_func | wc -l` @@ -49,7 +58,16 @@ address=`grep " ${target_func}\$" /proc/kallsyms | cut -d' ' -f1` echo "Test event filter function address" echo "call_site.function == 0x$address" > events/kmem/kmem_cache_free/filter +echo > trace +sample_events +max_retry=10 +while [ `grep kmem_cache_free trace| wc -l` -eq 0 ]; do sample_events +max_retry=$((max_retry - 1)) +if [ $max_retry -eq 0 ]; then + exit_fail +fi +done hitcnt=`grep kmem_cache_free trace| grep $target_func | wc -l` misscnt=`grep kmem_cache_free trace| grep -v $target_func | wc -l` From 88da52ccd66e65f2e63a6c35c9dff55d448ef4dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Thu, 16 May 2024 20:19:34 +0200 Subject: [PATCH 0430/1578] landlock: Fix d_parent walk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The WARN_ON_ONCE() in collect_domain_accesses() can be triggered when trying to link a root mount point. This cannot work in practice because this directory is mounted, but the VFS check is done after the call to security_path_link(). Do not use source directory's d_parent when the source directory is the mount point. Cc: Günther Noack Cc: Paul Moore Cc: stable@vger.kernel.org Reported-by: syzbot+bf4903dc7e12b18ebc87@syzkaller.appspotmail.com Fixes: b91c3e4ea756 ("landlock: Add support for file reparenting with LANDLOCK_ACCESS_FS_REFER") Closes: https://lore.kernel.org/r/000000000000553d3f0618198200@google.com Link: https://lore.kernel.org/r/20240516181935.1645983-2-mic@digikod.net [mic: Fix commit message] Signed-off-by: Mickaël Salaün --- security/landlock/fs.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/security/landlock/fs.c b/security/landlock/fs.c index 22d8b7c28074e0..7877a64cc6b87c 100644 --- a/security/landlock/fs.c +++ b/security/landlock/fs.c @@ -1110,6 +1110,7 @@ static int current_check_refer_path(struct dentry *const old_dentry, bool allow_parent1, allow_parent2; access_mask_t access_request_parent1, access_request_parent2; struct path mnt_dir; + struct dentry *old_parent; layer_mask_t layer_masks_parent1[LANDLOCK_NUM_ACCESS_FS] = {}, layer_masks_parent2[LANDLOCK_NUM_ACCESS_FS] = {}; @@ -1157,9 +1158,17 @@ static int current_check_refer_path(struct dentry *const old_dentry, mnt_dir.mnt = new_dir->mnt; mnt_dir.dentry = new_dir->mnt->mnt_root; + /* + * old_dentry may be the root of the common mount point and + * !IS_ROOT(old_dentry) at the same time (e.g. with open_tree() and + * OPEN_TREE_CLONE). We do not need to call dget(old_parent) because + * we keep a reference to old_dentry. + */ + old_parent = (old_dentry == mnt_dir.dentry) ? old_dentry : + old_dentry->d_parent; + /* new_dir->dentry is equal to new_dentry->d_parent */ - allow_parent1 = collect_domain_accesses(dom, mnt_dir.dentry, - old_dentry->d_parent, + allow_parent1 = collect_domain_accesses(dom, mnt_dir.dentry, old_parent, &layer_masks_parent1); allow_parent2 = collect_domain_accesses( dom, mnt_dir.dentry, new_dir->dentry, &layer_masks_parent2); From 0055f53aac80fd938bf7cdfad7ad414ca6c0e198 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Thu, 16 May 2024 20:19:35 +0200 Subject: [PATCH 0431/1578] selftests/landlock: Add layout1.refer_mount_root MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add tests to check error codes when linking or renaming a mount root directory. This previously triggered a kernel warning, but it is fixed with the previous commit. Cc: Günther Noack Cc: Paul Moore Link: https://lore.kernel.org/r/20240516181935.1645983-3-mic@digikod.net Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/fs_test.c | 45 ++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c index 6b5a9ff88c3d71..7d063c652be164 100644 --- a/tools/testing/selftests/landlock/fs_test.c +++ b/tools/testing/selftests/landlock/fs_test.c @@ -35,6 +35,7 @@ * See https://sourceware.org/glibc/wiki/Synchronizing_Headers. */ #include +#include #include "common.h" @@ -47,6 +48,13 @@ int renameat2(int olddirfd, const char *oldpath, int newdirfd, } #endif +#ifndef open_tree +int open_tree(int dfd, const char *filename, unsigned int flags) +{ + return syscall(__NR_open_tree, dfd, filename, flags); +} +#endif + #ifndef RENAME_EXCHANGE #define RENAME_EXCHANGE (1 << 1) #endif @@ -2400,6 +2408,43 @@ TEST_F_FORK(layout1, refer_denied_by_default4) layer_dir_s1d1_refer); } +/* + * Tests walking through a denied root mount. + */ +TEST_F_FORK(layout1, refer_mount_root_deny) +{ + const struct landlock_ruleset_attr ruleset_attr = { + .handled_access_fs = LANDLOCK_ACCESS_FS_MAKE_DIR, + }; + int root_fd, ruleset_fd; + + /* Creates a mount object from a non-mount point. */ + set_cap(_metadata, CAP_SYS_ADMIN); + root_fd = + open_tree(AT_FDCWD, dir_s1d1, + AT_EMPTY_PATH | OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + clear_cap(_metadata, CAP_SYS_ADMIN); + ASSERT_LE(0, root_fd); + + ruleset_fd = + landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0); + ASSERT_LE(0, ruleset_fd); + + ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)); + ASSERT_EQ(0, landlock_restrict_self(ruleset_fd, 0)); + EXPECT_EQ(0, close(ruleset_fd)); + + /* Link denied by Landlock: EACCES. */ + EXPECT_EQ(-1, linkat(root_fd, ".", root_fd, "does_not_exist", 0)); + EXPECT_EQ(EACCES, errno); + + /* renameat2() always returns EBUSY. */ + EXPECT_EQ(-1, renameat2(root_fd, ".", root_fd, "does_not_exist", 0)); + EXPECT_EQ(EBUSY, errno); + + EXPECT_EQ(0, close(root_fd)); +} + TEST_F_FORK(layout1, reparent_link) { const struct rule layer1[] = { From 518549c120e671c4906f77d1802b97e9b23f673a Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 29 May 2024 18:16:56 -0500 Subject: [PATCH 0432/1578] cifs: fix creating sockets when using sfu mount options When running fstest generic/423 with sfu mount option, it was being skipped due to inability to create sockets: generic/423 [not run] cifs does not support mknod/mkfifo which can also be easily reproduced with their af_unix tool: ./src/af_unix /mnt1/socket-two bind: Operation not permitted Fix sfu mount option to allow creating and reporting sockets. Cc: stable@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/cifspdu.h | 2 +- fs/smb/client/inode.c | 4 ++++ fs/smb/client/smb2ops.c | 3 +++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h index c46d418c1c0c3e..a2072ab9e586d0 100644 --- a/fs/smb/client/cifspdu.h +++ b/fs/smb/client/cifspdu.h @@ -2574,7 +2574,7 @@ typedef struct { struct win_dev { - unsigned char type[8]; /* IntxCHR or IntxBLK or LnxFIFO*/ + unsigned char type[8]; /* IntxCHR or IntxBLK or LnxFIFO or LnxSOCK */ __le64 major; __le64 minor; } __attribute__((packed)); diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 262576573eb518..4a8aa1de95223d 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -606,6 +606,10 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path, mnr = le64_to_cpu(*(__le64 *)(pbuf+16)); fattr->cf_rdev = MKDEV(mjr, mnr); } + } else if (memcmp("LnxSOCK", pbuf, 8) == 0) { + cifs_dbg(FYI, "Socket\n"); + fattr->cf_mode |= S_IFSOCK; + fattr->cf_dtype = DT_SOCK; } else if (memcmp("IntxLNK", pbuf, 7) == 0) { cifs_dbg(FYI, "Symlink\n"); fattr->cf_mode |= S_IFLNK; diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 4ce6c3121a7ef1..c8e536540895a2 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -4997,6 +4997,9 @@ static int __cifs_sfu_make_node(unsigned int xid, struct inode *inode, pdev.major = cpu_to_le64(MAJOR(dev)); pdev.minor = cpu_to_le64(MINOR(dev)); break; + case S_IFSOCK: + strscpy(pdev.type, "LnxSOCK"); + break; case S_IFIFO: strscpy(pdev.type, "LnxFIFO"); break; From adb77bba9c664f5d120e0ffb1387e9d7408e1529 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 9 Apr 2024 19:31:52 -0700 Subject: [PATCH 0433/1578] scsi: mpt3sas: Avoid possible run-time warning with long manufacturer strings The prior strscpy() replacement of strncpy() here expected the manufacture_reply strings to be NUL-terminated, but it is possible they are not, as the code pattern here shows, e.g., edev->vendor_id being exactly 1 character larger than manufacture_reply->vendor_id, and the replaced strncpy() was copying only up to the size of the source character array. Replace this with memtostr(), which is the unambiguous way to convert a maybe not-NUL-terminated character array into a NUL-terminated string. Fixes: b7e9712a02e8 ("scsi: mpt3sas: Replace deprecated strncpy() with strscpy()") Signed-off-by: Kees Cook Tested-by: Marco Patalano Reviewed-by: Ewan D. Milne Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240410023155.2100422-3-keescook@chromium.org Signed-off-by: Kees Cook --- drivers/scsi/mpt3sas/mpt3sas_base.c | 2 +- drivers/scsi/mpt3sas/mpt3sas_transport.c | 14 +++++--------- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c index 258647fc6bddb4..1320e06727df19 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_base.c +++ b/drivers/scsi/mpt3sas/mpt3sas_base.c @@ -4774,7 +4774,7 @@ _base_display_ioc_capabilities(struct MPT3SAS_ADAPTER *ioc) char desc[17] = {0}; u32 iounit_pg1_flags; - strscpy(desc, ioc->manu_pg0.ChipName, sizeof(desc)); + memtostr(desc, ioc->manu_pg0.ChipName); ioc_info(ioc, "%s: FWVersion(%02d.%02d.%02d.%02d), ChipRevision(0x%02x)\n", desc, (ioc->facts.FWVersion.Word & 0xFF000000) >> 24, diff --git a/drivers/scsi/mpt3sas/mpt3sas_transport.c b/drivers/scsi/mpt3sas/mpt3sas_transport.c index 76f9a91771985b..d84413b77d8499 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_transport.c +++ b/drivers/scsi/mpt3sas/mpt3sas_transport.c @@ -458,17 +458,13 @@ _transport_expander_report_manufacture(struct MPT3SAS_ADAPTER *ioc, goto out; manufacture_reply = data_out + sizeof(struct rep_manu_request); - strscpy(edev->vendor_id, manufacture_reply->vendor_id, - sizeof(edev->vendor_id)); - strscpy(edev->product_id, manufacture_reply->product_id, - sizeof(edev->product_id)); - strscpy(edev->product_rev, manufacture_reply->product_rev, - sizeof(edev->product_rev)); + memtostr(edev->vendor_id, manufacture_reply->vendor_id); + memtostr(edev->product_id, manufacture_reply->product_id); + memtostr(edev->product_rev, manufacture_reply->product_rev); edev->level = manufacture_reply->sas_format & 1; if (edev->level) { - strscpy(edev->component_vendor_id, - manufacture_reply->component_vendor_id, - sizeof(edev->component_vendor_id)); + memtostr(edev->component_vendor_id, + manufacture_reply->component_vendor_id); tmp = (u8 *)&manufacture_reply->component_id; edev->component_id = tmp[0] << 8 | tmp[1]; edev->component_revision_id = From 4e173c825b1914d5b118bbf26f0168102d56ae95 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 31 May 2024 08:54:52 -0700 Subject: [PATCH 0434/1578] mailmap: update entry for Kees Cook I'm tired of gmail breaking DKIM. Switch everything over to my @kernel.org alias instead. Signed-off-by: Kees Cook --- .mailmap | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.mailmap b/.mailmap index 43cd2995dbc29a..efd9fa867a8ecc 100644 --- a/.mailmap +++ b/.mailmap @@ -337,10 +337,11 @@ Kalyan Thota Karthikeyan Periyasamy Kathiravan T Kay Sievers -Kees Cook -Kees Cook -Kees Cook -Kees Cook +Kees Cook +Kees Cook +Kees Cook +Kees Cook +Kees Cook Keith Busch Keith Busch Kenneth W Chen From d551ce15d08114514d489fad63bd275de2aca862 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 28 May 2024 14:02:35 +0200 Subject: [PATCH 0435/1578] mailbox: zynqmp-ipi: drop irq_to_desc() call irq_to_desc() is not exported to loadable modules, so this driver now fails to link in some configurations: ERROR: modpost: "irq_to_desc" [drivers/mailbox/zynqmp-ipi-mailbox.ko] undefined! I can't see a purpose for this call, since the return value is unused and probably left over from some code refactoring. Address the link failure by just removing the line. Fixes: 6ffb1635341b ("mailbox: zynqmp: handle SGI for shared IPI") Signed-off-by: Arnd Bergmann Tested-by: Tanmay Shah Signed-off-by: Jassi Brar --- drivers/mailbox/zynqmp-ipi-mailbox.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/mailbox/zynqmp-ipi-mailbox.c b/drivers/mailbox/zynqmp-ipi-mailbox.c index 7c90bac3de216a..4acf5612487cfd 100644 --- a/drivers/mailbox/zynqmp-ipi-mailbox.c +++ b/drivers/mailbox/zynqmp-ipi-mailbox.c @@ -850,7 +850,6 @@ static int xlnx_mbox_init_sgi(struct platform_device *pdev, return ret; } - irq_to_desc(pdata->virq_sgi); irq_set_status_flags(pdata->virq_sgi, IRQ_PER_CPU); /* Setup function for the CPU hot-plug cases */ From 0c2f6d04619ec2b53ad4b0b591eafc9389786e86 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 30 May 2024 17:29:18 +0200 Subject: [PATCH 0436/1578] x86/topology/intel: Unlock CPUID before evaluating anything Intel CPUs have a MSR bit to limit CPUID enumeration to leaf two. If this bit is set by the BIOS then CPUID evaluation including topology enumeration does not work correctly as the evaluation code does not try to analyze any leaf greater than two. This went unnoticed before because the original topology code just repeated evaluation several times and managed to overwrite the initial limited information with the correct one later. The new evaluation code does it once and therefore ends up with the limited and wrong information. Cure this by unlocking CPUID right before evaluating anything which depends on the maximum CPUID leaf being greater than two instead of rereading stuff after unlock. Fixes: 22d63660c35e ("x86/cpu: Use common topology code for Intel") Reported-by: Peter Schneider Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Peter Schneider Cc: Link: https://lore.kernel.org/r/fd3f73dc-a86f-4bcf-9c60-43556a21eb42@googlemail.com --- arch/x86/kernel/cpu/common.c | 3 ++- arch/x86/kernel/cpu/cpu.h | 2 ++ arch/x86/kernel/cpu/intel.c | 25 ++++++++++++++++--------- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index e31293c9609f50..d4e539d4e158cc 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1589,6 +1589,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) if (have_cpuid_p()) { cpu_detect(c); get_cpu_vendor(c); + intel_unlock_cpuid_leafs(c); get_cpu_cap(c); setup_force_cpu_cap(X86_FEATURE_CPUID); get_cpu_address_sizes(c); @@ -1748,7 +1749,7 @@ static void generic_identify(struct cpuinfo_x86 *c) cpu_detect(c); get_cpu_vendor(c); - + intel_unlock_cpuid_leafs(c); get_cpu_cap(c); get_cpu_address_sizes(c); diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index ea9e07d57c8dd2..1beccefbaff9a5 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -61,9 +61,11 @@ extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state; extern void __init tsx_init(void); void tsx_ap_init(void); +void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c); #else static inline void tsx_init(void) { } static inline void tsx_ap_init(void) { } +static inline void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c) { } #endif /* CONFIG_CPU_SUP_INTEL */ extern void init_spectral_chicken(struct cpuinfo_x86 *c); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 3c3e7e5695ba41..fdf3489d92a498 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -269,19 +269,26 @@ static void detect_tme_early(struct cpuinfo_x86 *c) c->x86_phys_bits -= keyid_bits; } +void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return; + + if (c->x86 < 6 || (c->x86 == 6 && c->x86_model < 0xd)) + return; + + /* + * The BIOS can have limited CPUID to leaf 2, which breaks feature + * enumeration. Unlock it and update the maximum leaf info. + */ + if (msr_clear_bit(MSR_IA32_MISC_ENABLE, MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) + c->cpuid_level = cpuid_eax(0); +} + static void early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; - /* Unmask CPUID levels if masked: */ - if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { - if (msr_clear_bit(MSR_IA32_MISC_ENABLE, - MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) { - c->cpuid_level = cpuid_eax(0); - get_cpu_cap(c); - } - } - if ((c->x86 == 0xf && c->x86_model >= 0x03) || (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); From 7dc3bfcb4c9cc58970fff6aaa48172cb224d85aa Mon Sep 17 00:00:00 2001 From: Chunguang Xu Date: Fri, 31 May 2024 17:24:21 +0800 Subject: [PATCH 0437/1578] nvme-fabrics: use reserved tag for reg read/write command In some scenarios, if too many commands are issued by nvme command in the same time by user tasks, this may exhaust all tags of admin_q. If a reset (nvme reset or IO timeout) occurs before these commands finish, reconnect routine may fail to update nvme regs due to insufficient tags, which will cause kernel hang forever. In order to workaround this issue, maybe we can let reg_read32()/reg_read64()/reg_write32() use reserved tags. This maybe safe for nvmf: 1. For the disable ctrl path, we will not issue connect command 2. For the enable ctrl / fw activate path, since connect and reg_xx() are called serially. So the reserved tags may still be enough while reg_xx() use reserved tags. Signed-off-by: Chunguang Xu Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/fabrics.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index c6ad2148c2e04d..ceb9c0ed3120b0 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -180,7 +180,7 @@ int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) cmd.prop_get.offset = cpu_to_le32(off); ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, - NVME_QID_ANY, 0); + NVME_QID_ANY, NVME_SUBMIT_RESERVED); if (ret >= 0) *val = le64_to_cpu(res.u64); @@ -226,7 +226,7 @@ int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) cmd.prop_get.offset = cpu_to_le32(off); ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, - NVME_QID_ANY, 0); + NVME_QID_ANY, NVME_SUBMIT_RESERVED); if (ret >= 0) *val = le64_to_cpu(res.u64); @@ -271,7 +271,7 @@ int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val) cmd.prop_set.value = cpu_to_le64(val); ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, NULL, NULL, 0, - NVME_QID_ANY, 0); + NVME_QID_ANY, NVME_SUBMIT_RESERVED); if (unlikely(ret)) dev_err(ctrl->device, "Property Set error: %d, offset %#x\n", From 32c75ad4a79259609ee19f749832bc2d99bbdd13 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Fri, 31 May 2024 13:07:53 -0700 Subject: [PATCH 0438/1578] selftests/futex: don't redefine .PHONY targets (all, clean) The .PHONY targets "all" and "clean" are both already defined in the file that is included in the very next line: ../lib.mk. Remove this duplicate code. Reviewed-by: Davidlohr Bueso Signed-off-by: John Hubbard Signed-off-by: Shuah Khan --- tools/testing/selftests/futex/Makefile | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/testing/selftests/futex/Makefile b/tools/testing/selftests/futex/Makefile index 11e157d7533b8f..78ab2cd111f608 100644 --- a/tools/testing/selftests/futex/Makefile +++ b/tools/testing/selftests/futex/Makefile @@ -3,8 +3,6 @@ SUBDIRS := functional TEST_PROGS := run.sh -.PHONY: all clean - include ../lib.mk all: From 4bf15b1c657d22d1d70173e43264e4606dfe75ff Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Fri, 31 May 2024 13:07:54 -0700 Subject: [PATCH 0439/1578] selftests/futex: don't pass a const char* to asprintf(3) When building with clang, via: make LLVM=1 -C tools/testing/selftests ...clang issues this warning: futex_requeue_pi.c:403:17: warning: passing 'const char **' to parameter of type 'char **' discards qualifiers in nested pointer types [-Wincompatible-pointer-types-discards-qualifiers] This warning fires because test_name is passed into asprintf(3), which then changes it. Fix this by simply removing the const qualifier. This is a local automatic variable in a very short function, so there is not much need to use the compiler to enforce const-ness at this scope. [1] https://lore.kernel.org/all/20240329-selftests-libmk-llvm-rfc-v1-1-2f9ed7d1c49f@valentinobst.de/ Fixes: f17d8a87ecb5 ("selftests: fuxex: Report a unique test name per run of futex_requeue_pi") Reviewed-by: Davidlohr Bueso Signed-off-by: John Hubbard Signed-off-by: Shuah Khan --- tools/testing/selftests/futex/functional/futex_requeue_pi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/futex/functional/futex_requeue_pi.c b/tools/testing/selftests/futex/functional/futex_requeue_pi.c index 7f3ca5c78df129..215c6cb539b4ab 100644 --- a/tools/testing/selftests/futex/functional/futex_requeue_pi.c +++ b/tools/testing/selftests/futex/functional/futex_requeue_pi.c @@ -360,7 +360,7 @@ int unit_test(int broadcast, long lock, int third_party_owner, long timeout_ns) int main(int argc, char *argv[]) { - const char *test_name; + char *test_name; int c, ret; while ((c = getopt(argc, argv, "bchlot:v:")) != -1) { From 99a6087dfdc65303d26ab5fba2dacd8931b82b08 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 31 May 2024 11:57:07 -0700 Subject: [PATCH 0440/1578] kunit/fortify: Remove __kmalloc_node() test __kmalloc_node() is considered an "internal" function to the Slab, so drop it from explicit testing. Link: https://lore.kernel.org/r/20240531185703.work.588-kees@kernel.org Signed-off-by: Kees Cook --- lib/fortify_kunit.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/lib/fortify_kunit.c b/lib/fortify_kunit.c index 39da5b3bc649c2..f9cc467334ce3d 100644 --- a/lib/fortify_kunit.c +++ b/lib/fortify_kunit.c @@ -235,9 +235,6 @@ static void fortify_test_alloc_size_##allocator##_dynamic(struct kunit *test) \ kmalloc_array_node(alloc_size, 1, gfp, NUMA_NO_NODE), \ kfree(p)); \ checker(expected_size, __kmalloc(alloc_size, gfp), \ - kfree(p)); \ - checker(expected_size, \ - __kmalloc_node(alloc_size, gfp, NUMA_NO_NODE), \ kfree(p)); \ \ orig = kmalloc(alloc_size, gfp); \ From b1a1fdd7096dd2d67911b07f8118ff113d815db4 Mon Sep 17 00:00:00 2001 From: Weiwen Hu Date: Thu, 30 May 2024 14:16:46 +0800 Subject: [PATCH 0441/1578] nvme: fix nvme_pr_* status code parsing Fix the parsing if extra status bits (e.g. MORE) is present. Fixes: 7fb42780d06c ("nvme: Convert NVMe errors to PR errors") Signed-off-by: Weiwen Hu Signed-off-by: Keith Busch --- drivers/nvme/host/pr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/pr.c b/drivers/nvme/host/pr.c index e05571b2a1b0c9..8fa1ffcdaed489 100644 --- a/drivers/nvme/host/pr.c +++ b/drivers/nvme/host/pr.c @@ -77,7 +77,7 @@ static int nvme_sc_to_pr_err(int nvme_sc) if (nvme_is_path_error(nvme_sc)) return PR_STS_PATH_FAILED; - switch (nvme_sc) { + switch (nvme_sc & 0x7ff) { case NVME_SC_SUCCESS: return PR_STS_SUCCESS; case NVME_SC_RESERVATION_CONFLICT: From b7c5e64fecfa88764791679cca4786ac65de739e Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 29 May 2024 22:52:30 -0600 Subject: [PATCH 0442/1578] vfio: Create vfio_fs_type with inode per device By linking all the device fds we provide to userspace to an address space through a new pseudo fs, we can use tools like unmap_mapping_range() to zap all vmas associated with a device. Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240530045236.1005864-2-alex.williamson@redhat.com Signed-off-by: Alex Williamson --- drivers/vfio/device_cdev.c | 7 ++++++ drivers/vfio/group.c | 7 ++++++ drivers/vfio/vfio_main.c | 44 ++++++++++++++++++++++++++++++++++++++ include/linux/vfio.h | 1 + 4 files changed, 59 insertions(+) diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c index e75da0a70d1f83..bb1817bd4ff319 100644 --- a/drivers/vfio/device_cdev.c +++ b/drivers/vfio/device_cdev.c @@ -39,6 +39,13 @@ int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep) filep->private_data = df; + /* + * Use the pseudo fs inode on the device to link all mmaps + * to the same address space, allowing us to unmap all vmas + * associated to this device using unmap_mapping_range(). + */ + filep->f_mapping = device->inode->i_mapping; + return 0; err_put_registration: diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c index 610a429c619125..ded364588d2973 100644 --- a/drivers/vfio/group.c +++ b/drivers/vfio/group.c @@ -286,6 +286,13 @@ static struct file *vfio_device_open_file(struct vfio_device *device) */ filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE); + /* + * Use the pseudo fs inode on the device to link all mmaps + * to the same address space, allowing us to unmap all vmas + * associated to this device using unmap_mapping_range(). + */ + filep->f_mapping = device->inode->i_mapping; + if (device->group->type == VFIO_NO_IOMMU) dev_warn(device->dev, "vfio-noiommu device opened by user " "(%s:%d)\n", current->comm, task_pid_nr(current)); diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index e97d796a54fbaf..a5a62d9d963f72 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -22,8 +22,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -43,9 +45,13 @@ #define DRIVER_AUTHOR "Alex Williamson " #define DRIVER_DESC "VFIO - User Level meta-driver" +#define VFIO_MAGIC 0x5646494f /* "VFIO" */ + static struct vfio { struct class *device_class; struct ida device_ida; + struct vfsmount *vfs_mount; + int fs_count; } vfio; #ifdef CONFIG_VFIO_NOIOMMU @@ -186,6 +192,8 @@ static void vfio_device_release(struct device *dev) if (device->ops->release) device->ops->release(device); + iput(device->inode); + simple_release_fs(&vfio.vfs_mount, &vfio.fs_count); kvfree(device); } @@ -228,6 +236,34 @@ struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev, } EXPORT_SYMBOL_GPL(_vfio_alloc_device); +static int vfio_fs_init_fs_context(struct fs_context *fc) +{ + return init_pseudo(fc, VFIO_MAGIC) ? 0 : -ENOMEM; +} + +static struct file_system_type vfio_fs_type = { + .name = "vfio", + .owner = THIS_MODULE, + .init_fs_context = vfio_fs_init_fs_context, + .kill_sb = kill_anon_super, +}; + +static struct inode *vfio_fs_inode_new(void) +{ + struct inode *inode; + int ret; + + ret = simple_pin_fs(&vfio_fs_type, &vfio.vfs_mount, &vfio.fs_count); + if (ret) + return ERR_PTR(ret); + + inode = alloc_anon_inode(vfio.vfs_mount->mnt_sb); + if (IS_ERR(inode)) + simple_release_fs(&vfio.vfs_mount, &vfio.fs_count); + + return inode; +} + /* * Initialize a vfio_device so it can be registered to vfio core. */ @@ -246,6 +282,11 @@ static int vfio_init_device(struct vfio_device *device, struct device *dev, init_completion(&device->comp); device->dev = dev; device->ops = ops; + device->inode = vfio_fs_inode_new(); + if (IS_ERR(device->inode)) { + ret = PTR_ERR(device->inode); + goto out_inode; + } if (ops->init) { ret = ops->init(device); @@ -260,6 +301,9 @@ static int vfio_init_device(struct vfio_device *device, struct device *dev, return 0; out_uninit: + iput(device->inode); + simple_release_fs(&vfio.vfs_mount, &vfio.fs_count); +out_inode: vfio_release_device_set(device); ida_free(&vfio.device_ida, device->index); return ret; diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 8b1a2982040914..000a6cab2d318a 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -64,6 +64,7 @@ struct vfio_device { struct completion comp; struct iommufd_access *iommufd_access; void (*put_kvm)(struct kvm *kvm); + struct inode *inode; #if IS_ENABLED(CONFIG_IOMMUFD) struct iommufd_device *iommufd_device; u8 iommufd_attached:1; From aac6db75a9fc2c7a6f73e152df8f15101dda38e6 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 29 May 2024 22:52:31 -0600 Subject: [PATCH 0443/1578] vfio/pci: Use unmap_mapping_range() With the vfio device fd tied to the address space of the pseudo fs inode, we can use the mm to track all vmas that might be mmap'ing device BARs, which removes our vma_list and all the complicated lock ordering necessary to manually zap each related vma. Note that we can no longer store the pfn in vm_pgoff if we want to use unmap_mapping_range() to zap a selective portion of the device fd corresponding to BAR mappings. This also converts our mmap fault handler to use vmf_insert_pfn() because we no longer have a vma_list to avoid the concurrency problem with io_remap_pfn_range(). The goal is to eventually use the vm_ops huge_fault handler to avoid the additional faulting overhead, but vmf_insert_pfn_{pmd,pud}() need to learn about pfnmaps first. Also, Jason notes that a race exists between unmap_mapping_range() and the fops mmap callback if we were to call io_remap_pfn_range() to populate the vma on mmap. Specifically, mmap_region() does call_mmap() before it does vma_link_file() which gives a window where the vma is populated but invisible to unmap_mapping_range(). Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240530045236.1005864-3-alex.williamson@redhat.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 264 +++++++------------------------ include/linux/vfio_pci_core.h | 2 - 2 files changed, 55 insertions(+), 211 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 80cae87fff36ee..db31c27bf78bce 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1610,100 +1610,20 @@ ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *bu } EXPORT_SYMBOL_GPL(vfio_pci_core_write); -/* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */ -static int vfio_pci_zap_and_vma_lock(struct vfio_pci_core_device *vdev, bool try) +static void vfio_pci_zap_bars(struct vfio_pci_core_device *vdev) { - struct vfio_pci_mmap_vma *mmap_vma, *tmp; + struct vfio_device *core_vdev = &vdev->vdev; + loff_t start = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_BAR0_REGION_INDEX); + loff_t end = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_ROM_REGION_INDEX); + loff_t len = end - start; - /* - * Lock ordering: - * vma_lock is nested under mmap_lock for vm_ops callback paths. - * The memory_lock semaphore is used by both code paths calling - * into this function to zap vmas and the vm_ops.fault callback - * to protect the memory enable state of the device. - * - * When zapping vmas we need to maintain the mmap_lock => vma_lock - * ordering, which requires using vma_lock to walk vma_list to - * acquire an mm, then dropping vma_lock to get the mmap_lock and - * reacquiring vma_lock. This logic is derived from similar - * requirements in uverbs_user_mmap_disassociate(). - * - * mmap_lock must always be the top-level lock when it is taken. - * Therefore we can only hold the memory_lock write lock when - * vma_list is empty, as we'd need to take mmap_lock to clear - * entries. vma_list can only be guaranteed empty when holding - * vma_lock, thus memory_lock is nested under vma_lock. - * - * This enables the vm_ops.fault callback to acquire vma_lock, - * followed by memory_lock read lock, while already holding - * mmap_lock without risk of deadlock. - */ - while (1) { - struct mm_struct *mm = NULL; - - if (try) { - if (!mutex_trylock(&vdev->vma_lock)) - return 0; - } else { - mutex_lock(&vdev->vma_lock); - } - while (!list_empty(&vdev->vma_list)) { - mmap_vma = list_first_entry(&vdev->vma_list, - struct vfio_pci_mmap_vma, - vma_next); - mm = mmap_vma->vma->vm_mm; - if (mmget_not_zero(mm)) - break; - - list_del(&mmap_vma->vma_next); - kfree(mmap_vma); - mm = NULL; - } - if (!mm) - return 1; - mutex_unlock(&vdev->vma_lock); - - if (try) { - if (!mmap_read_trylock(mm)) { - mmput(mm); - return 0; - } - } else { - mmap_read_lock(mm); - } - if (try) { - if (!mutex_trylock(&vdev->vma_lock)) { - mmap_read_unlock(mm); - mmput(mm); - return 0; - } - } else { - mutex_lock(&vdev->vma_lock); - } - list_for_each_entry_safe(mmap_vma, tmp, - &vdev->vma_list, vma_next) { - struct vm_area_struct *vma = mmap_vma->vma; - - if (vma->vm_mm != mm) - continue; - - list_del(&mmap_vma->vma_next); - kfree(mmap_vma); - - zap_vma_ptes(vma, vma->vm_start, - vma->vm_end - vma->vm_start); - } - mutex_unlock(&vdev->vma_lock); - mmap_read_unlock(mm); - mmput(mm); - } + unmap_mapping_range(core_vdev->inode->i_mapping, start, len, true); } void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev) { - vfio_pci_zap_and_vma_lock(vdev, false); down_write(&vdev->memory_lock); - mutex_unlock(&vdev->vma_lock); + vfio_pci_zap_bars(vdev); } u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev) @@ -1725,99 +1645,41 @@ void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, u16 c up_write(&vdev->memory_lock); } -/* Caller holds vma_lock */ -static int __vfio_pci_add_vma(struct vfio_pci_core_device *vdev, - struct vm_area_struct *vma) -{ - struct vfio_pci_mmap_vma *mmap_vma; - - mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL_ACCOUNT); - if (!mmap_vma) - return -ENOMEM; - - mmap_vma->vma = vma; - list_add(&mmap_vma->vma_next, &vdev->vma_list); - - return 0; -} - -/* - * Zap mmaps on open so that we can fault them in on access and therefore - * our vma_list only tracks mappings accessed since last zap. - */ -static void vfio_pci_mmap_open(struct vm_area_struct *vma) -{ - zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); -} - -static void vfio_pci_mmap_close(struct vm_area_struct *vma) +static unsigned long vma_to_pfn(struct vm_area_struct *vma) { struct vfio_pci_core_device *vdev = vma->vm_private_data; - struct vfio_pci_mmap_vma *mmap_vma; + int index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); + u64 pgoff; - mutex_lock(&vdev->vma_lock); - list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { - if (mmap_vma->vma == vma) { - list_del(&mmap_vma->vma_next); - kfree(mmap_vma); - break; - } - } - mutex_unlock(&vdev->vma_lock); + pgoff = vma->vm_pgoff & + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + + return (pci_resource_start(vdev->pdev, index) >> PAGE_SHIFT) + pgoff; } static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct vfio_pci_core_device *vdev = vma->vm_private_data; - struct vfio_pci_mmap_vma *mmap_vma; - vm_fault_t ret = VM_FAULT_NOPAGE; + unsigned long pfn, pgoff = vmf->pgoff - vma->vm_pgoff; + vm_fault_t ret = VM_FAULT_SIGBUS; - mutex_lock(&vdev->vma_lock); - down_read(&vdev->memory_lock); + pfn = vma_to_pfn(vma); - /* - * Memory region cannot be accessed if the low power feature is engaged - * or memory access is disabled. - */ - if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) { - ret = VM_FAULT_SIGBUS; - goto up_out; - } + down_read(&vdev->memory_lock); - /* - * We populate the whole vma on fault, so we need to test whether - * the vma has already been mapped, such as for concurrent faults - * to the same vma. io_remap_pfn_range() will trigger a BUG_ON if - * we ask it to fill the same range again. - */ - list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { - if (mmap_vma->vma == vma) - goto up_out; - } + if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) + goto out_disabled; - if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, - vma->vm_end - vma->vm_start, - vma->vm_page_prot)) { - ret = VM_FAULT_SIGBUS; - zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); - goto up_out; - } + ret = vmf_insert_pfn(vma, vmf->address, pfn + pgoff); - if (__vfio_pci_add_vma(vdev, vma)) { - ret = VM_FAULT_OOM; - zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); - } - -up_out: +out_disabled: up_read(&vdev->memory_lock); - mutex_unlock(&vdev->vma_lock); + return ret; } static const struct vm_operations_struct vfio_pci_mmap_ops = { - .open = vfio_pci_mmap_open, - .close = vfio_pci_mmap_close, .fault = vfio_pci_mmap_fault, }; @@ -1880,11 +1742,12 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma vma->vm_private_data = vdev; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff; + vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); /* - * See remap_pfn_range(), called from vfio_pci_fault() but we can't - * change vm_flags within the fault handler. Set them now. + * Set vm_flags now, they should not be changed in the fault handler. + * We want the same flags and page protection (decrypted above) as + * io_remap_pfn_range() would set. * * VM_ALLOW_ANY_UNCACHED: The VMA flag is implemented for ARM64, * allowing KVM stage 2 device mapping attributes to use Normal-NC @@ -2202,8 +2065,6 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev) mutex_init(&vdev->ioeventfds_lock); INIT_LIST_HEAD(&vdev->dummy_resources_list); INIT_LIST_HEAD(&vdev->ioeventfds_list); - mutex_init(&vdev->vma_lock); - INIT_LIST_HEAD(&vdev->vma_list); INIT_LIST_HEAD(&vdev->sriov_pfs_item); init_rwsem(&vdev->memory_lock); xa_init(&vdev->ctx); @@ -2219,7 +2080,6 @@ void vfio_pci_core_release_dev(struct vfio_device *core_vdev) mutex_destroy(&vdev->igate); mutex_destroy(&vdev->ioeventfds_lock); - mutex_destroy(&vdev->vma_lock); kfree(vdev->region); kfree(vdev->pm_save); } @@ -2497,26 +2357,15 @@ static int vfio_pci_dev_set_pm_runtime_get(struct vfio_device_set *dev_set) return ret; } -/* - * We need to get memory_lock for each device, but devices can share mmap_lock, - * therefore we need to zap and hold the vma_lock for each device, and only then - * get each memory_lock. - */ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, struct vfio_pci_group_info *groups, struct iommufd_ctx *iommufd_ctx) { - struct vfio_pci_core_device *cur_mem; - struct vfio_pci_core_device *cur_vma; - struct vfio_pci_core_device *cur; + struct vfio_pci_core_device *vdev; struct pci_dev *pdev; - bool is_mem = true; int ret; mutex_lock(&dev_set->lock); - cur_mem = list_first_entry(&dev_set->device_list, - struct vfio_pci_core_device, - vdev.dev_set_list); pdev = vfio_pci_dev_set_resettable(dev_set); if (!pdev) { @@ -2533,7 +2382,7 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, if (ret) goto err_unlock; - list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) { + list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) { bool owned; /* @@ -2557,38 +2406,38 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, * Otherwise, reset is not allowed. */ if (iommufd_ctx) { - int devid = vfio_iommufd_get_dev_id(&cur_vma->vdev, + int devid = vfio_iommufd_get_dev_id(&vdev->vdev, iommufd_ctx); owned = (devid > 0 || devid == -ENOENT); } else { - owned = vfio_dev_in_groups(&cur_vma->vdev, groups); + owned = vfio_dev_in_groups(&vdev->vdev, groups); } if (!owned) { ret = -EINVAL; - goto err_undo; + break; } /* - * Locking multiple devices is prone to deadlock, runaway and - * unwind if we hit contention. + * Take the memory write lock for each device and zap BAR + * mappings to prevent the user accessing the device while in + * reset. Locking multiple devices is prone to deadlock, + * runaway and unwind if we hit contention. */ - if (!vfio_pci_zap_and_vma_lock(cur_vma, true)) { + if (!down_write_trylock(&vdev->memory_lock)) { ret = -EBUSY; - goto err_undo; + break; } + + vfio_pci_zap_bars(vdev); } - cur_vma = NULL; - list_for_each_entry(cur_mem, &dev_set->device_list, vdev.dev_set_list) { - if (!down_write_trylock(&cur_mem->memory_lock)) { - ret = -EBUSY; - goto err_undo; - } - mutex_unlock(&cur_mem->vma_lock); + if (!list_entry_is_head(vdev, + &dev_set->device_list, vdev.dev_set_list)) { + vdev = list_prev_entry(vdev, vdev.dev_set_list); + goto err_undo; } - cur_mem = NULL; /* * The pci_reset_bus() will reset all the devices in the bus. @@ -2599,25 +2448,22 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, * cause the PCI config space reset without restoring the original * state (saved locally in 'vdev->pm_save'). */ - list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) - vfio_pci_set_power_state(cur, PCI_D0); + list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) + vfio_pci_set_power_state(vdev, PCI_D0); ret = pci_reset_bus(pdev); + vdev = list_last_entry(&dev_set->device_list, + struct vfio_pci_core_device, vdev.dev_set_list); + err_undo: - list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) { - if (cur == cur_mem) - is_mem = false; - if (cur == cur_vma) - break; - if (is_mem) - up_write(&cur->memory_lock); - else - mutex_unlock(&cur->vma_lock); - } + list_for_each_entry_from_reverse(vdev, &dev_set->device_list, + vdev.dev_set_list) + up_write(&vdev->memory_lock); + + list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) + pm_runtime_put(&vdev->pdev->dev); - list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) - pm_runtime_put(&cur->pdev->dev); err_unlock: mutex_unlock(&dev_set->lock); return ret; diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index a2c8b8bba71195..f87067438ed48f 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -93,8 +93,6 @@ struct vfio_pci_core_device { struct list_head sriov_pfs_item; struct vfio_pci_core_device *sriov_pf_core_dev; struct notifier_block nb; - struct mutex vma_lock; - struct list_head vma_list; struct rw_semaphore memory_lock; }; From 62da3acd28955e7299babebdfcb14243b789e773 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 May 2024 15:32:18 -0700 Subject: [PATCH 0444/1578] selftests/bpf: fix inet_csk_accept prototype in test_sk_storage_tracing.c Recent kernel change ([0]) changed inet_csk_accept() prototype. Adapt progs/test_sk_storage_tracing.c to take that into account. [0] 92ef0fd55ac8 ("net: change proto and proto_ops accept type") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240528223218.3445297-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c b/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c index 02e718f06e0f5c..40531e56776e42 100644 --- a/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c +++ b/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c @@ -84,7 +84,7 @@ int BPF_PROG(trace_tcp_connect, struct sock *sk) } SEC("fexit/inet_csk_accept") -int BPF_PROG(inet_csk_accept, struct sock *sk, int flags, int *err, bool kern, +int BPF_PROG(inet_csk_accept, struct sock *sk, struct proto_accept_arg *arg, struct sock *accepted_sk) { set_task_info(accepted_sk); From aeb8fe0283d4d3b0f27a87c5f5c938e7324f7d8f Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 31 May 2024 21:45:00 +0200 Subject: [PATCH 0445/1578] bpf: Fix bpf_session_cookie BTF_ID in special_kfunc_set list The bpf_session_cookie is unavailable for !CONFIG_FPROBE as reported by Sebastian [1]. To fix that we remove CONFIG_FPROBE ifdef for session kfuncs, which is fine, because there's filter for session programs. Then based on bpf_trace.o dependency: obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o we add bpf_session_cookie BTF_ID in special_kfunc_set list dependency on CONFIG_BPF_EVENTS. [1] https://lore.kernel.org/bpf/20240531071557.MvfIqkn7@linutronix.de/T/#m71c6d5ec71db2967288cb79acedc15cc5dbfeec5 Reported-by: Sebastian Andrzej Siewior Tested-by: Sebastian Andrzej Siewior Suggested-by: Alexei Starovoitov Fixes: 5c919acef8514 ("bpf: Add support for kprobe session cookie") Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20240531194500.2967187-1-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++++ kernel/trace/bpf_trace.c | 2 -- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 48f3a9acdef3d4..36ef8e96787ed5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11128,7 +11128,11 @@ BTF_ID(func, bpf_iter_css_task_new) #else BTF_ID_UNUSED #endif +#ifdef CONFIG_BPF_EVENTS BTF_ID(func, bpf_session_cookie) +#else +BTF_ID_UNUSED +#endif static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 6249dac6170183..d1daeab1bbc141 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3517,7 +3517,6 @@ static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx) } #endif /* CONFIG_UPROBES */ -#ifdef CONFIG_FPROBE __bpf_kfunc_start_defs(); __bpf_kfunc bool bpf_session_is_return(void) @@ -3566,4 +3565,3 @@ static int __init bpf_kprobe_multi_kfuncs_init(void) } late_initcall(bpf_kprobe_multi_kfuncs_init); -#endif From 7d0b3953f6d832daec10a0d76e2d4db405768a8b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 29 May 2024 16:12:12 -0700 Subject: [PATCH 0446/1578] libbpf: don't close(-1) in multi-uprobe feature detector Guard close(link_fd) with extra link_fd >= 0 check to prevent close(-1). Detected by Coverity static analysis. Fixes: 04d939a2ab22 ("libbpf: detect broken PID filtering logic for multi-uprobe") Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20240529231212.768828-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/features.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/features.c b/tools/lib/bpf/features.c index 3df0125ed5fa7a..50befe125ddc55 100644 --- a/tools/lib/bpf/features.c +++ b/tools/lib/bpf/features.c @@ -393,7 +393,8 @@ static int probe_uprobe_multi_link(int token_fd) err = -errno; /* close() can clobber errno */ if (link_fd >= 0 || err != -EBADF) { - close(link_fd); + if (link_fd >= 0) + close(link_fd); close(prog_fd); return 0; } From 955af6355ddfe35140f9706a635838212a32513b Mon Sep 17 00:00:00 2001 From: Tobias Jakobi Date: Fri, 31 May 2024 15:43:07 -0700 Subject: [PATCH 0447/1578] Input: i8042 - add Ayaneo Kun to i8042 quirk table See the added comment for details. Also fix a typo in the quirk's define. Signed-off-by: Tobias Jakobi Link: https://lore.kernel.org/r/20240531190100.3874731-1-tjakobi@math.uni-bielefeld.de Signed-off-by: Dmitry Torokhov --- drivers/input/serio/i8042-acpipnpio.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/input/serio/i8042-acpipnpio.h b/drivers/input/serio/i8042-acpipnpio.h index dfc6c581873b7d..5b50475ec41402 100644 --- a/drivers/input/serio/i8042-acpipnpio.h +++ b/drivers/input/serio/i8042-acpipnpio.h @@ -76,7 +76,7 @@ static inline void i8042_write_command(int val) #define SERIO_QUIRK_PROBE_DEFER BIT(5) #define SERIO_QUIRK_RESET_ALWAYS BIT(6) #define SERIO_QUIRK_RESET_NEVER BIT(7) -#define SERIO_QUIRK_DIECT BIT(8) +#define SERIO_QUIRK_DIRECT BIT(8) #define SERIO_QUIRK_DUMBKBD BIT(9) #define SERIO_QUIRK_NOLOOP BIT(10) #define SERIO_QUIRK_NOTIMEOUT BIT(11) @@ -1332,6 +1332,20 @@ static const struct dmi_system_id i8042_dmi_quirk_table[] __initconst = { .driver_data = (void *)(SERIO_QUIRK_NOMUX | SERIO_QUIRK_RESET_ALWAYS | SERIO_QUIRK_NOLOOP | SERIO_QUIRK_NOPNP) }, + { + /* + * The Ayaneo Kun is a handheld device where some the buttons + * are handled by an AT keyboard. The keyboard is usually + * detected as raw, but sometimes, usually after a cold boot, + * it is detected as translated. Make sure that the keyboard + * is always in raw mode. + */ + .matches = { + DMI_EXACT_MATCH(DMI_BOARD_VENDOR, "AYANEO"), + DMI_MATCH(DMI_BOARD_NAME, "KUN"), + }, + .driver_data = (void *)(SERIO_QUIRK_DIRECT) + }, { } }; @@ -1655,7 +1669,7 @@ static void __init i8042_check_quirks(void) if (quirks & SERIO_QUIRK_RESET_NEVER) i8042_reset = I8042_RESET_NEVER; } - if (quirks & SERIO_QUIRK_DIECT) + if (quirks & SERIO_QUIRK_DIRECT) i8042_direct = true; if (quirks & SERIO_QUIRK_DUMBKBD) i8042_dumbkbd = true; From 7bc4244c882a7d7d79f4afefc50893244eb11d07 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 1 Jun 2024 07:28:21 +0200 Subject: [PATCH 0448/1578] Revert "VT: Use macros to define ioctls" This reverts commit 8c467f3300591a206fa8dcc6988d768910799872. Turns out this breaks many architectures as the vt ioctls do not all match up everywhere due to historical reasons, so the original commit is invalid for many values. Reported-by: Nick Bowler Reported-by: Arnd Bergmann Reported-by: Jiri Slaby Reported-by: Christian Zigotzky Reported-by: Michael Ellerman Cc: Al Viro Cc: Alexey Gladkov Link: https://lore.kernel.org/r/ad4e561c-1d49-4f25-882c-7a36c6b1b5c0@draconx.ca Link: https://lore.kernel.org/r/0da9785e-ba44-4718-9d08-4e96c1ba7ab2@kernel.org Link: https://lore.kernel.org/all/34d848f4-670b-4493-bf21-130ef862521b@xenosoft.de/ Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/kd.h | 96 ++++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 49 deletions(-) diff --git a/include/uapi/linux/kd.h b/include/uapi/linux/kd.h index 8ddb2219a84bdf..6b384065c01356 100644 --- a/include/uapi/linux/kd.h +++ b/include/uapi/linux/kd.h @@ -5,61 +5,60 @@ #include /* 0x4B is 'K', to avoid collision with termios and vt */ -#define KD_IOCTL_BASE 'K' -#define GIO_FONT _IO(KD_IOCTL_BASE, 0x60) /* gets font in expanded form */ -#define PIO_FONT _IO(KD_IOCTL_BASE, 0x61) /* use font in expanded form */ +#define GIO_FONT 0x4B60 /* gets font in expanded form */ +#define PIO_FONT 0x4B61 /* use font in expanded form */ -#define GIO_FONTX _IO(KD_IOCTL_BASE, 0x6B) /* get font using struct consolefontdesc */ -#define PIO_FONTX _IO(KD_IOCTL_BASE, 0x6C) /* set font using struct consolefontdesc */ +#define GIO_FONTX 0x4B6B /* get font using struct consolefontdesc */ +#define PIO_FONTX 0x4B6C /* set font using struct consolefontdesc */ struct consolefontdesc { unsigned short charcount; /* characters in font (256 or 512) */ unsigned short charheight; /* scan lines per character (1-32) */ char __user *chardata; /* font data in expanded form */ }; -#define PIO_FONTRESET _IO(KD_IOCTL_BASE, 0x6D) /* reset to default font */ +#define PIO_FONTRESET 0x4B6D /* reset to default font */ -#define GIO_CMAP _IO(KD_IOCTL_BASE, 0x70) /* gets colour palette on VGA+ */ -#define PIO_CMAP _IO(KD_IOCTL_BASE, 0x71) /* sets colour palette on VGA+ */ +#define GIO_CMAP 0x4B70 /* gets colour palette on VGA+ */ +#define PIO_CMAP 0x4B71 /* sets colour palette on VGA+ */ -#define KIOCSOUND _IO(KD_IOCTL_BASE, 0x2F) /* start sound generation (0 for off) */ -#define KDMKTONE _IO(KD_IOCTL_BASE, 0x30) /* generate tone */ +#define KIOCSOUND 0x4B2F /* start sound generation (0 for off) */ +#define KDMKTONE 0x4B30 /* generate tone */ -#define KDGETLED _IO(KD_IOCTL_BASE, 0x31) /* return current led state */ -#define KDSETLED _IO(KD_IOCTL_BASE, 0x32) /* set led state [lights, not flags] */ +#define KDGETLED 0x4B31 /* return current led state */ +#define KDSETLED 0x4B32 /* set led state [lights, not flags] */ #define LED_SCR 0x01 /* scroll lock led */ #define LED_NUM 0x02 /* num lock led */ #define LED_CAP 0x04 /* caps lock led */ -#define KDGKBTYPE _IO(KD_IOCTL_BASE, 0x33) /* get keyboard type */ +#define KDGKBTYPE 0x4B33 /* get keyboard type */ #define KB_84 0x01 #define KB_101 0x02 /* this is what we always answer */ #define KB_OTHER 0x03 -#define KDADDIO _IO(KD_IOCTL_BASE, 0x34) /* add i/o port as valid */ -#define KDDELIO _IO(KD_IOCTL_BASE, 0x35) /* del i/o port as valid */ -#define KDENABIO _IO(KD_IOCTL_BASE, 0x36) /* enable i/o to video board */ -#define KDDISABIO _IO(KD_IOCTL_BASE, 0x37) /* disable i/o to video board */ +#define KDADDIO 0x4B34 /* add i/o port as valid */ +#define KDDELIO 0x4B35 /* del i/o port as valid */ +#define KDENABIO 0x4B36 /* enable i/o to video board */ +#define KDDISABIO 0x4B37 /* disable i/o to video board */ -#define KDSETMODE _IO(KD_IOCTL_BASE, 0x3A) /* set text/graphics mode */ +#define KDSETMODE 0x4B3A /* set text/graphics mode */ #define KD_TEXT 0x00 #define KD_GRAPHICS 0x01 #define KD_TEXT0 0x02 /* obsolete */ #define KD_TEXT1 0x03 /* obsolete */ -#define KDGETMODE _IO(KD_IOCTL_BASE, 0x3B) /* get current mode */ +#define KDGETMODE 0x4B3B /* get current mode */ -#define KDMAPDISP _IO(KD_IOCTL_BASE, 0x3C) /* map display into address space */ -#define KDUNMAPDISP _IO(KD_IOCTL_BASE, 0x3D) /* unmap display from address space */ +#define KDMAPDISP 0x4B3C /* map display into address space */ +#define KDUNMAPDISP 0x4B3D /* unmap display from address space */ typedef char scrnmap_t; #define E_TABSZ 256 -#define GIO_SCRNMAP _IO(KD_IOCTL_BASE, 0x40) /* get screen mapping from kernel */ -#define PIO_SCRNMAP _IO(KD_IOCTL_BASE, 0x41) /* put screen mapping table in kernel */ -#define GIO_UNISCRNMAP _IO(KD_IOCTL_BASE, 0x69) /* get full Unicode screen mapping */ -#define PIO_UNISCRNMAP _IO(KD_IOCTL_BASE, 0x6A) /* set full Unicode screen mapping */ +#define GIO_SCRNMAP 0x4B40 /* get screen mapping from kernel */ +#define PIO_SCRNMAP 0x4B41 /* put screen mapping table in kernel */ +#define GIO_UNISCRNMAP 0x4B69 /* get full Unicode screen mapping */ +#define PIO_UNISCRNMAP 0x4B6A /* set full Unicode screen mapping */ -#define GIO_UNIMAP _IO(KD_IOCTL_BASE, 0x66) /* get unicode-to-font mapping from kernel */ +#define GIO_UNIMAP 0x4B66 /* get unicode-to-font mapping from kernel */ struct unipair { unsigned short unicode; unsigned short fontpos; @@ -68,8 +67,8 @@ struct unimapdesc { unsigned short entry_ct; struct unipair __user *entries; }; -#define PIO_UNIMAP _IO(KD_IOCTL_BASE, 0x67) /* put unicode-to-font mapping in kernel */ -#define PIO_UNIMAPCLR _IO(KD_IOCTL_BASE, 0x68) /* clear table, possibly advise hash algorithm */ +#define PIO_UNIMAP 0x4B67 /* put unicode-to-font mapping in kernel */ +#define PIO_UNIMAPCLR 0x4B68 /* clear table, possibly advise hash algorithm */ struct unimapinit { unsigned short advised_hashsize; /* 0 if no opinion */ unsigned short advised_hashstep; /* 0 if no opinion */ @@ -84,19 +83,19 @@ struct unimapinit { #define K_MEDIUMRAW 0x02 #define K_UNICODE 0x03 #define K_OFF 0x04 -#define KDGKBMODE _IO(KD_IOCTL_BASE, 0x44) /* gets current keyboard mode */ -#define KDSKBMODE _IO(KD_IOCTL_BASE, 0x45) /* sets current keyboard mode */ +#define KDGKBMODE 0x4B44 /* gets current keyboard mode */ +#define KDSKBMODE 0x4B45 /* sets current keyboard mode */ #define K_METABIT 0x03 #define K_ESCPREFIX 0x04 -#define KDGKBMETA _IO(KD_IOCTL_BASE, 0x62) /* gets meta key handling mode */ -#define KDSKBMETA _IO(KD_IOCTL_BASE, 0x63) /* sets meta key handling mode */ +#define KDGKBMETA 0x4B62 /* gets meta key handling mode */ +#define KDSKBMETA 0x4B63 /* sets meta key handling mode */ #define K_SCROLLLOCK 0x01 #define K_NUMLOCK 0x02 #define K_CAPSLOCK 0x04 -#define KDGKBLED _IO(KD_IOCTL_BASE, 0x64) /* get led flags (not lights) */ -#define KDSKBLED _IO(KD_IOCTL_BASE, 0x65) /* set led flags (not lights) */ +#define KDGKBLED 0x4B64 /* get led flags (not lights) */ +#define KDSKBLED 0x4B65 /* set led flags (not lights) */ struct kbentry { unsigned char kb_table; @@ -108,15 +107,15 @@ struct kbentry { #define K_ALTTAB 0x02 #define K_ALTSHIFTTAB 0x03 -#define KDGKBENT _IO(KD_IOCTL_BASE, 0x46) /* gets one entry in translation table */ -#define KDSKBENT _IO(KD_IOCTL_BASE, 0x47) /* sets one entry in translation table */ +#define KDGKBENT 0x4B46 /* gets one entry in translation table */ +#define KDSKBENT 0x4B47 /* sets one entry in translation table */ struct kbsentry { unsigned char kb_func; unsigned char kb_string[512]; }; -#define KDGKBSENT _IO(KD_IOCTL_BASE, 0x48) /* gets one function key string entry */ -#define KDSKBSENT _IO(KD_IOCTL_BASE, 0x49) /* sets one function key string entry */ +#define KDGKBSENT 0x4B48 /* gets one function key string entry */ +#define KDSKBSENT 0x4B49 /* sets one function key string entry */ struct kbdiacr { unsigned char diacr, base, result; @@ -125,8 +124,8 @@ struct kbdiacrs { unsigned int kb_cnt; /* number of entries in following array */ struct kbdiacr kbdiacr[256]; /* MAX_DIACR from keyboard.h */ }; -#define KDGKBDIACR _IO(KD_IOCTL_BASE, 0x4A) /* read kernel accent table */ -#define KDSKBDIACR _IO(KD_IOCTL_BASE, 0x4B) /* write kernel accent table */ +#define KDGKBDIACR 0x4B4A /* read kernel accent table */ +#define KDSKBDIACR 0x4B4B /* write kernel accent table */ struct kbdiacruc { unsigned int diacr, base, result; @@ -135,16 +134,16 @@ struct kbdiacrsuc { unsigned int kb_cnt; /* number of entries in following array */ struct kbdiacruc kbdiacruc[256]; /* MAX_DIACR from keyboard.h */ }; -#define KDGKBDIACRUC _IO(KD_IOCTL_BASE, 0xFA) /* read kernel accent table - UCS */ -#define KDSKBDIACRUC _IO(KD_IOCTL_BASE, 0xFB) /* write kernel accent table - UCS */ +#define KDGKBDIACRUC 0x4BFA /* read kernel accent table - UCS */ +#define KDSKBDIACRUC 0x4BFB /* write kernel accent table - UCS */ struct kbkeycode { unsigned int scancode, keycode; }; -#define KDGETKEYCODE _IO(KD_IOCTL_BASE, 0x4C) /* read kernel keycode table entry */ -#define KDSETKEYCODE _IO(KD_IOCTL_BASE, 0x4D) /* write kernel keycode table entry */ +#define KDGETKEYCODE 0x4B4C /* read kernel keycode table entry */ +#define KDSETKEYCODE 0x4B4D /* write kernel keycode table entry */ -#define KDSIGACCEPT _IO(KD_IOCTL_BASE, 0x4E) /* accept kbd generated signals */ +#define KDSIGACCEPT 0x4B4E /* accept kbd generated signals */ struct kbd_repeat { int delay; /* in msec; <= 0: don't change */ @@ -152,11 +151,10 @@ struct kbd_repeat { /* earlier this field was misnamed "rate" */ }; -#define KDKBDREP _IO(KD_IOCTL_BASE, 0x52) /* set keyboard delay/repeat rate; - * actually used values are returned - */ +#define KDKBDREP 0x4B52 /* set keyboard delay/repeat rate; + * actually used values are returned */ -#define KDFONTOP _IO(KD_IOCTL_BASE, 0x72) /* font operations */ +#define KDFONTOP 0x4B72 /* font operations */ struct console_font_op { unsigned int op; /* operation code KD_FONT_OP_* */ From ebfb5e8fc8b45040b979f4bf0012a01d0abac8d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexis=20Lothor=C3=A9?= Date: Tue, 28 May 2024 16:20:28 +0200 Subject: [PATCH 0449/1578] Revert "wifi: wilc1000: convert list management to RCU" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit f236464f1db7bea80075e6e31ac70dc6eb80547f Commit f236464f1db7 ("wifi: wilc1000: convert list management to RCU") replaced SRCU with RCU, aiming to simplify RCU usage in the driver. No documentation or commit history hinted about why SRCU has been preferred in original design, so it has been assumed to be safe to do this conversion. Unfortunately, some static analyzers raised warnings, confirmed by runtime checker, not long after the merge. At least three different issues arose when switching to RCU: - wilc_wlan_txq_filter_dup_tcp_ack is executed in a RCU read critical section yet calls wait_for_completion_timeout - wilc_wfi_init_mon_interface calls kmalloc and register_netdevice while manipulating a vif retrieved from vif list - set_channel sends command to chip (and so, also waits for a completion) while holding a vif retrieved from vif list (so, in RCU read critical section) Some of those issues are not trivial to fix and would need bigger driver rework. Fix those issues by reverting the SRCU to RCU conversion commit Reported-by: Dan Carpenter Closes: https://lore.kernel.org/linux-wireless/3b46ec7c-baee-49fd-b760-3bc12fb12eaf@moroto.mountain/ Fixes: f236464f1db7 ("wifi: wilc1000: convert list management to RCU") Signed-off-by: Alexis Lothoré Signed-off-by: Kalle Valo Link: https://msgid.link/20240528-wilc_revert_srcu_to_rcu-v1-1-bce096e0798c@bootlin.com --- .../wireless/microchip/wilc1000/cfg80211.c | 41 ++++++++++-------- drivers/net/wireless/microchip/wilc1000/hif.c | 15 ++++--- .../net/wireless/microchip/wilc1000/netdev.c | 43 +++++++++++-------- .../net/wireless/microchip/wilc1000/netdev.h | 5 ++- .../net/wireless/microchip/wilc1000/wlan.c | 5 ++- 5 files changed, 64 insertions(+), 45 deletions(-) diff --git a/drivers/net/wireless/microchip/wilc1000/cfg80211.c b/drivers/net/wireless/microchip/wilc1000/cfg80211.c index 7d9fb9f2d52799..089102ed9ae51b 100644 --- a/drivers/net/wireless/microchip/wilc1000/cfg80211.c +++ b/drivers/net/wireless/microchip/wilc1000/cfg80211.c @@ -237,11 +237,12 @@ static int set_channel(struct wiphy *wiphy, struct wilc_vif *vif; u32 channelnum; int result; + int srcu_idx; - rcu_read_lock(); + srcu_idx = srcu_read_lock(&wl->srcu); vif = wilc_get_wl_to_vif(wl); if (IS_ERR(vif)) { - rcu_read_unlock(); + srcu_read_unlock(&wl->srcu, srcu_idx); return PTR_ERR(vif); } @@ -252,7 +253,7 @@ static int set_channel(struct wiphy *wiphy, if (result) netdev_err(vif->ndev, "Error in setting channel\n"); - rcu_read_unlock(); + srcu_read_unlock(&wl->srcu, srcu_idx); return result; } @@ -805,8 +806,9 @@ static int set_wiphy_params(struct wiphy *wiphy, u32 changed) struct wilc *wl = wiphy_priv(wiphy); struct wilc_vif *vif; struct wilc_priv *priv; + int srcu_idx; - rcu_read_lock(); + srcu_idx = srcu_read_lock(&wl->srcu); vif = wilc_get_wl_to_vif(wl); if (IS_ERR(vif)) goto out; @@ -861,7 +863,7 @@ static int set_wiphy_params(struct wiphy *wiphy, u32 changed) netdev_err(priv->dev, "Error in setting WIPHY PARAMS\n"); out: - rcu_read_unlock(); + srcu_read_unlock(&wl->srcu, srcu_idx); return ret; } @@ -1537,19 +1539,20 @@ static struct wireless_dev *add_virtual_intf(struct wiphy *wiphy, if (type == NL80211_IFTYPE_MONITOR) { struct net_device *ndev; + int srcu_idx; - rcu_read_lock(); + srcu_idx = srcu_read_lock(&wl->srcu); vif = wilc_get_vif_from_type(wl, WILC_AP_MODE); if (!vif) { vif = wilc_get_vif_from_type(wl, WILC_GO_MODE); if (!vif) { - rcu_read_unlock(); + srcu_read_unlock(&wl->srcu, srcu_idx); goto validate_interface; } } if (vif->monitor_flag) { - rcu_read_unlock(); + srcu_read_unlock(&wl->srcu, srcu_idx); goto validate_interface; } @@ -1557,12 +1560,12 @@ static struct wireless_dev *add_virtual_intf(struct wiphy *wiphy, if (ndev) { vif->monitor_flag = 1; } else { - rcu_read_unlock(); + srcu_read_unlock(&wl->srcu, srcu_idx); return ERR_PTR(-EINVAL); } wdev = &vif->priv.wdev; - rcu_read_unlock(); + srcu_read_unlock(&wl->srcu, srcu_idx); return wdev; } @@ -1610,7 +1613,7 @@ static int del_virtual_intf(struct wiphy *wiphy, struct wireless_dev *wdev) list_del_rcu(&vif->list); wl->vif_num--; mutex_unlock(&wl->vif_mutex); - synchronize_rcu(); + synchronize_srcu(&wl->srcu); return 0; } @@ -1635,23 +1638,25 @@ static void wilc_set_wakeup(struct wiphy *wiphy, bool enabled) { struct wilc *wl = wiphy_priv(wiphy); struct wilc_vif *vif; + int srcu_idx; - rcu_read_lock(); + srcu_idx = srcu_read_lock(&wl->srcu); vif = wilc_get_wl_to_vif(wl); if (IS_ERR(vif)) { - rcu_read_unlock(); + srcu_read_unlock(&wl->srcu, srcu_idx); return; } netdev_info(vif->ndev, "cfg set wake up = %d\n", enabled); wilc_set_wowlan_trigger(vif, enabled); - rcu_read_unlock(); + srcu_read_unlock(&wl->srcu, srcu_idx); } static int set_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, enum nl80211_tx_power_setting type, int mbm) { int ret; + int srcu_idx; s32 tx_power = MBM_TO_DBM(mbm); struct wilc *wl = wiphy_priv(wiphy); struct wilc_vif *vif; @@ -1659,10 +1664,10 @@ static int set_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, if (!wl->initialized) return -EIO; - rcu_read_lock(); + srcu_idx = srcu_read_lock(&wl->srcu); vif = wilc_get_wl_to_vif(wl); if (IS_ERR(vif)) { - rcu_read_unlock(); + srcu_read_unlock(&wl->srcu, srcu_idx); return -EINVAL; } @@ -1674,7 +1679,7 @@ static int set_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, ret = wilc_set_tx_power(vif, tx_power); if (ret) netdev_err(vif->ndev, "Failed to set tx power\n"); - rcu_read_unlock(); + srcu_read_unlock(&wl->srcu, srcu_idx); return ret; } @@ -1757,6 +1762,7 @@ static void wlan_init_locks(struct wilc *wl) init_completion(&wl->cfg_event); init_completion(&wl->sync_event); init_completion(&wl->txq_thread_started); + init_srcu_struct(&wl->srcu); } void wlan_deinit_locks(struct wilc *wilc) @@ -1767,6 +1773,7 @@ void wlan_deinit_locks(struct wilc *wilc) mutex_destroy(&wilc->txq_add_to_head_cs); mutex_destroy(&wilc->vif_mutex); mutex_destroy(&wilc->deinit_lock); + cleanup_srcu_struct(&wilc->srcu); } int wilc_cfg80211_init(struct wilc **wilc, struct device *dev, int io_type, diff --git a/drivers/net/wireless/microchip/wilc1000/hif.c b/drivers/net/wireless/microchip/wilc1000/hif.c index 919de6ffb8217c..3925ca653e807d 100644 --- a/drivers/net/wireless/microchip/wilc1000/hif.c +++ b/drivers/net/wireless/microchip/wilc1000/hif.c @@ -1570,11 +1570,12 @@ void wilc_network_info_received(struct wilc *wilc, u8 *buffer, u32 length) struct host_if_drv *hif_drv; struct host_if_msg *msg; struct wilc_vif *vif; + int srcu_idx; int result; int id; id = get_unaligned_le32(&buffer[length - 4]); - rcu_read_lock(); + srcu_idx = srcu_read_lock(&wilc->srcu); vif = wilc_get_vif_from_idx(wilc, id); if (!vif) goto out; @@ -1606,7 +1607,7 @@ void wilc_network_info_received(struct wilc *wilc, u8 *buffer, u32 length) kfree(msg); } out: - rcu_read_unlock(); + srcu_read_unlock(&wilc->srcu, srcu_idx); } void wilc_gnrl_async_info_received(struct wilc *wilc, u8 *buffer, u32 length) @@ -1614,13 +1615,14 @@ void wilc_gnrl_async_info_received(struct wilc *wilc, u8 *buffer, u32 length) struct host_if_drv *hif_drv; struct host_if_msg *msg; struct wilc_vif *vif; + int srcu_idx; int result; int id; mutex_lock(&wilc->deinit_lock); id = get_unaligned_le32(&buffer[length - 4]); - rcu_read_lock(); + srcu_idx = srcu_read_lock(&wilc->srcu); vif = wilc_get_vif_from_idx(wilc, id); if (!vif) goto out; @@ -1647,7 +1649,7 @@ void wilc_gnrl_async_info_received(struct wilc *wilc, u8 *buffer, u32 length) kfree(msg); } out: - rcu_read_unlock(); + srcu_read_unlock(&wilc->srcu, srcu_idx); mutex_unlock(&wilc->deinit_lock); } @@ -1655,11 +1657,12 @@ void wilc_scan_complete_received(struct wilc *wilc, u8 *buffer, u32 length) { struct host_if_drv *hif_drv; struct wilc_vif *vif; + int srcu_idx; int result; int id; id = get_unaligned_le32(&buffer[length - 4]); - rcu_read_lock(); + srcu_idx = srcu_read_lock(&wilc->srcu); vif = wilc_get_vif_from_idx(wilc, id); if (!vif) goto out; @@ -1684,7 +1687,7 @@ void wilc_scan_complete_received(struct wilc *wilc, u8 *buffer, u32 length) } } out: - rcu_read_unlock(); + srcu_read_unlock(&wilc->srcu, srcu_idx); } int wilc_remain_on_channel(struct wilc_vif *vif, u64 cookie, u16 chan, diff --git a/drivers/net/wireless/microchip/wilc1000/netdev.c b/drivers/net/wireless/microchip/wilc1000/netdev.c index 73f56f7b002bf3..710e29bea56058 100644 --- a/drivers/net/wireless/microchip/wilc1000/netdev.c +++ b/drivers/net/wireless/microchip/wilc1000/netdev.c @@ -127,28 +127,30 @@ void wilc_wlan_set_bssid(struct net_device *wilc_netdev, const u8 *bssid, int wilc_wlan_get_num_conn_ifcs(struct wilc *wilc) { + int srcu_idx; u8 ret_val = 0; struct wilc_vif *vif; - rcu_read_lock(); + srcu_idx = srcu_read_lock(&wilc->srcu); wilc_for_each_vif(wilc, vif) { if (!is_zero_ether_addr(vif->bssid)) ret_val++; } - rcu_read_unlock(); + srcu_read_unlock(&wilc->srcu, srcu_idx); return ret_val; } static void wilc_wake_tx_queues(struct wilc *wl) { + int srcu_idx; struct wilc_vif *ifc; - rcu_read_lock(); + srcu_idx = srcu_read_lock(&wl->srcu); wilc_for_each_vif(wl, ifc) { if (ifc->mac_opened && netif_queue_stopped(ifc->ndev)) netif_wake_queue(ifc->ndev); } - rcu_read_unlock(); + srcu_read_unlock(&wl->srcu, srcu_idx); } static int wilc_txq_task(void *vp) @@ -653,6 +655,7 @@ static int wilc_set_mac_addr(struct net_device *dev, void *p) struct sockaddr *addr = (struct sockaddr *)p; unsigned char mac_addr[ETH_ALEN]; struct wilc_vif *tmp_vif; + int srcu_idx; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; @@ -664,19 +667,19 @@ static int wilc_set_mac_addr(struct net_device *dev, void *p) /* Verify MAC Address is not already in use: */ - rcu_read_lock(); + srcu_idx = srcu_read_lock(&wilc->srcu); wilc_for_each_vif(wilc, tmp_vif) { wilc_get_mac_address(tmp_vif, mac_addr); if (ether_addr_equal(addr->sa_data, mac_addr)) { if (vif != tmp_vif) { - rcu_read_unlock(); + srcu_read_unlock(&wilc->srcu, srcu_idx); return -EADDRNOTAVAIL; } - rcu_read_unlock(); + srcu_read_unlock(&wilc->srcu, srcu_idx); return 0; } } - rcu_read_unlock(); + srcu_read_unlock(&wilc->srcu, srcu_idx); result = wilc_set_mac_address(vif, (u8 *)addr->sa_data); if (result) @@ -764,14 +767,15 @@ netdev_tx_t wilc_mac_xmit(struct sk_buff *skb, struct net_device *ndev) wilc_tx_complete); if (queue_count > FLOW_CONTROL_UPPER_THRESHOLD) { + int srcu_idx; struct wilc_vif *vif; - rcu_read_lock(); + srcu_idx = srcu_read_lock(&wilc->srcu); wilc_for_each_vif(wilc, vif) { if (vif->mac_opened) netif_stop_queue(vif->ndev); } - rcu_read_unlock(); + srcu_read_unlock(&wilc->srcu, srcu_idx); } return NETDEV_TX_OK; @@ -815,12 +819,13 @@ void wilc_frmw_to_host(struct wilc *wilc, u8 *buff, u32 size, unsigned int frame_len = 0; struct wilc_vif *vif; struct sk_buff *skb; + int srcu_idx; int stats; if (!wilc) return; - rcu_read_lock(); + srcu_idx = srcu_read_lock(&wilc->srcu); wilc_netdev = get_if_handler(wilc, buff); if (!wilc_netdev) goto out; @@ -848,14 +853,15 @@ void wilc_frmw_to_host(struct wilc *wilc, u8 *buff, u32 size, netdev_dbg(wilc_netdev, "netif_rx ret value is: %d\n", stats); } out: - rcu_read_unlock(); + srcu_read_unlock(&wilc->srcu, srcu_idx); } void wilc_wfi_mgmt_rx(struct wilc *wilc, u8 *buff, u32 size, bool is_auth) { + int srcu_idx; struct wilc_vif *vif; - rcu_read_lock(); + srcu_idx = srcu_read_lock(&wilc->srcu); wilc_for_each_vif(wilc, vif) { struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buff; u16 type = le16_to_cpup((__le16 *)buff); @@ -876,7 +882,7 @@ void wilc_wfi_mgmt_rx(struct wilc *wilc, u8 *buff, u32 size, bool is_auth) if (vif->monitor_flag) wilc_wfi_monitor_rx(wilc->monitor_dev, buff, size); } - rcu_read_unlock(); + srcu_read_unlock(&wilc->srcu, srcu_idx); } static const struct net_device_ops wilc_netdev_ops = { @@ -906,7 +912,7 @@ void wilc_netdev_cleanup(struct wilc *wilc) list_del_rcu(&vif->list); wilc->vif_num--; mutex_unlock(&wilc->vif_mutex); - synchronize_rcu(); + synchronize_srcu(&wilc->srcu); if (vif->ndev) unregister_netdev(vif->ndev); } @@ -925,15 +931,16 @@ static u8 wilc_get_available_idx(struct wilc *wl) { int idx = 0; struct wilc_vif *vif; + int srcu_idx; - rcu_read_lock(); + srcu_idx = srcu_read_lock(&wl->srcu); wilc_for_each_vif(wl, vif) { if (vif->idx == 0) idx = 1; else idx = 0; } - rcu_read_unlock(); + srcu_read_unlock(&wl->srcu, srcu_idx); return idx; } @@ -983,7 +990,7 @@ struct wilc_vif *wilc_netdev_ifc_init(struct wilc *wl, const char *name, list_add_tail_rcu(&vif->list, &wl->vif_list); wl->vif_num += 1; mutex_unlock(&wl->vif_mutex); - synchronize_rcu(); + synchronize_srcu(&wl->srcu); return vif; diff --git a/drivers/net/wireless/microchip/wilc1000/netdev.h b/drivers/net/wireless/microchip/wilc1000/netdev.h index eecee3973d6a42..5937d6d45695f4 100644 --- a/drivers/net/wireless/microchip/wilc1000/netdev.h +++ b/drivers/net/wireless/microchip/wilc1000/netdev.h @@ -32,8 +32,8 @@ #define wilc_for_each_vif(w, v) \ struct wilc *_w = w; \ - list_for_each_entry_rcu(v, &_w->vif_list, list, \ - rcu_read_lock_held()) + list_for_each_entry_srcu(v, &_w->vif_list, list, \ + srcu_read_lock_held(&_w->srcu)) struct wilc_wfi_stats { unsigned long rx_packets; @@ -220,6 +220,7 @@ struct wilc { /* protect vif list */ struct mutex vif_mutex; + struct srcu_struct srcu; u8 open_ifcs; /* protect head of transmit queue */ diff --git a/drivers/net/wireless/microchip/wilc1000/wlan.c b/drivers/net/wireless/microchip/wilc1000/wlan.c index 37c32d17856ea7..a9e872a7b2c38b 100644 --- a/drivers/net/wireless/microchip/wilc1000/wlan.c +++ b/drivers/net/wireless/microchip/wilc1000/wlan.c @@ -712,6 +712,7 @@ int wilc_wlan_handle_txq(struct wilc *wilc, u32 *txq_count) u32 *vmm_table = wilc->vmm_table; u8 ac_pkt_num_to_chip[NQUEUES] = {0, 0, 0, 0}; const struct wilc_hif_func *func; + int srcu_idx; u8 *txb = wilc->tx_buffer; struct wilc_vif *vif; @@ -723,10 +724,10 @@ int wilc_wlan_handle_txq(struct wilc *wilc, u32 *txq_count) mutex_lock(&wilc->txq_add_to_head_cs); - rcu_read_lock(); + srcu_idx = srcu_read_lock(&wilc->srcu); wilc_for_each_vif(wilc, vif) wilc_wlan_txq_filter_dup_tcp_ack(vif->ndev); - rcu_read_unlock(); + srcu_read_unlock(&wilc->srcu, srcu_idx); for (ac = 0; ac < NQUEUES; ac++) tqe_q[ac] = wilc_wlan_txq_get_first(wilc, ac); From 3596717a6fbd54c64e97e085a9f77ed511ff59f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexis=20Lothor=C3=A9?= Date: Tue, 28 May 2024 16:20:29 +0200 Subject: [PATCH 0450/1578] Revert "wifi: wilc1000: set atomic flag on kmemdup in srcu critical section" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 35aee01ff43d7eb6c2caa0b94e7cc6c45baeeab7 Commit 35aee01ff43d ("wifi: wilc1000: set atomic flag on kmemdup in srcu critical section") was preparatory to the SRCU to RCU conversion done by commit f236464f1db7 ("wifi: wilc1000: convert list management to RCU"). This conversion brought issues and so has been reverted, so the atomic flag is not needed anymore. Signed-off-by: Alexis Lothoré Signed-off-by: Kalle Valo Link: https://msgid.link/20240528-wilc_revert_srcu_to_rcu-v1-2-bce096e0798c@bootlin.com --- drivers/net/wireless/microchip/wilc1000/hif.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/microchip/wilc1000/hif.c b/drivers/net/wireless/microchip/wilc1000/hif.c index 3925ca653e807d..f1085ccb7eedc0 100644 --- a/drivers/net/wireless/microchip/wilc1000/hif.c +++ b/drivers/net/wireless/microchip/wilc1000/hif.c @@ -1594,7 +1594,7 @@ void wilc_network_info_received(struct wilc *wilc, u8 *buffer, u32 length) msg->body.net_info.rssi = buffer[8]; msg->body.net_info.mgmt = kmemdup(&buffer[9], msg->body.net_info.frame_len, - GFP_ATOMIC); + GFP_KERNEL); if (!msg->body.net_info.mgmt) { kfree(msg); goto out; From 596c195680dceb34e6b994ff5571331d5dba8299 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexis=20Lothor=C3=A9?= Date: Tue, 28 May 2024 16:20:30 +0200 Subject: [PATCH 0451/1578] wifi: wilc1000: document SRCU usage instead of SRCU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit f236464f1db7 ("wifi: wilc1000: convert list management to RCU") attempted to convert SRCU to RCU usage, assuming it was not really needed. The runtime issues that arose after merging it showed that there are code paths involving sleeping functions, and removing those would need some heavier driver rework. Add some documentation about SRCU need to make sure that any future developer do not miss some use cases if tempted to convert back again to RCU. Signed-off-by: Alexis Lothoré Signed-off-by: Kalle Valo Link: https://msgid.link/20240528-wilc_revert_srcu_to_rcu-v1-3-bce096e0798c@bootlin.com --- drivers/net/wireless/microchip/wilc1000/netdev.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/wireless/microchip/wilc1000/netdev.h b/drivers/net/wireless/microchip/wilc1000/netdev.h index 5937d6d45695f4..fde8610a9c84ba 100644 --- a/drivers/net/wireless/microchip/wilc1000/netdev.h +++ b/drivers/net/wireless/microchip/wilc1000/netdev.h @@ -220,6 +220,13 @@ struct wilc { /* protect vif list */ struct mutex vif_mutex; + /* Sleepable RCU struct to manipulate vif list. Sleepable version is + * needed over the classic RCU version because the driver's current + * design involves some sleeping code while manipulating a vif + * retrieved from vif list (so in a SRCU critical section), like: + * - sending commands to the chip, using info from retrieved vif + * - registering a new monitoring net device + */ struct srcu_struct srcu; u8 open_ifcs; From 40cecacabc460f5074398753feb9ed7d43e8dfa6 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 28 May 2024 14:23:08 +0200 Subject: [PATCH 0452/1578] wifi: mt76: mt7615: add missing chanctx ops Here's another one I missed during the initial conversion, fix that. Cc: stable@vger.kernel.org Reported-by: Rene Petersen Fixes: 0a44dfc07074 ("wifi: mac80211: simplify non-chanctx drivers") Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218895 Signed-off-by: Johannes Berg Signed-off-by: Kalle Valo Link: https://msgid.link/20240528142308.3f7db1821e68.I531135d7ad76331a50244d6d5288e14aa9668390@changeid --- drivers/net/wireless/mediatek/mt76/mt7615/main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/main.c b/drivers/net/wireless/mediatek/mt76/mt7615/main.c index 0971c164b57e92..c27acaf0eb1cf7 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/main.c @@ -1326,6 +1326,10 @@ static void mt7615_set_rekey_data(struct ieee80211_hw *hw, #endif /* CONFIG_PM */ const struct ieee80211_ops mt7615_ops = { + .add_chanctx = ieee80211_emulate_add_chanctx, + .remove_chanctx = ieee80211_emulate_remove_chanctx, + .change_chanctx = ieee80211_emulate_change_chanctx, + .switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx, .tx = mt7615_tx, .start = mt7615_start, .stop = mt7615_stop, From 819bda58e77bb67974f94dc1aa11b0556b6f6889 Mon Sep 17 00:00:00 2001 From: Bitterblue Smith Date: Wed, 29 May 2024 20:19:47 +0300 Subject: [PATCH 0453/1578] wifi: rtlwifi: Ignore IEEE80211_CONF_CHANGE_RETRY_LIMITS Since commit 0a44dfc07074 ("wifi: mac80211: simplify non-chanctx drivers") ieee80211_hw_config() is no longer called with changed = ~0. rtlwifi relied on ~0 in order to ignore the default retry limits of 4/7, preferring 48/48 in station mode and 7/7 in AP/IBSS. RTL8192DU has a lot of packet loss with the default limits from mac80211. Fix it by ignoring IEEE80211_CONF_CHANGE_RETRY_LIMITS completely, because it's the simplest solution. Link: https://lore.kernel.org/linux-wireless/cedd13d7691f4692b2a2fa5a24d44a22@realtek.com/ Cc: stable@vger.kernel.org # 6.9.x Signed-off-by: Bitterblue Smith Acked-by: Ping-Ke Shih Signed-off-by: Kalle Valo Link: https://msgid.link/1fabb8e4-adf3-47ae-8462-8aea963bc2a5@gmail.com --- drivers/net/wireless/realtek/rtlwifi/core.c | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/core.c b/drivers/net/wireless/realtek/rtlwifi/core.c index 2e60a6991ca166..42b7db12b1bd41 100644 --- a/drivers/net/wireless/realtek/rtlwifi/core.c +++ b/drivers/net/wireless/realtek/rtlwifi/core.c @@ -633,21 +633,6 @@ static int rtl_op_config(struct ieee80211_hw *hw, u32 changed) } } - if (changed & IEEE80211_CONF_CHANGE_RETRY_LIMITS) { - rtl_dbg(rtlpriv, COMP_MAC80211, DBG_LOUD, - "IEEE80211_CONF_CHANGE_RETRY_LIMITS %x\n", - hw->conf.long_frame_max_tx_count); - /* brought up everything changes (changed == ~0) indicates first - * open, so use our default value instead of that of wiphy. - */ - if (changed != ~0) { - mac->retry_long = hw->conf.long_frame_max_tx_count; - mac->retry_short = hw->conf.long_frame_max_tx_count; - rtlpriv->cfg->ops->set_hw_reg(hw, HW_VAR_RETRY_LIMIT, - (u8 *)(&hw->conf.long_frame_max_tx_count)); - } - } - if (changed & IEEE80211_CONF_CHANGE_CHANNEL && !rtlpriv->proximity.proxim_on) { struct ieee80211_channel *channel = hw->conf.chandef.chan; From 5fc16fa5f13b3c06fdb959ef262050bd810416a2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 1 Jun 2024 12:25:35 -0600 Subject: [PATCH 0454/1578] io_uring: check for non-NULL file pointer in io_file_can_poll() In earlier kernels, it was possible to trigger a NULL pointer dereference off the forced async preparation path, if no file had been assigned. The trace leading to that looks as follows: BUG: kernel NULL pointer dereference, address: 00000000000000b0 PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP CPU: 67 PID: 1633 Comm: buf-ring-invali Not tainted 6.8.0-rc3+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS unknown 2/2/2022 RIP: 0010:io_buffer_select+0xc3/0x210 Code: 00 00 48 39 d1 0f 82 ae 00 00 00 48 81 4b 48 00 00 01 00 48 89 73 70 0f b7 50 0c 66 89 53 42 85 ed 0f 85 d2 00 00 00 48 8b 13 <48> 8b 92 b0 00 00 00 48 83 7a 40 00 0f 84 21 01 00 00 4c 8b 20 5b RSP: 0018:ffffb7bec38c7d88 EFLAGS: 00010246 RAX: ffff97af2be61000 RBX: ffff97af234f1700 RCX: 0000000000000040 RDX: 0000000000000000 RSI: ffff97aecfb04820 RDI: ffff97af234f1700 RBP: 0000000000000000 R08: 0000000000200030 R09: 0000000000000020 R10: ffffb7bec38c7dc8 R11: 000000000000c000 R12: ffffb7bec38c7db8 R13: ffff97aecfb05800 R14: ffff97aecfb05800 R15: ffff97af2be5e000 FS: 00007f852f74b740(0000) GS:ffff97b1eeec0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000000b0 CR3: 000000016deab005 CR4: 0000000000370ef0 Call Trace: ? __die+0x1f/0x60 ? page_fault_oops+0x14d/0x420 ? do_user_addr_fault+0x61/0x6a0 ? exc_page_fault+0x6c/0x150 ? asm_exc_page_fault+0x22/0x30 ? io_buffer_select+0xc3/0x210 __io_import_iovec+0xb5/0x120 io_readv_prep_async+0x36/0x70 io_queue_sqe_fallback+0x20/0x260 io_submit_sqes+0x314/0x630 __do_sys_io_uring_enter+0x339/0xbc0 ? __do_sys_io_uring_register+0x11b/0xc50 ? vm_mmap_pgoff+0xce/0x160 do_syscall_64+0x5f/0x180 entry_SYSCALL_64_after_hwframe+0x46/0x4e RIP: 0033:0x55e0a110a67e Code: ba cc 00 00 00 45 31 c0 44 0f b6 92 d0 00 00 00 31 d2 41 b9 08 00 00 00 41 83 e2 01 41 c1 e2 04 41 09 c2 b8 aa 01 00 00 0f 05 90 89 30 eb a9 0f 1f 40 00 48 8b 42 20 8b 00 a8 06 75 af 85 f6 because the request is marked forced ASYNC and has a bad file fd, and hence takes the forced async prep path. Current kernels with the request async prep cleaned up can no longer hit this issue, but for ease of backporting, let's add this safety check in here too as it really doesn't hurt. For both cases, this will inevitably end with a CQE posted with -EBADF. Cc: stable@vger.kernel.org Fixes: a76c0b31eef5 ("io_uring: commit non-pollable provided mapped buffers upfront") Signed-off-by: Jens Axboe --- io_uring/io_uring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 624ca9076a50be..726e6367af4d37 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -433,7 +433,7 @@ static inline bool io_file_can_poll(struct io_kiocb *req) { if (req->flags & REQ_F_CAN_POLL) return true; - if (file_can_poll(req->file)) { + if (req->file && file_can_poll(req->file)) { req->flags |= REQ_F_CAN_POLL; return true; } From 30636258a7c9174be44ddb2318bae4f66d4beab0 Mon Sep 17 00:00:00 2001 From: Heng Qi Date: Thu, 30 May 2024 11:41:43 +0800 Subject: [PATCH 0455/1578] virtio_net: fix missing lock protection on control_buf access Refactored the handling of control_buf to be within the cvq_lock critical section, mitigating race conditions between reading device responses and new command submissions. Fixes: 6f45ab3e0409 ("virtio_net: Add a lock for the command VQ.") Signed-off-by: Heng Qi Reviewed-by: Hariprasad Kelam Acked-by: Jason Wang Reviewed-by: Xuan Zhuo Link: https://lore.kernel.org/r/20240530034143.19579-1-hengqi@linux.alibaba.com Signed-off-by: Jakub Kicinski --- drivers/net/virtio_net.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 4a802c0ea2cbc3..1ea8e6a24286ef 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2686,6 +2686,7 @@ static bool virtnet_send_command_reply(struct virtnet_info *vi, u8 class, u8 cmd { struct scatterlist *sgs[5], hdr, stat; u32 out_num = 0, tmp, in_num = 0; + bool ok; int ret; /* Caller should know better */ @@ -2731,8 +2732,9 @@ static bool virtnet_send_command_reply(struct virtnet_info *vi, u8 class, u8 cmd } unlock: + ok = vi->ctrl->status == VIRTIO_NET_OK; mutex_unlock(&vi->cvq_lock); - return vi->ctrl->status == VIRTIO_NET_OK; + return ok; } static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd, From 7679935b8bdf8d2bc298b04ed579d1ed78e81907 Mon Sep 17 00:00:00 2001 From: Peter Geis Date: Wed, 29 May 2024 14:56:35 -0400 Subject: [PATCH 0456/1578] MAINTAINERS: remove Peter Geis The Motorcomm PHY driver is now maintained by the OEM. The driver has expanded far beyond my original purpose, and I do not have the hardware to test against the new portions of it. Therefore I am removing myself as a maintainer of the driver. Signed-off-by: Peter Geis Link: https://lore.kernel.org/r/20240529185635.538072-1-pgwipeout@gmail.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index ba231d2f3a4e5c..f7b8025e3c27c3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15239,7 +15239,6 @@ F: drivers/staging/most/ F: include/linux/most.h MOTORCOMM PHY DRIVER -M: Peter Geis M: Frank L: netdev@vger.kernel.org S: Maintained From 89e281ebff72e6d37dce2df0e142b2909dafb267 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Wed, 29 May 2024 21:08:14 -0700 Subject: [PATCH 0457/1578] ethtool: init tsinfo stats if requested Statistic values should be set to ETHTOOL_STAT_NOT_SET even if the device doesn't support statistics. Otherwise zeros will be returned as if they are proper values: host# ethtool -I -T lo Time stamping parameters for lo: Capabilities: software-transmit software-receive software-system-clock PTP Hardware Clock: none Hardware Transmit Timestamp Modes: none Hardware Receive Filter Modes: none Statistics: tx_pkts: 0 tx_lost: 0 tx_err: 0 Fixes: 0e9c127729be ("ethtool: add interface to read Tx hardware timestamping statistics") Suggested-by: Jakub Kicinski Signed-off-by: Vadim Fedorenko Reviewed-by: Rahul Rameshbabu Link: https://lore.kernel.org/r/20240530040814.1014446-1-vadfed@meta.com Signed-off-by: Jakub Kicinski --- net/ethtool/tsinfo.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ethtool/tsinfo.c b/net/ethtool/tsinfo.c index be2755c8d8fde7..57d496287e5237 100644 --- a/net/ethtool/tsinfo.c +++ b/net/ethtool/tsinfo.c @@ -38,11 +38,11 @@ static int tsinfo_prepare_data(const struct ethnl_req_info *req_base, ret = ethnl_ops_begin(dev); if (ret < 0) return ret; - if (req_base->flags & ETHTOOL_FLAG_STATS && - dev->ethtool_ops->get_ts_stats) { + if (req_base->flags & ETHTOOL_FLAG_STATS) { ethtool_stats_init((u64 *)&data->stats, sizeof(data->stats) / sizeof(u64)); - dev->ethtool_ops->get_ts_stats(dev, &data->stats); + if (dev->ethtool_ops->get_ts_stats) + dev->ethtool_ops->get_ts_stats(dev, &data->stats); } ret = __ethtool_get_ts_info(dev, &data->ts_info); ethnl_ops_complete(dev); From 9e0945b1901c9eed4fbee3b8a3870487b2bdc936 Mon Sep 17 00:00:00 2001 From: Heng Qi Date: Tue, 28 May 2024 21:41:15 +0800 Subject: [PATCH 0458/1578] virtio_net: fix possible dim status unrecoverable When the dim worker is scheduled, if it no longer needs to issue commands, dim may not be able to return to the working state later. For example, the following single queue scenario: 1. The dim worker of rxq0 is scheduled, and the dim status is changed to DIM_APPLY_NEW_PROFILE; 2. dim is disabled or parameters have not been modified; 3. virtnet_rx_dim_work exits directly; Then, even if net_dim is invoked again, it cannot work because the state is not restored to DIM_START_MEASURE. Fixes: 6208799553a8 ("virtio-net: support rx netdim") Signed-off-by: Heng Qi Reviewed-by: Jiri Pirko Acked-by: Michael S. Tsirkin Reviewed-by: Xuan Zhuo Link: https://lore.kernel.org/r/20240528134116.117426-2-hengqi@linux.alibaba.com Signed-off-by: Jakub Kicinski --- drivers/net/virtio_net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 1ea8e6a24286ef..774a292a609061 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -4419,9 +4419,9 @@ static void virtnet_rx_dim_work(struct work_struct *work) if (err) pr_debug("%s: Failed to send dim parameters on rxq%d\n", dev->name, qnum); - dim->state = DIM_START_MEASURE; } out: + dim->state = DIM_START_MEASURE; mutex_unlock(&rq->dim_lock); } From d1f0bd01bc58f35b5353ad9dbe5f7249a8f3368e Mon Sep 17 00:00:00 2001 From: Heng Qi Date: Tue, 28 May 2024 21:41:16 +0800 Subject: [PATCH 0459/1578] virtio_net: fix a spurious deadlock issue When the following snippet is run, lockdep will report a deadlock[1]. /* Acquire all queues dim_locks */ for (i = 0; i < vi->max_queue_pairs; i++) mutex_lock(&vi->rq[i].dim_lock); There's no deadlock here because the vq locks are always taken in the same order, but lockdep can not figure it out. So refactoring the code to alleviate the problem. [1] ======================================================== WARNING: possible recursive locking detected 6.9.0-rc7+ #319 Not tainted -------------------------------------------- ethtool/962 is trying to acquire lock: but task is already holding lock: other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&vi->rq[i].dim_lock); lock(&vi->rq[i].dim_lock); *** DEADLOCK *** May be due to missing lock nesting notation 3 locks held by ethtool/962: #0: ffffffff82dbaab0 (cb_lock){++++}-{3:3}, at: genl_rcv+0x19/0x40 #1: ffffffff82dad0a8 (rtnl_mutex){+.+.}-{3:3}, at: ethnl_default_set_doit+0xbe/0x1e0 stack backtrace: CPU: 6 PID: 962 Comm: ethtool Not tainted 6.9.0-rc7+ #319 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x79/0xb0 check_deadlock+0x130/0x220 __lock_acquire+0x861/0x990 lock_acquire.part.0+0x72/0x1d0 ? lock_acquire+0xf8/0x130 __mutex_lock+0x71/0xd50 virtnet_set_coalesce+0x151/0x190 __ethnl_set_coalesce.isra.0+0x3f8/0x4d0 ethnl_set_coalesce+0x34/0x90 ethnl_default_set_doit+0xdd/0x1e0 genl_family_rcv_msg_doit+0xdc/0x130 genl_family_rcv_msg+0x154/0x230 ? __pfx_ethnl_default_set_doit+0x10/0x10 genl_rcv_msg+0x4b/0xa0 ? __pfx_genl_rcv_msg+0x10/0x10 netlink_rcv_skb+0x5a/0x110 genl_rcv+0x28/0x40 netlink_unicast+0x1af/0x280 netlink_sendmsg+0x20e/0x460 __sys_sendto+0x1fe/0x210 ? find_held_lock+0x2b/0x80 ? do_user_addr_fault+0x3a2/0x8a0 ? __lock_release+0x5e/0x160 ? do_user_addr_fault+0x3a2/0x8a0 ? lock_release+0x72/0x140 ? do_user_addr_fault+0x3a7/0x8a0 __x64_sys_sendto+0x29/0x30 do_syscall_64+0x78/0x180 entry_SYSCALL_64_after_hwframe+0x76/0x7e Fixes: 4d4ac2ececd3 ("virtio_net: Add a lock for per queue RX coalesce") Signed-off-by: Heng Qi Acked-by: Michael S. Tsirkin Reviewed-by: Xuan Zhuo Acked-by: Jason Wang Link: https://lore.kernel.org/r/20240528134116.117426-3-hengqi@linux.alibaba.com Signed-off-by: Jakub Kicinski --- drivers/net/virtio_net.c | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 774a292a609061..61a57d134544f9 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -4259,7 +4259,6 @@ static int virtnet_send_rx_notf_coal_cmds(struct virtnet_info *vi, struct virtio_net_ctrl_coal_rx *coal_rx __free(kfree) = NULL; bool rx_ctrl_dim_on = !!ec->use_adaptive_rx_coalesce; struct scatterlist sgs_rx; - int ret = 0; int i; if (rx_ctrl_dim_on && !virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) @@ -4269,27 +4268,27 @@ static int virtnet_send_rx_notf_coal_cmds(struct virtnet_info *vi, ec->rx_max_coalesced_frames != vi->intr_coal_rx.max_packets)) return -EINVAL; - /* Acquire all queues dim_locks */ - for (i = 0; i < vi->max_queue_pairs; i++) - mutex_lock(&vi->rq[i].dim_lock); - if (rx_ctrl_dim_on && !vi->rx_dim_enabled) { vi->rx_dim_enabled = true; - for (i = 0; i < vi->max_queue_pairs; i++) + for (i = 0; i < vi->max_queue_pairs; i++) { + mutex_lock(&vi->rq[i].dim_lock); vi->rq[i].dim_enabled = true; - goto unlock; + mutex_unlock(&vi->rq[i].dim_lock); + } + return 0; } coal_rx = kzalloc(sizeof(*coal_rx), GFP_KERNEL); - if (!coal_rx) { - ret = -ENOMEM; - goto unlock; - } + if (!coal_rx) + return -ENOMEM; if (!rx_ctrl_dim_on && vi->rx_dim_enabled) { vi->rx_dim_enabled = false; - for (i = 0; i < vi->max_queue_pairs; i++) + for (i = 0; i < vi->max_queue_pairs; i++) { + mutex_lock(&vi->rq[i].dim_lock); vi->rq[i].dim_enabled = false; + mutex_unlock(&vi->rq[i].dim_lock); + } } /* Since the per-queue coalescing params can be set, @@ -4302,22 +4301,19 @@ static int virtnet_send_rx_notf_coal_cmds(struct virtnet_info *vi, if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_NOTF_COAL, VIRTIO_NET_CTRL_NOTF_COAL_RX_SET, - &sgs_rx)) { - ret = -EINVAL; - goto unlock; - } + &sgs_rx)) + return -EINVAL; vi->intr_coal_rx.max_usecs = ec->rx_coalesce_usecs; vi->intr_coal_rx.max_packets = ec->rx_max_coalesced_frames; for (i = 0; i < vi->max_queue_pairs; i++) { + mutex_lock(&vi->rq[i].dim_lock); vi->rq[i].intr_coal.max_usecs = ec->rx_coalesce_usecs; vi->rq[i].intr_coal.max_packets = ec->rx_max_coalesced_frames; - } -unlock: - for (i = vi->max_queue_pairs - 1; i >= 0; i--) mutex_unlock(&vi->rq[i].dim_lock); + } - return ret; + return 0; } static int virtnet_send_notf_coal_cmds(struct virtnet_info *vi, From 3c34fb0bd4a4237592c5ecb5b2e2531900c55774 Mon Sep 17 00:00:00 2001 From: Lars Kellogg-Stedman Date: Wed, 29 May 2024 17:02:43 -0400 Subject: [PATCH 0460/1578] ax25: Fix refcount imbalance on inbound connections When releasing a socket in ax25_release(), we call netdev_put() to decrease the refcount on the associated ax.25 device. However, the execution path for accepting an incoming connection never calls netdev_hold(). This imbalance leads to refcount errors, and ultimately to kernel crashes. A typical call trace for the above situation will start with one of the following errors: refcount_t: decrement hit 0; leaking memory. refcount_t: underflow; use-after-free. And will then have a trace like: Call Trace: ? show_regs+0x64/0x70 ? __warn+0x83/0x120 ? refcount_warn_saturate+0xb2/0x100 ? report_bug+0x158/0x190 ? prb_read_valid+0x20/0x30 ? handle_bug+0x3e/0x70 ? exc_invalid_op+0x1c/0x70 ? asm_exc_invalid_op+0x1f/0x30 ? refcount_warn_saturate+0xb2/0x100 ? refcount_warn_saturate+0xb2/0x100 ax25_release+0x2ad/0x360 __sock_release+0x35/0xa0 sock_close+0x19/0x20 [...] On reboot (or any attempt to remove the interface), the kernel gets stuck in an infinite loop: unregister_netdevice: waiting for ax0 to become free. Usage count = 0 This patch corrects these issues by ensuring that we call netdev_hold() and ax25_dev_hold() for new connections in ax25_accept(). This makes the logic leading to ax25_accept() match the logic for ax25_bind(): in both cases we increment the refcount, which is ultimately decremented in ax25_release(). Fixes: 9fd75b66b8f6 ("ax25: Fix refcount leaks caused by ax25_cb_del()") Signed-off-by: Lars Kellogg-Stedman Tested-by: Duoming Zhou Tested-by: Dan Cross Tested-by: Chris Maness Reviewed-by: Dan Carpenter Link: https://lore.kernel.org/r/20240529210242.3346844-2-lars@oddbit.com Signed-off-by: Jakub Kicinski --- net/ax25/af_ax25.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 8077cf2ee44803..d6f9fae06a9d81 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1378,8 +1378,10 @@ static int ax25_accept(struct socket *sock, struct socket *newsock, { struct sk_buff *skb; struct sock *newsk; + ax25_dev *ax25_dev; DEFINE_WAIT(wait); struct sock *sk; + ax25_cb *ax25; int err = 0; if (sock->state != SS_UNCONNECTED) @@ -1434,6 +1436,10 @@ static int ax25_accept(struct socket *sock, struct socket *newsock, kfree_skb(skb); sk_acceptq_removed(sk); newsock->state = SS_CONNECTED; + ax25 = sk_to_ax25(newsk); + ax25_dev = ax25->ax25_dev; + netdev_hold(ax25_dev->dev, &ax25->dev_tracker, GFP_ATOMIC); + ax25_dev_hold(ax25_dev); out: release_sock(sk); From 166fcf86cd34e15c7f383eda4642d7a212393008 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Thu, 30 May 2024 13:17:33 +0800 Subject: [PATCH 0461/1578] ax25: Replace kfree() in ax25_dev_free() with ax25_dev_put() The object "ax25_dev" is managed by reference counting. Thus it should not be directly released by kfree(), replace with ax25_dev_put(). Fixes: d01ffb9eee4a ("ax25: add refcount in ax25_dev to avoid UAF bugs") Suggested-by: Dan Carpenter Signed-off-by: Duoming Zhou Reviewed-by: Dan Carpenter Link: https://lore.kernel.org/r/20240530051733.11416-1-duoming@zju.edu.cn Signed-off-by: Jakub Kicinski --- net/ax25/ax25_dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c index 742d7c68e7e7e9..9efd6690b34436 100644 --- a/net/ax25/ax25_dev.c +++ b/net/ax25/ax25_dev.c @@ -196,7 +196,7 @@ void __exit ax25_dev_free(void) list_for_each_entry_safe(s, n, &ax25_dev_list, list) { netdev_put(s->dev, &s->dev_tracker); list_del(&s->list); - kfree(s); + ax25_dev_put(s); } spin_unlock_bh(&ax25_dev_lock); } From 8105378c0c02309221b63d15cfe8fe92e7fe1434 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Thu, 30 May 2024 11:27:17 +0800 Subject: [PATCH 0462/1578] net: rps: fix error when CONFIG_RFS_ACCEL is off John Sperbeck reported that if we turn off CONFIG_RFS_ACCEL, the 'head' is not defined, which will trigger compile error. So I move the 'head' out of the CONFIG_RFS_ACCEL scope. Fixes: 84b6823cd96b ("net: rps: protect last_qtail with rps_input_queue_tail_save() helper") Reported-by: John Sperbeck Closes: https://lore.kernel.org/all/20240529203421.2432481-1-jsperbeck@google.com/ Signed-off-by: Jason Xing Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240530032717.57787-1-kerneljasonxing@gmail.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index e1bb6d7856d922..4d4de9008f6f3b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4516,12 +4516,13 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow *rflow, u16 next_cpu) { if (next_cpu < nr_cpu_ids) { + u32 head; #ifdef CONFIG_RFS_ACCEL struct netdev_rx_queue *rxqueue; struct rps_dev_flow_table *flow_table; struct rps_dev_flow *old_rflow; - u32 flow_id, head; u16 rxq_index; + u32 flow_id; int rc; /* Should we steer this flow to a different hardware queue? */ From e85e271dec0270982afed84f70dc37703fcc1d52 Mon Sep 17 00:00:00 2001 From: DelphineCCChiu Date: Wed, 29 May 2024 14:58:55 +0800 Subject: [PATCH 0463/1578] net/ncsi: Fix the multi thread manner of NCSI driver Currently NCSI driver will send several NCSI commands back to back without waiting the response of previous NCSI command or timeout in some state when NIC have multi channel. This operation against the single thread manner defined by NCSI SPEC(section 6.3.2.3 in DSP0222_1.1.1) According to NCSI SPEC(section 6.2.13.1 in DSP0222_1.1.1), we should probe one channel at a time by sending NCSI commands (Clear initial state, Get version ID, Get capabilities...), than repeat this steps until the max number of channels which we got from NCSI command (Get capabilities) has been probed. Fixes: e6f44ed6d04d ("net/ncsi: Package and channel management") Signed-off-by: DelphineCCChiu Link: https://lore.kernel.org/r/20240529065856.825241-1-delphine_cc_chiu@wiwynn.com Signed-off-by: Jakub Kicinski --- net/ncsi/internal.h | 2 ++ net/ncsi/ncsi-manage.c | 73 +++++++++++++++++++++--------------------- net/ncsi/ncsi-rsp.c | 4 ++- 3 files changed, 41 insertions(+), 38 deletions(-) diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index 374412ed780b6e..ef0f8f73826f53 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -325,6 +325,7 @@ struct ncsi_dev_priv { spinlock_t lock; /* Protect the NCSI device */ unsigned int package_probe_id;/* Current ID during probe */ unsigned int package_num; /* Number of packages */ + unsigned int channel_probe_id;/* Current cahnnel ID during probe */ struct list_head packages; /* List of packages */ struct ncsi_channel *hot_channel; /* Channel was ever active */ struct ncsi_request requests[256]; /* Request table */ @@ -343,6 +344,7 @@ struct ncsi_dev_priv { bool multi_package; /* Enable multiple packages */ bool mlx_multi_host; /* Enable multi host Mellanox */ u32 package_whitelist; /* Packages to configure */ + unsigned char channel_count; /* Num of channels to probe */ }; struct ncsi_cmd_arg { diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 745c788f1d1dfc..5ecf611c882009 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -510,17 +510,19 @@ static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp) break; case ncsi_dev_state_suspend_gls: - ndp->pending_req_num = np->channel_num; + ndp->pending_req_num = 1; nca.type = NCSI_PKT_CMD_GLS; nca.package = np->id; + nca.channel = ndp->channel_probe_id; + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + ndp->channel_probe_id++; - nd->state = ncsi_dev_state_suspend_dcnt; - NCSI_FOR_EACH_CHANNEL(np, nc) { - nca.channel = nc->id; - ret = ncsi_xmit_cmd(&nca); - if (ret) - goto error; + if (ndp->channel_probe_id == ndp->channel_count) { + ndp->channel_probe_id = 0; + nd->state = ncsi_dev_state_suspend_dcnt; } break; @@ -1345,7 +1347,6 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) { struct ncsi_dev *nd = &ndp->ndev; struct ncsi_package *np; - struct ncsi_channel *nc; struct ncsi_cmd_arg nca; unsigned char index; int ret; @@ -1423,23 +1424,6 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) nd->state = ncsi_dev_state_probe_cis; break; - case ncsi_dev_state_probe_cis: - ndp->pending_req_num = NCSI_RESERVED_CHANNEL; - - /* Clear initial state */ - nca.type = NCSI_PKT_CMD_CIS; - nca.package = ndp->active_package->id; - for (index = 0; index < NCSI_RESERVED_CHANNEL; index++) { - nca.channel = index; - ret = ncsi_xmit_cmd(&nca); - if (ret) - goto error; - } - - nd->state = ncsi_dev_state_probe_gvi; - if (IS_ENABLED(CONFIG_NCSI_OEM_CMD_KEEP_PHY)) - nd->state = ncsi_dev_state_probe_keep_phy; - break; case ncsi_dev_state_probe_keep_phy: ndp->pending_req_num = 1; @@ -1452,14 +1436,17 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) nd->state = ncsi_dev_state_probe_gvi; break; + case ncsi_dev_state_probe_cis: case ncsi_dev_state_probe_gvi: case ncsi_dev_state_probe_gc: case ncsi_dev_state_probe_gls: np = ndp->active_package; - ndp->pending_req_num = np->channel_num; + ndp->pending_req_num = 1; - /* Retrieve version, capability or link status */ - if (nd->state == ncsi_dev_state_probe_gvi) + /* Clear initial state Retrieve version, capability or link status */ + if (nd->state == ncsi_dev_state_probe_cis) + nca.type = NCSI_PKT_CMD_CIS; + else if (nd->state == ncsi_dev_state_probe_gvi) nca.type = NCSI_PKT_CMD_GVI; else if (nd->state == ncsi_dev_state_probe_gc) nca.type = NCSI_PKT_CMD_GC; @@ -1467,19 +1454,29 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) nca.type = NCSI_PKT_CMD_GLS; nca.package = np->id; - NCSI_FOR_EACH_CHANNEL(np, nc) { - nca.channel = nc->id; - ret = ncsi_xmit_cmd(&nca); - if (ret) - goto error; - } + nca.channel = ndp->channel_probe_id; - if (nd->state == ncsi_dev_state_probe_gvi) + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + + if (nd->state == ncsi_dev_state_probe_cis) { + nd->state = ncsi_dev_state_probe_gvi; + if (IS_ENABLED(CONFIG_NCSI_OEM_CMD_KEEP_PHY) && ndp->channel_probe_id == 0) + nd->state = ncsi_dev_state_probe_keep_phy; + } else if (nd->state == ncsi_dev_state_probe_gvi) { nd->state = ncsi_dev_state_probe_gc; - else if (nd->state == ncsi_dev_state_probe_gc) + } else if (nd->state == ncsi_dev_state_probe_gc) { nd->state = ncsi_dev_state_probe_gls; - else + } else { + nd->state = ncsi_dev_state_probe_cis; + ndp->channel_probe_id++; + } + + if (ndp->channel_probe_id == ndp->channel_count) { + ndp->channel_probe_id = 0; nd->state = ncsi_dev_state_probe_dp; + } break; case ncsi_dev_state_probe_dp: ndp->pending_req_num = 1; @@ -1780,6 +1777,7 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev, ndp->requests[i].ndp = ndp; timer_setup(&ndp->requests[i].timer, ncsi_request_timeout, 0); } + ndp->channel_count = NCSI_RESERVED_CHANNEL; spin_lock_irqsave(&ncsi_dev_lock, flags); list_add_tail_rcu(&ndp->node, &ncsi_dev_list); @@ -1813,6 +1811,7 @@ int ncsi_start_dev(struct ncsi_dev *nd) if (!(ndp->flags & NCSI_DEV_PROBED)) { ndp->package_probe_id = 0; + ndp->channel_probe_id = 0; nd->state = ncsi_dev_state_probe; schedule_work(&ndp->work); return 0; diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index bee290d0f48b6f..e28be33bdf2c48 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -795,12 +795,13 @@ static int ncsi_rsp_handler_gc(struct ncsi_request *nr) struct ncsi_rsp_gc_pkt *rsp; struct ncsi_dev_priv *ndp = nr->ndp; struct ncsi_channel *nc; + struct ncsi_package *np; size_t size; /* Find the channel */ rsp = (struct ncsi_rsp_gc_pkt *)skb_network_header(nr->rsp); ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, - NULL, &nc); + &np, &nc); if (!nc) return -ENODEV; @@ -835,6 +836,7 @@ static int ncsi_rsp_handler_gc(struct ncsi_request *nr) */ nc->vlan_filter.bitmap = U64_MAX; nc->vlan_filter.n_vids = rsp->vlan_cnt; + np->ndp->channel_count = rsp->channel_cnt; return 0; } From 33700a0c9b562700c28d31360a5f04508f459a45 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Wed, 29 May 2024 18:29:32 +0100 Subject: [PATCH 0464/1578] net/tcp: Don't consider TCP_CLOSE in TCP_AO_ESTABLISHED TCP_CLOSE may or may not have current/rnext keys and should not be considered "established". The fast-path for TCP_CLOSE is SKB_DROP_REASON_TCP_CLOSE. This is what tcp_rcv_state_process() does anyways. Add an early drop path to not spend any time verifying segment signatures for sockets in TCP_CLOSE state. Cc: stable@vger.kernel.org # v6.7 Fixes: 0a3a809089eb ("net/tcp: Verify inbound TCP-AO signed segments") Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Link: https://lore.kernel.org/r/20240529-tcp_ao-sk_state-v1-1-d69b5d323c52@gmail.com Signed-off-by: Jakub Kicinski --- include/net/tcp_ao.h | 7 ++++--- net/ipv4/tcp_ao.c | 13 +++++++++---- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h index 471e177362b4c0..5d8e9ed2c0056f 100644 --- a/include/net/tcp_ao.h +++ b/include/net/tcp_ao.h @@ -86,7 +86,8 @@ static inline int tcp_ao_sizeof_key(const struct tcp_ao_key *key) struct tcp_ao_info { /* List of tcp_ao_key's */ struct hlist_head head; - /* current_key and rnext_key aren't maintained on listen sockets. + /* current_key and rnext_key are maintained on sockets + * in TCP_AO_ESTABLISHED states. * Their purpose is to cache keys on established connections, * saving needless lookups. Never dereference any of them from * listen sockets. @@ -201,9 +202,9 @@ struct tcp6_ao_context { }; struct tcp_sigpool; +/* Established states are fast-path and there always is current_key/rnext_key */ #define TCP_AO_ESTABLISHED (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | \ - TCPF_CLOSE | TCPF_CLOSE_WAIT | \ - TCPF_LAST_ACK | TCPF_CLOSING) + TCPF_CLOSE_WAIT | TCPF_LAST_ACK | TCPF_CLOSING) int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb, struct tcp_ao_key *key, struct tcphdr *th, diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index 781b67a525719a..37c42b63ff9934 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -933,6 +933,7 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, struct tcp_ao_key *key; __be32 sisn, disn; u8 *traffic_key; + int state; u32 sne = 0; info = rcu_dereference(tcp_sk(sk)->ao_info); @@ -948,8 +949,9 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, disn = 0; } + state = READ_ONCE(sk->sk_state); /* Fast-path */ - if (likely((1 << sk->sk_state) & TCP_AO_ESTABLISHED)) { + if (likely((1 << state) & TCP_AO_ESTABLISHED)) { enum skb_drop_reason err; struct tcp_ao_key *current_key; @@ -988,6 +990,9 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, return SKB_NOT_DROPPED_YET; } + if (unlikely(state == TCP_CLOSE)) + return SKB_DROP_REASON_TCP_CLOSE; + /* Lookup key based on peer address and keyid. * current_key and rnext_key must not be used on tcp listen * sockets as otherwise: @@ -1001,7 +1006,7 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, if (th->syn && !th->ack) goto verify_hash; - if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV)) { + if ((1 << state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV)) { /* Make the initial syn the likely case here */ if (unlikely(req)) { sne = tcp_ao_compute_sne(0, tcp_rsk(req)->rcv_isn, @@ -1018,14 +1023,14 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, /* no way to figure out initial sisn/disn - drop */ return SKB_DROP_REASON_TCP_FLAGS; } - } else if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { + } else if ((1 << state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { disn = info->lisn; if (th->syn || th->rst) sisn = th->seq; else sisn = info->risn; } else { - WARN_ONCE(1, "TCP-AO: Unexpected sk_state %d", sk->sk_state); + WARN_ONCE(1, "TCP-AO: Unexpected sk_state %d", state); return SKB_DROP_REASON_TCP_AOFAILURE; } verify_hash: From d7bd473632d07f8a54655c270c0940cc3671c548 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Maneyrol Date: Wed, 29 May 2024 15:47:17 +0000 Subject: [PATCH 0465/1578] iio: imu: inv_icm42600: stabilized timestamp in interrupt Use IRQF_ONESHOT flag to ensure the timestamp is not updated in the hard handler during the thread handler. And compute and use the effective watermark value that correspond to this first timestamp. This way we can ensure the timestamp is always corresponding to the value used by the timestamping mechanism. Otherwise, it is possible that between FIFO count read and FIFO processing the timestamp is overwritten in the hard handler. Fixes: ec74ae9fd37c ("iio: imu: inv_icm42600: add accurate timestamping") Cc: stable@vger.kernel.org Signed-off-by: Jean-Baptiste Maneyrol Link: https://lore.kernel.org/r/20240529154717.651863-1-inv.git-commit@tdk.com Signed-off-by: Jonathan Cameron --- .../imu/inv_icm42600/inv_icm42600_buffer.c | 19 +++++++++++++++++-- .../imu/inv_icm42600/inv_icm42600_buffer.h | 2 ++ .../iio/imu/inv_icm42600/inv_icm42600_core.c | 1 + 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/iio/imu/inv_icm42600/inv_icm42600_buffer.c b/drivers/iio/imu/inv_icm42600/inv_icm42600_buffer.c index 63b85ec88c131f..a8cf74c84c3c45 100644 --- a/drivers/iio/imu/inv_icm42600/inv_icm42600_buffer.c +++ b/drivers/iio/imu/inv_icm42600/inv_icm42600_buffer.c @@ -222,10 +222,15 @@ int inv_icm42600_buffer_update_watermark(struct inv_icm42600_state *st) latency_accel = period_accel * wm_accel; /* 0 value for watermark means that the sensor is turned off */ + if (wm_gyro == 0 && wm_accel == 0) + return 0; + if (latency_gyro == 0) { watermark = wm_accel; + st->fifo.watermark.eff_accel = wm_accel; } else if (latency_accel == 0) { watermark = wm_gyro; + st->fifo.watermark.eff_gyro = wm_gyro; } else { /* compute the smallest latency that is a multiple of both */ if (latency_gyro <= latency_accel) @@ -241,6 +246,13 @@ int inv_icm42600_buffer_update_watermark(struct inv_icm42600_state *st) watermark = latency / period; if (watermark < 1) watermark = 1; + /* update effective watermark */ + st->fifo.watermark.eff_gyro = latency / period_gyro; + if (st->fifo.watermark.eff_gyro < 1) + st->fifo.watermark.eff_gyro = 1; + st->fifo.watermark.eff_accel = latency / period_accel; + if (st->fifo.watermark.eff_accel < 1) + st->fifo.watermark.eff_accel = 1; } /* compute watermark value in bytes */ @@ -514,7 +526,7 @@ int inv_icm42600_buffer_fifo_parse(struct inv_icm42600_state *st) /* handle gyroscope timestamp and FIFO data parsing */ if (st->fifo.nb.gyro > 0) { ts = &gyro_st->ts; - inv_sensors_timestamp_interrupt(ts, st->fifo.nb.gyro, + inv_sensors_timestamp_interrupt(ts, st->fifo.watermark.eff_gyro, st->timestamp.gyro); ret = inv_icm42600_gyro_parse_fifo(st->indio_gyro); if (ret) @@ -524,7 +536,7 @@ int inv_icm42600_buffer_fifo_parse(struct inv_icm42600_state *st) /* handle accelerometer timestamp and FIFO data parsing */ if (st->fifo.nb.accel > 0) { ts = &accel_st->ts; - inv_sensors_timestamp_interrupt(ts, st->fifo.nb.accel, + inv_sensors_timestamp_interrupt(ts, st->fifo.watermark.eff_accel, st->timestamp.accel); ret = inv_icm42600_accel_parse_fifo(st->indio_accel); if (ret) @@ -577,6 +589,9 @@ int inv_icm42600_buffer_init(struct inv_icm42600_state *st) unsigned int val; int ret; + st->fifo.watermark.eff_gyro = 1; + st->fifo.watermark.eff_accel = 1; + /* * Default FIFO configuration (bits 7 to 5) * - use invalid value diff --git a/drivers/iio/imu/inv_icm42600/inv_icm42600_buffer.h b/drivers/iio/imu/inv_icm42600/inv_icm42600_buffer.h index 8b85ee333bf8f6..f6c85daf42b00b 100644 --- a/drivers/iio/imu/inv_icm42600/inv_icm42600_buffer.h +++ b/drivers/iio/imu/inv_icm42600/inv_icm42600_buffer.h @@ -32,6 +32,8 @@ struct inv_icm42600_fifo { struct { unsigned int gyro; unsigned int accel; + unsigned int eff_gyro; + unsigned int eff_accel; } watermark; size_t count; struct { diff --git a/drivers/iio/imu/inv_icm42600/inv_icm42600_core.c b/drivers/iio/imu/inv_icm42600/inv_icm42600_core.c index 96116a68ab291b..62fdae530334af 100644 --- a/drivers/iio/imu/inv_icm42600/inv_icm42600_core.c +++ b/drivers/iio/imu/inv_icm42600/inv_icm42600_core.c @@ -537,6 +537,7 @@ static int inv_icm42600_irq_init(struct inv_icm42600_state *st, int irq, if (ret) return ret; + irq_type |= IRQF_ONESHOT; return devm_request_threaded_irq(dev, irq, inv_icm42600_irq_timestamp, inv_icm42600_irq_handler, irq_type, "inv_icm42600", st); From 245f3b149e6cc3ac6ee612cdb7042263bfc9e73c Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Maneyrol Date: Mon, 27 May 2024 21:00:08 +0000 Subject: [PATCH 0466/1578] iio: imu: inv_icm42600: delete unneeded update watermark call Update watermark will be done inside the hwfifo_set_watermark callback just after the update_scan_mode. It is useless to do it here. Fixes: 7f85e42a6c54 ("iio: imu: inv_icm42600: add buffer support in iio devices") Cc: stable@vger.kernel.org Signed-off-by: Jean-Baptiste Maneyrol Link: https://lore.kernel.org/r/20240527210008.612932-1-inv.git-commit@tdk.com Signed-off-by: Jonathan Cameron --- drivers/iio/imu/inv_icm42600/inv_icm42600_accel.c | 4 ---- drivers/iio/imu/inv_icm42600/inv_icm42600_gyro.c | 4 ---- 2 files changed, 8 deletions(-) diff --git a/drivers/iio/imu/inv_icm42600/inv_icm42600_accel.c b/drivers/iio/imu/inv_icm42600/inv_icm42600_accel.c index 83d8504ebfff43..4b25666936146a 100644 --- a/drivers/iio/imu/inv_icm42600/inv_icm42600_accel.c +++ b/drivers/iio/imu/inv_icm42600/inv_icm42600_accel.c @@ -130,10 +130,6 @@ static int inv_icm42600_accel_update_scan_mode(struct iio_dev *indio_dev, /* update data FIFO write */ inv_sensors_timestamp_apply_odr(ts, 0, 0, 0); ret = inv_icm42600_buffer_set_fifo_en(st, fifo_en | st->fifo.en); - if (ret) - goto out_unlock; - - ret = inv_icm42600_buffer_update_watermark(st); out_unlock: mutex_unlock(&st->lock); diff --git a/drivers/iio/imu/inv_icm42600/inv_icm42600_gyro.c b/drivers/iio/imu/inv_icm42600/inv_icm42600_gyro.c index e6f8de80128c4f..938af5b640b00f 100644 --- a/drivers/iio/imu/inv_icm42600/inv_icm42600_gyro.c +++ b/drivers/iio/imu/inv_icm42600/inv_icm42600_gyro.c @@ -130,10 +130,6 @@ static int inv_icm42600_gyro_update_scan_mode(struct iio_dev *indio_dev, /* update data FIFO write */ inv_sensors_timestamp_apply_odr(ts, 0, 0, 0); ret = inv_icm42600_buffer_set_fifo_en(st, fifo_en | st->fifo.en); - if (ret) - goto out_unlock; - - ret = inv_icm42600_buffer_update_watermark(st); out_unlock: mutex_unlock(&st->lock); From c3f38fa61af77b49866b006939479069cd451173 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 2 Jun 2024 15:44:56 -0700 Subject: [PATCH 0467/1578] Linux 6.10-rc2 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index f975b639632809..7f921ae547f116 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 10 SUBLEVEL = 0 -EXTRAVERSION = -rc1 +EXTRAVERSION = -rc2 NAME = Baby Opossum Posse # *DOCUMENTATION* From 08f0fa5d6aa9488f752eb5410e32636f143b3d8e Mon Sep 17 00:00:00 2001 From: Joao Paulo Goncalves Date: Tue, 7 May 2024 11:35:55 -0300 Subject: [PATCH 0468/1578] arm64: dts: freescale: imx8mm-verdin: Fix GPU speed The GPU clock was reduced on iMX8MM SOC device tree to prevent boards that don't support GPU overdrive from being out of specification. However, this caused a regression in GPU speed for the Verdin iMX8MM, which does support GPU overdrive. This patch fixes this by enabling overdrive mode in the SOM dtsi. Fixes: 1f794d3eed53 ("arm64: dts: imx8mm: Reduce GPU to nominal speed") Signed-off-by: Joao Paulo Goncalves Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi index 4768b05fd7659e..0d9abca5882182 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi @@ -6,6 +6,7 @@ #include #include #include "imx8mm.dtsi" +#include "imx8mm-overdrive.dtsi" / { chosen { From 6c3ca6654a74dd396bc477839ba8d9792eced441 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Mon, 3 Jun 2024 15:45:53 +0800 Subject: [PATCH 0469/1578] LoongArch: Remove CONFIG_ACPI_TABLE_UPGRADE in platform_init() Both acpi_table_upgrade() and acpi_boot_table_init() are defined as empty functions under !CONFIG_ACPI_TABLE_UPGRADE and !CONFIG_ACPI in include/linux/acpi.h, there are no implicit declaration errors with various configs. #ifdef CONFIG_ACPI_TABLE_UPGRADE void acpi_table_upgrade(void); #else static inline void acpi_table_upgrade(void) { } #endif #ifdef CONFIG_ACPI ... void acpi_boot_table_init (void); ... #else /* !CONFIG_ACPI */ ... static inline void acpi_boot_table_init(void) { } ... #endif /* !CONFIG_ACPI */ As Huacai suggested, CONFIG_ACPI_TABLE_UPGRADE is ugly and not necessary here, just remove it. At the same time, just keep CONFIG_ACPI to prevent potential build errors in future, and give a signal to indicate the code is ACPI-specific. For the same reason, we also put acpi_table_upgrade() under CONFIG_ACPI. Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- arch/loongarch/kernel/setup.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index 60e0fe97f61a31..89ad756aeeed20 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -351,10 +351,8 @@ void __init platform_init(void) arch_reserve_vmcore(); arch_reserve_crashkernel(); -#ifdef CONFIG_ACPI_TABLE_UPGRADE - acpi_table_upgrade(); -#endif #ifdef CONFIG_ACPI + acpi_table_upgrade(); acpi_gbl_use_default_register_widths = false; acpi_boot_table_init(); #endif From b56f67a6c748bb009f313f91651c8020d2338d63 Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Mon, 3 Jun 2024 15:45:53 +0800 Subject: [PATCH 0470/1578] LoongArch: Fix built-in DTB detection fdt_check_header(__dtb_start) will always success because kernel provides a dummy dtb, and by coincidence __dtb_start clashed with entry of this dummy dtb. The consequence is fdt passed from firmware will never be taken. Fix by trying to utilise __dtb_start only when CONFIG_BUILTIN_DTB is enabled. Cc: stable@vger.kernel.org Fixes: 7b937cc243e5 ("of: Create of_root if no dtb provided by firmware") Signed-off-by: Jiaxun Yang Signed-off-by: Huacai Chen --- arch/loongarch/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index 89ad756aeeed20..3d048f1be14388 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -282,7 +282,7 @@ static void __init fdt_setup(void) return; /* Prefer to use built-in dtb, checking its legality first. */ - if (!fdt_check_header(__dtb_start)) + if (IS_ENABLED(CONFIG_BUILTIN_DTB) && !fdt_check_header(__dtb_start)) fdt_pointer = __dtb_start; else fdt_pointer = efi_fdt_pointer(); /* Fallback to firmware dtb */ From 3de9c42d02a79a5e09bbee7a4421ddc00cfd5c6d Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Mon, 3 Jun 2024 15:45:53 +0800 Subject: [PATCH 0471/1578] LoongArch: Add all CPUs enabled by fdt to NUMA node 0 NUMA enabled kernel on FDT based machine fails to boot because CPUs are all in NUMA_NO_NODE and mm subsystem won't accept that. Fix by adding them to default NUMA node at FDT parsing phase and move numa_add_cpu(0) to a later point. Cc: stable@vger.kernel.org Fixes: 88d4d957edc7 ("LoongArch: Add FDT booting support from efi system table") Signed-off-by: Jiaxun Yang Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/numa.h | 1 + arch/loongarch/kernel/smp.c | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/loongarch/include/asm/numa.h b/arch/loongarch/include/asm/numa.h index 27f319b4986257..b5f9de9f102e44 100644 --- a/arch/loongarch/include/asm/numa.h +++ b/arch/loongarch/include/asm/numa.h @@ -56,6 +56,7 @@ extern int early_cpu_to_node(int cpu); static inline void early_numa_add_cpu(int cpuid, s16 node) { } static inline void numa_add_cpu(unsigned int cpu) { } static inline void numa_remove_cpu(unsigned int cpu) { } +static inline void set_cpuid_to_node(int cpuid, s16 node) { } static inline int early_cpu_to_node(int cpu) { diff --git a/arch/loongarch/kernel/smp.c b/arch/loongarch/kernel/smp.c index 0dfe2388ef413b..1436d2465939b2 100644 --- a/arch/loongarch/kernel/smp.c +++ b/arch/loongarch/kernel/smp.c @@ -273,7 +273,6 @@ static void __init fdt_smp_setup(void) if (cpuid == loongson_sysconf.boot_cpu_id) { cpu = 0; - numa_add_cpu(cpu); } else { cpu = cpumask_next_zero(-1, cpu_present_mask); } @@ -283,6 +282,9 @@ static void __init fdt_smp_setup(void) set_cpu_present(cpu, true); __cpu_number_map[cpuid] = cpu; __cpu_logical_map[cpu] = cpuid; + + early_numa_add_cpu(cpu, 0); + set_cpuid_to_node(cpuid, 0); } loongson_sysconf.nr_cpus = num_processors; @@ -468,6 +470,7 @@ void smp_prepare_boot_cpu(void) set_cpu_possible(0, true); set_cpu_online(0, true); set_my_cpu_offset(per_cpu_offset(0)); + numa_add_cpu(0); rr_node = first_node(node_online_map); for_each_possible_cpu(cpu) { From beb2800074c15362cf9f6c7301120910046d6556 Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Mon, 3 Jun 2024 15:45:53 +0800 Subject: [PATCH 0472/1578] LoongArch: Fix entry point in kernel image header Currently kernel entry in head.S is in DMW address range, firmware is instructed to jump to this address after loading the kernel image. However kernel should not make any assumption on firmware's DMW setting, thus the entry point should be a physical address falls into direct translation region. Fix by converting entry address to physical and amend entry calculation logic in libstub accordingly. BTW, use ABSOLUTE() to calculate variables to make Clang/LLVM happy. Cc: stable@vger.kernel.org Signed-off-by: Jiaxun Yang Signed-off-by: Huacai Chen --- arch/loongarch/kernel/head.S | 2 +- arch/loongarch/kernel/vmlinux.lds.S | 10 ++++++---- drivers/firmware/efi/libstub/loongarch.c | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/loongarch/kernel/head.S b/arch/loongarch/kernel/head.S index c4f7de2e28054c..4677ea8fa8e98c 100644 --- a/arch/loongarch/kernel/head.S +++ b/arch/loongarch/kernel/head.S @@ -22,7 +22,7 @@ _head: .word MZ_MAGIC /* "MZ", MS-DOS header */ .org 0x8 - .dword kernel_entry /* Kernel entry point */ + .dword _kernel_entry /* Kernel entry point (physical address) */ .dword _kernel_asize /* Kernel image effective size */ .quad PHYS_LINK_KADDR /* Kernel image load offset from start of RAM */ .org 0x38 /* 0x20 ~ 0x37 reserved */ diff --git a/arch/loongarch/kernel/vmlinux.lds.S b/arch/loongarch/kernel/vmlinux.lds.S index e8e97dbf9ca40f..3c7595342730ed 100644 --- a/arch/loongarch/kernel/vmlinux.lds.S +++ b/arch/loongarch/kernel/vmlinux.lds.S @@ -6,6 +6,7 @@ #define PAGE_SIZE _PAGE_SIZE #define RO_EXCEPTION_TABLE_ALIGN 4 +#define PHYSADDR_MASK 0xffffffffffff /* 48-bit */ /* * Put .bss..swapper_pg_dir as the first thing in .bss. This will @@ -142,10 +143,11 @@ SECTIONS #ifdef CONFIG_EFI_STUB /* header symbols */ - _kernel_asize = _end - _text; - _kernel_fsize = _edata - _text; - _kernel_vsize = _end - __initdata_begin; - _kernel_rsize = _edata - __initdata_begin; + _kernel_entry = ABSOLUTE(kernel_entry & PHYSADDR_MASK); + _kernel_asize = ABSOLUTE(_end - _text); + _kernel_fsize = ABSOLUTE(_edata - _text); + _kernel_vsize = ABSOLUTE(_end - __initdata_begin); + _kernel_rsize = ABSOLUTE(_edata - __initdata_begin); #endif .gptab.sdata : { diff --git a/drivers/firmware/efi/libstub/loongarch.c b/drivers/firmware/efi/libstub/loongarch.c index 684c9354637c65..d0ef93551c44f6 100644 --- a/drivers/firmware/efi/libstub/loongarch.c +++ b/drivers/firmware/efi/libstub/loongarch.c @@ -41,7 +41,7 @@ static efi_status_t exit_boot_func(struct efi_boot_memmap *map, void *priv) unsigned long __weak kernel_entry_address(unsigned long kernel_addr, efi_loaded_image_t *image) { - return *(unsigned long *)(kernel_addr + 8) - VMLINUX_LOAD_ADDRESS + kernel_addr; + return *(unsigned long *)(kernel_addr + 8) - PHYSADDR(VMLINUX_LOAD_ADDRESS) + kernel_addr; } efi_status_t efi_boot_kernel(void *handle, efi_loaded_image_t *image, From 1098efd299ffe9c8af818425338c7f6c4f930a98 Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Mon, 3 Jun 2024 15:45:53 +0800 Subject: [PATCH 0473/1578] LoongArch: Override higher address bits in JUMP_VIRT_ADDR In JUMP_VIRT_ADDR we are performing an or calculation on address value directly from pcaddi. This will only work if we are currently running from direct 1:1 mapping addresses or firmware's DMW is configured exactly same as kernel. Still, we should not rely on such assumption. Fix by overriding higher bits in address comes from pcaddi, so we can get rid of or operator. Cc: stable@vger.kernel.org Signed-off-by: Jiaxun Yang Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/stackframe.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/loongarch/include/asm/stackframe.h b/arch/loongarch/include/asm/stackframe.h index 45b507a7b06fca..d9eafd3ee3d1e8 100644 --- a/arch/loongarch/include/asm/stackframe.h +++ b/arch/loongarch/include/asm/stackframe.h @@ -42,7 +42,7 @@ .macro JUMP_VIRT_ADDR temp1 temp2 li.d \temp1, CACHE_BASE pcaddi \temp2, 0 - or \temp1, \temp1, \temp2 + bstrins.d \temp1, \temp2, (DMW_PABITS - 1), 0 jirl zero, \temp1, 0xc .endm From eb36e520f4f1b690fd776f15cbac452f82ff7bfa Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Mon, 3 Jun 2024 15:45:53 +0800 Subject: [PATCH 0474/1578] LoongArch: Fix GMAC's phy-mode definitions in dts The GMAC of Loongson chips cannot insert the correct 1.5-2ns delay. So we need the PHY to insert internal delays for both transmit and receive data lines from/to the PHY device. Fix this by changing the "phy-mode" from "rgmii" to "rgmii-id" in dts. Signed-off-by: Huacai Chen --- arch/loongarch/boot/dts/loongson-2k0500-ref.dts | 4 ++-- arch/loongarch/boot/dts/loongson-2k1000-ref.dts | 4 ++-- arch/loongarch/boot/dts/loongson-2k2000-ref.dts | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/loongarch/boot/dts/loongson-2k0500-ref.dts b/arch/loongarch/boot/dts/loongson-2k0500-ref.dts index 8aefb0c1267229..a34734a6c3ce80 100644 --- a/arch/loongarch/boot/dts/loongson-2k0500-ref.dts +++ b/arch/loongarch/boot/dts/loongson-2k0500-ref.dts @@ -44,14 +44,14 @@ &gmac0 { status = "okay"; - phy-mode = "rgmii"; + phy-mode = "rgmii-id"; bus_id = <0x0>; }; &gmac1 { status = "okay"; - phy-mode = "rgmii"; + phy-mode = "rgmii-id"; bus_id = <0x1>; }; diff --git a/arch/loongarch/boot/dts/loongson-2k1000-ref.dts b/arch/loongarch/boot/dts/loongson-2k1000-ref.dts index 8463fe035386e4..23cf26cc3e5f19 100644 --- a/arch/loongarch/boot/dts/loongson-2k1000-ref.dts +++ b/arch/loongarch/boot/dts/loongson-2k1000-ref.dts @@ -43,7 +43,7 @@ &gmac0 { status = "okay"; - phy-mode = "rgmii"; + phy-mode = "rgmii-id"; phy-handle = <&phy0>; mdio { compatible = "snps,dwmac-mdio"; @@ -58,7 +58,7 @@ &gmac1 { status = "okay"; - phy-mode = "rgmii"; + phy-mode = "rgmii-id"; phy-handle = <&phy1>; mdio { compatible = "snps,dwmac-mdio"; diff --git a/arch/loongarch/boot/dts/loongson-2k2000-ref.dts b/arch/loongarch/boot/dts/loongson-2k2000-ref.dts index 74b99bd234cc38..ea9e6985d0e9fc 100644 --- a/arch/loongarch/boot/dts/loongson-2k2000-ref.dts +++ b/arch/loongarch/boot/dts/loongson-2k2000-ref.dts @@ -92,7 +92,7 @@ &gmac2 { status = "okay"; - phy-mode = "rgmii"; + phy-mode = "rgmii-id"; phy-handle = <&phy2>; mdio { compatible = "snps,dwmac-mdio"; From 537a350d14321c8cca5efbf0a33a404fec3a9f9e Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sat, 4 May 2024 21:57:30 +0200 Subject: [PATCH 0475/1578] batman-adv: Don't accept TT entries for out-of-spec VIDs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The internal handling of VLAN IDs in batman-adv is only specified for following encodings: * VLAN is used - bit 15 is 1 - bit 11 - bit 0 is the VLAN ID (0-4095) - remaining bits are 0 * No VLAN is used - bit 15 is 0 - remaining bits are 0 batman-adv was only preparing new translation table entries (based on its soft interface information) using this encoding format. But the receive path was never checking if entries in the roam or TT TVLVs were also following this encoding. It was therefore possible to create more than the expected maximum of 4096 + 1 entries in the originator VLAN list. Simply by setting the "remaining bits" to "random" values in corresponding TVLV. Cc: stable@vger.kernel.org Fixes: 7ea7b4a14275 ("batman-adv: make the TT CRC logic VLAN specific") Reported-by: Linus Lüssing Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/originator.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index ac74f6ead62d5e..8f6dd2c6ee41d6 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -131,6 +132,29 @@ batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node, return vlan; } +/** + * batadv_vlan_id_valid() - check if vlan id is in valid batman-adv encoding + * @vid: the VLAN identifier + * + * Return: true when either no vlan is set or if VLAN is in correct range, + * false otherwise + */ +static bool batadv_vlan_id_valid(unsigned short vid) +{ + unsigned short non_vlan = vid & ~(BATADV_VLAN_HAS_TAG | VLAN_VID_MASK); + + if (vid == 0) + return true; + + if (!(vid & BATADV_VLAN_HAS_TAG)) + return false; + + if (non_vlan) + return false; + + return true; +} + /** * batadv_orig_node_vlan_new() - search and possibly create an orig_node_vlan * object @@ -149,6 +173,9 @@ batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node, { struct batadv_orig_node_vlan *vlan; + if (!batadv_vlan_id_valid(vid)) + return NULL; + spin_lock_bh(&orig_node->vlan_list_lock); /* first look if an object for this vid already exists */ From 629f2b4e05225e53125aaf7ff0b87d5d53897128 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Tue, 28 May 2024 11:08:32 +0800 Subject: [PATCH 0476/1578] drm/panel: sitronix-st7789v: Add check for of_drm_get_panel_orientation Add check for the return value of of_drm_get_panel_orientation() and return the error if it fails in order to catch the error. Fixes: b27c0f6d208d ("drm/panel: sitronix-st7789v: add panel orientation support") Signed-off-by: Chen Ni Reviewed-by: Michael Riesch Acked-by: Jessica Zhang Link: https://lore.kernel.org/r/20240528030832.2529471-1-nichen@iscas.ac.cn Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20240528030832.2529471-1-nichen@iscas.ac.cn --- drivers/gpu/drm/panel/panel-sitronix-st7789v.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/panel/panel-sitronix-st7789v.c b/drivers/gpu/drm/panel/panel-sitronix-st7789v.c index e8f385b9c6182c..28bfc48a912729 100644 --- a/drivers/gpu/drm/panel/panel-sitronix-st7789v.c +++ b/drivers/gpu/drm/panel/panel-sitronix-st7789v.c @@ -643,7 +643,9 @@ static int st7789v_probe(struct spi_device *spi) if (ret) return dev_err_probe(dev, ret, "Failed to get backlight\n"); - of_drm_get_panel_orientation(spi->dev.of_node, &ctx->orientation); + ret = of_drm_get_panel_orientation(spi->dev.of_node, &ctx->orientation); + if (ret) + return dev_err_probe(&spi->dev, ret, "Failed to get orientation\n"); drm_panel_add(&ctx->panel); From 38a38f5a36da9820680d413972cb733349400532 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 25 May 2024 21:38:53 +0200 Subject: [PATCH 0477/1578] Input: silead - Always support 10 fingers When support for Silead touchscreens was orginal added some touchscreens with older firmware versions only supported 5 fingers and this was made the default requiring the setting of a "silead,max-fingers=10" uint32 device-property for all touchscreen models which do support 10 fingers. There are very few models with the old 5 finger fw, so in practice the setting of the "silead,max-fingers=10" is boilerplate which needs to be copy and pasted to every touchscreen config. Reporting that 10 fingers are supported on devices which only support 5 fingers doesn't cause any problems for userspace in practice, since at max 4 finger gestures are supported anyways. Drop the max_fingers configuration and simply always assume 10 fingers. Signed-off-by: Hans de Goede Acked-by: Dmitry Torokhov Link: https://lore.kernel.org/r/20240525193854.39130-2-hdegoede@redhat.com --- drivers/input/touchscreen/silead.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/drivers/input/touchscreen/silead.c b/drivers/input/touchscreen/silead.c index bbd366dcb69af9..6a42b27c459901 100644 --- a/drivers/input/touchscreen/silead.c +++ b/drivers/input/touchscreen/silead.c @@ -71,7 +71,6 @@ struct silead_ts_data { struct regulator_bulk_data regulators[2]; char fw_name[64]; struct touchscreen_properties prop; - u32 max_fingers; u32 chip_id; struct input_mt_pos pos[SILEAD_MAX_FINGERS]; int slots[SILEAD_MAX_FINGERS]; @@ -136,7 +135,7 @@ static int silead_ts_request_input_dev(struct silead_ts_data *data) touchscreen_parse_properties(data->input, true, &data->prop); silead_apply_efi_fw_min_max(data); - input_mt_init_slots(data->input, data->max_fingers, + input_mt_init_slots(data->input, SILEAD_MAX_FINGERS, INPUT_MT_DIRECT | INPUT_MT_DROP_UNUSED | INPUT_MT_TRACK); @@ -256,10 +255,10 @@ static void silead_ts_read_data(struct i2c_client *client) return; } - if (buf[0] > data->max_fingers) { + if (buf[0] > SILEAD_MAX_FINGERS) { dev_warn(dev, "More touches reported then supported %d > %d\n", - buf[0], data->max_fingers); - buf[0] = data->max_fingers; + buf[0], SILEAD_MAX_FINGERS); + buf[0] = SILEAD_MAX_FINGERS; } if (silead_ts_handle_pen_data(data, buf)) @@ -315,7 +314,6 @@ static void silead_ts_read_data(struct i2c_client *client) static int silead_ts_init(struct i2c_client *client) { - struct silead_ts_data *data = i2c_get_clientdata(client); int error; error = i2c_smbus_write_byte_data(client, SILEAD_REG_RESET, @@ -325,7 +323,7 @@ static int silead_ts_init(struct i2c_client *client) usleep_range(SILEAD_CMD_SLEEP_MIN, SILEAD_CMD_SLEEP_MAX); error = i2c_smbus_write_byte_data(client, SILEAD_REG_TOUCH_NR, - data->max_fingers); + SILEAD_MAX_FINGERS); if (error) goto i2c_write_err; usleep_range(SILEAD_CMD_SLEEP_MIN, SILEAD_CMD_SLEEP_MAX); @@ -591,13 +589,6 @@ static void silead_ts_read_props(struct i2c_client *client) const char *str; int error; - error = device_property_read_u32(dev, "silead,max-fingers", - &data->max_fingers); - if (error) { - dev_dbg(dev, "Max fingers read error %d\n", error); - data->max_fingers = 5; /* Most devices handle up-to 5 fingers */ - } - error = device_property_read_string(dev, "firmware-name", &str); if (!error) snprintf(data->fw_name, sizeof(data->fw_name), From 84b26f509c1b9c8b3c3c63d75912701f151fd148 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 25 May 2024 21:38:54 +0200 Subject: [PATCH 0478/1578] platform/x86: touchscreen_dmi: Drop "silead,max-fingers" property The silead touchscreen driver now defaults to 10 fingers, so it is no longer necessary to have a "silead,max-fingers=10" property for each silead touchscreen model. Drop this property from all the configs. Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20240525193854.39130-3-hdegoede@redhat.com --- drivers/platform/x86/touchscreen_dmi.c | 57 -------------------------- 1 file changed, 57 deletions(-) diff --git a/drivers/platform/x86/touchscreen_dmi.c b/drivers/platform/x86/touchscreen_dmi.c index 2d9ca2292ea19d..3f626584f2c3fb 100644 --- a/drivers/platform/x86/touchscreen_dmi.c +++ b/drivers/platform/x86/touchscreen_dmi.c @@ -34,7 +34,6 @@ static const struct property_entry archos_101_cesium_educ_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-y", 1280), PROPERTY_ENTRY_BOOL("touchscreen-inverted-y"), PROPERTY_ENTRY_BOOL("touchscreen-swapped-x-y"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), PROPERTY_ENTRY_STRING("firmware-name", "gsl1680-archos-101-cesium-educ.fw"), { } @@ -49,7 +48,6 @@ static const struct property_entry bush_bush_windows_tablet_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-x", 1850), PROPERTY_ENTRY_U32("touchscreen-size-y", 1280), PROPERTY_ENTRY_BOOL("touchscreen-swapped-x-y"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), PROPERTY_ENTRY_STRING("firmware-name", "gsl1680-bush-bush-windows-tablet.fw"), { } @@ -79,7 +77,6 @@ static const struct property_entry chuwi_hi8_air_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-y", 1148), PROPERTY_ENTRY_BOOL("touchscreen-swapped-x-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl3676-chuwi-hi8-air.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), { } }; @@ -95,7 +92,6 @@ static const struct property_entry chuwi_hi8_pro_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-y", 1148), PROPERTY_ENTRY_BOOL("touchscreen-swapped-x-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl3680-chuwi-hi8-pro.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -123,7 +119,6 @@ static const struct property_entry chuwi_hi10_air_props[] = { PROPERTY_ENTRY_U32("touchscreen-fuzz-x", 5), PROPERTY_ENTRY_U32("touchscreen-fuzz-y", 4), PROPERTY_ENTRY_STRING("firmware-name", "gsl1680-chuwi-hi10-air.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -139,7 +134,6 @@ static const struct property_entry chuwi_hi10_plus_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-x", 1908), PROPERTY_ENTRY_U32("touchscreen-size-y", 1270), PROPERTY_ENTRY_STRING("firmware-name", "gsl1680-chuwi-hi10plus.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), PROPERTY_ENTRY_BOOL("silead,pen-supported"), PROPERTY_ENTRY_U32("silead,pen-resolution-x", 8), @@ -171,7 +165,6 @@ static const struct property_entry chuwi_hi10_pro_props[] = { PROPERTY_ENTRY_BOOL("touchscreen-swapped-x-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl1680-chuwi-hi10-pro.fw"), PROPERTY_ENTRY_U32_ARRAY("silead,efi-fw-min-max", chuwi_hi10_pro_efi_min_max), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), PROPERTY_ENTRY_BOOL("silead,pen-supported"), PROPERTY_ENTRY_U32("silead,pen-resolution-x", 8), @@ -201,7 +194,6 @@ static const struct property_entry chuwi_hibook_props[] = { PROPERTY_ENTRY_BOOL("touchscreen-inverted-y"), PROPERTY_ENTRY_BOOL("touchscreen-swapped-x-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl1680-chuwi-hibook.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -227,7 +219,6 @@ static const struct property_entry chuwi_vi8_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-y", 1140), PROPERTY_ENTRY_BOOL("touchscreen-swapped-x-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl3676-chuwi-vi8.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -255,7 +246,6 @@ static const struct property_entry chuwi_vi10_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-x", 1858), PROPERTY_ENTRY_U32("touchscreen-size-y", 1280), PROPERTY_ENTRY_STRING("firmware-name", "gsl3680-chuwi-vi10.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -271,7 +261,6 @@ static const struct property_entry chuwi_surbook_mini_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-x", 2040), PROPERTY_ENTRY_U32("touchscreen-size-y", 1524), PROPERTY_ENTRY_STRING("firmware-name", "gsl1680-chuwi-surbook-mini.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("touchscreen-inverted-y"), { } }; @@ -289,7 +278,6 @@ static const struct property_entry connect_tablet9_props[] = { PROPERTY_ENTRY_BOOL("touchscreen-inverted-y"), PROPERTY_ENTRY_BOOL("touchscreen-swapped-x-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl1680-connect-tablet9.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), { } }; @@ -306,7 +294,6 @@ static const struct property_entry csl_panther_tab_hd_props[] = { PROPERTY_ENTRY_BOOL("touchscreen-inverted-y"), PROPERTY_ENTRY_BOOL("touchscreen-swapped-x-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl1680-csl-panther-tab-hd.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), { } }; @@ -322,7 +309,6 @@ static const struct property_entry cube_iwork8_air_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-y", 896), PROPERTY_ENTRY_BOOL("touchscreen-swapped-x-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl3670-cube-iwork8-air.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), { } }; @@ -346,7 +332,6 @@ static const struct property_entry cube_knote_i1101_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-x", 1961), PROPERTY_ENTRY_U32("touchscreen-size-y", 1513), PROPERTY_ENTRY_STRING("firmware-name", "gsl3692-cube-knote-i1101.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -360,7 +345,6 @@ static const struct property_entry dexp_ursus_7w_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-x", 890), PROPERTY_ENTRY_U32("touchscreen-size-y", 630), PROPERTY_ENTRY_STRING("firmware-name", "gsl1686-dexp-ursus-7w.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -376,7 +360,6 @@ static const struct property_entry dexp_ursus_kx210i_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-x", 1720), PROPERTY_ENTRY_U32("touchscreen-size-y", 1137), PROPERTY_ENTRY_STRING("firmware-name", "gsl1680-dexp-ursus-kx210i.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -391,7 +374,6 @@ static const struct property_entry digma_citi_e200_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-y", 1500), PROPERTY_ENTRY_BOOL("touchscreen-inverted-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl1686-digma_citi_e200.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -450,7 +432,6 @@ static const struct property_entry irbis_tw90_props[] = { PROPERTY_ENTRY_BOOL("touchscreen-inverted-y"), PROPERTY_ENTRY_BOOL("touchscreen-swapped-x-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl3680-irbis_tw90.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -466,7 +447,6 @@ static const struct property_entry irbis_tw118_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-x", 1960), PROPERTY_ENTRY_U32("touchscreen-size-y", 1510), PROPERTY_ENTRY_STRING("firmware-name", "gsl1680-irbis-tw118.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), { } }; @@ -483,7 +463,6 @@ static const struct property_entry itworks_tw891_props[] = { PROPERTY_ENTRY_BOOL("touchscreen-inverted-y"), PROPERTY_ENTRY_BOOL("touchscreen-swapped-x-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl3670-itworks-tw891.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), { } }; @@ -496,7 +475,6 @@ static const struct property_entry jumper_ezpad_6_pro_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-x", 1980), PROPERTY_ENTRY_U32("touchscreen-size-y", 1500), PROPERTY_ENTRY_STRING("firmware-name", "gsl3692-jumper-ezpad-6-pro.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -511,7 +489,6 @@ static const struct property_entry jumper_ezpad_6_pro_b_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-y", 1500), PROPERTY_ENTRY_STRING("firmware-name", "gsl3692-jumper-ezpad-6-pro-b.fw"), PROPERTY_ENTRY_BOOL("touchscreen-inverted-y"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -527,7 +504,6 @@ static const struct property_entry jumper_ezpad_6_m4_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-x", 1950), PROPERTY_ENTRY_U32("touchscreen-size-y", 1525), PROPERTY_ENTRY_STRING("firmware-name", "gsl3692-jumper-ezpad-6-m4.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -544,7 +520,6 @@ static const struct property_entry jumper_ezpad_7_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-y", 1526), PROPERTY_ENTRY_BOOL("touchscreen-swapped-x-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl3680-jumper-ezpad-7.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,stuck-controller-bug"), { } }; @@ -561,7 +536,6 @@ static const struct property_entry jumper_ezpad_mini3_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-y", 1138), PROPERTY_ENTRY_BOOL("touchscreen-swapped-x-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl3676-jumper-ezpad-mini3.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), { } }; @@ -578,7 +552,6 @@ static const struct property_entry mpman_converter9_props[] = { PROPERTY_ENTRY_BOOL("touchscreen-inverted-y"), PROPERTY_ENTRY_BOOL("touchscreen-swapped-x-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl1680-mpman-converter9.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), { } }; @@ -594,7 +567,6 @@ static const struct property_entry mpman_mpwin895cl_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-y", 1150), PROPERTY_ENTRY_BOOL("touchscreen-inverted-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl3680-mpman-mpwin895cl.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -611,7 +583,6 @@ static const struct property_entry myria_my8307_props[] = { PROPERTY_ENTRY_BOOL("touchscreen-inverted-y"), PROPERTY_ENTRY_BOOL("touchscreen-swapped-x-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl1680-myria-my8307.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -628,7 +599,6 @@ static const struct property_entry onda_obook_20_plus_props[] = { PROPERTY_ENTRY_BOOL("touchscreen-inverted-y"), PROPERTY_ENTRY_BOOL("touchscreen-swapped-x-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl3676-onda-obook-20-plus.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -645,7 +615,6 @@ static const struct property_entry onda_v80_plus_v3_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-y", 1140), PROPERTY_ENTRY_BOOL("touchscreen-swapped-x-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl3676-onda-v80-plus-v3.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -669,7 +638,6 @@ static const struct property_entry onda_v820w_32g_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-y", 1140), PROPERTY_ENTRY_BOOL("touchscreen-swapped-x-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl1680-onda-v820w-32g.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -687,7 +655,6 @@ static const struct property_entry onda_v891_v5_props[] = { PROPERTY_ENTRY_BOOL("touchscreen-swapped-x-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl3676-onda-v891-v5.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -703,7 +670,6 @@ static const struct property_entry onda_v891w_v1_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-x", 1676), PROPERTY_ENTRY_U32("touchscreen-size-y", 1130), PROPERTY_ENTRY_STRING("firmware-name", "gsl3680-onda-v891w-v1.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -720,7 +686,6 @@ static const struct property_entry onda_v891w_v3_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-y", 1135), PROPERTY_ENTRY_BOOL("touchscreen-inverted-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl3676-onda-v891w-v3.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -759,7 +724,6 @@ static const struct property_entry pipo_w11_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-x", 1984), PROPERTY_ENTRY_U32("touchscreen-size-y", 1532), PROPERTY_ENTRY_STRING("firmware-name", "gsl1680-pipo-w11.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -775,7 +739,6 @@ static const struct property_entry positivo_c4128b_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-x", 1915), PROPERTY_ENTRY_U32("touchscreen-size-y", 1269), PROPERTY_ENTRY_STRING("firmware-name", "gsl1680-positivo-c4128b.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), { } }; @@ -791,7 +754,6 @@ static const struct property_entry pov_mobii_wintab_p800w_v20_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-y", 1146), PROPERTY_ENTRY_BOOL("touchscreen-swapped-x-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl3680-pov-mobii-wintab-p800w-v20.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -808,7 +770,6 @@ static const struct property_entry pov_mobii_wintab_p800w_v21_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-y", 1148), PROPERTY_ENTRY_BOOL("touchscreen-swapped-x-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl3692-pov-mobii-wintab-p800w.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -825,7 +786,6 @@ static const struct property_entry pov_mobii_wintab_p1006w_v10_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-y", 1520), PROPERTY_ENTRY_BOOL("touchscreen-inverted-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl3692-pov-mobii-wintab-p1006w-v10.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -842,7 +802,6 @@ static const struct property_entry predia_basic_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-y", 1144), PROPERTY_ENTRY_BOOL("touchscreen-swapped-x-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl3680-predia-basic.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -859,7 +818,6 @@ static const struct property_entry rca_cambio_w101_v2_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-y", 874), PROPERTY_ENTRY_BOOL("touchscreen-swapped-x-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl1680-rca-cambio-w101-v2.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), { } }; @@ -874,7 +832,6 @@ static const struct property_entry rwc_nanote_p8_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-y", 1140), PROPERTY_ENTRY_BOOL("touchscreen-inverted-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl1680-rwc-nanote-p8.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), { } }; @@ -890,7 +847,6 @@ static const struct property_entry schneider_sct101ctm_props[] = { PROPERTY_ENTRY_BOOL("touchscreen-inverted-y"), PROPERTY_ENTRY_BOOL("touchscreen-swapped-x-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl1680-schneider-sct101ctm.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -906,7 +862,6 @@ static const struct property_entry globalspace_solt_ivw116_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-x", 1723), PROPERTY_ENTRY_U32("touchscreen-size-y", 1077), PROPERTY_ENTRY_STRING("firmware-name", "gsl1680-globalspace-solt-ivw116.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -923,7 +878,6 @@ static const struct property_entry techbite_arc_11_6_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-y", 1270), PROPERTY_ENTRY_BOOL("touchscreen-inverted-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl1680-techbite-arc-11-6.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), { } }; @@ -939,7 +893,6 @@ static const struct property_entry teclast_tbook11_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-y", 1264), PROPERTY_ENTRY_BOOL("touchscreen-inverted-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl3692-teclast-tbook11.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -965,7 +918,6 @@ static const struct property_entry teclast_x16_plus_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-y", 1264), PROPERTY_ENTRY_BOOL("touchscreen-inverted-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl3692-teclast-x16-plus.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -988,7 +940,6 @@ static const struct property_entry teclast_x3_plus_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-x", 1980), PROPERTY_ENTRY_U32("touchscreen-size-y", 1500), PROPERTY_ENTRY_STRING("firmware-name", "gsl1680-teclast-x3-plus.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -1004,7 +955,6 @@ static const struct property_entry teclast_x98plus2_props[] = { PROPERTY_ENTRY_BOOL("touchscreen-inverted-x"), PROPERTY_ENTRY_BOOL("touchscreen-inverted-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl1686-teclast_x98plus2.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), { } }; @@ -1018,7 +968,6 @@ static const struct property_entry trekstor_primebook_c11_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-y", 1530), PROPERTY_ENTRY_BOOL("touchscreen-inverted-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl1680-trekstor-primebook-c11.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -1032,7 +981,6 @@ static const struct property_entry trekstor_primebook_c13_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-x", 2624), PROPERTY_ENTRY_U32("touchscreen-size-y", 1920), PROPERTY_ENTRY_STRING("firmware-name", "gsl1680-trekstor-primebook-c13.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -1046,7 +994,6 @@ static const struct property_entry trekstor_primetab_t13b_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-x", 2500), PROPERTY_ENTRY_U32("touchscreen-size-y", 1900), PROPERTY_ENTRY_STRING("firmware-name", "gsl1680-trekstor-primetab-t13b.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), PROPERTY_ENTRY_BOOL("touchscreen-inverted-y"), { } @@ -1074,7 +1021,6 @@ static const struct property_entry trekstor_surftab_twin_10_1_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-y", 1280), PROPERTY_ENTRY_U32("touchscreen-inverted-y", 1), PROPERTY_ENTRY_STRING("firmware-name", "gsl3670-surftab-twin-10-1-st10432-8.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -1090,7 +1036,6 @@ static const struct property_entry trekstor_surftab_wintron70_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-x", 884), PROPERTY_ENTRY_U32("touchscreen-size-y", 632), PROPERTY_ENTRY_STRING("firmware-name", "gsl1686-surftab-wintron70-st70416-6.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -1107,7 +1052,6 @@ static const struct property_entry viglen_connect_10_props[] = { PROPERTY_ENTRY_U32("touchscreen-fuzz-y", 6), PROPERTY_ENTRY_BOOL("touchscreen-swapped-x-y"), PROPERTY_ENTRY_STRING("firmware-name", "gsl3680-viglen-connect-10.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; @@ -1121,7 +1065,6 @@ static const struct property_entry vinga_twizzle_j116_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-x", 1920), PROPERTY_ENTRY_U32("touchscreen-size-y", 1280), PROPERTY_ENTRY_STRING("firmware-name", "gsl1680-vinga-twizzle_j116.fw"), - PROPERTY_ENTRY_U32("silead,max-fingers", 10), PROPERTY_ENTRY_BOOL("silead,home-button"), { } }; From 55624db051c8bdd39acbb1f35774c3f7b97c07b8 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Sun, 2 Jun 2024 11:58:00 +0300 Subject: [PATCH 0479/1578] platform/x86: touchscreen_dmi: Use 2-argument strscpy() Use 2-argument strscpy(), which is not only shorter but also provides an additional check that destination buffer is an array. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240602090244.1666360-8-andy.shevchenko@gmail.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/touchscreen_dmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/touchscreen_dmi.c b/drivers/platform/x86/touchscreen_dmi.c index 3f626584f2c3fb..f74af0a689f20c 100644 --- a/drivers/platform/x86/touchscreen_dmi.c +++ b/drivers/platform/x86/touchscreen_dmi.c @@ -1850,7 +1850,7 @@ static int __init ts_parse_props(char *str) u32 u32val; int i, ret; - strscpy(orig_str, str, sizeof(orig_str)); + strscpy(orig_str, str); /* * str is part of the static_command_line from init/main.c and poking From 078fc56f5c1787a9272373742f817b320c046d1a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 28 May 2024 13:59:33 +0200 Subject: [PATCH 0480/1578] platform/x86: yt2-1380: add CONFIG_EXTCON dependency This driver uses the extcon subsystem and fails to build when it cannot call into that subsystem: x86_64-linux-ld: vmlinux.o: in function `yt2_1380_fc_worker': lenovo-yoga-tab2-pro-1380-fastcharger.c:(.text+0xa9d819): undefined reference to `extcon_get_state' x86_64-linux-ld: lenovo-yoga-tab2-pro-1380-fastcharger.c:(.text+0xa9d853): undefined reference to `extcon_get_state' x86_64-linux-ld: vmlinux.o: in function `yt2_1380_fc_serdev_probe': lenovo-yoga-tab2-pro-1380-fastcharger.c:(.text+0xa9da22): undefined reference to `extcon_get_extcon_dev' x86_64-linux-ld: lenovo-yoga-tab2-pro-1380-fastcharger.c:(.text+0xa9dc0c): undefined reference to `devm_extcon_register_notifier_all' Add a Kconfig dependency to make it it always builds correctly. Fixes: b2ed33e8d486 ("platform/x86: Add lenovo-yoga-tab2-pro-1380-fastcharger driver") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20240528115940.3169455-1-arnd@kernel.org Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 1953317541eab8..665fa952498659 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -136,6 +136,7 @@ config YOGABOOK config YT2_1380 tristate "Lenovo Yoga Tablet 2 1380 fast charge driver" depends on SERIAL_DEV_BUS + depends on EXTCON depends on ACPI help Say Y here to enable support for the custom fast charging protocol From 1981b296f858010eae409548fd297659b2cc570e Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Tue, 28 May 2024 22:49:02 +0200 Subject: [PATCH 0481/1578] platform/x86: dell-smbios: Fix wrong token data in sysfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When reading token data from sysfs on my Inspiron 3505, the token locations and values are wrong. This happens because match_attribute() blindly assumes that all entries in da_tokens have an associated entry in token_attrs. This however is not true as soon as da_tokens[] contains zeroed token entries. Those entries are being skipped when initialising token_attrs, breaking the core assumption of match_attribute(). Fix this by defining an extra struct for each pair of token attributes and use container_of() to retrieve token information. Tested on a Dell Inspiron 3050. Fixes: 33b9ca1e53b4 ("platform/x86: dell-smbios: Add a sysfs interface for SMBIOS tokens") Signed-off-by: Armin Wolf Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20240528204903.445546-1-W_Armin@gmx.de Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/dell/dell-smbios-base.c | 92 ++++++++------------ 1 file changed, 36 insertions(+), 56 deletions(-) diff --git a/drivers/platform/x86/dell/dell-smbios-base.c b/drivers/platform/x86/dell/dell-smbios-base.c index e61bfaf8b5c48f..86b95206cb1bd6 100644 --- a/drivers/platform/x86/dell/dell-smbios-base.c +++ b/drivers/platform/x86/dell/dell-smbios-base.c @@ -11,6 +11,7 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -25,11 +26,16 @@ static u32 da_supported_commands; static int da_num_tokens; static struct platform_device *platform_device; static struct calling_interface_token *da_tokens; -static struct device_attribute *token_location_attrs; -static struct device_attribute *token_value_attrs; +static struct token_sysfs_data *token_entries; static struct attribute **token_attrs; static DEFINE_MUTEX(smbios_mutex); +struct token_sysfs_data { + struct device_attribute location_attr; + struct device_attribute value_attr; + struct calling_interface_token *token; +}; + struct smbios_device { struct list_head list; struct device *device; @@ -416,47 +422,26 @@ static void __init find_tokens(const struct dmi_header *dm, void *dummy) } } -static int match_attribute(struct device *dev, - struct device_attribute *attr) -{ - int i; - - for (i = 0; i < da_num_tokens * 2; i++) { - if (!token_attrs[i]) - continue; - if (strcmp(token_attrs[i]->name, attr->attr.name) == 0) - return i/2; - } - dev_dbg(dev, "couldn't match: %s\n", attr->attr.name); - return -EINVAL; -} - static ssize_t location_show(struct device *dev, struct device_attribute *attr, char *buf) { - int i; + struct token_sysfs_data *data = container_of(attr, struct token_sysfs_data, location_attr); if (!capable(CAP_SYS_ADMIN)) return -EPERM; - i = match_attribute(dev, attr); - if (i > 0) - return sysfs_emit(buf, "%08x", da_tokens[i].location); - return 0; + return sysfs_emit(buf, "%08x", data->token->location); } static ssize_t value_show(struct device *dev, struct device_attribute *attr, char *buf) { - int i; + struct token_sysfs_data *data = container_of(attr, struct token_sysfs_data, value_attr); if (!capable(CAP_SYS_ADMIN)) return -EPERM; - i = match_attribute(dev, attr); - if (i > 0) - return sysfs_emit(buf, "%08x", da_tokens[i].value); - return 0; + return sysfs_emit(buf, "%08x", data->token->value); } static struct attribute_group smbios_attribute_group = { @@ -473,22 +458,15 @@ static int build_tokens_sysfs(struct platform_device *dev) { char *location_name; char *value_name; - size_t size; int ret; int i, j; - /* (number of tokens + 1 for null terminated */ - size = sizeof(struct device_attribute) * (da_num_tokens + 1); - token_location_attrs = kzalloc(size, GFP_KERNEL); - if (!token_location_attrs) + token_entries = kcalloc(da_num_tokens, sizeof(*token_entries), GFP_KERNEL); + if (!token_entries) return -ENOMEM; - token_value_attrs = kzalloc(size, GFP_KERNEL); - if (!token_value_attrs) - goto out_allocate_value; /* need to store both location and value + terminator*/ - size = sizeof(struct attribute *) * ((2 * da_num_tokens) + 1); - token_attrs = kzalloc(size, GFP_KERNEL); + token_attrs = kcalloc((2 * da_num_tokens) + 1, sizeof(*token_attrs), GFP_KERNEL); if (!token_attrs) goto out_allocate_attrs; @@ -496,27 +474,32 @@ static int build_tokens_sysfs(struct platform_device *dev) /* skip empty */ if (da_tokens[i].tokenID == 0) continue; + + token_entries[i].token = &da_tokens[i]; + /* add location */ location_name = kasprintf(GFP_KERNEL, "%04x_location", da_tokens[i].tokenID); if (location_name == NULL) goto out_unwind_strings; - sysfs_attr_init(&token_location_attrs[i].attr); - token_location_attrs[i].attr.name = location_name; - token_location_attrs[i].attr.mode = 0444; - token_location_attrs[i].show = location_show; - token_attrs[j++] = &token_location_attrs[i].attr; + + sysfs_attr_init(&token_entries[i].location_attr.attr); + token_entries[i].location_attr.attr.name = location_name; + token_entries[i].location_attr.attr.mode = 0444; + token_entries[i].location_attr.show = location_show; + token_attrs[j++] = &token_entries[i].location_attr.attr; /* add value */ value_name = kasprintf(GFP_KERNEL, "%04x_value", da_tokens[i].tokenID); if (value_name == NULL) goto loop_fail_create_value; - sysfs_attr_init(&token_value_attrs[i].attr); - token_value_attrs[i].attr.name = value_name; - token_value_attrs[i].attr.mode = 0444; - token_value_attrs[i].show = value_show; - token_attrs[j++] = &token_value_attrs[i].attr; + + sysfs_attr_init(&token_entries[i].value_attr.attr); + token_entries[i].value_attr.attr.name = value_name; + token_entries[i].value_attr.attr.mode = 0444; + token_entries[i].value_attr.show = value_show; + token_attrs[j++] = &token_entries[i].value_attr.attr; continue; loop_fail_create_value: @@ -532,14 +515,12 @@ static int build_tokens_sysfs(struct platform_device *dev) out_unwind_strings: while (i--) { - kfree(token_location_attrs[i].attr.name); - kfree(token_value_attrs[i].attr.name); + kfree(token_entries[i].location_attr.attr.name); + kfree(token_entries[i].value_attr.attr.name); } kfree(token_attrs); out_allocate_attrs: - kfree(token_value_attrs); -out_allocate_value: - kfree(token_location_attrs); + kfree(token_entries); return -ENOMEM; } @@ -551,12 +532,11 @@ static void free_group(struct platform_device *pdev) sysfs_remove_group(&pdev->dev.kobj, &smbios_attribute_group); for (i = 0; i < da_num_tokens; i++) { - kfree(token_location_attrs[i].attr.name); - kfree(token_value_attrs[i].attr.name); + kfree(token_entries[i].location_attr.attr.name); + kfree(token_entries[i].value_attr.attr.name); } kfree(token_attrs); - kfree(token_value_attrs); - kfree(token_location_attrs); + kfree(token_entries); } static int __init dell_smbios_init(void) From 306aec7eea8c83e8212f4dd6e5b358c508e3e466 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Tue, 28 May 2024 22:49:03 +0200 Subject: [PATCH 0482/1578] platform/x86: dell-smbios: Simplify error handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the allocation of value_name fails, the error handling code uses two gotos for error handling, which is not necessary. Simplify the error handling in this case by only using a single goto. Tested on a Dell Inspiron 3505. Signed-off-by: Armin Wolf Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20240528204903.445546-2-W_Armin@gmx.de Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/dell/dell-smbios-base.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/platform/x86/dell/dell-smbios-base.c b/drivers/platform/x86/dell/dell-smbios-base.c index 86b95206cb1bd6..b562ed99ec4e79 100644 --- a/drivers/platform/x86/dell/dell-smbios-base.c +++ b/drivers/platform/x86/dell/dell-smbios-base.c @@ -492,19 +492,16 @@ static int build_tokens_sysfs(struct platform_device *dev) /* add value */ value_name = kasprintf(GFP_KERNEL, "%04x_value", da_tokens[i].tokenID); - if (value_name == NULL) - goto loop_fail_create_value; + if (!value_name) { + kfree(location_name); + goto out_unwind_strings; + } sysfs_attr_init(&token_entries[i].value_attr.attr); token_entries[i].value_attr.attr.name = value_name; token_entries[i].value_attr.attr.mode = 0444; token_entries[i].value_attr.show = value_show; token_attrs[j++] = &token_entries[i].value_attr.attr; - continue; - -loop_fail_create_value: - kfree(location_name); - goto out_unwind_strings; } smbios_attribute_group.attrs = token_attrs; From 77f1972bdcf7513293e8bbe376b9fe837310ee9c Mon Sep 17 00:00:00 2001 From: Suma Hegde Date: Mon, 3 Jun 2024 08:15:12 +0000 Subject: [PATCH 0483/1578] platform/x86/amd/hsmp: Check HSMP support on AMD family of processors HSMP interface is supported only on few x86 processors from AMD. Accessing HSMP registers on rest of the platforms might cause unexpected behaviour. So add a check. Also unavailability of this interface on rest of the processors is not an error. Hence, use pr_info() instead of the pr_err() to log the message. Signed-off-by: Suma Hegde Reviewed-by: Naveen Krishna Chatradhi Link: https://lore.kernel.org/r/20240603081512.142909-1-suma.hegde@amd.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/amd/hsmp.c | 50 ++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/drivers/platform/x86/amd/hsmp.c b/drivers/platform/x86/amd/hsmp.c index d84ea66eecc6b6..8fcf38eed7f00e 100644 --- a/drivers/platform/x86/amd/hsmp.c +++ b/drivers/platform/x86/amd/hsmp.c @@ -907,16 +907,44 @@ static int hsmp_plat_dev_register(void) return ret; } +/* + * This check is only needed for backward compatibility of previous platforms. + * All new platforms are expected to support ACPI based probing. + */ +static bool legacy_hsmp_support(void) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return false; + + switch (boot_cpu_data.x86) { + case 0x19: + switch (boot_cpu_data.x86_model) { + case 0x00 ... 0x1F: + case 0x30 ... 0x3F: + case 0x90 ... 0x9F: + case 0xA0 ... 0xAF: + return true; + default: + return false; + } + case 0x1A: + switch (boot_cpu_data.x86_model) { + case 0x00 ... 0x1F: + return true; + default: + return false; + } + default: + return false; + } + + return false; +} + static int __init hsmp_plt_init(void) { int ret = -ENODEV; - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD || boot_cpu_data.x86 < 0x19) { - pr_err("HSMP is not supported on Family:%x model:%x\n", - boot_cpu_data.x86, boot_cpu_data.x86_model); - return ret; - } - /* * amd_nb_num() returns number of SMN/DF interfaces present in the system * if we have N SMN/DF interfaces that ideally means N sockets @@ -930,7 +958,15 @@ static int __init hsmp_plt_init(void) return ret; if (!plat_dev.is_acpi_device) { - ret = hsmp_plat_dev_register(); + if (legacy_hsmp_support()) { + /* Not ACPI device, but supports HSMP, register a plat_dev */ + ret = hsmp_plat_dev_register(); + } else { + /* Not ACPI, Does not support HSMP */ + pr_info("HSMP is not supported on Family:%x model:%x\n", + boot_cpu_data.x86, boot_cpu_data.x86_model); + ret = -ENODEV; + } if (ret) platform_driver_unregister(&amd_hsmp_driver); } From 0110c4b110477bb1f19b0d02361846be7ab08300 Mon Sep 17 00:00:00 2001 From: Sunil V L Date: Mon, 27 May 2024 13:41:13 +0530 Subject: [PATCH 0484/1578] irqchip/riscv-intc: Prevent memory leak when riscv_intc_init_common() fails When riscv_intc_init_common() fails, the firmware node allocated is not freed. Add the missing free(). Fixes: 7023b9d83f03 ("irqchip/riscv-intc: Add ACPI support") Signed-off-by: Sunil V L Signed-off-by: Thomas Gleixner Reviewed-by: Anup Patel Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240527081113.616189-1-sunilvl@ventanamicro.com --- drivers/irqchip/irq-riscv-intc.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-riscv-intc.c b/drivers/irqchip/irq-riscv-intc.c index 9e71c442881412..4f3a12383a1e41 100644 --- a/drivers/irqchip/irq-riscv-intc.c +++ b/drivers/irqchip/irq-riscv-intc.c @@ -253,8 +253,9 @@ IRQCHIP_DECLARE(andes, "andestech,cpu-intc", riscv_intc_init); static int __init riscv_intc_acpi_init(union acpi_subtable_headers *header, const unsigned long end) { - struct fwnode_handle *fn; struct acpi_madt_rintc *rintc; + struct fwnode_handle *fn; + int rc; rintc = (struct acpi_madt_rintc *)header; @@ -273,7 +274,11 @@ static int __init riscv_intc_acpi_init(union acpi_subtable_headers *header, return -ENOMEM; } - return riscv_intc_init_common(fn, &riscv_intc_chip); + rc = riscv_intc_init_common(fn, &riscv_intc_chip); + if (rc) + irq_domain_free_fwnode(fn); + + return rc; } IRQCHIP_ACPI_DECLARE(riscv_intc, ACPI_MADT_TYPE_RINTC, NULL, From 6149db4997f582e958da675092f21c666e3b67b7 Mon Sep 17 00:00:00 2001 From: Tristram Ha Date: Tue, 28 May 2024 19:20:23 -0700 Subject: [PATCH 0485/1578] net: phy: micrel: fix KSZ9477 PHY issues after suspend/resume When the PHY is powered up after powered down most of the registers are reset, so the PHY setup code needs to be done again. In addition the interrupt register will need to be setup again so that link status indication works again. Fixes: 26dd2974c5b5 ("net: phy: micrel: Move KSZ9477 errata fixes to PHY driver") Signed-off-by: Tristram Ha Signed-off-by: David S. Miller --- drivers/net/phy/micrel.c | 62 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 56 insertions(+), 6 deletions(-) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 2b8f8b7f1517cc..8c20cf93753066 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -1939,7 +1939,7 @@ static const struct ksz9477_errata_write ksz9477_errata_writes[] = { {0x1c, 0x20, 0xeeee}, }; -static int ksz9477_config_init(struct phy_device *phydev) +static int ksz9477_phy_errata(struct phy_device *phydev) { int err; int i; @@ -1967,16 +1967,30 @@ static int ksz9477_config_init(struct phy_device *phydev) return err; } + err = genphy_restart_aneg(phydev); + if (err) + return err; + + return err; +} + +static int ksz9477_config_init(struct phy_device *phydev) +{ + int err; + + /* Only KSZ9897 family of switches needs this fix. */ + if ((phydev->phy_id & 0xf) == 1) { + err = ksz9477_phy_errata(phydev); + if (err) + return err; + } + /* According to KSZ9477 Errata DS80000754C (Module 4) all EEE modes * in this switch shall be regarded as broken. */ if (phydev->dev_flags & MICREL_NO_EEE) phydev->eee_broken_modes = -1; - err = genphy_restart_aneg(phydev); - if (err) - return err; - return kszphy_config_init(phydev); } @@ -2085,6 +2099,42 @@ static int kszphy_resume(struct phy_device *phydev) return 0; } +static int ksz9477_resume(struct phy_device *phydev) +{ + int ret; + + /* No need to initialize registers if not powered down. */ + ret = phy_read(phydev, MII_BMCR); + if (ret < 0) + return ret; + if (!(ret & BMCR_PDOWN)) + return 0; + + genphy_resume(phydev); + + /* After switching from power-down to normal mode, an internal global + * reset is automatically generated. Wait a minimum of 1 ms before + * read/write access to the PHY registers. + */ + usleep_range(1000, 2000); + + /* Only KSZ9897 family of switches needs this fix. */ + if ((phydev->phy_id & 0xf) == 1) { + ret = ksz9477_phy_errata(phydev); + if (ret) + return ret; + } + + /* Enable PHY Interrupts */ + if (phy_interrupt_is_valid(phydev)) { + phydev->interrupts = PHY_INTERRUPT_ENABLED; + if (phydev->drv->config_intr) + phydev->drv->config_intr(phydev); + } + + return 0; +} + static int kszphy_probe(struct phy_device *phydev) { const struct kszphy_type *type = phydev->drv->driver_data; @@ -5493,7 +5543,7 @@ static struct phy_driver ksphy_driver[] = { .config_intr = kszphy_config_intr, .handle_interrupt = kszphy_handle_interrupt, .suspend = genphy_suspend, - .resume = genphy_resume, + .resume = ksz9477_resume, .get_features = ksz9477_get_features, } }; From e306a894bd511804ba9db7c00ca9cc05b55df1f2 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Wed, 29 May 2024 14:54:56 -0700 Subject: [PATCH 0486/1578] irqchip/sifive-plic: Chain to parent IRQ after handlers are ready Now that the PLIC uses a platform driver, the driver is probed later in the boot process, where interrupts from peripherals might already be pending. As a result, plic_handle_irq() may be called as early as the call to irq_set_chained_handler() completes. But this call happens before the per-context handler is completely set up, so there is a window where plic_handle_irq() can see incomplete per-context state and crash. Avoid this by delaying the call to irq_set_chained_handler() until all handlers from all PLICs are initialized. Fixes: 8ec99b033147 ("irqchip/sifive-plic: Convert PLIC driver into a platform driver") Reported-by: Geert Uytterhoeven Signed-off-by: Samuel Holland Signed-off-by: Thomas Gleixner Tested-by: Geert Uytterhoeven Reviewed-by: Anup Patel Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240529215458.937817-1-samuel.holland@sifive.com Closes: https://lore.kernel.org/r/CAMuHMdVYFFR7K5SbHBLY-JHhb7YpgGMS_hnRWm8H0KD-wBo+4A@mail.gmail.com/ --- drivers/irqchip/irq-sifive-plic.c | 34 +++++++++++++++---------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/drivers/irqchip/irq-sifive-plic.c b/drivers/irqchip/irq-sifive-plic.c index 8fb183ced1e778..9e22f7e378f502 100644 --- a/drivers/irqchip/irq-sifive-plic.c +++ b/drivers/irqchip/irq-sifive-plic.c @@ -85,7 +85,7 @@ struct plic_handler { struct plic_priv *priv; }; static int plic_parent_irq __ro_after_init; -static bool plic_cpuhp_setup_done __ro_after_init; +static bool plic_global_setup_done __ro_after_init; static DEFINE_PER_CPU(struct plic_handler, plic_handlers); static int plic_irq_set_type(struct irq_data *d, unsigned int type); @@ -487,10 +487,8 @@ static int plic_probe(struct platform_device *pdev) unsigned long plic_quirks = 0; struct plic_handler *handler; u32 nr_irqs, parent_hwirq; - struct irq_domain *domain; struct plic_priv *priv; irq_hw_number_t hwirq; - bool cpuhp_setup; if (is_of_node(dev->fwnode)) { const struct of_device_id *id; @@ -549,14 +547,6 @@ static int plic_probe(struct platform_device *pdev) continue; } - /* Find parent domain and register chained handler */ - domain = irq_find_matching_fwnode(riscv_get_intc_hwnode(), DOMAIN_BUS_ANY); - if (!plic_parent_irq && domain) { - plic_parent_irq = irq_create_mapping(domain, RV_IRQ_EXT); - if (plic_parent_irq) - irq_set_chained_handler(plic_parent_irq, plic_handle_irq); - } - /* * When running in M-mode we need to ignore the S-mode handler. * Here we assume it always comes later, but that might be a @@ -597,25 +587,35 @@ static int plic_probe(struct platform_device *pdev) goto fail_cleanup_contexts; /* - * We can have multiple PLIC instances so setup cpuhp state + * We can have multiple PLIC instances so setup global state * and register syscore operations only once after context * handlers of all online CPUs are initialized. */ - if (!plic_cpuhp_setup_done) { - cpuhp_setup = true; + if (!plic_global_setup_done) { + struct irq_domain *domain; + bool global_setup = true; + for_each_online_cpu(cpu) { handler = per_cpu_ptr(&plic_handlers, cpu); if (!handler->present) { - cpuhp_setup = false; + global_setup = false; break; } } - if (cpuhp_setup) { + + if (global_setup) { + /* Find parent domain and register chained handler */ + domain = irq_find_matching_fwnode(riscv_get_intc_hwnode(), DOMAIN_BUS_ANY); + if (domain) + plic_parent_irq = irq_create_mapping(domain, RV_IRQ_EXT); + if (plic_parent_irq) + irq_set_chained_handler(plic_parent_irq, plic_handle_irq); + cpuhp_setup_state(CPUHP_AP_IRQ_SIFIVE_PLIC_STARTING, "irqchip/sifive/plic:starting", plic_starting_cpu, plic_dying_cpu); register_syscore_ops(&plic_irq_syscore_ops); - plic_cpuhp_setup_done = true; + plic_global_setup_done = true; } } From e2d8ea0a066a6db51f31efd2710057271d685d2e Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 29 Apr 2024 00:49:35 +0000 Subject: [PATCH 0487/1578] soundwire: fix usages of device_get_named_child_node() The documentation for device_get_named_child_node() mentions this important point: " The caller is responsible for calling fwnode_handle_put() on the returned fwnode pointer. " Add fwnode_handle_put() to avoid leaked references. Signed-off-by: Pierre-Louis Bossart Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20240429004935.2400191-1-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- drivers/soundwire/amd_manager.c | 3 +++ drivers/soundwire/intel_auxdevice.c | 6 +++++- drivers/soundwire/mipi_disco.c | 30 +++++++++++++++++++++++------ 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/drivers/soundwire/amd_manager.c b/drivers/soundwire/amd_manager.c index 20d94bcfc9b4fb..795e223f7e5c29 100644 --- a/drivers/soundwire/amd_manager.c +++ b/drivers/soundwire/amd_manager.c @@ -571,6 +571,9 @@ static int sdw_master_read_amd_prop(struct sdw_bus *bus) amd_manager->wake_en_mask = wake_en_mask; fwnode_property_read_u32(link, "amd-sdw-power-mode", &power_mode_mask); amd_manager->power_mode_mask = power_mode_mask; + + fwnode_handle_put(link); + return 0; } diff --git a/drivers/soundwire/intel_auxdevice.c b/drivers/soundwire/intel_auxdevice.c index 17cf27e6ea738c..18517121cc8981 100644 --- a/drivers/soundwire/intel_auxdevice.c +++ b/drivers/soundwire/intel_auxdevice.c @@ -155,8 +155,10 @@ static int sdw_master_read_intel_prop(struct sdw_bus *bus) SDW_MASTER_QUIRKS_CLEAR_INITIAL_PARITY; intel_prop = devm_kzalloc(bus->dev, sizeof(*intel_prop), GFP_KERNEL); - if (!intel_prop) + if (!intel_prop) { + fwnode_handle_put(link); return -ENOMEM; + } /* initialize with hardware defaults, in case the properties are not found */ intel_prop->doaise = 0x1; @@ -184,6 +186,8 @@ static int sdw_master_read_intel_prop(struct sdw_bus *bus) intel_prop->dodse, intel_prop->dods); + fwnode_handle_put(link); + return 0; } diff --git a/drivers/soundwire/mipi_disco.c b/drivers/soundwire/mipi_disco.c index 55a9c51c84c19a..e5d9df26d4dc95 100644 --- a/drivers/soundwire/mipi_disco.c +++ b/drivers/soundwire/mipi_disco.c @@ -66,8 +66,10 @@ int sdw_master_read_prop(struct sdw_bus *bus) prop->clk_freq = devm_kcalloc(bus->dev, prop->num_clk_freq, sizeof(*prop->clk_freq), GFP_KERNEL); - if (!prop->clk_freq) + if (!prop->clk_freq) { + fwnode_handle_put(link); return -ENOMEM; + } fwnode_property_read_u32_array(link, "mipi-sdw-clock-frequencies-supported", @@ -92,8 +94,10 @@ int sdw_master_read_prop(struct sdw_bus *bus) prop->clk_gears = devm_kcalloc(bus->dev, prop->num_clk_gears, sizeof(*prop->clk_gears), GFP_KERNEL); - if (!prop->clk_gears) + if (!prop->clk_gears) { + fwnode_handle_put(link); return -ENOMEM; + } fwnode_property_read_u32_array(link, "mipi-sdw-supported-clock-gears", @@ -116,6 +120,8 @@ int sdw_master_read_prop(struct sdw_bus *bus) fwnode_property_read_u32(link, "mipi-sdw-command-error-threshold", &prop->err_threshold); + fwnode_handle_put(link); + return 0; } EXPORT_SYMBOL(sdw_master_read_prop); @@ -197,8 +203,10 @@ static int sdw_slave_read_dpn(struct sdw_slave *slave, dpn[i].num_words, sizeof(*dpn[i].words), GFP_KERNEL); - if (!dpn[i].words) + if (!dpn[i].words) { + fwnode_handle_put(node); return -ENOMEM; + } fwnode_property_read_u32_array(node, "mipi-sdw-port-wordlength-configs", @@ -236,8 +244,10 @@ static int sdw_slave_read_dpn(struct sdw_slave *slave, dpn[i].num_channels, sizeof(*dpn[i].channels), GFP_KERNEL); - if (!dpn[i].channels) + if (!dpn[i].channels) { + fwnode_handle_put(node); return -ENOMEM; + } fwnode_property_read_u32_array(node, "mipi-sdw-channel-number-list", @@ -251,8 +261,10 @@ static int sdw_slave_read_dpn(struct sdw_slave *slave, dpn[i].num_ch_combinations, sizeof(*dpn[i].ch_combinations), GFP_KERNEL); - if (!dpn[i].ch_combinations) + if (!dpn[i].ch_combinations) { + fwnode_handle_put(node); return -ENOMEM; + } fwnode_property_read_u32_array(node, "mipi-sdw-channel-combination-list", @@ -274,6 +286,8 @@ static int sdw_slave_read_dpn(struct sdw_slave *slave, /* TODO: Read audio mode */ + fwnode_handle_put(node); + i++; } @@ -348,10 +362,14 @@ int sdw_slave_read_prop(struct sdw_slave *slave) prop->dp0_prop = devm_kzalloc(&slave->dev, sizeof(*prop->dp0_prop), GFP_KERNEL); - if (!prop->dp0_prop) + if (!prop->dp0_prop) { + fwnode_handle_put(port); return -ENOMEM; + } sdw_slave_read_dp0(slave, port, prop->dp0_prop); + + fwnode_handle_put(port); } /* From 8c219e52ca4d9a67cd6a7074e91bf29b55edc075 Mon Sep 17 00:00:00 2001 From: Gregor Herburger Date: Thu, 30 May 2024 12:19:59 +0200 Subject: [PATCH 0488/1578] gpio: tqmx86: fix typo in Kconfig label Fix description for GPIO_TQMX86 from QTMX86 to TQMx86. Fixes: b868db94a6a7 ("gpio: tqmx86: Add GPIO from for this IO controller") Signed-off-by: Gregor Herburger Signed-off-by: Matthias Schiffer Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/e0e38c9944ad6d281d9a662a45d289b88edc808e.1717063994.git.matthias.schiffer@ew.tq-group.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index 3dbddec0702818..1c28a48915bb22 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -1576,7 +1576,7 @@ config GPIO_TPS68470 are "output only" GPIOs. config GPIO_TQMX86 - tristate "TQ-Systems QTMX86 GPIO" + tristate "TQ-Systems TQMx86 GPIO" depends on MFD_TQMX86 || COMPILE_TEST depends on HAS_IOPORT_MAP select GPIOLIB_IRQCHIP From 9d6a811b522ba558bcb4ec01d12e72a0af8e9f6e Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Thu, 30 May 2024 12:20:00 +0200 Subject: [PATCH 0489/1578] gpio: tqmx86: introduce shadow register for GPIO output value The TQMx86 GPIO controller uses the same register address for input and output data. Reading the register will always return current inputs rather than the previously set outputs (regardless of the current direction setting). Therefore, using a RMW pattern does not make sense when setting output values. Instead, the previously set output register value needs to be stored as a shadow register. As there is no reliable way to get the current output values from the hardware, also initialize all channels to 0, to ensure that stored and actual output values match. This should usually not have any effect in practise, as the TQMx86 UEFI sets all outputs to 0 during boot. Also prepare for extension of the driver to more than 8 GPIOs by using DECLARE_BITMAP. Fixes: b868db94a6a7 ("gpio: tqmx86: Add GPIO from for this IO controller") Signed-off-by: Matthias Schiffer Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/d0555933becd45fa92a85675d26e4d59343ddc01.1717063994.git.matthias.schiffer@ew.tq-group.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-tqmx86.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/gpio/gpio-tqmx86.c b/drivers/gpio/gpio-tqmx86.c index 3a28c1f273c396..b7e2dbbdc4ebe8 100644 --- a/drivers/gpio/gpio-tqmx86.c +++ b/drivers/gpio/gpio-tqmx86.c @@ -6,6 +6,7 @@ * Vadim V.Vlasov */ +#include #include #include #include @@ -38,6 +39,7 @@ struct tqmx86_gpio_data { void __iomem *io_base; int irq; raw_spinlock_t spinlock; + DECLARE_BITMAP(output, TQMX86_NGPIO); u8 irq_type[TQMX86_NGPI]; }; @@ -64,15 +66,10 @@ static void tqmx86_gpio_set(struct gpio_chip *chip, unsigned int offset, { struct tqmx86_gpio_data *gpio = gpiochip_get_data(chip); unsigned long flags; - u8 val; raw_spin_lock_irqsave(&gpio->spinlock, flags); - val = tqmx86_gpio_read(gpio, TQMX86_GPIOD); - if (value) - val |= BIT(offset); - else - val &= ~BIT(offset); - tqmx86_gpio_write(gpio, val, TQMX86_GPIOD); + __assign_bit(offset, gpio->output, value); + tqmx86_gpio_write(gpio, bitmap_get_value8(gpio->output, 0), TQMX86_GPIOD); raw_spin_unlock_irqrestore(&gpio->spinlock, flags); } @@ -277,6 +274,13 @@ static int tqmx86_gpio_probe(struct platform_device *pdev) tqmx86_gpio_write(gpio, (u8)~TQMX86_DIR_INPUT_MASK, TQMX86_GPIODD); + /* + * Reading the previous output state is not possible with TQMx86 hardware. + * Initialize all outputs to 0 to have a defined state that matches the + * shadow register. + */ + tqmx86_gpio_write(gpio, 0, TQMX86_GPIOD); + chip = &gpio->chip; chip->label = "gpio-tqmx86"; chip->owner = THIS_MODULE; From 08af509efdf8dad08e972b48de0e2c2a7919ea8b Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Thu, 30 May 2024 12:20:01 +0200 Subject: [PATCH 0490/1578] gpio: tqmx86: store IRQ trigger type and unmask status separately irq_set_type() should not implicitly unmask the IRQ. All accesses to the interrupt configuration register are moved to a new helper tqmx86_gpio_irq_config(). We also introduce the new rule that accessing irq_type must happen while locked, which will become significant for fixing EDGE_BOTH handling. Fixes: b868db94a6a7 ("gpio: tqmx86: Add GPIO from for this IO controller") Signed-off-by: Matthias Schiffer Link: https://lore.kernel.org/r/6aa4f207f77cb58ef64ffb947e91949b0f753ccd.1717063994.git.matthias.schiffer@ew.tq-group.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-tqmx86.c | 48 ++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/drivers/gpio/gpio-tqmx86.c b/drivers/gpio/gpio-tqmx86.c index b7e2dbbdc4ebe8..7e428c872a2579 100644 --- a/drivers/gpio/gpio-tqmx86.c +++ b/drivers/gpio/gpio-tqmx86.c @@ -29,15 +29,19 @@ #define TQMX86_GPIIC 3 /* GPI Interrupt Configuration Register */ #define TQMX86_GPIIS 4 /* GPI Interrupt Status Register */ +#define TQMX86_GPII_NONE 0 #define TQMX86_GPII_FALLING BIT(0) #define TQMX86_GPII_RISING BIT(1) #define TQMX86_GPII_MASK (BIT(0) | BIT(1)) #define TQMX86_GPII_BITS 2 +/* Stored in irq_type with GPII bits */ +#define TQMX86_INT_UNMASKED BIT(2) struct tqmx86_gpio_data { struct gpio_chip chip; void __iomem *io_base; int irq; + /* Lock must be held for accessing output and irq_type fields */ raw_spinlock_t spinlock; DECLARE_BITMAP(output, TQMX86_NGPIO); u8 irq_type[TQMX86_NGPI]; @@ -104,21 +108,32 @@ static int tqmx86_gpio_get_direction(struct gpio_chip *chip, return GPIO_LINE_DIRECTION_OUT; } +static void tqmx86_gpio_irq_config(struct tqmx86_gpio_data *gpio, int offset) + __must_hold(&gpio->spinlock) +{ + u8 type = TQMX86_GPII_NONE, gpiic; + + if (gpio->irq_type[offset] & TQMX86_INT_UNMASKED) + type = gpio->irq_type[offset] & TQMX86_GPII_MASK; + + gpiic = tqmx86_gpio_read(gpio, TQMX86_GPIIC); + gpiic &= ~(TQMX86_GPII_MASK << (offset * TQMX86_GPII_BITS)); + gpiic |= type << (offset * TQMX86_GPII_BITS); + tqmx86_gpio_write(gpio, gpiic, TQMX86_GPIIC); +} + static void tqmx86_gpio_irq_mask(struct irq_data *data) { unsigned int offset = (data->hwirq - TQMX86_NGPO); struct tqmx86_gpio_data *gpio = gpiochip_get_data( irq_data_get_irq_chip_data(data)); unsigned long flags; - u8 gpiic, mask; - - mask = TQMX86_GPII_MASK << (offset * TQMX86_GPII_BITS); raw_spin_lock_irqsave(&gpio->spinlock, flags); - gpiic = tqmx86_gpio_read(gpio, TQMX86_GPIIC); - gpiic &= ~mask; - tqmx86_gpio_write(gpio, gpiic, TQMX86_GPIIC); + gpio->irq_type[offset] &= ~TQMX86_INT_UNMASKED; + tqmx86_gpio_irq_config(gpio, offset); raw_spin_unlock_irqrestore(&gpio->spinlock, flags); + gpiochip_disable_irq(&gpio->chip, irqd_to_hwirq(data)); } @@ -128,16 +143,12 @@ static void tqmx86_gpio_irq_unmask(struct irq_data *data) struct tqmx86_gpio_data *gpio = gpiochip_get_data( irq_data_get_irq_chip_data(data)); unsigned long flags; - u8 gpiic, mask; - - mask = TQMX86_GPII_MASK << (offset * TQMX86_GPII_BITS); gpiochip_enable_irq(&gpio->chip, irqd_to_hwirq(data)); + raw_spin_lock_irqsave(&gpio->spinlock, flags); - gpiic = tqmx86_gpio_read(gpio, TQMX86_GPIIC); - gpiic &= ~mask; - gpiic |= gpio->irq_type[offset] << (offset * TQMX86_GPII_BITS); - tqmx86_gpio_write(gpio, gpiic, TQMX86_GPIIC); + gpio->irq_type[offset] |= TQMX86_INT_UNMASKED; + tqmx86_gpio_irq_config(gpio, offset); raw_spin_unlock_irqrestore(&gpio->spinlock, flags); } @@ -148,7 +159,7 @@ static int tqmx86_gpio_irq_set_type(struct irq_data *data, unsigned int type) unsigned int offset = (data->hwirq - TQMX86_NGPO); unsigned int edge_type = type & IRQF_TRIGGER_MASK; unsigned long flags; - u8 new_type, gpiic; + u8 new_type; switch (edge_type) { case IRQ_TYPE_EDGE_RISING: @@ -164,13 +175,10 @@ static int tqmx86_gpio_irq_set_type(struct irq_data *data, unsigned int type) return -EINVAL; /* not supported */ } - gpio->irq_type[offset] = new_type; - raw_spin_lock_irqsave(&gpio->spinlock, flags); - gpiic = tqmx86_gpio_read(gpio, TQMX86_GPIIC); - gpiic &= ~((TQMX86_GPII_MASK) << (offset * TQMX86_GPII_BITS)); - gpiic |= new_type << (offset * TQMX86_GPII_BITS); - tqmx86_gpio_write(gpio, gpiic, TQMX86_GPIIC); + gpio->irq_type[offset] &= ~TQMX86_GPII_MASK; + gpio->irq_type[offset] |= new_type; + tqmx86_gpio_irq_config(gpio, offset); raw_spin_unlock_irqrestore(&gpio->spinlock, flags); return 0; From 90dd7de4ef7ba584823dfbeba834c2919a4bb55b Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Thu, 30 May 2024 12:20:02 +0200 Subject: [PATCH 0491/1578] gpio: tqmx86: fix broken IRQ_TYPE_EDGE_BOTH interrupt type The TQMx86 GPIO controller only supports falling and rising edge triggers, but not both. Fix this by implementing a software both-edge mode that toggles the edge type after every interrupt. Fixes: b868db94a6a7 ("gpio: tqmx86: Add GPIO from for this IO controller") Co-developed-by: Gregor Herburger Signed-off-by: Gregor Herburger Signed-off-by: Matthias Schiffer Link: https://lore.kernel.org/r/515324f0491c4d44f4ef49f170354aca002d81ef.1717063994.git.matthias.schiffer@ew.tq-group.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-tqmx86.c | 46 ++++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/drivers/gpio/gpio-tqmx86.c b/drivers/gpio/gpio-tqmx86.c index 7e428c872a2579..f2e7e8754d95d6 100644 --- a/drivers/gpio/gpio-tqmx86.c +++ b/drivers/gpio/gpio-tqmx86.c @@ -32,6 +32,10 @@ #define TQMX86_GPII_NONE 0 #define TQMX86_GPII_FALLING BIT(0) #define TQMX86_GPII_RISING BIT(1) +/* Stored in irq_type as a trigger type, but not actually valid as a register + * value, so the name doesn't use "GPII" + */ +#define TQMX86_INT_BOTH (BIT(0) | BIT(1)) #define TQMX86_GPII_MASK (BIT(0) | BIT(1)) #define TQMX86_GPII_BITS 2 /* Stored in irq_type with GPII bits */ @@ -113,9 +117,15 @@ static void tqmx86_gpio_irq_config(struct tqmx86_gpio_data *gpio, int offset) { u8 type = TQMX86_GPII_NONE, gpiic; - if (gpio->irq_type[offset] & TQMX86_INT_UNMASKED) + if (gpio->irq_type[offset] & TQMX86_INT_UNMASKED) { type = gpio->irq_type[offset] & TQMX86_GPII_MASK; + if (type == TQMX86_INT_BOTH) + type = tqmx86_gpio_get(&gpio->chip, offset + TQMX86_NGPO) + ? TQMX86_GPII_FALLING + : TQMX86_GPII_RISING; + } + gpiic = tqmx86_gpio_read(gpio, TQMX86_GPIIC); gpiic &= ~(TQMX86_GPII_MASK << (offset * TQMX86_GPII_BITS)); gpiic |= type << (offset * TQMX86_GPII_BITS); @@ -169,7 +179,7 @@ static int tqmx86_gpio_irq_set_type(struct irq_data *data, unsigned int type) new_type = TQMX86_GPII_FALLING; break; case IRQ_TYPE_EDGE_BOTH: - new_type = TQMX86_GPII_FALLING | TQMX86_GPII_RISING; + new_type = TQMX86_INT_BOTH; break; default: return -EINVAL; /* not supported */ @@ -189,8 +199,8 @@ static void tqmx86_gpio_irq_handler(struct irq_desc *desc) struct gpio_chip *chip = irq_desc_get_handler_data(desc); struct tqmx86_gpio_data *gpio = gpiochip_get_data(chip); struct irq_chip *irq_chip = irq_desc_get_chip(desc); - unsigned long irq_bits; - int i = 0; + unsigned long irq_bits, flags; + int i; u8 irq_status; chained_irq_enter(irq_chip, desc); @@ -199,6 +209,34 @@ static void tqmx86_gpio_irq_handler(struct irq_desc *desc) tqmx86_gpio_write(gpio, irq_status, TQMX86_GPIIS); irq_bits = irq_status; + + raw_spin_lock_irqsave(&gpio->spinlock, flags); + for_each_set_bit(i, &irq_bits, TQMX86_NGPI) { + /* + * Edge-both triggers are implemented by flipping the edge + * trigger after each interrupt, as the controller only supports + * either rising or falling edge triggers, but not both. + * + * Internally, the TQMx86 GPIO controller has separate status + * registers for rising and falling edge interrupts. GPIIC + * configures which bits from which register are visible in the + * interrupt status register GPIIS and defines what triggers the + * parent IRQ line. Writing to GPIIS always clears both rising + * and falling interrupt flags internally, regardless of the + * currently configured trigger. + * + * In consequence, we can cleanly implement the edge-both + * trigger in software by first clearing the interrupt and then + * setting the new trigger based on the current GPIO input in + * tqmx86_gpio_irq_config() - even if an edge arrives between + * reading the input and setting the trigger, we will have a new + * interrupt pending. + */ + if ((gpio->irq_type[i] & TQMX86_GPII_MASK) == TQMX86_INT_BOTH) + tqmx86_gpio_irq_config(gpio, i); + } + raw_spin_unlock_irqrestore(&gpio->spinlock, flags); + for_each_set_bit(i, &irq_bits, TQMX86_NGPI) generic_handle_domain_irq(gpio->chip.irq.domain, i + TQMX86_NGPO); From 4a77c3dead97339478c7422eb07bf4bf63577008 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 2 Jun 2024 18:15:25 -0400 Subject: [PATCH 0492/1578] SUNRPC: Fix loop termination condition in gss_free_in_token_pages() The in_token->pages[] array is not NULL terminated. This results in the following KASAN splat: KASAN: maybe wild-memory-access in range [0x04a2013400000008-0x04a201340000000f] Fixes: bafa6b4d95d9 ("SUNRPC: Fix gss_free_in_token_pages()") Reviewed-by: Benjamin Coddington Signed-off-by: Chuck Lever --- net/sunrpc/auth_gss/svcauth_gss.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 96ab50eda9c2eb..73a90ad873fb9d 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1069,7 +1069,7 @@ static int gss_read_proxy_verf(struct svc_rqst *rqstp, goto out_denied_free; pages = DIV_ROUND_UP(inlen, PAGE_SIZE); - in_token->pages = kcalloc(pages, sizeof(struct page *), GFP_KERNEL); + in_token->pages = kcalloc(pages + 1, sizeof(struct page *), GFP_KERNEL); if (!in_token->pages) goto out_denied_free; in_token->page_base = 0; From 3d117494e2a88b9c1e8ad41bbbf2cf453a73620e Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Mon, 3 Jun 2024 14:23:44 +0800 Subject: [PATCH 0493/1578] cachefiles: remove unneeded include of close_fd() has been killed, let's get rid of unneeded as Al Viro pointed out [1]. [1] https://lore.kernel.org/r/20240603034055.GI1629371@ZenIV Suggested-by: Al Viro Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20240603062344.818290-1-hsiangkao@linux.alibaba.com Signed-off-by: Christian Brauner --- fs/cachefiles/ondemand.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index 58bd80956c5a64..bce005f2b45633 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0-or-later -#include #include #include #include "internal.h" From c6144a21169fe7d0d70f1a0dae6f6301e5918d30 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 3 Jun 2024 22:43:11 +0900 Subject: [PATCH 0494/1578] tomoyo: update project links TOMOYO project has moved to SourceForge.net . Signed-off-by: Tetsuo Handa --- Documentation/admin-guide/LSM/tomoyo.rst | 35 ++++++++---------------- MAINTAINERS | 2 +- security/tomoyo/Kconfig | 2 +- security/tomoyo/common.c | 2 +- 4 files changed, 14 insertions(+), 27 deletions(-) diff --git a/Documentation/admin-guide/LSM/tomoyo.rst b/Documentation/admin-guide/LSM/tomoyo.rst index 4bc9c2b4da6f39..bdb2c2e2a1b26b 100644 --- a/Documentation/admin-guide/LSM/tomoyo.rst +++ b/Documentation/admin-guide/LSM/tomoyo.rst @@ -9,8 +9,8 @@ TOMOYO is a name-based MAC extension (LSM module) for the Linux kernel. LiveCD-based tutorials are available at -http://tomoyo.sourceforge.jp/1.8/ubuntu12.04-live.html -http://tomoyo.sourceforge.jp/1.8/centos6-live.html +https://tomoyo.sourceforge.net/1.8/ubuntu12.04-live.html +https://tomoyo.sourceforge.net/1.8/centos6-live.html Though these tutorials use non-LSM version of TOMOYO, they are useful for you to know what TOMOYO is. @@ -21,45 +21,32 @@ How to enable TOMOYO? Build the kernel with ``CONFIG_SECURITY_TOMOYO=y`` and pass ``security=tomoyo`` on kernel's command line. -Please see http://tomoyo.osdn.jp/2.5/ for details. +Please see https://tomoyo.sourceforge.net/2.6/ for details. Where is documentation? ======================= User <-> Kernel interface documentation is available at -https://tomoyo.osdn.jp/2.5/policy-specification/index.html . +https://tomoyo.sourceforge.net/2.6/policy-specification/index.html . Materials we prepared for seminars and symposiums are available at -https://osdn.jp/projects/tomoyo/docs/?category_id=532&language_id=1 . +https://sourceforge.net/projects/tomoyo/files/docs/ . Below lists are chosen from three aspects. What is TOMOYO? TOMOYO Linux Overview - https://osdn.jp/projects/tomoyo/docs/lca2009-takeda.pdf + https://sourceforge.net/projects/tomoyo/files/docs/lca2009-takeda.pdf TOMOYO Linux: pragmatic and manageable security for Linux - https://osdn.jp/projects/tomoyo/docs/freedomhectaipei-tomoyo.pdf + https://sourceforge.net/projects/tomoyo/files/docs/freedomhectaipei-tomoyo.pdf TOMOYO Linux: A Practical Method to Understand and Protect Your Own Linux Box - https://osdn.jp/projects/tomoyo/docs/PacSec2007-en-no-demo.pdf + https://sourceforge.net/projects/tomoyo/files/docs/PacSec2007-en-no-demo.pdf What can TOMOYO do? Deep inside TOMOYO Linux - https://osdn.jp/projects/tomoyo/docs/lca2009-kumaneko.pdf + https://sourceforge.net/projects/tomoyo/files/docs/lca2009-kumaneko.pdf The role of "pathname based access control" in security. - https://osdn.jp/projects/tomoyo/docs/lfj2008-bof.pdf + https://sourceforge.net/projects/tomoyo/files/docs/lfj2008-bof.pdf History of TOMOYO? Realities of Mainlining - https://osdn.jp/projects/tomoyo/docs/lfj2008.pdf - -What is future plan? -==================== - -We believe that inode based security and name based security are complementary -and both should be used together. But unfortunately, so far, we cannot enable -multiple LSM modules at the same time. We feel sorry that you have to give up -SELinux/SMACK/AppArmor etc. when you want to use TOMOYO. - -We hope that LSM becomes stackable in future. Meanwhile, you can use non-LSM -version of TOMOYO, available at http://tomoyo.osdn.jp/1.8/ . -LSM version of TOMOYO is a subset of non-LSM version of TOMOYO. We are planning -to port non-LSM version's functionalities to LSM versions. + https://sourceforge.net/projects/tomoyo/files/docs/lfj2008.pdf diff --git a/MAINTAINERS b/MAINTAINERS index 8754ac2c259dc9..4154db83a172b2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22679,7 +22679,7 @@ L: tomoyo-users-en@lists.osdn.me (subscribers-only, for users in English) L: tomoyo-dev@lists.osdn.me (subscribers-only, for developers in Japanese) L: tomoyo-users@lists.osdn.me (subscribers-only, for users in Japanese) S: Maintained -W: https://tomoyo.osdn.jp/ +W: https://tomoyo.sourceforge.net/ F: security/tomoyo/ TOPSTAR LAPTOP EXTRAS DRIVER diff --git a/security/tomoyo/Kconfig b/security/tomoyo/Kconfig index fad75be5f381d6..1e0dd1a6d0b0e9 100644 --- a/security/tomoyo/Kconfig +++ b/security/tomoyo/Kconfig @@ -10,7 +10,7 @@ config SECURITY_TOMOYO help This selects TOMOYO Linux, pathname-based access control. Required userspace tools and further information may be - found at . + found at . If you are unsure how to answer this question, answer N. config SECURITY_TOMOYO_MAX_ACCEPT_ENTRY diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c index ea3140d510ecbf..5c7b059a332aac 100644 --- a/security/tomoyo/common.c +++ b/security/tomoyo/common.c @@ -2787,7 +2787,7 @@ void tomoyo_check_profile(void) else continue; pr_err("Userland tools for TOMOYO 2.6 must be installed and policy must be initialized.\n"); - pr_err("Please see https://tomoyo.osdn.jp/2.6/ for more information.\n"); + pr_err("Please see https://tomoyo.sourceforge.net/2.6/ for more information.\n"); panic("STOP!"); } tomoyo_read_unlock(idx); From 5314e84c33e7ad61df5203df540626ac59f9dcd9 Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Mon, 27 May 2024 10:20:35 +0300 Subject: [PATCH 0495/1578] phy: qcom-qmp: qserdes-txrx: Add missing registers offsets Currently, the x1e80100 uses pure V6 register offsets for DP part of the combo PHY. This hasn't been an issue because external DP is not yet enabled on any of the boards yet. But in order to enabled it, all these new V6 N4 register offsets are needed. So add them. Fixes: 762c3565f3c8 ("phy: qcom-qmp: qserdes-txrx: Add V6 N4 register offsets") Co-developed-by: Kuogee Hsieh Signed-off-by: Kuogee Hsieh Signed-off-by: Abel Vesa Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20240527-x1e80100-phy-qualcomm-combo-fix-dp-v1-1-be8a0b882117@linaro.org Signed-off-by: Vinod Koul --- .../phy/qualcomm/phy-qcom-qmp-qserdes-txrx-v6_n4.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-v6_n4.h b/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-v6_n4.h index a814ad11af071b..d37cc0d4fd365c 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-v6_n4.h +++ b/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-v6_n4.h @@ -6,11 +6,24 @@ #ifndef QCOM_PHY_QMP_QSERDES_TXRX_V6_N4_H_ #define QCOM_PHY_QMP_QSERDES_TXRX_V6_N4_H_ +#define QSERDES_V6_N4_TX_CLKBUF_ENABLE 0x08 +#define QSERDES_V6_N4_TX_TX_EMP_POST1_LVL 0x0c +#define QSERDES_V6_N4_TX_TX_DRV_LVL 0x14 +#define QSERDES_V6_N4_TX_RESET_TSYNC_EN 0x1c +#define QSERDES_V6_N4_TX_PRE_STALL_LDO_BOOST_EN 0x20 #define QSERDES_V6_N4_TX_RES_CODE_LANE_OFFSET_TX 0x30 #define QSERDES_V6_N4_TX_RES_CODE_LANE_OFFSET_RX 0x34 +#define QSERDES_V6_N4_TX_TRANSCEIVER_BIAS_EN 0x48 +#define QSERDES_V6_N4_TX_HIGHZ_DRVR_EN 0x4c +#define QSERDES_V6_N4_TX_TX_POL_INV 0x50 +#define QSERDES_V6_N4_TX_PARRATE_REC_DETECT_IDLE_EN 0x54 #define QSERDES_V6_N4_TX_LANE_MODE_1 0x78 #define QSERDES_V6_N4_TX_LANE_MODE_2 0x7c #define QSERDES_V6_N4_TX_LANE_MODE_3 0x80 +#define QSERDES_V6_N4_TX_TRAN_DRVR_EMP_EN 0xac +#define QSERDES_V6_N4_TX_TX_BAND 0xd8 +#define QSERDES_V6_N4_TX_INTERFACE_SELECT 0xe4 +#define QSERDES_V6_N4_TX_VMODE_CTRL1 0xb0 #define QSERDES_V6_N4_RX_UCDR_FO_GAIN_RATE2 0x8 #define QSERDES_V6_N4_RX_UCDR_SO_GAIN_RATE2 0x18 From 99bf89626335bbec71d8461f0faec88551440850 Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Mon, 27 May 2024 10:20:36 +0300 Subject: [PATCH 0496/1578] phy: qcom-qmp: pcs: Add missing v6 N4 register offsets The new X1E80100 SoC bumps up the HW version of QMP phy to v6 N4 for combo USB and DP PHY. Currently, the X1E80100 uses the pure V6 PCS register offsets, which are different. Add the offsets so the mentioned platform can be fixed later on. Add the new PCS offsets in a dedicated header file. Fixes: d7b3579f84f7 ("phy: qcom-qmp-combo: Add x1e80100 USB/DP combo phys") Co-developed-by: Kuogee Hsieh Signed-off-by: Kuogee Hsieh Signed-off-by: Abel Vesa Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20240527-x1e80100-phy-qualcomm-combo-fix-dp-v1-2-be8a0b882117@linaro.org Signed-off-by: Vinod Koul --- drivers/phy/qualcomm/phy-qcom-qmp-pcs-v6-n4.h | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-pcs-v6-n4.h diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-pcs-v6-n4.h b/drivers/phy/qualcomm/phy-qcom-qmp-pcs-v6-n4.h new file mode 100644 index 00000000000000..b3024714dab4ec --- /dev/null +++ b/drivers/phy/qualcomm/phy-qcom-qmp-pcs-v6-n4.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023, Linaro Limited + */ + +#ifndef QCOM_PHY_QMP_PCS_V6_N4_H_ +#define QCOM_PHY_QMP_PCS_V6_N4_H_ + +/* Only for QMP V6 N4 PHY - USB/PCIe PCS registers */ +#define QPHY_V6_N4_PCS_SW_RESET 0x000 +#define QPHY_V6_N4_PCS_PCS_STATUS1 0x014 +#define QPHY_V6_N4_PCS_POWER_DOWN_CONTROL 0x040 +#define QPHY_V6_N4_PCS_START_CONTROL 0x044 +#define QPHY_V6_N4_PCS_POWER_STATE_CONFIG1 0x090 +#define QPHY_V6_N4_PCS_LOCK_DETECT_CONFIG1 0x0c4 +#define QPHY_V6_N4_PCS_LOCK_DETECT_CONFIG2 0x0c8 +#define QPHY_V6_N4_PCS_LOCK_DETECT_CONFIG3 0x0cc +#define QPHY_V6_N4_PCS_LOCK_DETECT_CONFIG6 0x0d8 +#define QPHY_V6_N4_PCS_REFGEN_REQ_CONFIG1 0x0dc +#define QPHY_V6_N4_PCS_RX_SIGDET_LVL 0x188 +#define QPHY_V6_N4_PCS_RCVR_DTCT_DLY_P1U2_L 0x190 +#define QPHY_V6_N4_PCS_RCVR_DTCT_DLY_P1U2_H 0x194 +#define QPHY_V6_N4_PCS_RATE_SLEW_CNTRL1 0x198 +#define QPHY_V6_N4_PCS_RX_CONFIG 0x1b0 +#define QPHY_V6_N4_PCS_ALIGN_DETECT_CONFIG1 0x1c0 +#define QPHY_V6_N4_PCS_ALIGN_DETECT_CONFIG2 0x1c4 +#define QPHY_V6_N4_PCS_PCS_TX_RX_CONFIG 0x1d0 +#define QPHY_V6_N4_PCS_EQ_CONFIG1 0x1dc +#define QPHY_V6_N4_PCS_EQ_CONFIG2 0x1e0 +#define QPHY_V6_N4_PCS_EQ_CONFIG5 0x1ec + +#endif From 163c1a356a847ab4767200fd4a45b3f8e4ddc900 Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Mon, 27 May 2024 10:20:37 +0300 Subject: [PATCH 0497/1578] phy: qcom: qmp-combo: Switch from V6 to V6 N4 register offsets Currently, none of the X1E80100 supported boards upstream have enabled DP. As for USB, the reason it is not broken when it's obvious that the offsets are wrong is because the only difference with respect to USB is the difference in register name. The V6 uses QPHY_V6_PCS_CDR_RESET_TIME while V6 N4 uses QPHY_V6_N4_PCS_RX_CONFIG. Now, in order for the DP to work, the DP serdes tables need to be added as they have different values for V6 N4 when compared to V6 ones, even though they use the same V6 offsets. While at it, switch swing and pre-emphasis tables to V6 as well. Fixes: d7b3579f84f7 ("phy: qcom-qmp-combo: Add x1e80100 USB/DP combo phys") Co-developed-by: Kuogee Hsieh Signed-off-by: Kuogee Hsieh Signed-off-by: Abel Vesa Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20240527-x1e80100-phy-qualcomm-combo-fix-dp-v1-3-be8a0b882117@linaro.org Signed-off-by: Vinod Koul --- drivers/phy/qualcomm/phy-qcom-qmp-combo.c | 189 ++++++++++++++++++---- drivers/phy/qualcomm/phy-qcom-qmp.h | 2 + 2 files changed, 162 insertions(+), 29 deletions(-) diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-combo.c b/drivers/phy/qualcomm/phy-qcom-qmp-combo.c index 7f999e8a433d89..7b00945f7191d7 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp-combo.c +++ b/drivers/phy/qualcomm/phy-qcom-qmp-combo.c @@ -187,6 +187,31 @@ static const unsigned int qmp_v6_usb3phy_regs_layout[QPHY_LAYOUT_SIZE] = { [QPHY_TX_TRANSCEIVER_BIAS_EN] = QSERDES_V6_TX_TRANSCEIVER_BIAS_EN, }; +static const unsigned int qmp_v6_n4_usb3phy_regs_layout[QPHY_LAYOUT_SIZE] = { + [QPHY_SW_RESET] = QPHY_V6_N4_PCS_SW_RESET, + [QPHY_START_CTRL] = QPHY_V6_N4_PCS_START_CONTROL, + [QPHY_PCS_STATUS] = QPHY_V6_N4_PCS_PCS_STATUS1, + [QPHY_PCS_POWER_DOWN_CONTROL] = QPHY_V6_N4_PCS_POWER_DOWN_CONTROL, + + /* In PCS_USB */ + [QPHY_PCS_AUTONOMOUS_MODE_CTRL] = QPHY_V6_PCS_USB3_AUTONOMOUS_MODE_CTRL, + [QPHY_PCS_LFPS_RXTERM_IRQ_CLEAR] = QPHY_V6_PCS_USB3_LFPS_RXTERM_IRQ_CLEAR, + + [QPHY_COM_RESETSM_CNTRL] = QSERDES_V6_COM_RESETSM_CNTRL, + [QPHY_COM_C_READY_STATUS] = QSERDES_V6_COM_C_READY_STATUS, + [QPHY_COM_CMN_STATUS] = QSERDES_V6_COM_CMN_STATUS, + [QPHY_COM_BIAS_EN_CLKBUFLR_EN] = QSERDES_V6_COM_PLL_BIAS_EN_CLK_BUFLR_EN, + + [QPHY_DP_PHY_STATUS] = QSERDES_V6_DP_PHY_STATUS, + [QPHY_DP_PHY_VCO_DIV] = QSERDES_V6_DP_PHY_VCO_DIV, + + [QPHY_TX_TX_POL_INV] = QSERDES_V6_N4_TX_TX_POL_INV, + [QPHY_TX_TX_DRV_LVL] = QSERDES_V6_N4_TX_TX_DRV_LVL, + [QPHY_TX_TX_EMP_POST1_LVL] = QSERDES_V6_N4_TX_TX_EMP_POST1_LVL, + [QPHY_TX_HIGHZ_DRVR_EN] = QSERDES_V6_N4_TX_HIGHZ_DRVR_EN, + [QPHY_TX_TRANSCEIVER_BIAS_EN] = QSERDES_V6_N4_TX_TRANSCEIVER_BIAS_EN, +}; + static const struct qmp_phy_init_tbl qmp_v3_usb3_serdes_tbl[] = { QMP_PHY_INIT_CFG(QSERDES_V3_COM_PLL_IVCO, 0x07), QMP_PHY_INIT_CFG(QSERDES_V3_COM_SYSCLK_EN_SEL, 0x14), @@ -997,6 +1022,31 @@ static const struct qmp_phy_init_tbl qmp_v6_dp_serdes_tbl[] = { QMP_PHY_INIT_CFG(QSERDES_V6_COM_CORE_CLK_EN, 0x0f), }; +static const struct qmp_phy_init_tbl qmp_v6_n4_dp_serdes_tbl[] = { + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SVS_MODE_CLK_SEL, 0x15), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SYSCLK_EN_SEL, 0x3b), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SYS_CLK_CTRL, 0x02), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_CLK_ENABLE1, 0x0c), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SYSCLK_BUF_ENABLE, 0x06), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_CLK_SELECT, 0x30), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_IVCO, 0x07), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_CCTRL_MODE0, 0x36), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_RCTRL_MODE0, 0x16), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_CP_CTRL_MODE0, 0x06), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DEC_START_MODE0, 0x34), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DIV_FRAC_START1_MODE0, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DIV_FRAC_START2_MODE0, 0xc0), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_CMN_CONFIG_1, 0x12), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_INTEGLOOP_GAIN0_MODE0, 0x3f), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_INTEGLOOP_GAIN1_MODE0, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_VCO_TUNE_MAP, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_BG_TIMER, 0x0a), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_CORE_CLK_DIV_MODE0, 0x14), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_VCO_TUNE_CTRL, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_BIAS_EN_CLK_BUFLR_EN, 0x17), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_CORE_CLK_EN, 0x0f), +}; + static const struct qmp_phy_init_tbl qmp_v6_dp_tx_tbl[] = { QMP_PHY_INIT_CFG(QSERDES_V6_TX_VMODE_CTRL1, 0x40), QMP_PHY_INIT_CFG(QSERDES_V6_TX_PRE_STALL_LDO_BOOST_EN, 0x30), @@ -1011,6 +1061,19 @@ static const struct qmp_phy_init_tbl qmp_v6_dp_tx_tbl[] = { QMP_PHY_INIT_CFG(QSERDES_V6_TX_TX_BAND, 0x4), }; +static const struct qmp_phy_init_tbl qmp_v6_n4_dp_tx_tbl[] = { + QMP_PHY_INIT_CFG(QSERDES_V6_N4_TX_VMODE_CTRL1, 0x40), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_TX_PRE_STALL_LDO_BOOST_EN, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_TX_INTERFACE_SELECT, 0xff), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_TX_CLKBUF_ENABLE, 0x0f), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_TX_RESET_TSYNC_EN, 0x03), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_TX_TRAN_DRVR_EMP_EN, 0x0f), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_TX_PARRATE_REC_DETECT_IDLE_EN, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_TX_RES_CODE_LANE_OFFSET_TX, 0x11), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_TX_RES_CODE_LANE_OFFSET_RX, 0x11), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_TX_TX_BAND, 0x1), +}; + static const struct qmp_phy_init_tbl qmp_v6_dp_serdes_tbl_rbr[] = { QMP_PHY_INIT_CFG(QSERDES_V6_COM_HSCLK_SEL_1, 0x05), QMP_PHY_INIT_CFG(QSERDES_V6_COM_DEC_START_MODE0, 0x34), @@ -1059,6 +1122,74 @@ static const struct qmp_phy_init_tbl qmp_v6_dp_serdes_tbl_hbr3[] = { QMP_PHY_INIT_CFG(QSERDES_V6_COM_BIN_VCOCAL_CMP_CODE2_MODE0, 0x0c), }; +static const struct qmp_phy_init_tbl qmp_v6_n4_dp_serdes_tbl_rbr[] = { + QMP_PHY_INIT_CFG(QSERDES_V6_COM_HSCLK_SEL_1, 0x05), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DEC_START_MODE0, 0x34), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP_EN, 0x04), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DIV_FRAC_START3_MODE0, 0x0b), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP1_MODE0, 0x37), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP2_MODE0, 0x04), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_BIN_VCOCAL_CMP_CODE1_MODE0, 0x71), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_BIN_VCOCAL_CMP_CODE2_MODE0, 0x0c), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_EN_CENTER, 0x01), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_ADJ_PER1, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_PER1, 0x6b), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_PER2, 0x02), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_STEP_SIZE1_MODE0, 0x92), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_STEP_SIZE2_MODE0, 0x01), +}; + +static const struct qmp_phy_init_tbl qmp_v6_n4_dp_serdes_tbl_hbr[] = { + QMP_PHY_INIT_CFG(QSERDES_V6_COM_HSCLK_SEL_1, 0x03), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DEC_START_MODE0, 0x34), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP_EN, 0x08), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DIV_FRAC_START3_MODE0, 0x0b), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP1_MODE0, 0x07), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP2_MODE0, 0x07), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_BIN_VCOCAL_CMP_CODE1_MODE0, 0x71), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_BIN_VCOCAL_CMP_CODE2_MODE0, 0x0c), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_EN_CENTER, 0x01), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_ADJ_PER1, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_PER1, 0x6b), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_PER2, 0x02), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_STEP_SIZE1_MODE0, 0x92), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_STEP_SIZE2_MODE0, 0x01), +}; + +static const struct qmp_phy_init_tbl qmp_v6_n4_dp_serdes_tbl_hbr2[] = { + QMP_PHY_INIT_CFG(QSERDES_V6_COM_HSCLK_SEL_1, 0x01), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DEC_START_MODE0, 0x46), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP_EN, 0x08), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DIV_FRAC_START3_MODE0, 0x05), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP1_MODE0, 0x0f), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP2_MODE0, 0x0e), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_BIN_VCOCAL_CMP_CODE1_MODE0, 0x97), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_BIN_VCOCAL_CMP_CODE2_MODE0, 0x10), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_EN_CENTER, 0x01), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_ADJ_PER1, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_PER1, 0x6b), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_PER2, 0x02), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_STEP_SIZE1_MODE0, 0x18), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_STEP_SIZE2_MODE0, 0x02), +}; + +static const struct qmp_phy_init_tbl qmp_v6_n4_dp_serdes_tbl_hbr3[] = { + QMP_PHY_INIT_CFG(QSERDES_V6_COM_HSCLK_SEL_1, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DEC_START_MODE0, 0x34), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP_EN, 0x08), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DIV_FRAC_START3_MODE0, 0x0b), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP1_MODE0, 0x17), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP2_MODE0, 0x15), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_BIN_VCOCAL_CMP_CODE1_MODE0, 0x71), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_BIN_VCOCAL_CMP_CODE2_MODE0, 0x0c), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_EN_CENTER, 0x01), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_ADJ_PER1, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_PER1, 0x6b), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_PER2, 0x02), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_STEP_SIZE1_MODE0, 0x92), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_STEP_SIZE2_MODE0, 0x01), +}; + static const struct qmp_phy_init_tbl sc8280xp_usb43dp_serdes_tbl[] = { QMP_PHY_INIT_CFG(QSERDES_V5_COM_SSC_EN_CENTER, 0x01), QMP_PHY_INIT_CFG(QSERDES_V5_COM_SSC_PER1, 0x31), @@ -1273,20 +1404,20 @@ static const struct qmp_phy_init_tbl x1e80100_usb43dp_rx_tbl[] = { }; static const struct qmp_phy_init_tbl x1e80100_usb43dp_pcs_tbl[] = { - QMP_PHY_INIT_CFG(QPHY_V6_PCS_RCVR_DTCT_DLY_P1U2_L, 0xe7), - QMP_PHY_INIT_CFG(QPHY_V6_PCS_RCVR_DTCT_DLY_P1U2_H, 0x03), - QMP_PHY_INIT_CFG(QPHY_V6_PCS_LOCK_DETECT_CONFIG1, 0xc4), - QMP_PHY_INIT_CFG(QPHY_V6_PCS_LOCK_DETECT_CONFIG2, 0x89), - QMP_PHY_INIT_CFG(QPHY_V6_PCS_LOCK_DETECT_CONFIG3, 0x20), - QMP_PHY_INIT_CFG(QPHY_V6_PCS_LOCK_DETECT_CONFIG6, 0x13), - QMP_PHY_INIT_CFG(QPHY_V6_PCS_REFGEN_REQ_CONFIG1, 0x21), - QMP_PHY_INIT_CFG(QPHY_V6_PCS_RX_SIGDET_LVL, 0x55), - QMP_PHY_INIT_CFG(QPHY_V6_PCS_CDR_RESET_TIME, 0x0a), - QMP_PHY_INIT_CFG(QPHY_V6_PCS_ALIGN_DETECT_CONFIG1, 0xd4), - QMP_PHY_INIT_CFG(QPHY_V6_PCS_ALIGN_DETECT_CONFIG2, 0x30), - QMP_PHY_INIT_CFG(QPHY_V6_PCS_PCS_TX_RX_CONFIG, 0x0c), - QMP_PHY_INIT_CFG(QPHY_V6_PCS_EQ_CONFIG1, 0x4b), - QMP_PHY_INIT_CFG(QPHY_V6_PCS_EQ_CONFIG5, 0x10), + QMP_PHY_INIT_CFG(QPHY_V6_N4_PCS_RCVR_DTCT_DLY_P1U2_L, 0xe7), + QMP_PHY_INIT_CFG(QPHY_V6_N4_PCS_RCVR_DTCT_DLY_P1U2_H, 0x03), + QMP_PHY_INIT_CFG(QPHY_V6_N4_PCS_LOCK_DETECT_CONFIG1, 0xc4), + QMP_PHY_INIT_CFG(QPHY_V6_N4_PCS_LOCK_DETECT_CONFIG2, 0x89), + QMP_PHY_INIT_CFG(QPHY_V6_N4_PCS_LOCK_DETECT_CONFIG3, 0x20), + QMP_PHY_INIT_CFG(QPHY_V6_N4_PCS_LOCK_DETECT_CONFIG6, 0x13), + QMP_PHY_INIT_CFG(QPHY_V6_N4_PCS_REFGEN_REQ_CONFIG1, 0x21), + QMP_PHY_INIT_CFG(QPHY_V6_N4_PCS_RX_SIGDET_LVL, 0x55), + QMP_PHY_INIT_CFG(QPHY_V6_N4_PCS_RX_CONFIG, 0x0a), + QMP_PHY_INIT_CFG(QPHY_V6_N4_PCS_ALIGN_DETECT_CONFIG1, 0xd4), + QMP_PHY_INIT_CFG(QPHY_V6_N4_PCS_ALIGN_DETECT_CONFIG2, 0x30), + QMP_PHY_INIT_CFG(QPHY_V6_N4_PCS_PCS_TX_RX_CONFIG, 0x0c), + QMP_PHY_INIT_CFG(QPHY_V6_N4_PCS_EQ_CONFIG1, 0x4b), + QMP_PHY_INIT_CFG(QPHY_V6_N4_PCS_EQ_CONFIG5, 0x10), }; static const struct qmp_phy_init_tbl x1e80100_usb43dp_pcs_usb_tbl[] = { @@ -1794,22 +1925,22 @@ static const struct qmp_phy_cfg x1e80100_usb3dpphy_cfg = { .pcs_usb_tbl = x1e80100_usb43dp_pcs_usb_tbl, .pcs_usb_tbl_num = ARRAY_SIZE(x1e80100_usb43dp_pcs_usb_tbl), - .dp_serdes_tbl = qmp_v6_dp_serdes_tbl, - .dp_serdes_tbl_num = ARRAY_SIZE(qmp_v6_dp_serdes_tbl), - .dp_tx_tbl = qmp_v6_dp_tx_tbl, - .dp_tx_tbl_num = ARRAY_SIZE(qmp_v6_dp_tx_tbl), + .dp_serdes_tbl = qmp_v6_n4_dp_serdes_tbl, + .dp_serdes_tbl_num = ARRAY_SIZE(qmp_v6_n4_dp_serdes_tbl), + .dp_tx_tbl = qmp_v6_n4_dp_tx_tbl, + .dp_tx_tbl_num = ARRAY_SIZE(qmp_v6_n4_dp_tx_tbl), - .serdes_tbl_rbr = qmp_v6_dp_serdes_tbl_rbr, - .serdes_tbl_rbr_num = ARRAY_SIZE(qmp_v6_dp_serdes_tbl_rbr), - .serdes_tbl_hbr = qmp_v6_dp_serdes_tbl_hbr, - .serdes_tbl_hbr_num = ARRAY_SIZE(qmp_v6_dp_serdes_tbl_hbr), - .serdes_tbl_hbr2 = qmp_v6_dp_serdes_tbl_hbr2, - .serdes_tbl_hbr2_num = ARRAY_SIZE(qmp_v6_dp_serdes_tbl_hbr2), - .serdes_tbl_hbr3 = qmp_v6_dp_serdes_tbl_hbr3, - .serdes_tbl_hbr3_num = ARRAY_SIZE(qmp_v6_dp_serdes_tbl_hbr3), + .serdes_tbl_rbr = qmp_v6_n4_dp_serdes_tbl_rbr, + .serdes_tbl_rbr_num = ARRAY_SIZE(qmp_v6_n4_dp_serdes_tbl_rbr), + .serdes_tbl_hbr = qmp_v6_n4_dp_serdes_tbl_hbr, + .serdes_tbl_hbr_num = ARRAY_SIZE(qmp_v6_n4_dp_serdes_tbl_hbr), + .serdes_tbl_hbr2 = qmp_v6_n4_dp_serdes_tbl_hbr2, + .serdes_tbl_hbr2_num = ARRAY_SIZE(qmp_v6_n4_dp_serdes_tbl_hbr2), + .serdes_tbl_hbr3 = qmp_v6_n4_dp_serdes_tbl_hbr3, + .serdes_tbl_hbr3_num = ARRAY_SIZE(qmp_v6_n4_dp_serdes_tbl_hbr3), - .swing_hbr_rbr = &qmp_dp_v5_voltage_swing_hbr_rbr, - .pre_emphasis_hbr_rbr = &qmp_dp_v5_pre_emphasis_hbr_rbr, + .swing_hbr_rbr = &qmp_dp_v6_voltage_swing_hbr_rbr, + .pre_emphasis_hbr_rbr = &qmp_dp_v6_pre_emphasis_hbr_rbr, .swing_hbr3_hbr2 = &qmp_dp_v5_voltage_swing_hbr3_hbr2, .pre_emphasis_hbr3_hbr2 = &qmp_dp_v5_pre_emphasis_hbr3_hbr2, @@ -1822,7 +1953,7 @@ static const struct qmp_phy_cfg x1e80100_usb3dpphy_cfg = { .num_resets = ARRAY_SIZE(msm8996_usb3phy_reset_l), .vreg_list = qmp_phy_vreg_l, .num_vregs = ARRAY_SIZE(qmp_phy_vreg_l), - .regs = qmp_v45_usb3phy_regs_layout, + .regs = qmp_v6_n4_usb3phy_regs_layout, }; static const struct qmp_phy_cfg sm6350_usb3dpphy_cfg = { diff --git a/drivers/phy/qualcomm/phy-qcom-qmp.h b/drivers/phy/qualcomm/phy-qcom-qmp.h index d10b8f653c4b23..d0f41e4aaa855f 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp.h +++ b/drivers/phy/qualcomm/phy-qcom-qmp.h @@ -46,6 +46,8 @@ #include "phy-qcom-qmp-pcs-v6.h" +#include "phy-qcom-qmp-pcs-v6-n4.h" + #include "phy-qcom-qmp-pcs-v6_20.h" #include "phy-qcom-qmp-pcs-v7.h" From 994af1825a2aa286f4903ff64a1c7378b52defe6 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 25 Apr 2024 13:52:01 +0200 Subject: [PATCH 0498/1578] riscv: fix overlap of allocated page and PTR_ERR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On riscv32, it is possible for the last page in virtual address space (0xfffff000) to be allocated. This page overlaps with PTR_ERR, so that shouldn't happen. There is already some code to ensure memblock won't allocate the last page. However, buddy allocator is left unchecked. Fix this by reserving physical memory that would be mapped at virtual addresses greater than 0xfffff000. Reported-by: Björn Töpel Closes: https://lore.kernel.org/linux-riscv/878r1ibpdn.fsf@all.your.base.are.belong.to.us Fixes: 76d2a0493a17 ("RISC-V: Init and Halt Code") Signed-off-by: Nam Cao Cc: Tested-by: Björn Töpel Reviewed-by: Björn Töpel Reviewed-by: Mike Rapoport (IBM) Link: https://lore.kernel.org/r/20240425115201.3044202-1-namcao@linutronix.de Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index e3218d65f21d5a..e3405e4b99af50 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -250,18 +250,19 @@ static void __init setup_bootmem(void) kernel_map.va_pa_offset = PAGE_OFFSET - phys_ram_base; /* - * memblock allocator is not aware of the fact that last 4K bytes of - * the addressable memory can not be mapped because of IS_ERR_VALUE - * macro. Make sure that last 4k bytes are not usable by memblock - * if end of dram is equal to maximum addressable memory. For 64-bit - * kernel, this problem can't happen here as the end of the virtual - * address space is occupied by the kernel mapping then this check must - * be done as soon as the kernel mapping base address is determined. + * Reserve physical address space that would be mapped to virtual + * addresses greater than (void *)(-PAGE_SIZE) because: + * - This memory would overlap with ERR_PTR + * - This memory belongs to high memory, which is not supported + * + * This is not applicable to 64-bit kernel, because virtual addresses + * after (void *)(-PAGE_SIZE) are not linearly mapped: they are + * occupied by kernel mapping. Also it is unrealistic for high memory + * to exist on 64-bit platforms. */ if (!IS_ENABLED(CONFIG_64BIT)) { - max_mapped_addr = __pa(~(ulong)0); - if (max_mapped_addr == (phys_ram_end - 1)) - memblock_set_current_limit(max_mapped_addr - 4096); + max_mapped_addr = __va_to_pa_nodebug(-PAGE_SIZE); + memblock_reserve(max_mapped_addr, (phys_addr_t)-max_mapped_addr); } min_low_pfn = PFN_UP(phys_ram_base); From e2c79b4c5c4d83520abb570ca633ded09621c0a6 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Thu, 30 May 2024 09:44:51 -0700 Subject: [PATCH 0499/1578] Revert "riscv: mm: accelerate pagefault when badaccess" I accidentally picked up an earlier version of this patch, which had already landed via mm. The patch I picked up contains a bug, which I kept as I thought it was a fix. So let's just revert it. This reverts commit 4c6c0020427a4547845a83f7e4d6085e16c3e24f. Fixes: 4c6c0020427a ("riscv: mm: accelerate pagefault when badaccess") Reviewed-by: Kefeng Wang Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20240530164451.21336-1-palmer@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/fault.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index b3fcf7d67efba1..5224f373380225 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -293,8 +293,8 @@ void handle_page_fault(struct pt_regs *regs) if (unlikely(access_error(cause, vma))) { vma_end_read(vma); count_vm_vma_lock_event(VMA_LOCK_SUCCESS); - tsk->thread.bad_cause = SEGV_ACCERR; - bad_area_nosemaphore(regs, code, addr); + tsk->thread.bad_cause = cause; + bad_area_nosemaphore(regs, SEGV_ACCERR, addr); return; } From e0e8e4bce61cac674fdabd85d070e7bab1634a8b Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Mon, 3 Jun 2024 10:32:23 +0300 Subject: [PATCH 0500/1578] ASoC: SOF: Intel: hda-dai: skip tlv for dspless mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sof_ipc4_dma_config_tlv{} is for Audio DSP firmware only. Don't set it in dspless mode. Fixes: 17386cb1b48b ("ASoC: SOF: Intel: hda-dai: set dma_stream_channel_map device") Signed-off-by: Bard Liao Reviewed-by: Pierre-Louis Bossart Reviewed-by: Péter Ujfalusi Signed-off-by: Peter Ujfalusi Link: https://msgid.link/r/20240603073224.14726-2-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/intel/hda-dai.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/soc/sof/intel/hda-dai.c b/sound/soc/sof/intel/hda-dai.c index ce675c22a5ab8f..a2b6dbcfa9187a 100644 --- a/sound/soc/sof/intel/hda-dai.c +++ b/sound/soc/sof/intel/hda-dai.c @@ -525,6 +525,9 @@ int sdw_hda_dai_hw_params(struct snd_pcm_substream *substream, return ret; } + if (sdev->dspless_mode_selected) + return 0; + ipc4_copier = widget_to_copier(w); dma_config_tlv = &ipc4_copier->dma_config_tlv[cpu_dai_id]; dma_config = &dma_config_tlv->dma_config; From 3b06e137089fc0beb5ffa6a869de9a93df984072 Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Mon, 3 Jun 2024 10:32:24 +0300 Subject: [PATCH 0501/1578] ASoC: SOF: Intel: hda-dai: remove skip_tlv label MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We just return 0 after the skip_tlv label. No need to use a label. Signed-off-by: Bard Liao Reviewed-by: Pierre-Louis Bossart Reviewed-by: Péter Ujfalusi Signed-off-by: Peter Ujfalusi Link: https://msgid.link/r/20240603073224.14726-3-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/intel/hda-dai.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sound/soc/sof/intel/hda-dai.c b/sound/soc/sof/intel/hda-dai.c index a2b6dbcfa9187a..c61d298ea6b3a1 100644 --- a/sound/soc/sof/intel/hda-dai.c +++ b/sound/soc/sof/intel/hda-dai.c @@ -379,7 +379,7 @@ static int non_hda_dai_hw_params_data(struct snd_pcm_substream *substream, sdev = widget_to_sdev(w); if (sdev->dspless_mode_selected) - goto skip_tlv; + return 0; /* get stream_id */ hext_stream = ops->get_hext_stream(sdev, cpu_dai, substream); @@ -423,7 +423,6 @@ static int non_hda_dai_hw_params_data(struct snd_pcm_substream *substream, dma_config->dma_stream_channel_map.device_count = 1; dma_config->dma_priv_config_size = 0; -skip_tlv: return 0; } From d3cb3516f2540e6c384eef96b4ffeb49425175ed Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Fri, 31 May 2024 01:30:54 +0300 Subject: [PATCH 0502/1578] MAINTAINERS: copy linux-arm-msm for sound/qcom changes Not having linux-arm-msm@ in cc for audio-related changes for Qualcomm platforms means that interested parties can easily miss the patches. Add corresponding L: entry so that linux-arm-msm ML gets CC'ed for audio patches too. Signed-off-by: Dmitry Baryshkov Acked-by: Srinivas Kandagatla Link: https://msgid.link/r/20240531-asoc-qcom-cc-lamsm-v1-1-f026ad618496@linaro.org Signed-off-by: Mark Brown --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 8754ac2c259dc9..451c1aa5af3cac 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18212,6 +18212,7 @@ QCOM AUDIO (ASoC) DRIVERS M: Srinivas Kandagatla M: Banajit Goswami L: alsa-devel@alsa-project.org (moderated for non-subscribers) +L: linux-arm-msm@vger.kernel.org S: Supported F: Documentation/devicetree/bindings/soc/qcom/qcom,apr* F: Documentation/devicetree/bindings/sound/qcom,* From a73a83021ae136ab6b0d08eb196d84b1d02814e9 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Sun, 2 Jun 2024 09:30:13 -0700 Subject: [PATCH 0503/1578] ASoC: mxs: add missing MODULE_DESCRIPTION() macro make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in sound/soc/mxs/snd-soc-mxs-pcm.o Add the missing invocation of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Link: https://msgid.link/r/20240602-md-snd-soc-mxs-pcm-v1-1-1e663d11328d@quicinc.com Signed-off-by: Mark Brown --- sound/soc/mxs/mxs-pcm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/mxs/mxs-pcm.c b/sound/soc/mxs/mxs-pcm.c index df2e4be992d2ee..9bb08cadeb183e 100644 --- a/sound/soc/mxs/mxs-pcm.c +++ b/sound/soc/mxs/mxs-pcm.c @@ -43,4 +43,5 @@ int mxs_pcm_platform_register(struct device *dev) } EXPORT_SYMBOL_GPL(mxs_pcm_platform_register); +MODULE_DESCRIPTION("MXS ASoC PCM driver"); MODULE_LICENSE("GPL"); From 7478e15bcc16cbc0fa1b8c431163bf651033c088 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Sun, 2 Jun 2024 10:00:27 -0700 Subject: [PATCH 0504/1578] ASoC: fsl: add missing MODULE_DESCRIPTION() macro make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in sound/soc/fsl/imx-pcm-dma.o Add the missing invocation of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Link: https://msgid.link/r/20240602-md-snd-fsl-imx-pcm-dma-v1-1-e7efc33c6bf3@quicinc.com Signed-off-by: Mark Brown --- sound/soc/fsl/imx-pcm-dma.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/fsl/imx-pcm-dma.c b/sound/soc/fsl/imx-pcm-dma.c index 14e94270911cfb..4fa208d6a03221 100644 --- a/sound/soc/fsl/imx-pcm-dma.c +++ b/sound/soc/fsl/imx-pcm-dma.c @@ -50,4 +50,5 @@ int imx_pcm_dma_init(struct platform_device *pdev) } EXPORT_SYMBOL_GPL(imx_pcm_dma_init); +MODULE_DESCRIPTION("Freescale i.MX PCM DMA interface"); MODULE_LICENSE("GPL"); From 968c974c08106fcf911d8d390d0f049af855d348 Mon Sep 17 00:00:00 2001 From: Jack Yu Date: Mon, 3 Jun 2024 10:47:16 +0000 Subject: [PATCH 0505/1578] ASoC: rt722-sdca-sdw: add silence detection register as volatile Including silence detection register as volatile. Signed-off-by: Jack Yu Link: https://msgid.link/r/c66a6bd6d220426793096b42baf85437@realtek.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt722-sdca-sdw.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/soc/codecs/rt722-sdca-sdw.c b/sound/soc/codecs/rt722-sdca-sdw.c index b33da2215adea7..f73ee3bf90f5e6 100644 --- a/sound/soc/codecs/rt722-sdca-sdw.c +++ b/sound/soc/codecs/rt722-sdca-sdw.c @@ -68,6 +68,7 @@ static bool rt722_sdca_mbq_readable_register(struct device *dev, unsigned int re case 0x200007f: case 0x2000082 ... 0x200008e: case 0x2000090 ... 0x2000094: + case 0x3110000: case 0x5300000 ... 0x5300002: case 0x5400002: case 0x5600000 ... 0x5600007: @@ -125,6 +126,7 @@ static bool rt722_sdca_mbq_volatile_register(struct device *dev, unsigned int re case 0x2000067: case 0x2000084: case 0x2000086: + case 0x3110000: return true; default: return false; From 2317dc2c22cc353b699c7d1db47b2fe91f54055c Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Wed, 29 May 2024 12:19:01 +0200 Subject: [PATCH 0506/1578] bpf, devmap: Remove unnecessary if check in for loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The iterator variable dst cannot be NULL and the if check can be removed. Remove it and fix the following Coccinelle/coccicheck warning reported by itnull.cocci: ERROR: iterator variable bound on line 762 cannot be NULL Signed-off-by: Thorsten Blum Signed-off-by: Daniel Borkmann Reviewed-by: Toke Høiland-Jørgensen Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20240529101900.103913-2-thorsten.blum@toblux.com --- kernel/bpf/devmap.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 4e2cdbb5629f22..7f3b34452243c8 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -760,9 +760,6 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, for (i = 0; i < dtab->n_buckets; i++) { head = dev_map_index_hash(dtab, i); hlist_for_each_entry_safe(dst, next, head, index_hlist) { - if (!dst) - continue; - if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) continue; From 93c1800b3799f17375989b0daf76497dd3e80922 Mon Sep 17 00:00:00 2001 From: David Kaplan Date: Sun, 2 Jun 2024 13:19:09 -0500 Subject: [PATCH 0507/1578] x86/kexec: Fix bug with call depth tracking The call to cc_platform_has() triggers a fault and system crash if call depth tracking is active because the GS segment has been reset by load_segments() and GS_BASE is now 0 but call depth tracking uses per-CPU variables to operate. Call cc_platform_has() earlier in the function when GS is still valid. [ bp: Massage. ] Fixes: 5d8213864ade ("x86/retbleed: Add SKL return thunk") Signed-off-by: David Kaplan Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Cc: Link: https://lore.kernel.org/r/20240603083036.637-1-bp@kernel.org --- arch/x86/kernel/machine_kexec_64.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index b180d8e497c317..cc0f7f70b17ba3 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -295,8 +295,15 @@ void machine_kexec_cleanup(struct kimage *image) void machine_kexec(struct kimage *image) { unsigned long page_list[PAGES_NR]; - void *control_page; + unsigned int host_mem_enc_active; int save_ftrace_enabled; + void *control_page; + + /* + * This must be done before load_segments() since if call depth tracking + * is used then GS must be valid to make any function calls. + */ + host_mem_enc_active = cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT); #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) @@ -358,7 +365,7 @@ void machine_kexec(struct kimage *image) (unsigned long)page_list, image->start, image->preserve_context, - cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)); + host_mem_enc_active); #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) From 1e24c31351787e24b7eebe84866bd55fd62a0aef Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Fri, 31 May 2024 16:00:04 -0700 Subject: [PATCH 0508/1578] cpufreq: intel_pstate: Fix unchecked HWP MSR access Fix unchecked MSR access error for processors with no HWP support. On such processors, maximum frequency can be changed by the system firmware using ACPI event ACPI_PROCESSOR_NOTIFY_HIGEST_PERF_CHANGED. This results in accessing HWP MSR 0x771. Call Trace: generic_exec_single+0x58/0x120 smp_call_function_single+0xbf/0x110 rdmsrl_on_cpu+0x46/0x60 intel_pstate_get_hwp_cap+0x1b/0x70 intel_pstate_update_limits+0x2a/0x60 acpi_processor_notify+0xb7/0x140 acpi_ev_notify_dispatch+0x3b/0x60 HWP MSR 0x771 can be only read on a CPU which supports HWP and enabled. Hence intel_pstate_get_hwp_cap() can only be called when hwp_active is true. Reported-by: Sebastian Andrzej Siewior Closes: https://lore.kernel.org/linux-pm/20240529155740.Hq2Hw7be@linutronix.de/ Fixes: e8217b4bece3 ("cpufreq: intel_pstate: Update the maximum CPU frequency consistently") Tested-by: Sebastian Andrzej Siewior Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 4b986c044741ed..65d3f79104bd50 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1153,7 +1153,8 @@ static void intel_pstate_update_policies(void) static void __intel_pstate_update_max_freq(struct cpudata *cpudata, struct cpufreq_policy *policy) { - intel_pstate_get_hwp_cap(cpudata); + if (hwp_active) + intel_pstate_get_hwp_cap(cpudata); policy->cpuinfo.max_freq = READ_ONCE(global.no_turbo) ? cpudata->pstate.max_freq : cpudata->pstate.turbo_freq; From 2884dc7d08d98a89d8d65121524bb7533183a63a Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sun, 2 Jun 2024 11:27:03 -0700 Subject: [PATCH 0509/1578] bpf: Fix a potential use-after-free in bpf_link_free() After commit 1a80dbcb2dba, bpf_link can be freed by link->ops->dealloc_deferred, but the code still tests and uses link->ops->dealloc afterward, which leads to a use-after-free as reported by syzbot. Actually, one of them should be sufficient, so just call one of them instead of both. Also add a WARN_ON() in case of any problematic implementation. Fixes: 1a80dbcb2dba ("bpf: support deferring bpf_link dealloc to after RCU grace period") Reported-by: syzbot+1989ee16d94720836244@syzkaller.appspotmail.com Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20240602182703.207276-1-xiyou.wangcong@gmail.com --- kernel/bpf/syscall.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2222c3ff88e7fd..f45ed6adc092af 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2998,6 +2998,7 @@ static int bpf_obj_get(const union bpf_attr *attr) void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog) { + WARN_ON(ops->dealloc && ops->dealloc_deferred); atomic64_set(&link->refcnt, 1); link->type = type; link->id = 0; @@ -3056,16 +3057,17 @@ static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu) /* bpf_link_free is guaranteed to be called from process context */ static void bpf_link_free(struct bpf_link *link) { + const struct bpf_link_ops *ops = link->ops; bool sleepable = false; bpf_link_free_id(link->id); if (link->prog) { sleepable = link->prog->sleepable; /* detach BPF program, clean up used resources */ - link->ops->release(link); + ops->release(link); bpf_prog_put(link->prog); } - if (link->ops->dealloc_deferred) { + if (ops->dealloc_deferred) { /* schedule BPF link deallocation; if underlying BPF program * is sleepable, we need to first wait for RCU tasks trace * sync, then go through "classic" RCU grace period @@ -3074,9 +3076,8 @@ static void bpf_link_free(struct bpf_link *link) call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp); else call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp); - } - if (link->ops->dealloc) - link->ops->dealloc(link); + } else if (ops->dealloc) + ops->dealloc(link); } static void bpf_link_put_deferred(struct work_struct *work) From b97e8a2f7130a4b30d1502003095833d16c028b3 Mon Sep 17 00:00:00 2001 From: Hagar Hemdan Date: Fri, 31 May 2024 16:21:44 +0000 Subject: [PATCH 0510/1578] irqchip/gic-v3-its: Fix potential race condition in its_vlpi_prop_update() its_vlpi_prop_update() calls lpi_write_config() which obtains the mapping information for a VLPI without lock held. So it could race with its_vlpi_unmap(). Since all calls from its_irq_set_vcpu_affinity() require the same lock to be held, hoist the locking there instead of sprinkling the locking all over the place. This bug was discovered using Coverity Static Analysis Security Testing (SAST) by Synopsys, Inc. [ tglx: Use guard() instead of goto ] Fixes: 015ec0386ab6 ("irqchip/gic-v3-its: Add VLPI configuration handling") Suggested-by: Marc Zyngier Signed-off-by: Hagar Hemdan Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20240531162144.28650-1-hagarhem@amazon.com --- drivers/irqchip/irq-gic-v3-its.c | 44 +++++++++----------------------- 1 file changed, 12 insertions(+), 32 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 40ebf1726393ce..3c755d5dad6e62 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -1846,28 +1846,22 @@ static int its_vlpi_map(struct irq_data *d, struct its_cmd_info *info) { struct its_device *its_dev = irq_data_get_irq_chip_data(d); u32 event = its_get_event_id(d); - int ret = 0; if (!info->map) return -EINVAL; - raw_spin_lock(&its_dev->event_map.vlpi_lock); - if (!its_dev->event_map.vm) { struct its_vlpi_map *maps; maps = kcalloc(its_dev->event_map.nr_lpis, sizeof(*maps), GFP_ATOMIC); - if (!maps) { - ret = -ENOMEM; - goto out; - } + if (!maps) + return -ENOMEM; its_dev->event_map.vm = info->map->vm; its_dev->event_map.vlpi_maps = maps; } else if (its_dev->event_map.vm != info->map->vm) { - ret = -EINVAL; - goto out; + return -EINVAL; } /* Get our private copy of the mapping information */ @@ -1899,46 +1893,32 @@ static int its_vlpi_map(struct irq_data *d, struct its_cmd_info *info) its_dev->event_map.nr_vlpis++; } -out: - raw_spin_unlock(&its_dev->event_map.vlpi_lock); - return ret; + return 0; } static int its_vlpi_get(struct irq_data *d, struct its_cmd_info *info) { struct its_device *its_dev = irq_data_get_irq_chip_data(d); struct its_vlpi_map *map; - int ret = 0; - - raw_spin_lock(&its_dev->event_map.vlpi_lock); map = get_vlpi_map(d); - if (!its_dev->event_map.vm || !map) { - ret = -EINVAL; - goto out; - } + if (!its_dev->event_map.vm || !map) + return -EINVAL; /* Copy our mapping information to the incoming request */ *info->map = *map; -out: - raw_spin_unlock(&its_dev->event_map.vlpi_lock); - return ret; + return 0; } static int its_vlpi_unmap(struct irq_data *d) { struct its_device *its_dev = irq_data_get_irq_chip_data(d); u32 event = its_get_event_id(d); - int ret = 0; - - raw_spin_lock(&its_dev->event_map.vlpi_lock); - if (!its_dev->event_map.vm || !irqd_is_forwarded_to_vcpu(d)) { - ret = -EINVAL; - goto out; - } + if (!its_dev->event_map.vm || !irqd_is_forwarded_to_vcpu(d)) + return -EINVAL; /* Drop the virtual mapping */ its_send_discard(its_dev, event); @@ -1962,9 +1942,7 @@ static int its_vlpi_unmap(struct irq_data *d) kfree(its_dev->event_map.vlpi_maps); } -out: - raw_spin_unlock(&its_dev->event_map.vlpi_lock); - return ret; + return 0; } static int its_vlpi_prop_update(struct irq_data *d, struct its_cmd_info *info) @@ -1992,6 +1970,8 @@ static int its_irq_set_vcpu_affinity(struct irq_data *d, void *vcpu_info) if (!is_v4(its_dev->its)) return -EINVAL; + guard(raw_spinlock_irq)(&its_dev->event_map.vlpi_lock); + /* Unmap request? */ if (!info) return its_vlpi_unmap(d); From 27bd5fdc24c0d5d1306f968ef24105c4577242b0 Mon Sep 17 00:00:00 2001 From: Nikunj A Dadhania Date: Fri, 31 May 2024 04:46:42 +0000 Subject: [PATCH 0511/1578] KVM: SEV-ES: Prevent MSR access post VMSA encryption KVM currently allows userspace to read/write MSRs even after the VMSA is encrypted. This can cause unintentional issues if MSR access has side- effects. For ex, while migrating a guest, userspace could attempt to migrate MSR_IA32_DEBUGCTLMSR and end up unintentionally disabling LBRV on the target. Fix this by preventing access to those MSRs which are context switched via the VMSA, once the VMSA is encrypted. Suggested-by: Sean Christopherson Signed-off-by: Nikunj A Dadhania Signed-off-by: Ravi Bangoria Message-ID: <20240531044644.768-2-ravi.bangoria@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 1d3c8be39479b3..f265361f4518bd 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2822,10 +2822,24 @@ static int svm_get_msr_feature(struct kvm_msr_entry *msr) return 0; } +static bool +sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + return sev_es_guest(vcpu->kvm) && + vcpu->arch.guest_state_protected && + svm_msrpm_offset(msr_info->index) != MSR_INVALID && + !msr_write_intercepted(vcpu, msr_info->index); +} + static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_svm *svm = to_svm(vcpu); + if (sev_es_prevent_msr_access(vcpu, msr_info)) { + msr_info->data = 0; + return -EINVAL; + } + switch (msr_info->index) { case MSR_AMD64_TSC_RATIO: if (!msr_info->host_initiated && @@ -2976,6 +2990,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) u32 ecx = msr->index; u64 data = msr->data; + + if (sev_es_prevent_msr_access(vcpu, msr)) + return -EINVAL; + switch (ecx) { case MSR_AMD64_TSC_RATIO: From d922056215617eedfbdbc29fe49953423686fe5e Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Fri, 31 May 2024 04:46:43 +0000 Subject: [PATCH 0512/1578] KVM: SEV-ES: Disallow SEV-ES guests when X86_FEATURE_LBRV is absent As documented in APM[1], LBR Virtualization must be enabled for SEV-ES guests. So, prevent SEV-ES guests when LBRV support is missing. [1]: AMD64 Architecture Programmer's Manual Pub. 40332, Rev. 4.07 - June 2023, Vol 2, 15.35.2 Enabling SEV-ES. https://bugzilla.kernel.org/attachment.cgi?id=304653 Fixes: 376c6d285017 ("KVM: SVM: Provide support for SEV-ES vCPU creation/loading") Signed-off-by: Ravi Bangoria Message-ID: <20240531044644.768-3-ravi.bangoria@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 6 ++++++ arch/x86/kvm/svm/svm.c | 16 +++++++--------- arch/x86/kvm/svm/svm.h | 1 + 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0623cfaa7bb0ee..8b52bbba02c0fc 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2406,6 +2406,12 @@ void __init sev_hardware_setup(void) if (!boot_cpu_has(X86_FEATURE_SEV_ES)) goto out; + if (!lbrv) { + WARN_ONCE(!boot_cpu_has(X86_FEATURE_LBRV), + "LBRV must be present for SEV-ES support"); + goto out; + } + /* Has the system been allocated ASIDs for SEV-ES? */ if (min_sev_asid == 1) goto out; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index f265361f4518bd..223a551bf44e45 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -215,7 +215,7 @@ int vgif = true; module_param(vgif, int, 0444); /* enable/disable LBR virtualization */ -static int lbrv = true; +int lbrv = true; module_param(lbrv, int, 0444); static int tsc_scaling = true; @@ -5294,6 +5294,12 @@ static __init int svm_hardware_setup(void) nrips = nrips && boot_cpu_has(X86_FEATURE_NRIPS); + if (lbrv) { + if (!boot_cpu_has(X86_FEATURE_LBRV)) + lbrv = false; + else + pr_info("LBR virtualization supported\n"); + } /* * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which * may be modified by svm_adjust_mmio_mask()), as well as nrips. @@ -5347,14 +5353,6 @@ static __init int svm_hardware_setup(void) svm_x86_ops.set_vnmi_pending = NULL; } - - if (lbrv) { - if (!boot_cpu_has(X86_FEATURE_LBRV)) - lbrv = false; - else - pr_info("LBR virtualization supported\n"); - } - if (!enable_pmu) pr_info("PMU virtualization is disabled\n"); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index be57213cd29593..59bbb8122b7509 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -39,6 +39,7 @@ extern int vgif; extern bool intercept_smi; extern bool x2avic_enabled; extern bool vnmi; +extern int lbrv; /* * Clean bits in VMCB. From b7e4be0a224fe5c6be30c1c8bdda8d2317ad6ba4 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Fri, 31 May 2024 04:46:44 +0000 Subject: [PATCH 0513/1578] KVM: SEV-ES: Delegate LBR virtualization to the processor As documented in APM[1], LBR Virtualization must be enabled for SEV-ES guests. Although KVM currently enforces LBRV for SEV-ES guests, there are multiple issues with it: o MSR_IA32_DEBUGCTLMSR is still intercepted. Since MSR_IA32_DEBUGCTLMSR interception is used to dynamically toggle LBRV for performance reasons, this can be fatal for SEV-ES guests. For ex SEV-ES guest on Zen3: [guest ~]# wrmsr 0x1d9 0x4 KVM: entry failed, hardware error 0xffffffff EAX=00000004 EBX=00000000 ECX=000001d9 EDX=00000000 Fix this by never intercepting MSR_IA32_DEBUGCTLMSR for SEV-ES guests. No additional save/restore logic is required since MSR_IA32_DEBUGCTLMSR is of swap type A. o KVM will disable LBRV if userspace sets MSR_IA32_DEBUGCTLMSR before the VMSA is encrypted. Fix this by moving LBRV enablement code post VMSA encryption. [1]: AMD64 Architecture Programmer's Manual Pub. 40332, Rev. 4.07 - June 2023, Vol 2, 15.35.2 Enabling SEV-ES. https://bugzilla.kernel.org/attachment.cgi?id=304653 Fixes: 376c6d285017 ("KVM: SVM: Provide support for SEV-ES vCPU creation/loading") Co-developed-by: Nikunj A Dadhania Signed-off-by: Nikunj A Dadhania Signed-off-by: Ravi Bangoria Message-ID: <20240531044644.768-4-ravi.bangoria@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 13 ++++++++----- arch/x86/kvm/svm/svm.c | 8 +++++++- arch/x86/kvm/svm/svm.h | 3 ++- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 8b52bbba02c0fc..95095a233a459a 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -779,6 +779,14 @@ static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu, */ fpstate_set_confidential(&vcpu->arch.guest_fpu); vcpu->arch.guest_state_protected = true; + + /* + * SEV-ES guest mandates LBR Virtualization to be _always_ ON. Enable it + * only after setting guest_state_protected because KVM_SET_MSRS allows + * dynamic toggling of LBRV (for performance reason) on write access to + * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set. + */ + svm_enable_lbrv(vcpu); return 0; } @@ -3222,7 +3230,6 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) struct kvm_vcpu *vcpu = &svm->vcpu; svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE; - svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; /* * An SEV-ES guest requires a VMSA area that is a separate from the @@ -3274,10 +3281,6 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) /* Clear intercepts on selected MSRs */ set_msr_interception(vcpu, svm->msrpm, MSR_EFER, 1, 1); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_CR_PAT, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); } void sev_init_vmcb(struct vcpu_svm *svm) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 223a551bf44e45..296c524988f953 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -99,6 +99,7 @@ static const struct svm_direct_access_msrs { { .index = MSR_IA32_SPEC_CTRL, .always = false }, { .index = MSR_IA32_PRED_CMD, .always = false }, { .index = MSR_IA32_FLUSH_CMD, .always = false }, + { .index = MSR_IA32_DEBUGCTLMSR, .always = false }, { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, { .index = MSR_IA32_LASTINTFROMIP, .always = false }, @@ -990,7 +991,7 @@ void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb) vmcb_mark_dirty(to_vmcb, VMCB_LBR); } -static void svm_enable_lbrv(struct kvm_vcpu *vcpu) +void svm_enable_lbrv(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1000,6 +1001,9 @@ static void svm_enable_lbrv(struct kvm_vcpu *vcpu) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); + if (sev_es_guest(vcpu->kvm)) + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_DEBUGCTLMSR, 1, 1); + /* Move the LBR msrs to the vmcb02 so that the guest can see them. */ if (is_guest_mode(vcpu)) svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr); @@ -1009,6 +1013,8 @@ static void svm_disable_lbrv(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm); + svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 59bbb8122b7509..0f1472690b593a 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -30,7 +30,7 @@ #define IOPM_SIZE PAGE_SIZE * 3 #define MSRPM_SIZE PAGE_SIZE * 2 -#define MAX_DIRECT_ACCESS_MSRS 47 +#define MAX_DIRECT_ACCESS_MSRS 48 #define MSRPM_OFFSETS 32 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; @@ -553,6 +553,7 @@ u32 *svm_vcpu_alloc_msrpm(void); void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm); void svm_vcpu_free_msrpm(u32 *msrpm); void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb); +void svm_enable_lbrv(struct kvm_vcpu *vcpu); void svm_update_lbrv(struct kvm_vcpu *vcpu); int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer); From 89a58812c47f1823191e9d0b08b53df2dd304ae2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 21 May 2024 18:03:04 -0700 Subject: [PATCH 0514/1578] KVM: x86: Drop support for hand tuning APIC timer advancement from userspace Remove support for specifying a static local APIC timer advancement value, and instead present a read-only boolean parameter to let userspace enable or disable KVM's dynamic APIC timer advancement. Realistically, it's all but impossible for userspace to specify an advancement that is more precise than what KVM's adaptive tuning can provide. E.g. a static value needs to be tuned for the exact hardware and kernel, and if KVM is using hrtimers, likely requires additional tuning for the exact configuration of the entire system. Dropping support for a userspace provided value also fixes several flaws in the interface. E.g. KVM interprets a negative value other than -1 as a large advancement, toggling between a negative and positive value yields unpredictable behavior as vCPUs will switch from dynamic to static advancement, changing the advancement in the middle of VM creation can result in different values for vCPUs within a VM, etc. Those flaws are mostly fixable, but there's almost no justification for taking on yet more complexity (it's minimal complexity, but still non-zero). The only arguments against using KVM's adaptive tuning is if a setup needs a higher maximum, or if the adjustments are too reactive, but those are arguments for letting userspace control the absolute max advancement and the granularity of each adjustment, e.g. similar to how KVM provides knobs for halt polling. Link: https://lore.kernel.org/all/20240520115334.852510-1-zhoushuling@huawei.com Cc: Shuling Zhou Cc: Marcelo Tosatti Signed-off-by: Sean Christopherson Message-ID: <20240522010304.1650603-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 39 +++++++++++++++++++++------------------ arch/x86/kvm/lapic.h | 2 +- arch/x86/kvm/x86.c | 11 +---------- 3 files changed, 23 insertions(+), 29 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ebf41023be3829..acd7d48100a1d3 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -59,7 +59,17 @@ #define MAX_APIC_VECTOR 256 #define APIC_VECTORS_PER_REG 32 -static bool lapic_timer_advance_dynamic __read_mostly; +/* + * Enable local APIC timer advancement (tscdeadline mode only) with adaptive + * tuning. When enabled, KVM programs the host timer event to fire early, i.e. + * before the deadline expires, to account for the delay between taking the + * VM-Exit (to inject the guest event) and the subsequent VM-Enter to resume + * the guest, i.e. so that the interrupt arrives in the guest with minimal + * latency relative to the deadline programmed by the guest. + */ +static bool lapic_timer_advance __read_mostly = true; +module_param(lapic_timer_advance, bool, 0444); + #define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100 /* clock cycles */ #define LAPIC_TIMER_ADVANCE_ADJUST_MAX 10000 /* clock cycles */ #define LAPIC_TIMER_ADVANCE_NS_INIT 1000 @@ -1854,16 +1864,14 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline); - if (lapic_timer_advance_dynamic) { - adjust_lapic_timer_advance(vcpu, guest_tsc - tsc_deadline); - /* - * If the timer fired early, reread the TSC to account for the - * overhead of the above adjustment to avoid waiting longer - * than is necessary. - */ - if (guest_tsc < tsc_deadline) - guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); - } + adjust_lapic_timer_advance(vcpu, guest_tsc - tsc_deadline); + + /* + * If the timer fired early, reread the TSC to account for the overhead + * of the above adjustment to avoid waiting longer than is necessary. + */ + if (guest_tsc < tsc_deadline) + guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); if (guest_tsc < tsc_deadline) __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc); @@ -2812,7 +2820,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) return HRTIMER_NORESTART; } -int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) +int kvm_create_lapic(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic; @@ -2845,13 +2853,8 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); apic->lapic_timer.timer.function = apic_timer_fn; - if (timer_advance_ns == -1) { + if (lapic_timer_advance) apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT; - lapic_timer_advance_dynamic = true; - } else { - apic->lapic_timer.timer_advance_ns = timer_advance_ns; - lapic_timer_advance_dynamic = false; - } /* * Stuff the APIC ENABLE bit in lieu of temporarily incrementing diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 0a0ea4b5dd8ce7..a69e706b9080ac 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -85,7 +85,7 @@ struct kvm_lapic { struct dest_map; -int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns); +int kvm_create_lapic(struct kvm_vcpu *vcpu); void kvm_free_lapic(struct kvm_vcpu *vcpu); int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 082ac6d95a3a08..8c9e4281d978d7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -164,15 +164,6 @@ module_param(kvmclock_periodic_sync, bool, 0444); static u32 __read_mostly tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, 0644); -/* - * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables - * adaptive tuning starting from default advancement of 1000ns. '0' disables - * advancement entirely. Any other value is used as-is and disables adaptive - * tuning, i.e. allows privileged userspace to set an exact advancement time. - */ -static int __read_mostly lapic_timer_advance_ns = -1; -module_param(lapic_timer_advance_ns, int, 0644); - static bool __read_mostly vector_hashing = true; module_param(vector_hashing, bool, 0444); @@ -12169,7 +12160,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) if (r < 0) return r; - r = kvm_create_lapic(vcpu, lapic_timer_advance_ns); + r = kvm_create_lapic(vcpu); if (r < 0) goto fail_mmu_destroy; From d6283b160a12010b2113cc64726a3c9eda13dc5f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 29 May 2024 10:38:04 -0300 Subject: [PATCH 0515/1578] tools headers uapi: Sync linux/stat.h with the kernel sources to pick STATX_SUBVOL To pick the changes from: 2a82bb02941fb53d ("statx: stx_subvol") This silences this perf build warning: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/stat.h include/uapi/linux/stat.h Cc: Adrian Hunter Cc: Christian Brauner Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Kent Overstreet Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/ZlnK2Fmx_gahzwZI@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/stat.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/linux/stat.h b/tools/include/uapi/linux/stat.h index 2f2ee82d55175d..67626d53531664 100644 --- a/tools/include/uapi/linux/stat.h +++ b/tools/include/uapi/linux/stat.h @@ -126,8 +126,9 @@ struct statx { __u64 stx_mnt_id; __u32 stx_dio_mem_align; /* Memory buffer alignment for direct I/O */ __u32 stx_dio_offset_align; /* File offset alignment for direct I/O */ + __u64 stx_subvol; /* Subvolume identifier */ /* 0xa0 */ - __u64 __spare3[12]; /* Spare space for future expansion */ + __u64 __spare3[11]; /* Spare space for future expansion */ /* 0x100 */ }; @@ -155,6 +156,7 @@ struct statx { #define STATX_MNT_ID 0x00001000U /* Got stx_mnt_id */ #define STATX_DIOALIGN 0x00002000U /* Want/got direct I/O alignment info */ #define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */ +#define STATX_SUBVOL 0x00008000U /* Want/got stx_subvol */ #define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */ From 18befe4a28403f599115c5ae753cc7f5157af8b7 Mon Sep 17 00:00:00 2001 From: Dumitru Ceclan Date: Thu, 30 May 2024 15:07:52 +0300 Subject: [PATCH 0516/1578] iio: adc: ad7173: Clear append status bit The previous value of the append status bit was not cleared before setting the new value. This caused the bit to remain set after enabling buffered mode for multiple channels and not permit further buffered reads from a single channel after the fact. Fixes: 76a1e6a42802 ("iio: adc: ad7173: add AD7173 driver") Signed-off-by: Dumitru Ceclan Link: https://lore.kernel.org/r/20240530-ad7173-fixes-v3-4-b85f33079e18@analog.com Signed-off-by: Jonathan Cameron --- drivers/iio/adc/ad7173.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iio/adc/ad7173.c b/drivers/iio/adc/ad7173.c index 048127bf979a96..e2f17292409cb3 100644 --- a/drivers/iio/adc/ad7173.c +++ b/drivers/iio/adc/ad7173.c @@ -543,6 +543,7 @@ static int ad7173_append_status(struct ad_sigma_delta *sd, bool append) unsigned int interface_mode = st->interface_mode; int ret; + interface_mode &= ~AD7173_INTERFACE_DATA_STAT; interface_mode |= AD7173_INTERFACE_DATA_STAT_EN(append); ret = ad_sd_write_reg(&st->sd, AD7173_REG_INTERFACE_MODE, 2, interface_mode); if (ret) From 182bc496dc63ca03986e3c393166477f4a4c2742 Mon Sep 17 00:00:00 2001 From: Dumitru Ceclan Date: Thu, 30 May 2024 15:07:53 +0300 Subject: [PATCH 0517/1578] iio: adc: ad7173: Fix sampling frequency setting This patch fixes two issues regarding the sampling frequency setting: -The attribute was set as per device, not per channel. As such, when setting the sampling frequency, the configuration was always done for the slot 0, and the correct configuration was applied on the next channel configuration call by the LRU mechanism. -The LRU implementation does not take into account external settings of the slot registers. When setting the sampling frequency directly to a slot register in write_raw(), there is no guarantee that other channels were not also using that slot and now incorrectly retain their config as live. Set the sampling frequency attribute as separate in the channel templates. Do not set the sampling directly to the slot register in write_raw(), just mark the config as not live and let the LRU mechanism handle it. As the reg variable is no longer used, remove it. Fixes: 76a1e6a42802 ("iio: adc: ad7173: add AD7173 driver") Signed-off-by: Dumitru Ceclan Link: https://lore.kernel.org/r/20240530-ad7173-fixes-v3-5-b85f33079e18@analog.com Signed-off-by: Jonathan Cameron --- drivers/iio/adc/ad7173.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/drivers/iio/adc/ad7173.c b/drivers/iio/adc/ad7173.c index e2f17292409cb3..b26d4575e2569d 100644 --- a/drivers/iio/adc/ad7173.c +++ b/drivers/iio/adc/ad7173.c @@ -717,7 +717,7 @@ static int ad7173_write_raw(struct iio_dev *indio_dev, { struct ad7173_state *st = iio_priv(indio_dev); struct ad7173_channel_config *cfg; - unsigned int freq, i, reg; + unsigned int freq, i; int ret; ret = iio_device_claim_direct_mode(indio_dev); @@ -733,16 +733,7 @@ static int ad7173_write_raw(struct iio_dev *indio_dev, cfg = &st->channels[chan->address].cfg; cfg->odr = i; - - if (!cfg->live) - break; - - ret = ad_sd_read_reg(&st->sd, AD7173_REG_FILTER(cfg->cfg_slot), 2, ®); - if (ret) - break; - reg &= ~AD7173_FILTER_ODR0_MASK; - reg |= FIELD_PREP(AD7173_FILTER_ODR0_MASK, i); - ret = ad_sd_write_reg(&st->sd, AD7173_REG_FILTER(cfg->cfg_slot), 2, reg); + cfg->live = false; break; default: @@ -804,8 +795,7 @@ static const struct iio_chan_spec ad7173_channel_template = { .type = IIO_VOLTAGE, .indexed = 1, .info_mask_separate = BIT(IIO_CHAN_INFO_RAW) | - BIT(IIO_CHAN_INFO_SCALE), - .info_mask_shared_by_all = BIT(IIO_CHAN_INFO_SAMP_FREQ), + BIT(IIO_CHAN_INFO_SCALE) | BIT(IIO_CHAN_INFO_SAMP_FREQ), .scan_type = { .sign = 'u', .realbits = 24, @@ -819,8 +809,8 @@ static const struct iio_chan_spec ad7173_temp_iio_channel_template = { .channel = AD7173_AIN_TEMP_POS, .channel2 = AD7173_AIN_TEMP_NEG, .info_mask_separate = BIT(IIO_CHAN_INFO_RAW) | - BIT(IIO_CHAN_INFO_SCALE) | BIT(IIO_CHAN_INFO_OFFSET), - .info_mask_shared_by_all = BIT(IIO_CHAN_INFO_SAMP_FREQ), + BIT(IIO_CHAN_INFO_SCALE) | BIT(IIO_CHAN_INFO_OFFSET) | + BIT(IIO_CHAN_INFO_SAMP_FREQ), .scan_type = { .sign = 'u', .realbits = 24, From 8844ed0a6e063acf7173b231021b2d301e31ded9 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Maneyrol Date: Mon, 27 May 2024 15:01:17 +0000 Subject: [PATCH 0518/1578] iio: imu: inv_mpu6050: stabilized timestamping in interrupt Use IRQ ONESHOT flag to ensure the timestamp is not updated in the hard handler during the thread handler. And use a fixed value of 1 sample that correspond to this first timestamp. This way we can ensure the timestamp is always corresponding to the value used by the timestamping mechanism. Otherwise, it is possible that between FIFO count read and FIFO processing the timestamp is overwritten in the hard handler. Fixes: 111e1abd0045 ("iio: imu: inv_mpu6050: use the common inv_sensors timestamp module") Cc: stable@vger.kernel.org Signed-off-by: Jean-Baptiste Maneyrol Link: https://lore.kernel.org/r/20240527150117.608792-1-inv.git-commit@tdk.com Signed-off-by: Jonathan Cameron --- drivers/iio/imu/inv_mpu6050/inv_mpu_ring.c | 4 ++-- drivers/iio/imu/inv_mpu6050/inv_mpu_trigger.c | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/iio/imu/inv_mpu6050/inv_mpu_ring.c b/drivers/iio/imu/inv_mpu6050/inv_mpu_ring.c index 0dc0f22a558279..3d3b27f28c9d1c 100644 --- a/drivers/iio/imu/inv_mpu6050/inv_mpu_ring.c +++ b/drivers/iio/imu/inv_mpu6050/inv_mpu_ring.c @@ -100,8 +100,8 @@ irqreturn_t inv_mpu6050_read_fifo(int irq, void *p) goto end_session; /* Each FIFO data contains all sensors, so same number for FIFO and sensor data */ fifo_period = NSEC_PER_SEC / INV_MPU6050_DIVIDER_TO_FIFO_RATE(st->chip_config.divider); - inv_sensors_timestamp_interrupt(&st->timestamp, nb, pf->timestamp); - inv_sensors_timestamp_apply_odr(&st->timestamp, fifo_period, nb, 0); + inv_sensors_timestamp_interrupt(&st->timestamp, 1, pf->timestamp); + inv_sensors_timestamp_apply_odr(&st->timestamp, fifo_period, 1, 0); /* clear internal data buffer for avoiding kernel data leak */ memset(data, 0, sizeof(data)); diff --git a/drivers/iio/imu/inv_mpu6050/inv_mpu_trigger.c b/drivers/iio/imu/inv_mpu6050/inv_mpu_trigger.c index 1b603567ccc804..84273660ca2eb9 100644 --- a/drivers/iio/imu/inv_mpu6050/inv_mpu_trigger.c +++ b/drivers/iio/imu/inv_mpu6050/inv_mpu_trigger.c @@ -300,6 +300,7 @@ int inv_mpu6050_probe_trigger(struct iio_dev *indio_dev, int irq_type) if (!st->trig) return -ENOMEM; + irq_type |= IRQF_ONESHOT; ret = devm_request_threaded_irq(&indio_dev->dev, st->irq, &inv_mpu6050_interrupt_timestamp, &inv_mpu6050_interrupt_handle, From 78f0dfa64cbd05f381849377a32e0a2f1afe9215 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 30 May 2024 09:44:16 +0200 Subject: [PATCH 0519/1578] iio: inkern: fix channel read regression A recent "cleanup" broke IIO channel read outs and thereby thermal mitigation on the Lenovo ThinkPad X13s by returning zero instead of the expected IIO value type in iio_read_channel_processed_scale(): thermal thermal_zone12: failed to read out thermal zone (-22) Fixes: 3092bde731ca ("iio: inkern: move to the cleanup.h magic") Cc: Nuno Sa Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20240530074416.13697-1-johan+linaro@kernel.org Signed-off-by: Jonathan Cameron --- drivers/iio/inkern.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/inkern.c b/drivers/iio/inkern.c index 52d7732618282e..485e6fc44a04c6 100644 --- a/drivers/iio/inkern.c +++ b/drivers/iio/inkern.c @@ -721,7 +721,7 @@ int iio_read_channel_processed_scale(struct iio_channel *chan, int *val, return ret; *val *= scale; - return 0; + return ret; } else { ret = iio_channel_read(chan, val, NULL, IIO_CHAN_INFO_RAW); if (ret < 0) From ffbe335b8d471f79b259e950cb20999700670456 Mon Sep 17 00:00:00 2001 From: Matthias Stocker Date: Fri, 31 May 2024 12:37:11 +0200 Subject: [PATCH 0520/1578] vmxnet3: disable rx data ring on dma allocation failure When vmxnet3_rq_create() fails to allocate memory for rq->data_ring.base, the subsequent call to vmxnet3_rq_destroy_all_rxdataring does not reset rq->data_ring.desc_size for the data ring that failed, which presumably causes the hypervisor to reference it on packet reception. To fix this bug, rq->data_ring.desc_size needs to be set to 0 to tell the hypervisor to disable this feature. [ 95.436876] kernel BUG at net/core/skbuff.c:207! [ 95.439074] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI [ 95.440411] CPU: 7 PID: 0 Comm: swapper/7 Not tainted 6.9.3-dirty #1 [ 95.441558] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 12/12/2018 [ 95.443481] RIP: 0010:skb_panic+0x4d/0x4f [ 95.444404] Code: 4f 70 50 8b 87 c0 00 00 00 50 8b 87 bc 00 00 00 50 ff b7 d0 00 00 00 4c 8b 8f c8 00 00 00 48 c7 c7 68 e8 be 9f e8 63 58 f9 ff <0f> 0b 48 8b 14 24 48 c7 c1 d0 73 65 9f e8 a1 ff ff ff 48 8b 14 24 [ 95.447684] RSP: 0018:ffffa13340274dd0 EFLAGS: 00010246 [ 95.448762] RAX: 0000000000000089 RBX: ffff8fbbc72b02d0 RCX: 000000000000083f [ 95.450148] RDX: 0000000000000000 RSI: 00000000000000f6 RDI: 000000000000083f [ 95.451520] RBP: 000000000000002d R08: 0000000000000000 R09: ffffa13340274c60 [ 95.452886] R10: ffffffffa04ed468 R11: 0000000000000002 R12: 0000000000000000 [ 95.454293] R13: ffff8fbbdab3c2d0 R14: ffff8fbbdbd829e0 R15: ffff8fbbdbd809e0 [ 95.455682] FS: 0000000000000000(0000) GS:ffff8fbeefd80000(0000) knlGS:0000000000000000 [ 95.457178] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 95.458340] CR2: 00007fd0d1f650c8 CR3: 0000000115f28000 CR4: 00000000000406f0 [ 95.459791] Call Trace: [ 95.460515] [ 95.461180] ? __die_body.cold+0x19/0x27 [ 95.462150] ? die+0x2e/0x50 [ 95.462976] ? do_trap+0xca/0x110 [ 95.463973] ? do_error_trap+0x6a/0x90 [ 95.464966] ? skb_panic+0x4d/0x4f [ 95.465901] ? exc_invalid_op+0x50/0x70 [ 95.466849] ? skb_panic+0x4d/0x4f [ 95.467718] ? asm_exc_invalid_op+0x1a/0x20 [ 95.468758] ? skb_panic+0x4d/0x4f [ 95.469655] skb_put.cold+0x10/0x10 [ 95.470573] vmxnet3_rq_rx_complete+0x862/0x11e0 [vmxnet3] [ 95.471853] vmxnet3_poll_rx_only+0x36/0xb0 [vmxnet3] [ 95.473185] __napi_poll+0x2b/0x160 [ 95.474145] net_rx_action+0x2c6/0x3b0 [ 95.475115] handle_softirqs+0xe7/0x2a0 [ 95.476122] __irq_exit_rcu+0x97/0xb0 [ 95.477109] common_interrupt+0x85/0xa0 [ 95.478102] [ 95.478846] [ 95.479603] asm_common_interrupt+0x26/0x40 [ 95.480657] RIP: 0010:pv_native_safe_halt+0xf/0x20 [ 95.481801] Code: 22 d7 e9 54 87 01 00 0f 1f 40 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa eb 07 0f 00 2d 93 ba 3b 00 fb f4 2c 87 01 00 66 66 2e 0f 1f 84 00 00 00 00 00 90 90 90 90 90 90 [ 95.485563] RSP: 0018:ffffa133400ffe58 EFLAGS: 00000246 [ 95.486882] RAX: 0000000000004000 RBX: ffff8fbbc1d14064 RCX: 0000000000000000 [ 95.488477] RDX: ffff8fbeefd80000 RSI: ffff8fbbc1d14000 RDI: 0000000000000001 [ 95.490067] RBP: ffff8fbbc1d14064 R08: ffffffffa0652260 R09: 00000000000010d3 [ 95.491683] R10: 0000000000000018 R11: ffff8fbeefdb4764 R12: ffffffffa0652260 [ 95.493389] R13: ffffffffa06522e0 R14: 0000000000000001 R15: 0000000000000000 [ 95.495035] acpi_safe_halt+0x14/0x20 [ 95.496127] acpi_idle_do_entry+0x2f/0x50 [ 95.497221] acpi_idle_enter+0x7f/0xd0 [ 95.498272] cpuidle_enter_state+0x81/0x420 [ 95.499375] cpuidle_enter+0x2d/0x40 [ 95.500400] do_idle+0x1e5/0x240 [ 95.501385] cpu_startup_entry+0x29/0x30 [ 95.502422] start_secondary+0x11c/0x140 [ 95.503454] common_startup_64+0x13e/0x141 [ 95.504466] [ 95.505197] Modules linked in: nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 rfkill ip_set nf_tables vsock_loopback vmw_vsock_virtio_transport_common qrtr vmw_vsock_vmci_transport vsock sunrpc binfmt_misc pktcdvd vmw_balloon pcspkr vmw_vmci i2c_piix4 joydev loop dm_multipath nfnetlink zram crct10dif_pclmul crc32_pclmul vmwgfx crc32c_intel polyval_clmulni polyval_generic ghash_clmulni_intel sha512_ssse3 sha256_ssse3 vmxnet3 sha1_ssse3 drm_ttm_helper vmw_pvscsi ttm ata_generic pata_acpi serio_raw scsi_dh_rdac scsi_dh_emc scsi_dh_alua ip6_tables ip_tables fuse [ 95.516536] ---[ end trace 0000000000000000 ]--- Fixes: 6f4833383e85 ("net: vmxnet3: Fix NULL pointer dereference in vmxnet3_rq_rx_complete()") Signed-off-by: Matthias Stocker Reviewed-by: Subbaraya Sundeep Reviewed-by: Ronak Doshi Link: https://lore.kernel.org/r/20240531103711.101961-1-mstocker@barracuda.com Signed-off-by: Jakub Kicinski --- drivers/net/vmxnet3/vmxnet3_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c index 89ca6e75fcc6b0..63822d454c00c7 100644 --- a/drivers/net/vmxnet3/vmxnet3_drv.c +++ b/drivers/net/vmxnet3/vmxnet3_drv.c @@ -2034,8 +2034,8 @@ vmxnet3_rq_destroy_all_rxdataring(struct vmxnet3_adapter *adapter) rq->data_ring.base, rq->data_ring.basePA); rq->data_ring.base = NULL; - rq->data_ring.desc_size = 0; } + rq->data_ring.desc_size = 0; } } From 2fe40483ec257de2a0d819ef88e3e76c7e261319 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 31 May 2024 13:26:32 +0000 Subject: [PATCH 0521/1578] ipv6: ioam: block BH from ioam6_output() As explained in commit 1378817486d6 ("tipc: block BH before using dst_cache"), net/core/dst_cache.c helpers need to be called with BH disabled. Disabling preemption in ioam6_output() is not good enough, because ioam6_output() is called from process context, lwtunnel_output() only uses rcu_read_lock(). We might be interrupted by a softirq, re-enter ioam6_output() and corrupt dst_cache data structures. Fix the race by using local_bh_disable() instead of preempt_disable(). Fixes: 8cb3bf8bff3c ("ipv6: ioam: Add support for the ip6ip6 encapsulation") Signed-off-by: Eric Dumazet Cc: Justin Iurman Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240531132636.2637995-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/ioam6_iptunnel.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c index 7563f8c6aa87cf..bf7120ecea1ebe 100644 --- a/net/ipv6/ioam6_iptunnel.c +++ b/net/ipv6/ioam6_iptunnel.c @@ -351,9 +351,9 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) goto drop; if (!ipv6_addr_equal(&orig_daddr, &ipv6_hdr(skb)->daddr)) { - preempt_disable(); + local_bh_disable(); dst = dst_cache_get(&ilwt->cache); - preempt_enable(); + local_bh_enable(); if (unlikely(!dst)) { struct ipv6hdr *hdr = ipv6_hdr(skb); @@ -373,9 +373,9 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) goto drop; } - preempt_disable(); + local_bh_disable(); dst_cache_set_ip6(&ilwt->cache, dst, &fl6.saddr); - preempt_enable(); + local_bh_enable(); } skb_dst_drop(skb); From db0090c6eb12c31246438b7fe2a8f1b833e7a653 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 31 May 2024 13:26:33 +0000 Subject: [PATCH 0522/1578] net: ipv6: rpl_iptunnel: block BH in rpl_output() and rpl_input() As explained in commit 1378817486d6 ("tipc: block BH before using dst_cache"), net/core/dst_cache.c helpers need to be called with BH disabled. Disabling preemption in rpl_output() is not good enough, because rpl_output() is called from process context, lwtunnel_output() only uses rcu_read_lock(). We might be interrupted by a softirq, re-enter rpl_output() and corrupt dst_cache data structures. Fix the race by using local_bh_disable() instead of preempt_disable(). Apply a similar change in rpl_input(). Signed-off-by: Eric Dumazet Cc: Alexander Aring Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240531132636.2637995-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/rpl_iptunnel.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c index a013b92cbb860a..2c83b7586422dd 100644 --- a/net/ipv6/rpl_iptunnel.c +++ b/net/ipv6/rpl_iptunnel.c @@ -212,9 +212,9 @@ static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (unlikely(err)) goto drop; - preempt_disable(); + local_bh_disable(); dst = dst_cache_get(&rlwt->cache); - preempt_enable(); + local_bh_enable(); if (unlikely(!dst)) { struct ipv6hdr *hdr = ipv6_hdr(skb); @@ -234,9 +234,9 @@ static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) goto drop; } - preempt_disable(); + local_bh_disable(); dst_cache_set_ip6(&rlwt->cache, dst, &fl6.saddr); - preempt_enable(); + local_bh_enable(); } skb_dst_drop(skb); @@ -268,23 +268,21 @@ static int rpl_input(struct sk_buff *skb) return err; } - preempt_disable(); + local_bh_disable(); dst = dst_cache_get(&rlwt->cache); - preempt_enable(); if (!dst) { ip6_route_input(skb); dst = skb_dst(skb); if (!dst->error) { - preempt_disable(); dst_cache_set_ip6(&rlwt->cache, dst, &ipv6_hdr(skb)->saddr); - preempt_enable(); } } else { skb_dst_drop(skb); skb_dst_set(skb, dst); } + local_bh_enable(); err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); if (unlikely(err)) From c0b98ac1cc104f48763cdb27b1e9ac25fd81fc90 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 31 May 2024 13:26:34 +0000 Subject: [PATCH 0523/1578] ipv6: sr: block BH in seg6_output_core() and seg6_input_core() As explained in commit 1378817486d6 ("tipc: block BH before using dst_cache"), net/core/dst_cache.c helpers need to be called with BH disabled. Disabling preemption in seg6_output_core() is not good enough, because seg6_output_core() is called from process context, lwtunnel_output() only uses rcu_read_lock(). We might be interrupted by a softirq, re-enter seg6_output_core() and corrupt dst_cache data structures. Fix the race by using local_bh_disable() instead of preempt_disable(). Apply a similar change in seg6_input_core(). Fixes: fa79581ea66c ("ipv6: sr: fix several BUGs when preemption is enabled") Fixes: 6c8702c60b88 ("ipv6: sr: add support for SRH encapsulation and injection with lwtunnels") Signed-off-by: Eric Dumazet Cc: David Lebrun Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240531132636.2637995-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/seg6_iptunnel.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index a75df2ec8db0d3..098632adc9b5af 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -464,23 +464,21 @@ static int seg6_input_core(struct net *net, struct sock *sk, slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate); - preempt_disable(); + local_bh_disable(); dst = dst_cache_get(&slwt->cache); - preempt_enable(); if (!dst) { ip6_route_input(skb); dst = skb_dst(skb); if (!dst->error) { - preempt_disable(); dst_cache_set_ip6(&slwt->cache, dst, &ipv6_hdr(skb)->saddr); - preempt_enable(); } } else { skb_dst_drop(skb); skb_dst_set(skb, dst); } + local_bh_enable(); err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); if (unlikely(err)) @@ -536,9 +534,9 @@ static int seg6_output_core(struct net *net, struct sock *sk, slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate); - preempt_disable(); + local_bh_disable(); dst = dst_cache_get(&slwt->cache); - preempt_enable(); + local_bh_enable(); if (unlikely(!dst)) { struct ipv6hdr *hdr = ipv6_hdr(skb); @@ -558,9 +556,9 @@ static int seg6_output_core(struct net *net, struct sock *sk, goto drop; } - preempt_disable(); + local_bh_disable(); dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr); - preempt_enable(); + local_bh_enable(); } skb_dst_drop(skb); From cf28ff8e4c02e1ffa850755288ac954b6ff0db8c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 31 May 2024 13:26:35 +0000 Subject: [PATCH 0524/1578] ila: block BH in ila_output() As explained in commit 1378817486d6 ("tipc: block BH before using dst_cache"), net/core/dst_cache.c helpers need to be called with BH disabled. ila_output() is called from lwtunnel_output() possibly from process context, and under rcu_read_lock(). We might be interrupted by a softirq, re-enter ila_output() and corrupt dst_cache data structures. Fix the race by using local_bh_disable(). Signed-off-by: Eric Dumazet Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240531132636.2637995-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/ila/ila_lwt.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c index 0601bad7982213..ff7e734e335b06 100644 --- a/net/ipv6/ila/ila_lwt.c +++ b/net/ipv6/ila/ila_lwt.c @@ -58,7 +58,9 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) return orig_dst->lwtstate->orig_output(net, sk, skb); } + local_bh_disable(); dst = dst_cache_get(&ilwt->dst_cache); + local_bh_enable(); if (unlikely(!dst)) { struct ipv6hdr *ip6h = ipv6_hdr(skb); struct flowi6 fl6; @@ -86,8 +88,11 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) goto drop; } - if (ilwt->connected) + if (ilwt->connected) { + local_bh_disable(); dst_cache_set_ip6(&ilwt->dst_cache, dst, &fl6.saddr); + local_bh_enable(); + } } skb_dst_set(skb, dst); From 2fe6fb36c781b50482b1c3323fb526bc07d1af59 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 31 May 2024 13:26:36 +0000 Subject: [PATCH 0525/1578] net: dst_cache: add two DEBUG_NET warnings After fixing four different bugs involving dst_cache users, it might be worth adding a check about BH being blocked by dst_cache callers. DEBUG_NET_WARN_ON_ONCE(!in_softirq()); It is not fatal, if we missed valid case where no BH deadlock is to be feared, we might change this. Signed-off-by: Eric Dumazet Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240531132636.2637995-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/dst_cache.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c index 6a0482e676d379..70c634b9e7b023 100644 --- a/net/core/dst_cache.c +++ b/net/core/dst_cache.c @@ -27,6 +27,7 @@ struct dst_cache_pcpu { static void dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache, struct dst_entry *dst, u32 cookie) { + DEBUG_NET_WARN_ON_ONCE(!in_softirq()); dst_release(dst_cache->dst); if (dst) dst_hold(dst); @@ -40,6 +41,7 @@ static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache, { struct dst_entry *dst; + DEBUG_NET_WARN_ON_ONCE(!in_softirq()); dst = idst->dst; if (!dst) goto fail; From c6cab01d7e20a028ffcee1e0a0b782332a16b5e6 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Fri, 31 May 2024 18:35:43 -0700 Subject: [PATCH 0526/1578] lib/test_rhashtable: add missing MODULE_DESCRIPTION() macro make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in lib/test_rhashtable.o Add the missing invocation of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Link: https://lore.kernel.org/r/20240531-md-lib-test_rhashtable-v1-1-cd6d4138f1b6@quicinc.com Signed-off-by: Jakub Kicinski --- lib/test_rhashtable.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c index 42b585208249c6..c63db03ebb9dcf 100644 --- a/lib/test_rhashtable.c +++ b/lib/test_rhashtable.c @@ -811,4 +811,5 @@ static void __exit test_rht_exit(void) module_init(test_rht_init); module_exit(test_rht_exit); +MODULE_DESCRIPTION("Resizable, Scalable, Concurrent Hash Table test module"); MODULE_LICENSE("GPL v2"); From 2b85b7fb1376481f7d4c2cf92e5da942f06b2547 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Mon, 3 Jun 2024 08:01:03 -0500 Subject: [PATCH 0527/1578] powerpc/crypto: Add generated P8 asm to .gitignore Looks like drivers/crypto/vmx/.gitignore should have been merged into arch/powerpc/crypto/.gitignore as part of commit 109303336a0c ("crypto: vmx - Move to arch/powerpc/crypto") so that all generated asm files are ignored. Signed-off-by: Nathan Lynch Fixes: 109303336a0c ("crypto: vmx - Move to arch/powerpc/crypto") Signed-off-by: Michael Ellerman Link: https://msgid.link/20240603-powerpc-crypto-ignore-p8-asm-v1-1-05843fec2bb7@linux.ibm.com --- arch/powerpc/crypto/.gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/crypto/.gitignore b/arch/powerpc/crypto/.gitignore index e1094f08f713ae..e9fe73aac8b61a 100644 --- a/arch/powerpc/crypto/.gitignore +++ b/arch/powerpc/crypto/.gitignore @@ -1,3 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only aesp10-ppc.S +aesp8-ppc.S ghashp10-ppc.S +ghashp8-ppc.S From 4aa2dcfbad538adf7becd0034a3754e1bd01b2b5 Mon Sep 17 00:00:00 2001 From: Nikita Zhandarovich Date: Fri, 17 May 2024 07:19:14 -0700 Subject: [PATCH 0528/1578] HID: core: remove unnecessary WARN_ON() in implement() Syzkaller hit a warning [1] in a call to implement() when trying to write a value into a field of smaller size in an output report. Since implement() already has a warn message printed out with the help of hid_warn() and value in question gets trimmed with: ... value &= m; ... WARN_ON may be considered superfluous. Remove it to suppress future syzkaller triggers. [1] WARNING: CPU: 0 PID: 5084 at drivers/hid/hid-core.c:1451 implement drivers/hid/hid-core.c:1451 [inline] WARNING: CPU: 0 PID: 5084 at drivers/hid/hid-core.c:1451 hid_output_report+0x548/0x760 drivers/hid/hid-core.c:1863 Modules linked in: CPU: 0 PID: 5084 Comm: syz-executor424 Not tainted 6.9.0-rc7-syzkaller-00183-gcf87f46fd34d #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/02/2024 RIP: 0010:implement drivers/hid/hid-core.c:1451 [inline] RIP: 0010:hid_output_report+0x548/0x760 drivers/hid/hid-core.c:1863 ... Call Trace: __usbhid_submit_report drivers/hid/usbhid/hid-core.c:591 [inline] usbhid_submit_report+0x43d/0x9e0 drivers/hid/usbhid/hid-core.c:636 hiddev_ioctl+0x138b/0x1f00 drivers/hid/usbhid/hiddev.c:726 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:904 [inline] __se_sys_ioctl+0xfc/0x170 fs/ioctl.c:890 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf5/0x240 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f ... Fixes: 95d1c8951e5b ("HID: simplify implement() a bit") Reported-by: Suggested-by: Alan Stern Signed-off-by: Nikita Zhandarovich Signed-off-by: Jiri Kosina --- drivers/hid/hid-core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index b1fa0378e8f4a4..74efda212c55f5 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -1448,7 +1448,6 @@ static void implement(const struct hid_device *hid, u8 *report, hid_warn(hid, "%s() called with too large value %d (n: %d)! (%s)\n", __func__, value, n, current->comm); - WARN_ON(1); value &= m; } } From 3ec8ebd8a5b782d56347ae884de880af26f93996 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 27 May 2024 16:22:34 +0300 Subject: [PATCH 0529/1578] EDAC/amd64: Convert PCIBIOS_* return codes to errnos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gpu_get_node_map() uses pci_read_config_dword() that returns PCIBIOS_* codes. The return code is then returned all the way into the module init function amd64_edac_init() that returns it as is. The module init functions, however, should return normal errnos. Convert PCIBIOS_* returns code using pcibios_err_to_errno() into normal errno before returning it from gpu_get_node_map(). For consistency, convert also the other similar cases which return PCIBIOS_* codes even if they do not have any bugs at the moment. Fixes: 4251566ebc1c ("EDAC/amd64: Cache and use GPU node map") Signed-off-by: Ilpo Järvinen Signed-off-by: Borislav Petkov (AMD) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240527132236.13875-1-ilpo.jarvinen@linux.intel.com --- drivers/edac/amd64_edac.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 1f3520d7686135..a17f3c0cdfa601 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -81,7 +81,7 @@ int __amd64_read_pci_cfg_dword(struct pci_dev *pdev, int offset, amd64_warn("%s: error reading F%dx%03x.\n", func, PCI_FUNC(pdev->devfn), offset); - return err; + return pcibios_err_to_errno(err); } int __amd64_write_pci_cfg_dword(struct pci_dev *pdev, int offset, @@ -94,7 +94,7 @@ int __amd64_write_pci_cfg_dword(struct pci_dev *pdev, int offset, amd64_warn("%s: error writing to F%dx%03x.\n", func, PCI_FUNC(pdev->devfn), offset); - return err; + return pcibios_err_to_errno(err); } /* @@ -1025,8 +1025,10 @@ static int gpu_get_node_map(struct amd64_pvt *pvt) } ret = pci_read_config_dword(pdev, REG_LOCAL_NODE_TYPE_MAP, &tmp); - if (ret) + if (ret) { + ret = pcibios_err_to_errno(ret); goto out; + } gpu_node_map.node_count = FIELD_GET(LNTM_NODE_COUNT, tmp); gpu_node_map.base_node_id = FIELD_GET(LNTM_BASE_NODE_ID, tmp); From f8367a74aebf88dc8b58a0db6a6c90b4cb8fc9d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 27 May 2024 16:22:35 +0300 Subject: [PATCH 0530/1578] EDAC/igen6: Convert PCIBIOS_* return codes to errnos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit errcmd_enable_error_reporting() uses pci_{read,write}_config_word() that return PCIBIOS_* codes. The return code is then returned all the way into the probe function igen6_probe() that returns it as is. The probe functions, however, should return normal errnos. Convert PCIBIOS_* returns code using pcibios_err_to_errno() into normal errno before returning it from errcmd_enable_error_reporting(). Fixes: 10590a9d4f23 ("EDAC/igen6: Add EDAC driver for Intel client SoCs using IBECC") Signed-off-by: Ilpo Järvinen Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Qiuxu Zhuo Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240527132236.13875-2-ilpo.jarvinen@linux.intel.com --- drivers/edac/igen6_edac.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/edac/igen6_edac.c b/drivers/edac/igen6_edac.c index cdd8480e736877..dbe9fe5f2ca6c3 100644 --- a/drivers/edac/igen6_edac.c +++ b/drivers/edac/igen6_edac.c @@ -800,7 +800,7 @@ static int errcmd_enable_error_reporting(bool enable) rc = pci_read_config_word(imc->pdev, ERRCMD_OFFSET, &errcmd); if (rc) - return rc; + return pcibios_err_to_errno(rc); if (enable) errcmd |= ERRCMD_CE | ERRSTS_UE; @@ -809,7 +809,7 @@ static int errcmd_enable_error_reporting(bool enable) rc = pci_write_config_word(imc->pdev, ERRCMD_OFFSET, errcmd); if (rc) - return rc; + return pcibios_err_to_errno(rc); return 0; } From a535d59432370343058755100ee75ab03c0e3f91 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 30 May 2024 16:26:07 -0700 Subject: [PATCH 0531/1578] net: tls: fix marking packets as decrypted For TLS offload we mark packets with skb->decrypted to make sure they don't escape the host without getting encrypted first. The crypto state lives in the socket, so it may get detached by a call to skb_orphan(). As a safety check - the egress path drops all packets with skb->decrypted and no "crypto-safe" socket. The skb marking was added to sendpage only (and not sendmsg), because tls_device injected data into the TCP stack using sendpage. This special case was missed when sendpage got folded into sendmsg. Fixes: c5c37af6ecad ("tcp: Convert do_tcp_sendpages() to use MSG_SPLICE_PAGES") Signed-off-by: Jakub Kicinski Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240530232607.82686-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/ipv4/tcp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 681b54e1f3a643..4d8cc2ebb64c7d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1165,6 +1165,9 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) process_backlog++; +#ifdef CONFIG_SKB_DECRYPTED + skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED); +#endif tcp_skb_entail(sk, skb); copy = size_goal; From c3552ab19aeb8101b751e7c7ad45deab9e1134e1 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Fri, 24 May 2024 17:15:42 +0200 Subject: [PATCH 0532/1578] staging: vchiq_debugfs: Fix NPD in vchiq_dump_state The commit 42a2f6664e18 ("staging: vc04_services: Move global g_state to vchiq_state") falsely assumed that the debugfs entry vchiq/state was created with vchiq_instance as data. This causes now a NULL pointer derefence while trying to dump the vchiq state. So fix this by passing vchiq_state as data, because this is the relevant part here. Fixes: 42a2f6664e18 ("staging: vc04_services: Move global g_state to vchiq_state") Signed-off-by: Stefan Wahren Reviewed-by: Umang Jain Link: https://lore.kernel.org/r/20240524151542.19415-1-wahrenst@gmx.net Signed-off-by: Greg Kroah-Hartman --- .../staging/vc04_services/interface/vchiq_arm/vchiq_arm.c | 2 +- .../vc04_services/interface/vchiq_arm/vchiq_debugfs.c | 8 ++++---- .../vc04_services/interface/vchiq_arm/vchiq_debugfs.h | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c index 297af1d80b12bd..69daeba974f2dc 100644 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c @@ -1759,7 +1759,7 @@ static int vchiq_probe(struct platform_device *pdev) if (err) goto failed_platform_init; - vchiq_debugfs_init(); + vchiq_debugfs_init(&mgmt->state); dev_dbg(&pdev->dev, "arm: platform initialised - version %d (min %d)\n", VCHIQ_VERSION, VCHIQ_VERSION_MIN); diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.c index 54e7bf029d9a1f..1f74d0bb33bae6 100644 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.c +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.c @@ -42,9 +42,9 @@ static int debugfs_trace_show(struct seq_file *f, void *offset) static int vchiq_dump_show(struct seq_file *f, void *offset) { - struct vchiq_instance *instance = f->private; + struct vchiq_state *state = f->private; - vchiq_dump_state(f, instance->state); + vchiq_dump_state(f, state); return 0; } @@ -121,12 +121,12 @@ void vchiq_debugfs_remove_instance(struct vchiq_instance *instance) debugfs_remove_recursive(node->dentry); } -void vchiq_debugfs_init(void) +void vchiq_debugfs_init(struct vchiq_state *state) { vchiq_dbg_dir = debugfs_create_dir("vchiq", NULL); vchiq_dbg_clients = debugfs_create_dir("clients", vchiq_dbg_dir); - debugfs_create_file("state", S_IFREG | 0444, vchiq_dbg_dir, NULL, + debugfs_create_file("state", S_IFREG | 0444, vchiq_dbg_dir, state, &vchiq_dump_fops); } diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.h b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.h index e9bf055a4ca958..fabffd81b1ec9a 100644 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.h +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.h @@ -10,7 +10,7 @@ struct vchiq_debugfs_node { struct dentry *dentry; }; -void vchiq_debugfs_init(void); +void vchiq_debugfs_init(struct vchiq_state *state); void vchiq_debugfs_deinit(void); From a295ec52c8624883885396fde7b4df1a179627c3 Mon Sep 17 00:00:00 2001 From: "Kun(llfl)" Date: Thu, 9 May 2024 08:42:20 +0800 Subject: [PATCH 0533/1578] iommu/amd: Fix sysfs leak in iommu init During the iommu initialization, iommu_init_pci() adds sysfs nodes. However, these nodes aren't remove in free_iommu_resources() subsequently. Fixes: 39ab9555c241 ("iommu: Add sysfs bindings for struct iommu_device") Signed-off-by: Kun(llfl) Reviewed-by: Suravee Suthikulpanit Link: https://lore.kernel.org/r/c8e0d11c6ab1ee48299c288009cf9c5dae07b42d.1715215003.git.llfl@linux.alibaba.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/init.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index a18e74878f68a0..27e2937270950b 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -1626,8 +1626,17 @@ static void __init free_pci_segments(void) } } +static void __init free_sysfs(struct amd_iommu *iommu) +{ + if (iommu->iommu.dev) { + iommu_device_unregister(&iommu->iommu); + iommu_device_sysfs_remove(&iommu->iommu); + } +} + static void __init free_iommu_one(struct amd_iommu *iommu) { + free_sysfs(iommu); free_cwwb_sem(iommu); free_command_buffer(iommu); free_event_buffer(iommu); From cc8d89d0637990c66440a226f443d95340979a04 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 20 May 2024 20:14:44 +0100 Subject: [PATCH 0534/1578] iommu/dma: Fix domain init Despite carefully rewording the kerneldoc to describe the new direct interaction with dma_range_map, it seems I managed to confuse myself in removing the redundant force_aperture check and ended up making the code not do that at all. This led to dma_range_maps inadvertently being able to set iovad->start_pfn = 0, and all the nonsensical chaos which ensues from there. Restore the correct behaviour of constraining base_pfn to the domain aperture regardless of dma_range_map, and not trying to apply dma_range_map constraints to the basic IOVA domain since they will be properly handled with reserved regions later. Reported-by: Jon Hunter Reported-by: Jerry Snitselaar Fixes: ad4750b07d34 ("iommu/dma: Make limit checks self-contained") Signed-off-by: Robin Murphy Tested-by: Jerry Snitselaar Reviewed-by: Jerry Snitselaar Link: https://lore.kernel.org/r/721fa6baebb0924aa40db0b8fb86bcb4538434af.1716232484.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/dma-iommu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index f731e4b2a41724..43520e7275cc12 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -686,15 +686,15 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, struct device *dev /* Check the domain allows at least some access to the device... */ if (map) { - dma_addr_t base = dma_range_map_min(map); - if (base > domain->geometry.aperture_end || + if (dma_range_map_min(map) > domain->geometry.aperture_end || dma_range_map_max(map) < domain->geometry.aperture_start) { pr_warn("specified DMA range outside IOMMU capability\n"); return -EFAULT; } - /* ...then finally give it a kicking to make sure it fits */ - base_pfn = max(base, domain->geometry.aperture_start) >> order; } + /* ...then finally give it a kicking to make sure it fits */ + base_pfn = max_t(unsigned long, base_pfn, + domain->geometry.aperture_start >> order); /* start_pfn is always nonzero for an already-initialised domain */ mutex_lock(&cookie->mutex); From 89e8a2366e3bce584b6c01549d5019c5cda1205e Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 28 May 2024 12:25:28 +0800 Subject: [PATCH 0535/1578] iommu: Return right value in iommu_sva_bind_device() iommu_sva_bind_device() should return either a sva bond handle or an ERR_PTR value in error cases. Existing drivers (idxd and uacce) only check the return value with IS_ERR(). This could potentially lead to a kernel NULL pointer dereference issue if the function returns NULL instead of an error pointer. In reality, this doesn't cause any problems because iommu_sva_bind_device() only returns NULL when the kernel is not configured with CONFIG_IOMMU_SVA. In this case, iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_SVA) will return an error, and the device drivers won't call iommu_sva_bind_device() at all. Fixes: 26b25a2b98e4 ("iommu: Bind process address spaces to devices") Signed-off-by: Lu Baolu Reviewed-by: Jean-Philippe Brucker Reviewed-by: Kevin Tian Reviewed-by: Vasant Hegde Link: https://lore.kernel.org/r/20240528042528.71396-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 7bc8dff7cf6d76..17b3f36ad843ee 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1533,7 +1533,7 @@ struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, static inline struct iommu_sva * iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) { - return NULL; + return ERR_PTR(-ENODEV); } static inline void iommu_sva_unbind_device(struct iommu_sva *handle) From 65909a7e7aa8b25c9cc5f04c1fd5d6f0f1d76fcd Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Mon, 3 Jun 2024 17:16:07 -0700 Subject: [PATCH 0536/1578] ASoC: qcom: add missing MODULE_DESCRIPTION() macro make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in sound/soc/qcom/snd-soc-qcom-sdw.o Add the missing invocation of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Link: https://msgid.link/r/20240603-md-snd-soc-qcom-sdw-v1-1-101ea8bcdd38@quicinc.com Signed-off-by: Mark Brown --- sound/soc/qcom/sdw.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/qcom/sdw.c b/sound/soc/qcom/sdw.c index eaa8bb016e5096..f2eda2ff46c04f 100644 --- a/sound/soc/qcom/sdw.c +++ b/sound/soc/qcom/sdw.c @@ -160,4 +160,5 @@ int qcom_snd_sdw_hw_free(struct snd_pcm_substream *substream, return 0; } EXPORT_SYMBOL_GPL(qcom_snd_sdw_hw_free); +MODULE_DESCRIPTION("Qualcomm ASoC SoundWire helper functions"); MODULE_LICENSE("GPL"); From 998a0a362b0b4cf501161c3319e6994d37c45a8c Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 29 May 2024 11:39:00 +0000 Subject: [PATCH 0537/1578] iommu/amd: Fix workqueue name Workqueue name length is crossing WQ_NAME_LEN limit. Fix it by changing name format. New format : "iopf_queue/amdvi-" kernel warning: [ 11.146912] workqueue: name exceeds WQ_NAME_LEN. Truncating to: iopf_queue/amdiommu-0xc002-iopf Reported-by: Borislav Petkov Fixes: 61928bab9d26 ("iommu/amd: Define per-IOMMU iopf_queue") Signed-off-by: Vasant Hegde Acked-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240529113900.5798-1-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/ppr.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/iommu/amd/ppr.c b/drivers/iommu/amd/ppr.c index 091423bb8aac88..31d98a2a11a6e9 100644 --- a/drivers/iommu/amd/ppr.c +++ b/drivers/iommu/amd/ppr.c @@ -222,8 +222,7 @@ int amd_iommu_iopf_init(struct amd_iommu *iommu) if (iommu->iopf_queue) return ret; - snprintf(iommu->iopfq_name, sizeof(iommu->iopfq_name), - "amdiommu-%#x-iopfq", + snprintf(iommu->iopfq_name, sizeof(iommu->iopfq_name), "amdvi-%#x", PCI_SEG_DEVID_TO_SBDF(iommu->pci_seg->id, iommu->devid)); iommu->iopf_queue = iopf_queue_alloc(iommu->iopfq_name); From 48dc345a23b984c457d1c5878168d026c500618f Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Thu, 30 May 2024 07:11:18 +0000 Subject: [PATCH 0538/1578] iommu/amd: Check EFR[EPHSup] bit before enabling PPR Check for EFR[EPHSup] bit before enabling PPR. This bit must be set to enable PPR. Reported-by: Borislav Petkov Fixes: c4cb23111103 ("iommu/amd: Add support for enable/disable IOPF") Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218900 Tested-by: Borislav Petkov Tested-by: Jean-Christophe Guillain Signed-off-by: Vasant Hegde Reviewed-by: Suravee Suthikulpanit Link: https://lore.kernel.org/r/20240530071118.10297-1-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/amd_iommu.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index 2fde1302a5843c..2d5945c982bde5 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -129,7 +129,8 @@ static inline int check_feature_gpt_level(void) static inline bool amd_iommu_gt_ppr_supported(void) { return (check_feature(FEATURE_GT) && - check_feature(FEATURE_PPR)); + check_feature(FEATURE_PPR) && + check_feature(FEATURE_EPHSUP)); } static inline u64 iommu_virt_to_phys(void *vaddr) From 526606b0a1998b0791b42c199d53550c3ba724b5 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Thu, 30 May 2024 08:48:01 +0000 Subject: [PATCH 0539/1578] iommu/amd: Fix Invalid wait context issue With commit c4cb23111103 ("iommu/amd: Add support for enable/disable IOPF") we are hitting below issue. This happens because in IOPF enablement path it holds spin lock with irq disable and then tries to take mutex lock. dmesg: ----- [ 0.938739] ============================= [ 0.938740] [ BUG: Invalid wait context ] [ 0.938742] 6.10.0-rc1+ #1 Not tainted [ 0.938745] ----------------------------- [ 0.938746] swapper/0/1 is trying to lock: [ 0.938748] ffffffff8c9f01d8 (&port_lock_key){....}-{3:3}, at: serial8250_console_write+0x78/0x4a0 [ 0.938767] other info that might help us debug this: [ 0.938768] context-{5:5} [ 0.938769] 7 locks held by swapper/0/1: [ 0.938772] #0: ffff888101a91310 (&group->mutex){+.+.}-{4:4}, at: bus_iommu_probe+0x70/0x160 [ 0.938790] #1: ffff888101d1f1b8 (&domain->lock){....}-{3:3}, at: amd_iommu_attach_device+0xa5/0x700 [ 0.938799] #2: ffff888101cc3d18 (&dev_data->lock){....}-{3:3}, at: amd_iommu_attach_device+0xc5/0x700 [ 0.938806] #3: ffff888100052830 (&iommu->lock){....}-{2:2}, at: amd_iommu_iopf_add_device+0x3f/0xa0 [ 0.938813] #4: ffffffff8945a340 (console_lock){+.+.}-{0:0}, at: _printk+0x48/0x50 [ 0.938822] #5: ffffffff8945a390 (console_srcu){....}-{0:0}, at: console_flush_all+0x58/0x4e0 [ 0.938867] #6: ffffffff82459f80 (console_owner){....}-{0:0}, at: console_flush_all+0x1f0/0x4e0 [ 0.938872] stack backtrace: [ 0.938874] CPU: 2 PID: 1 Comm: swapper/0 Not tainted 6.10.0-rc1+ #1 [ 0.938877] Hardware name: HP HP EliteBook 745 G3/807E, BIOS N73 Ver. 01.39 04/16/2019 Fix above issue by re-arranging code in attach device path: - move device PASID/IOPF enablement outside lock in AMD IOMMU driver. This is safe as core layer holds group->mutex lock before calling iommu_ops->attach_dev. Reported-by: Borislav Petkov Reported-by: Mikhail Gavrilov Reported-by: Chris Bainbridge Fixes: c4cb23111103 ("iommu/amd: Add support for enable/disable IOPF") Tested-by: Borislav Petkov Tested-by: Chris Bainbridge Tested-by: Mikhail Gavrilov Signed-off-by: Vasant Hegde Link: https://lore.kernel.org/r/20240530084801.10758-1-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 48 +++++++++++++++++++-------------------- drivers/iommu/amd/ppr.c | 22 ++++-------------- 2 files changed, 28 insertions(+), 42 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 52d83730a22ad4..c2703599bb1668 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2032,7 +2032,6 @@ static int do_attach(struct iommu_dev_data *dev_data, struct protection_domain *domain) { struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); - struct pci_dev *pdev; int ret = 0; /* Update data structures */ @@ -2047,30 +2046,13 @@ static int do_attach(struct iommu_dev_data *dev_data, domain->dev_iommu[iommu->index] += 1; domain->dev_cnt += 1; - pdev = dev_is_pci(dev_data->dev) ? to_pci_dev(dev_data->dev) : NULL; + /* Setup GCR3 table */ if (pdom_is_sva_capable(domain)) { ret = init_gcr3_table(dev_data, domain); if (ret) return ret; - - if (pdev) { - pdev_enable_caps(pdev); - - /* - * Device can continue to function even if IOPF - * enablement failed. Hence in error path just - * disable device PRI support. - */ - if (amd_iommu_iopf_add_device(iommu, dev_data)) - pdev_disable_cap_pri(pdev); - } - } else if (pdev) { - pdev_enable_cap_ats(pdev); } - /* Update device table */ - amd_iommu_dev_update_dte(dev_data, true); - return ret; } @@ -2163,6 +2145,11 @@ static void detach_device(struct device *dev) do_detach(dev_data); +out: + spin_unlock(&dev_data->lock); + + spin_unlock_irqrestore(&domain->lock, flags); + /* Remove IOPF handler */ if (ppr) amd_iommu_iopf_remove_device(iommu, dev_data); @@ -2170,10 +2157,6 @@ static void detach_device(struct device *dev) if (dev_is_pci(dev)) pdev_disable_caps(to_pci_dev(dev)); -out: - spin_unlock(&dev_data->lock); - - spin_unlock_irqrestore(&domain->lock, flags); } static struct iommu_device *amd_iommu_probe_device(struct device *dev) @@ -2485,6 +2468,7 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); struct protection_domain *domain = to_pdomain(dom); struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); + struct pci_dev *pdev; int ret; /* @@ -2517,7 +2501,23 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, } #endif - iommu_completion_wait(iommu); + pdev = dev_is_pci(dev_data->dev) ? to_pci_dev(dev_data->dev) : NULL; + if (pdev && pdom_is_sva_capable(domain)) { + pdev_enable_caps(pdev); + + /* + * Device can continue to function even if IOPF + * enablement failed. Hence in error path just + * disable device PRI support. + */ + if (amd_iommu_iopf_add_device(iommu, dev_data)) + pdev_disable_cap_pri(pdev); + } else if (pdev) { + pdev_enable_cap_ats(pdev); + } + + /* Update device table */ + amd_iommu_dev_update_dte(dev_data, true); return ret; } diff --git a/drivers/iommu/amd/ppr.c b/drivers/iommu/amd/ppr.c index 31d98a2a11a6e9..7c67d69f0b8cad 100644 --- a/drivers/iommu/amd/ppr.c +++ b/drivers/iommu/amd/ppr.c @@ -248,40 +248,26 @@ void amd_iommu_page_response(struct device *dev, struct iopf_fault *evt, int amd_iommu_iopf_add_device(struct amd_iommu *iommu, struct iommu_dev_data *dev_data) { - unsigned long flags; int ret = 0; if (!dev_data->pri_enabled) return ret; - raw_spin_lock_irqsave(&iommu->lock, flags); - - if (!iommu->iopf_queue) { - ret = -EINVAL; - goto out_unlock; - } + if (!iommu->iopf_queue) + return -EINVAL; ret = iopf_queue_add_device(iommu->iopf_queue, dev_data->dev); if (ret) - goto out_unlock; + return ret; dev_data->ppr = true; - -out_unlock: - raw_spin_unlock_irqrestore(&iommu->lock, flags); - return ret; + return 0; } /* Its assumed that caller has verified that device was added to iopf queue */ void amd_iommu_iopf_remove_device(struct amd_iommu *iommu, struct iommu_dev_data *dev_data) { - unsigned long flags; - - raw_spin_lock_irqsave(&iommu->lock, flags); - iopf_queue_remove_device(iommu->iopf_queue, dev_data->dev); dev_data->ppr = false; - - raw_spin_unlock_irqrestore(&iommu->lock, flags); } From b19ab7ee2c4c1ec5f27c18413c3ab63907f7d55c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 14 May 2024 17:04:29 +0300 Subject: [PATCH 0540/1578] tty: n_tty: Fix buffer offsets when lookahead is used MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When lookahead has "consumed" some characters (la_count > 0), n_tty_receive_buf_standard() and n_tty_receive_buf_closing() for characters beyond the la_count are given wrong cp/fp offsets which leads to duplicating and losing some characters. If la_count > 0, correct buffer pointers and make count consistent too (the latter is not strictly necessary to fix the issue but seems more logical to adjust all variables immediately to keep state consistent). Reported-by: Vadym Krevs Fixes: 6bb6fa6908eb ("tty: Implement lookahead to process XON/XOFF timely") Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218834 Tested-by: Vadym Krevs Cc: stable@vger.kernel.org Signed-off-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20240514140429.12087-1-ilpo.jarvinen@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/n_tty.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c index f252d0b5a434ea..5e9ca4376d686e 100644 --- a/drivers/tty/n_tty.c +++ b/drivers/tty/n_tty.c @@ -1619,15 +1619,25 @@ static void __receive_buf(struct tty_struct *tty, const u8 *cp, const u8 *fp, else if (ldata->raw || (L_EXTPROC(tty) && !preops)) n_tty_receive_buf_raw(tty, cp, fp, count); else if (tty->closing && !L_EXTPROC(tty)) { - if (la_count > 0) + if (la_count > 0) { n_tty_receive_buf_closing(tty, cp, fp, la_count, true); - if (count > la_count) - n_tty_receive_buf_closing(tty, cp, fp, count - la_count, false); + cp += la_count; + if (fp) + fp += la_count; + count -= la_count; + } + if (count > 0) + n_tty_receive_buf_closing(tty, cp, fp, count, false); } else { - if (la_count > 0) + if (la_count > 0) { n_tty_receive_buf_standard(tty, cp, fp, la_count, true); - if (count > la_count) - n_tty_receive_buf_standard(tty, cp, fp, count - la_count, false); + cp += la_count; + if (fp) + fp += la_count; + count -= la_count; + } + if (count > 0) + n_tty_receive_buf_standard(tty, cp, fp, count, false); flush_echoes(tty); if (tty->ops->flush_chars) From 87d80bfbd577912462061b1a45c0ed9c7fcb872f Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 14 May 2024 22:05:53 +0300 Subject: [PATCH 0541/1578] serial: 8250_dw: Don't use struct dw8250_data outside of 8250_dw The container of the struct dw8250_port_data is private to the actual driver. In particular, 8250_lpss and 8250_dw use different data types that are assigned to the UART port private_data. Hence, it must not be used outside the specific driver. Currently the only cpr_val is required by the common code, make it be available via struct dw8250_port_data. This fixes the UART breakage on Intel Galileo boards. Fixes: 593dea000bc1 ("serial: 8250: dw: Allow to use a fallback CPR value if not synthesized") Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240514190730.2787071-2-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_dw.c | 9 +++++++-- drivers/tty/serial/8250/8250_dwlib.c | 3 +-- drivers/tty/serial/8250/8250_dwlib.h | 3 ++- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/tty/serial/8250/8250_dw.c b/drivers/tty/serial/8250/8250_dw.c index ba9f4dc4e71da5..667e16897bde77 100644 --- a/drivers/tty/serial/8250/8250_dw.c +++ b/drivers/tty/serial/8250/8250_dw.c @@ -55,6 +55,7 @@ #define DW_UART_QUIRK_SKIP_SET_RATE BIT(2) #define DW_UART_QUIRK_IS_DMA_FC BIT(3) #define DW_UART_QUIRK_APMC0D08 BIT(4) +#define DW_UART_QUIRK_CPR_VALUE BIT(5) static inline struct dw8250_data *clk_to_dw8250_data(struct notifier_block *nb) { @@ -432,6 +433,10 @@ static void dw8250_prepare_rx_dma(struct uart_8250_port *p) static void dw8250_quirks(struct uart_port *p, struct dw8250_data *data) { unsigned int quirks = data->pdata ? data->pdata->quirks : 0; + u32 cpr_value = data->pdata ? data->pdata->cpr_value : 0; + + if (quirks & DW_UART_QUIRK_CPR_VALUE) + data->data.cpr_value = cpr_value; #ifdef CONFIG_64BIT if (quirks & DW_UART_QUIRK_OCTEON) { @@ -714,8 +719,8 @@ static const struct dw8250_platform_data dw8250_armada_38x_data = { static const struct dw8250_platform_data dw8250_renesas_rzn1_data = { .usr_reg = DW_UART_USR, - .cpr_val = 0x00012f32, - .quirks = DW_UART_QUIRK_IS_DMA_FC, + .cpr_value = 0x00012f32, + .quirks = DW_UART_QUIRK_CPR_VALUE | DW_UART_QUIRK_IS_DMA_FC, }; static const struct dw8250_platform_data dw8250_starfive_jh7100_data = { diff --git a/drivers/tty/serial/8250/8250_dwlib.c b/drivers/tty/serial/8250/8250_dwlib.c index 3e33ddf7bc8007..5a2520943dfd53 100644 --- a/drivers/tty/serial/8250/8250_dwlib.c +++ b/drivers/tty/serial/8250/8250_dwlib.c @@ -242,7 +242,6 @@ static const struct serial_rs485 dw8250_rs485_supported = { void dw8250_setup_port(struct uart_port *p) { struct dw8250_port_data *pd = p->private_data; - struct dw8250_data *data = to_dw8250_data(pd); struct uart_8250_port *up = up_to_u8250p(p); u32 reg, old_dlf; @@ -278,7 +277,7 @@ void dw8250_setup_port(struct uart_port *p) reg = dw8250_readl_ext(p, DW_UART_CPR); if (!reg) { - reg = data->pdata->cpr_val; + reg = pd->cpr_value; dev_dbg(p->dev, "CPR is not available, using 0x%08x instead\n", reg); } if (!reg) diff --git a/drivers/tty/serial/8250/8250_dwlib.h b/drivers/tty/serial/8250/8250_dwlib.h index f13e91f2cace9c..794a9014cdac1d 100644 --- a/drivers/tty/serial/8250/8250_dwlib.h +++ b/drivers/tty/serial/8250/8250_dwlib.h @@ -19,6 +19,7 @@ struct dw8250_port_data { struct uart_8250_dma dma; /* Hardware configuration */ + u32 cpr_value; u8 dlf_size; /* RS485 variables */ @@ -27,7 +28,7 @@ struct dw8250_port_data { struct dw8250_platform_data { u8 usr_reg; - u32 cpr_val; + u32 cpr_value; unsigned int quirks; }; From 2c94512055f362dd789e0f87b8566feeddec83c9 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 14 May 2024 22:05:54 +0300 Subject: [PATCH 0542/1578] serial: 8250_dw: Revert "Move definitions to the shared header" This reverts commit d9666dfb314e1ffd6eb9c3c4243fe3e094c047a7. The container of the struct dw8250_port_data is private to the actual driver. In particular, 8250_lpss and 8250_dw use different data types that are assigned to the UART port private_data. Hence, it must not be used outside the specific driver. Fix the mistake made in the past by moving the respective definitions to the specific driver. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240514190730.2787071-3-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_dw.c | 27 +++++++++++++++++++++++ drivers/tty/serial/8250/8250_dwlib.h | 32 ---------------------------- 2 files changed, 27 insertions(+), 32 deletions(-) diff --git a/drivers/tty/serial/8250/8250_dw.c b/drivers/tty/serial/8250/8250_dw.c index 667e16897bde77..fb809e32c6ae28 100644 --- a/drivers/tty/serial/8250/8250_dw.c +++ b/drivers/tty/serial/8250/8250_dw.c @@ -57,6 +57,33 @@ #define DW_UART_QUIRK_APMC0D08 BIT(4) #define DW_UART_QUIRK_CPR_VALUE BIT(5) +struct dw8250_platform_data { + u8 usr_reg; + u32 cpr_value; + unsigned int quirks; +}; + +struct dw8250_data { + struct dw8250_port_data data; + const struct dw8250_platform_data *pdata; + + int msr_mask_on; + int msr_mask_off; + struct clk *clk; + struct clk *pclk; + struct notifier_block clk_notifier; + struct work_struct clk_work; + struct reset_control *rst; + + unsigned int skip_autocfg:1; + unsigned int uart_16550_compatible:1; +}; + +static inline struct dw8250_data *to_dw8250_data(struct dw8250_port_data *data) +{ + return container_of(data, struct dw8250_data, data); +} + static inline struct dw8250_data *clk_to_dw8250_data(struct notifier_block *nb) { return container_of(nb, struct dw8250_data, clk_notifier); diff --git a/drivers/tty/serial/8250/8250_dwlib.h b/drivers/tty/serial/8250/8250_dwlib.h index 794a9014cdac1d..7dd2a8e7b78085 100644 --- a/drivers/tty/serial/8250/8250_dwlib.h +++ b/drivers/tty/serial/8250/8250_dwlib.h @@ -2,15 +2,10 @@ /* Synopsys DesignWare 8250 library header file. */ #include -#include #include -#include #include "8250.h" -struct clk; -struct reset_control; - struct dw8250_port_data { /* Port properties */ int line; @@ -26,36 +21,9 @@ struct dw8250_port_data { bool hw_rs485_support; }; -struct dw8250_platform_data { - u8 usr_reg; - u32 cpr_value; - unsigned int quirks; -}; - -struct dw8250_data { - struct dw8250_port_data data; - const struct dw8250_platform_data *pdata; - - int msr_mask_on; - int msr_mask_off; - struct clk *clk; - struct clk *pclk; - struct notifier_block clk_notifier; - struct work_struct clk_work; - struct reset_control *rst; - - unsigned int skip_autocfg:1; - unsigned int uart_16550_compatible:1; -}; - void dw8250_do_set_termios(struct uart_port *p, struct ktermios *termios, const struct ktermios *old); void dw8250_setup_port(struct uart_port *p); -static inline struct dw8250_data *to_dw8250_data(struct dw8250_port_data *data) -{ - return container_of(data, struct dw8250_data, data); -} - static inline u32 dw8250_readl_ext(struct uart_port *p, int offset) { if (p->iotype == UPIO_MEM32BE) From 5208e7ced520a813b4f4774451fbac4e517e78b2 Mon Sep 17 00:00:00 2001 From: Doug Brown Date: Sun, 19 May 2024 12:19:30 -0700 Subject: [PATCH 0543/1578] serial: 8250_pxa: Configure tx_loadsz to match FIFO IRQ level The FIFO is 64 bytes, but the FCR is configured to fire the TX interrupt when the FIFO is half empty (bit 3 = 0). Thus, we should only write 32 bytes when a TX interrupt occurs. This fixes a problem observed on the PXA168 that dropped a bunch of TX bytes during large transmissions. Fixes: ab28f51c77cd ("serial: rewrite pxa2xx-uart to use 8250_core") Signed-off-by: Doug Brown Link: https://lore.kernel.org/r/20240519191929.122202-1-doug@schmorgal.com Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_pxa.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/tty/serial/8250/8250_pxa.c b/drivers/tty/serial/8250/8250_pxa.c index f1a51b00b1b9de..ba96fa913e7f99 100644 --- a/drivers/tty/serial/8250/8250_pxa.c +++ b/drivers/tty/serial/8250/8250_pxa.c @@ -125,6 +125,7 @@ static int serial_pxa_probe(struct platform_device *pdev) uart.port.iotype = UPIO_MEM32; uart.port.regshift = 2; uart.port.fifosize = 64; + uart.tx_loadsz = 32; uart.dl_write = serial_pxa_dl_write; ret = serial8250_register_8250_port(&uart); From ca84cd379b45e9b1775b9e026f069a3a886b409d Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 31 May 2024 08:09:18 -0700 Subject: [PATCH 0544/1578] serial: port: Don't block system suspend even if bytes are left to xmit Recently, suspend testing on sc7180-trogdor based devices has started to sometimes fail with messages like this: port a88000.serial:0.0: PM: calling pm_runtime_force_suspend+0x0/0xf8 @ 28934, parent: a88000.serial:0 port a88000.serial:0.0: PM: dpm_run_callback(): pm_runtime_force_suspend+0x0/0xf8 returns -16 port a88000.serial:0.0: PM: pm_runtime_force_suspend+0x0/0xf8 returned -16 after 33 usecs port a88000.serial:0.0: PM: failed to suspend: error -16 I could reproduce these problems by logging in via an agetty on the debug serial port (which was _not_ used for kernel console) and running: cat /var/log/messages ...and then (via an SSH session) forcing a few suspend/resume cycles. Tracing through the code and doing some printf()-based debugging shows that the -16 (-EBUSY) comes from the recently added serial_port_runtime_suspend(). The idea of the serial_port_runtime_suspend() function is to prevent the port from being _runtime_ suspended if it still has bytes left to transmit. Having bytes left to transmit isn't a reason to block _system_ suspend, though. If a serdev device in the kernel needs to block system suspend it should block its own suspend and it can use serdev_device_wait_until_sent() to ensure bytes are sent. The DEFINE_RUNTIME_DEV_PM_OPS() used by the serial_port code means that the system suspend function will be pm_runtime_force_suspend(). In pm_runtime_force_suspend() we can see that before calling the runtime suspend function we'll call pm_runtime_disable(). This should be a reliable way to detect that we're called from system suspend and that we shouldn't look for busyness. Fixes: 43066e32227e ("serial: port: Don't suspend if the port is still busy") Cc: stable@vger.kernel.org Reviewed-by: Tony Lindgren Signed-off-by: Douglas Anderson Link: https://lore.kernel.org/r/20240531080914.v3.1.I2395e66cf70c6e67d774c56943825c289b9c13e4@changeid Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/serial_port.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/tty/serial/serial_port.c b/drivers/tty/serial/serial_port.c index 91a338d3cb343d..d35f1d24156c22 100644 --- a/drivers/tty/serial/serial_port.c +++ b/drivers/tty/serial/serial_port.c @@ -64,6 +64,13 @@ static int serial_port_runtime_suspend(struct device *dev) if (port->flags & UPF_DEAD) return 0; + /* + * Nothing to do on pm_runtime_force_suspend(), see + * DEFINE_RUNTIME_DEV_PM_OPS. + */ + if (!pm_runtime_enabled(dev)) + return 0; + uart_port_lock_irqsave(port, &flags); if (!port_dev->tx_enabled) { uart_port_unlock_irqrestore(port, flags); From 4e534ff4b69c6960a165cab2c851b48f0a0da945 Mon Sep 17 00:00:00 2001 From: Hugo Villeneuve Date: Mon, 3 Jun 2024 11:26:00 -0400 Subject: [PATCH 0545/1578] serial: sc16is7xx: rename Kconfig CONFIG_SERIAL_SC16IS7XX_CORE Commit d49216438139 ("serial: sc16is7xx: split into core and I2C/SPI parts (core)") renamed SERIAL_SC16IS7XX_CORE by SERIAL_SC16IS7XX. This means that some configs should have been updated when I submitted the original patch, but unfortunately they were not. Geert mentioned for example: arch/mips/configs/cu1??0-neo_defconfig Rename SERIAL_SC16IS7XX to SERIAL_SC16IS7XX_CORE so that existing configs will still work correctly. Suggested-by: Geert Uytterhoeven Fixes: d49216438139 ("serial: sc16is7xx: split into core and I2C/SPI parts (core)") Signed-off-by: Hugo Villeneuve Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20240603152601.3689319-2-hugo@hugovil.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/Kconfig | 2 +- drivers/tty/serial/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig index 4fdd7857ef4dff..2a0da35c0ef8f5 100644 --- a/drivers/tty/serial/Kconfig +++ b/drivers/tty/serial/Kconfig @@ -1023,7 +1023,7 @@ config SERIAL_SCCNXP_CONSOLE help Support for console on SCCNXP serial ports. -config SERIAL_SC16IS7XX_CORE +config SERIAL_SC16IS7XX tristate "NXP SC16IS7xx UART support" select SERIAL_CORE select SERIAL_SC16IS7XX_SPI if SPI_MASTER diff --git a/drivers/tty/serial/Makefile b/drivers/tty/serial/Makefile index faa45f2b8bb054..6ff74f0a9530c4 100644 --- a/drivers/tty/serial/Makefile +++ b/drivers/tty/serial/Makefile @@ -75,7 +75,7 @@ obj-$(CONFIG_SERIAL_SA1100) += sa1100.o obj-$(CONFIG_SERIAL_SAMSUNG) += samsung_tty.o obj-$(CONFIG_SERIAL_SB1250_DUART) += sb1250-duart.o obj-$(CONFIG_SERIAL_SCCNXP) += sccnxp.o -obj-$(CONFIG_SERIAL_SC16IS7XX_CORE) += sc16is7xx.o +obj-$(CONFIG_SERIAL_SC16IS7XX) += sc16is7xx.o obj-$(CONFIG_SERIAL_SC16IS7XX_SPI) += sc16is7xx_spi.o obj-$(CONFIG_SERIAL_SC16IS7XX_I2C) += sc16is7xx_i2c.o obj-$(CONFIG_SERIAL_SH_SCI) += sh-sci.o From 7a2e8e30ad89f498b206977396d028e10174390a Mon Sep 17 00:00:00 2001 From: Hugo Villeneuve Date: Mon, 3 Jun 2024 11:26:01 -0400 Subject: [PATCH 0546/1578] serial: sc16is7xx: re-add Kconfig SPI or I2C dependency Commit d49216438139 ("serial: sc16is7xx: split into core and I2C/SPI parts (core)") removed Kconfig SPI_MASTER or I2C dependency for SERIAL_SC16IS7XX (core). This removal was done because I inadvertently misinterpreted some review comments. Because of that, the driver question now pops up if both I2C and SPI_MASTER are disabled. Re-add Kconfig SPI_MASTER or I2C dependency to fix the problem. Suggested-by: Geert Uytterhoeven Fixes: d49216438139 ("serial: sc16is7xx: split into core and I2C/SPI parts (core)") Signed-off-by: Hugo Villeneuve Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20240603152601.3689319-3-hugo@hugovil.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig index 2a0da35c0ef8f5..28e4beeabf8f37 100644 --- a/drivers/tty/serial/Kconfig +++ b/drivers/tty/serial/Kconfig @@ -1025,6 +1025,7 @@ config SERIAL_SCCNXP_CONSOLE config SERIAL_SC16IS7XX tristate "NXP SC16IS7xx UART support" + depends on SPI_MASTER || I2C select SERIAL_CORE select SERIAL_SC16IS7XX_SPI if SPI_MASTER select SERIAL_SC16IS7XX_I2C if I2C From ae01e52da244af5d650378ada1bfd2d946dc1b45 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 29 May 2024 00:05:53 +0900 Subject: [PATCH 0547/1578] serial: drop debugging WARN_ON_ONCE() from uart_write() syzbot is reporting lockdep warning upon int disc = 7; ioctl(open("/dev/ttyS3", O_RDONLY), TIOCSETD, &disc); sequence. Do like what commit 5f1149d2f4bf ("serial: drop debugging WARN_ON_ONCE() from uart_put_char()") does. Reported-by: syzbot+f78380e4eae53c64125c@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=f78380e4eae53c64125c Signed-off-by: Tetsuo Handa Acked-by: Jiri Slaby Link: https://lore.kernel.org/r/d775ae2d-a2ac-439e-8e2b-134749f60f30@I-love.SAKURA.ne.jp Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/serial_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index 2c1a0254d3f4a9..0c4d609766633b 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -622,7 +622,7 @@ static ssize_t uart_write(struct tty_struct *tty, const u8 *buf, size_t count) return -EL3HLT; port = uart_port_lock(state, flags); - if (WARN_ON_ONCE(!state->port.xmit_buf)) { + if (!state->port.xmit_buf) { uart_port_unlock(port, flags); return 0; } From 8141b6da1763b9db009e5dcf873869bb31bcef45 Mon Sep 17 00:00:00 2001 From: Thomas Richard Date: Mon, 3 Jun 2024 19:01:00 +0200 Subject: [PATCH 0548/1578] regulator: tps6594-regulator: Fix the number of irqs for TPS65224 and TPS6594 The number of irqs is computed to allocate the right amount of memory for the irq data. An array of struct tps6594_regulator_irq_data is allocated one time for all the irqs. Each irq uses one cell of the array. If the computed number of irqs is not correct, not allocated memory could be used. Fix the values used in the calculation for TPS6594 and TPS65224. Fixes: 00c826525fba (regulator: tps6594-regulator: Add TI TPS65224 PMIC regulators) Signed-off-by: Thomas Richard Tested-by: Nishanth Menon Link: https://msgid.link/r/20240603170100.2394402-1-thomas.richard@bootlin.com Signed-off-by: Mark Brown --- drivers/regulator/tps6594-regulator.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/regulator/tps6594-regulator.c b/drivers/regulator/tps6594-regulator.c index 4a859f4c0f8358..ac53792e3fedee 100644 --- a/drivers/regulator/tps6594-regulator.c +++ b/drivers/regulator/tps6594-regulator.c @@ -653,18 +653,14 @@ static int tps6594_regulator_probe(struct platform_device *pdev) } } - if (tps->chip_id == LP8764) { - nr_buck = ARRAY_SIZE(buck_regs); - nr_ldo = 0; - nr_types = REGS_INT_NB; - } else if (tps->chip_id == TPS65224) { + if (tps->chip_id == TPS65224) { nr_buck = ARRAY_SIZE(tps65224_buck_regs); nr_ldo = ARRAY_SIZE(tps65224_ldo_regs); - nr_types = REGS_INT_NB; + nr_types = TPS65224_REGS_INT_NB; } else { nr_buck = ARRAY_SIZE(buck_regs); - nr_ldo = ARRAY_SIZE(tps6594_ldo_regs); - nr_types = TPS65224_REGS_INT_NB; + nr_ldo = (tps->chip_id == LP8764) ? 0 : ARRAY_SIZE(tps6594_ldo_regs); + nr_types = REGS_INT_NB; } reg_irq_nb = nr_types * (nr_buck + nr_ldo); From 718d4a63c0a62d16af1d0425d515d7e76f35681e Mon Sep 17 00:00:00 2001 From: Peter Chen Date: Fri, 17 May 2024 10:36:48 +0800 Subject: [PATCH 0549/1578] Revert "usb: chipidea: move ci_ulpi_init after the phy initialization" This reverts commit 22ffd399e6e7aa18ae0314278ed0b7f05f8ab679. People report this commit causes the driver defer probed, and never back to work[1][2]. [1] https://lore.kernel.org/lkml/20240407011913.GA168730@nchen-desktop/T/#mc2b93bc11a8b01ec7cd0d0bf6b0b03951d9ef751 [2] https://lore.kernel.org/lkml/20240407011913.GA168730@nchen-desktop/T/#me87d9a2a76c07619d83b3879ea14780da89fbbbf Cc: Michael Grzeschik Cc: Marek Szyprowski Cc: Wouter Franken Signed-off-by: Peter Chen Link: https://lore.kernel.org/r/20240517023648.3459188-1-peter.chen@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/chipidea/core.c | 8 ++++---- drivers/usb/chipidea/ulpi.c | 5 +++++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/usb/chipidea/core.c b/drivers/usb/chipidea/core.c index bada13f704b62d..835bf2428dc6ec 100644 --- a/drivers/usb/chipidea/core.c +++ b/drivers/usb/chipidea/core.c @@ -1084,6 +1084,10 @@ static int ci_hdrc_probe(struct platform_device *pdev) return -ENODEV; } + ret = ci_ulpi_init(ci); + if (ret) + return ret; + if (ci->platdata->phy) { ci->phy = ci->platdata->phy; } else if (ci->platdata->usb_phy) { @@ -1138,10 +1142,6 @@ static int ci_hdrc_probe(struct platform_device *pdev) goto ulpi_exit; } - ret = ci_ulpi_init(ci); - if (ret) - return ret; - ci->hw_bank.phys = res->start; ci->irq = platform_get_irq(pdev, 0); diff --git a/drivers/usb/chipidea/ulpi.c b/drivers/usb/chipidea/ulpi.c index 89fb51e2c3ded0..dfec07e8ae1d26 100644 --- a/drivers/usb/chipidea/ulpi.c +++ b/drivers/usb/chipidea/ulpi.c @@ -68,6 +68,11 @@ int ci_ulpi_init(struct ci_hdrc *ci) if (ci->platdata->phy_mode != USBPHY_INTERFACE_MODE_ULPI) return 0; + /* + * Set PORTSC correctly so we can read/write ULPI registers for + * identification purposes + */ + hw_phymode_configure(ci); ci->ulpi_ops.read = ci_ulpi_read; ci->ulpi_ops.write = ci_ulpi_write; From 415ce0ea55c5a3afea501a773e002be9ed7149f5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 3 Jun 2024 13:56:53 -0600 Subject: [PATCH 0550/1578] io_uring/napi: fix timeout calculation Not quite sure what __io_napi_adjust_timeout() was attemping to do, it's adjusting both the NAPI timeout and the general overall timeout, and calculating a value that is never used. The overall timeout is a super set of the NAPI timeout, and doesn't need adjusting. The only thing we really need to care about is that the NAPI timeout doesn't exceed the overall timeout. If a user asked for a timeout of eg 5 usec and NAPI timeout is 10 usec, then we should not spin for 10 usec. While in there, sanitize the time checking a bit. If we have a negative value in the passed in timeout, discard it. Round up the value as well, so we don't end up with a NAPI timeout for the majority of the wait, with only a tiny sleep value at the end. Hence the only case we need to care about is if the NAPI timeout is larger than the overall timeout. If it is, cap the NAPI timeout at what the overall timeout is. Cc: stable@vger.kernel.org Fixes: 8d0c12a80cde ("io-uring: add napi busy poll support") Reported-by: Lewis Baker Signed-off-by: Jens Axboe --- io_uring/napi.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/io_uring/napi.c b/io_uring/napi.c index 883a1a66590756..8c18ede595c415 100644 --- a/io_uring/napi.c +++ b/io_uring/napi.c @@ -261,12 +261,14 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) } /* - * __io_napi_adjust_timeout() - Add napi id to the busy poll list + * __io_napi_adjust_timeout() - adjust busy loop timeout * @ctx: pointer to io-uring context structure * @iowq: pointer to io wait queue * @ts: pointer to timespec or NULL * * Adjust the busy loop timeout according to timespec and busy poll timeout. + * If the specified NAPI timeout is bigger than the wait timeout, then adjust + * the NAPI timeout accordingly. */ void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, struct timespec64 *ts) @@ -274,16 +276,16 @@ void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iow unsigned int poll_to = READ_ONCE(ctx->napi_busy_poll_to); if (ts) { - struct timespec64 poll_to_ts = ns_to_timespec64(1000 * (s64)poll_to); - - if (timespec64_compare(ts, &poll_to_ts) > 0) { - *ts = timespec64_sub(*ts, poll_to_ts); - } else { - u64 to = timespec64_to_ns(ts); - - do_div(to, 1000); - ts->tv_sec = 0; - ts->tv_nsec = 0; + struct timespec64 poll_to_ts; + + poll_to_ts = ns_to_timespec64(1000 * (s64)poll_to); + if (timespec64_compare(ts, &poll_to_ts) < 0) { + s64 poll_to_ns = timespec64_to_ns(ts); + if (poll_to_ns > 0) { + u64 val = poll_to_ns + 999; + do_div(val, (s64) 1000); + poll_to = val; + } } } From fc3568f142cc5fe071d83be02d1851717d1fbf66 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 3 Jun 2024 12:00:07 +0200 Subject: [PATCH 0551/1578] usb: typec: ucsi: glink: increase max ports for x1e80100 The new X Elite (x1e80100) platform has three ports so increase the maximum so that all ports can be registered. Signed-off-by: Johan Hovold Reviewed-by: Dmitry Baryshkov Reviewed-by: Heikki Krogerus Reviewed-by: Abel Vesa Link: https://lore.kernel.org/r/20240603100007.10236-1-johan+linaro@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/ucsi/ucsi_glink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/typec/ucsi/ucsi_glink.c b/drivers/usb/typec/ucsi/ucsi_glink.c index f7546bb488c328..985a880e86da43 100644 --- a/drivers/usb/typec/ucsi/ucsi_glink.c +++ b/drivers/usb/typec/ucsi/ucsi_glink.c @@ -14,7 +14,7 @@ #include #include "ucsi.h" -#define PMIC_GLINK_MAX_PORTS 2 +#define PMIC_GLINK_MAX_PORTS 3 #define UCSI_BUF_SIZE 48 From 8475ffcfb381a77075562207ce08552414a80326 Mon Sep 17 00:00:00 2001 From: John Ernberg Date: Fri, 17 May 2024 11:43:52 +0000 Subject: [PATCH 0552/1578] USB: xen-hcd: Traverse host/ when CONFIG_USB_XEN_HCD is selected If no other USB HCDs are selected when compiling a small pure virutal machine, the Xen HCD driver cannot be built. Fix it by traversing down host/ if CONFIG_USB_XEN_HCD is selected. Fixes: 494ed3997d75 ("usb: Introduce Xen pvUSB frontend (xen hcd)") Cc: stable@vger.kernel.org # v5.17+ Signed-off-by: John Ernberg Link: https://lore.kernel.org/r/20240517114345.1190755-1-john.ernberg@actia.se Signed-off-by: Greg Kroah-Hartman --- drivers/usb/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/Makefile b/drivers/usb/Makefile index 3a9a0dd4be706d..949eca0adebea3 100644 --- a/drivers/usb/Makefile +++ b/drivers/usb/Makefile @@ -35,6 +35,7 @@ obj-$(CONFIG_USB_R8A66597_HCD) += host/ obj-$(CONFIG_USB_FSL_USB2) += host/ obj-$(CONFIG_USB_FOTG210_HCD) += host/ obj-$(CONFIG_USB_MAX3421_HCD) += host/ +obj-$(CONFIG_USB_XEN_HCD) += host/ obj-$(CONFIG_USB_C67X00_HCD) += c67x00/ From e7e921918d905544500ca7a95889f898121ba886 Mon Sep 17 00:00:00 2001 From: Amit Sunil Dhamne Date: Tue, 14 May 2024 15:01:31 -0700 Subject: [PATCH 0553/1578] usb: typec: tcpm: fix use-after-free case in tcpm_register_source_caps There could be a potential use-after-free case in tcpm_register_source_caps(). This could happen when: * new (say invalid) source caps are advertised * the existing source caps are unregistered * tcpm_register_source_caps() returns with an error as usb_power_delivery_register_capabilities() fails This causes port->partner_source_caps to hold on to the now freed source caps. Reset port->partner_source_caps value to NULL after unregistering existing source caps. Fixes: 230ecdf71a64 ("usb: typec: tcpm: unregister existing source caps before re-registration") Cc: stable@vger.kernel.org Signed-off-by: Amit Sunil Dhamne Reviewed-by: Ondrej Jirman Reviewed-by: Heikki Krogerus Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20240514220134.2143181-1-amitsd@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index 8a1af08f71b645..be4127ef84e999 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -3014,8 +3014,10 @@ static int tcpm_register_source_caps(struct tcpm_port *port) memcpy(caps.pdo, port->source_caps, sizeof(u32) * port->nr_source_caps); caps.role = TYPEC_SOURCE; - if (cap) + if (cap) { usb_power_delivery_unregister_capabilities(cap); + port->partner_source_caps = NULL; + } cap = usb_power_delivery_register_capabilities(port->partner_pd, &caps); if (IS_ERR(cap)) From fc8fb9eea94d8f476e15f3a4a7addeb16b3b99d6 Mon Sep 17 00:00:00 2001 From: Kyle Tso Date: Mon, 20 May 2024 23:48:58 +0800 Subject: [PATCH 0554/1578] usb: typec: tcpm: Ignore received Hard Reset in TOGGLING state Similar to what fixed in Commit a6fe37f428c1 ("usb: typec: tcpm: Skip hard reset when in error recovery"), the handling of the received Hard Reset has to be skipped during TOGGLING state. [ 4086.021288] VBUS off [ 4086.021295] pending state change SNK_READY -> SNK_UNATTACHED @ 650 ms [rev2 NONE_AMS] [ 4086.022113] VBUS VSAFE0V [ 4086.022117] state change SNK_READY -> SNK_UNATTACHED [rev2 NONE_AMS] [ 4086.022447] VBUS off [ 4086.022450] state change SNK_UNATTACHED -> SNK_UNATTACHED [rev2 NONE_AMS] [ 4086.023060] VBUS VSAFE0V [ 4086.023064] state change SNK_UNATTACHED -> SNK_UNATTACHED [rev2 NONE_AMS] [ 4086.023070] disable BIST MODE TESTDATA [ 4086.023766] disable vbus discharge ret:0 [ 4086.023911] Setting usb_comm capable false [ 4086.028874] Setting voltage/current limit 0 mV 0 mA [ 4086.028888] polarity 0 [ 4086.030305] Requesting mux state 0, usb-role 0, orientation 0 [ 4086.033539] Start toggling [ 4086.038496] state change SNK_UNATTACHED -> TOGGLING [rev2 NONE_AMS] // This Hard Reset is unexpected [ 4086.038499] Received hard reset [ 4086.038501] state change TOGGLING -> HARD_RESET_START [rev2 HARD_RESET] Fixes: f0690a25a140 ("staging: typec: USB Type-C Port Manager (tcpm)") Cc: stable@vger.kernel.org Signed-off-by: Kyle Tso Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20240520154858.1072347-1-kyletso@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index be4127ef84e999..5d4da962acc87d 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -6174,6 +6174,7 @@ static void _tcpm_pd_hard_reset(struct tcpm_port *port) port->tcpc->set_bist_data(port->tcpc, false); switch (port->state) { + case TOGGLING: case ERROR_RECOVERY: case PORT_RESET: case PORT_RESET_WAIT_OFF: From e4228cfd092351c2d9b1a3048b2070287291ccbb Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Thu, 23 May 2024 14:44:59 -0500 Subject: [PATCH 0555/1578] dt-bindings: usb: realtek,rts5411: Add missing "additionalProperties" on child nodes All nodes need an explicit additionalProperties or unevaluatedProperties unless a $ref has one that's false. As that is not the case with usb-device.yaml, "additionalProperties" is needed here. Fixes: c44d9dab31d6 ("dt-bindings: usb: Add downstream facing ports to realtek binding") Signed-off-by: "Rob Herring (Arm)" Reviewed-by: Stephen Boyd Link: https://lore.kernel.org/r/20240523194500.2958192-1-robh@kernel.org Signed-off-by: Greg Kroah-Hartman --- Documentation/devicetree/bindings/usb/realtek,rts5411.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/usb/realtek,rts5411.yaml b/Documentation/devicetree/bindings/usb/realtek,rts5411.yaml index 0874fc21f66fbb..6577a61cc07531 100644 --- a/Documentation/devicetree/bindings/usb/realtek,rts5411.yaml +++ b/Documentation/devicetree/bindings/usb/realtek,rts5411.yaml @@ -65,6 +65,7 @@ patternProperties: description: The hard wired USB devices type: object $ref: /schemas/usb/usb-device.yaml + additionalProperties: true required: - peer-hub From f85d39dd7ed89ffdd622bc1de247ffba8d961504 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 27 May 2024 19:35:38 +0200 Subject: [PATCH 0556/1578] kcov, usb: disable interrupts in kcov_remote_start_usb_softirq After commit 8fea0c8fda30 ("usb: core: hcd: Convert from tasklet to BH workqueue"), usb_giveback_urb_bh() runs in the BH workqueue with interrupts enabled. Thus, the remote coverage collection section in usb_giveback_urb_bh()-> __usb_hcd_giveback_urb() might be interrupted, and the interrupt handler might invoke __usb_hcd_giveback_urb() again. This breaks KCOV, as it does not support nested remote coverage collection sections within the same context (neither in task nor in softirq). Update kcov_remote_start/stop_usb_softirq() to disable interrupts for the duration of the coverage collection section to avoid nested sections in the softirq context (in addition to such in the task context, which are already handled). Reported-by: Tetsuo Handa Closes: https://lore.kernel.org/linux-usb/0f4d1964-7397-485b-bc48-11c01e2fcbca@I-love.SAKURA.ne.jp/ Closes: https://syzkaller.appspot.com/bug?extid=0438378d6f157baae1a2 Suggested-by: Alan Stern Fixes: 8fea0c8fda30 ("usb: core: hcd: Convert from tasklet to BH workqueue") Cc: stable@vger.kernel.org Acked-by: Dmitry Vyukov Signed-off-by: Andrey Konovalov Link: https://lore.kernel.org/r/20240527173538.4989-1-andrey.konovalov@linux.dev Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hcd.c | 12 ++++++----- include/linux/kcov.h | 47 ++++++++++++++++++++++++++++++++++-------- 2 files changed, 45 insertions(+), 14 deletions(-) diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index e3366f4d82b963..1ff7d901fedead 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -1623,6 +1623,7 @@ static void __usb_hcd_giveback_urb(struct urb *urb) struct usb_hcd *hcd = bus_to_hcd(urb->dev->bus); struct usb_anchor *anchor = urb->anchor; int status = urb->unlinked; + unsigned long flags; urb->hcpriv = NULL; if (unlikely((urb->transfer_flags & URB_SHORT_NOT_OK) && @@ -1640,13 +1641,14 @@ static void __usb_hcd_giveback_urb(struct urb *urb) /* pass ownership to the completion handler */ urb->status = status; /* - * This function can be called in task context inside another remote - * coverage collection section, but kcov doesn't support that kind of - * recursion yet. Only collect coverage in softirq context for now. + * Only collect coverage in the softirq context and disable interrupts + * to avoid scenarios with nested remote coverage collection sections + * that KCOV does not support. + * See the comment next to kcov_remote_start_usb_softirq() for details. */ - kcov_remote_start_usb_softirq((u64)urb->dev->bus->busnum); + flags = kcov_remote_start_usb_softirq((u64)urb->dev->bus->busnum); urb->complete(urb); - kcov_remote_stop_softirq(); + kcov_remote_stop_softirq(flags); usb_anchor_resume_wakeups(anchor); atomic_dec(&urb->use_count); diff --git a/include/linux/kcov.h b/include/linux/kcov.h index b851ba415e03fd..1068a7318d897e 100644 --- a/include/linux/kcov.h +++ b/include/linux/kcov.h @@ -55,21 +55,47 @@ static inline void kcov_remote_start_usb(u64 id) /* * The softirq flavor of kcov_remote_*() functions is introduced as a temporary - * work around for kcov's lack of nested remote coverage sections support in - * task context. Adding support for nested sections is tracked in: - * https://bugzilla.kernel.org/show_bug.cgi?id=210337 + * workaround for KCOV's lack of nested remote coverage sections support. + * + * Adding support is tracked in https://bugzilla.kernel.org/show_bug.cgi?id=210337. + * + * kcov_remote_start_usb_softirq(): + * + * 1. Only collects coverage when called in the softirq context. This allows + * avoiding nested remote coverage collection sections in the task context. + * For example, USB/IP calls usb_hcd_giveback_urb() in the task context + * within an existing remote coverage collection section. Thus, KCOV should + * not attempt to start collecting coverage within the coverage collection + * section in __usb_hcd_giveback_urb() in this case. + * + * 2. Disables interrupts for the duration of the coverage collection section. + * This allows avoiding nested remote coverage collection sections in the + * softirq context (a softirq might occur during the execution of a work in + * the BH workqueue, which runs with in_serving_softirq() > 0). + * For example, usb_giveback_urb_bh() runs in the BH workqueue with + * interrupts enabled, so __usb_hcd_giveback_urb() might be interrupted in + * the middle of its remote coverage collection section, and the interrupt + * handler might invoke __usb_hcd_giveback_urb() again. */ -static inline void kcov_remote_start_usb_softirq(u64 id) +static inline unsigned long kcov_remote_start_usb_softirq(u64 id) { - if (in_serving_softirq()) + unsigned long flags = 0; + + if (in_serving_softirq()) { + local_irq_save(flags); kcov_remote_start_usb(id); + } + + return flags; } -static inline void kcov_remote_stop_softirq(void) +static inline void kcov_remote_stop_softirq(unsigned long flags) { - if (in_serving_softirq()) + if (in_serving_softirq()) { kcov_remote_stop(); + local_irq_restore(flags); + } } #ifdef CONFIG_64BIT @@ -103,8 +129,11 @@ static inline u64 kcov_common_handle(void) } static inline void kcov_remote_start_common(u64 id) {} static inline void kcov_remote_start_usb(u64 id) {} -static inline void kcov_remote_start_usb_softirq(u64 id) {} -static inline void kcov_remote_stop_softirq(void) {} +static inline unsigned long kcov_remote_start_usb_softirq(u64 id) +{ + return 0; +} +static inline void kcov_remote_stop_softirq(unsigned long flags) {} #endif /* CONFIG_KCOV */ #endif /* _LINUX_KCOV_H */ From 8bdf8a42bca4f47646fd105a387ab6926948c7f1 Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Fri, 31 May 2024 13:46:52 +0300 Subject: [PATCH 0557/1578] usb: typec: ucsi: Ack also failed Get Error commands It is possible that also the GET_ERROR command fails. If that happens, the command completion still needs to be acknowledged. Otherwise the interface will be stuck until it's reset. Reported-by: Ammy Yi Fixes: bdc62f2bae8f ("usb: typec: ucsi: Simplified registration and I/O API") Cc: stable@vger.kernel.org Signed-off-by: Heikki Krogerus Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20240531104653.1303519-1-heikki.krogerus@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/ucsi/ucsi.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/usb/typec/ucsi/ucsi.c b/drivers/usb/typec/ucsi/ucsi.c index cb52e7b0a2c5cd..2cc7aedd490f16 100644 --- a/drivers/usb/typec/ucsi/ucsi.c +++ b/drivers/usb/typec/ucsi/ucsi.c @@ -153,8 +153,13 @@ static int ucsi_exec_command(struct ucsi *ucsi, u64 cmd) } if (cci & UCSI_CCI_ERROR) { - if (cmd == UCSI_GET_ERROR_STATUS) + if (cmd == UCSI_GET_ERROR_STATUS) { + ret = ucsi_acknowledge(ucsi, false); + if (ret) + return ret; + return -EIO; + } return ucsi_read_error(ucsi); } From 16637fea001ab3c8df528a8995b3211906165a30 Mon Sep 17 00:00:00 2001 From: Shichao Lai Date: Sun, 26 May 2024 09:27:45 +0800 Subject: [PATCH 0558/1578] usb-storage: alauda: Check whether the media is initialized The member "uzonesize" of struct alauda_info will remain 0 if alauda_init_media() fails, potentially causing divide errors in alauda_read_data() and alauda_write_lba(). - Add a member "media_initialized" to struct alauda_info. - Change a condition in alauda_check_media() to ensure the first initialization. - Add an error check for the return value of alauda_init_media(). Fixes: e80b0fade09e ("[PATCH] USB Storage: add alauda support") Reported-by: xingwei lee Reported-by: yue sun Reviewed-by: Alan Stern Signed-off-by: Shichao Lai Link: https://lore.kernel.org/r/20240526012745.2852061-1-shichaorai@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/storage/alauda.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/usb/storage/alauda.c b/drivers/usb/storage/alauda.c index 115f05a6201a16..40d34cc28344a4 100644 --- a/drivers/usb/storage/alauda.c +++ b/drivers/usb/storage/alauda.c @@ -105,6 +105,8 @@ struct alauda_info { unsigned char sense_key; unsigned long sense_asc; /* additional sense code */ unsigned long sense_ascq; /* additional sense code qualifier */ + + bool media_initialized; }; #define short_pack(lsb,msb) ( ((u16)(lsb)) | ( ((u16)(msb))<<8 ) ) @@ -476,11 +478,12 @@ static int alauda_check_media(struct us_data *us) } /* Check for media change */ - if (status[0] & 0x08) { + if (status[0] & 0x08 || !info->media_initialized) { usb_stor_dbg(us, "Media change detected\n"); alauda_free_maps(&MEDIA_INFO(us)); - alauda_init_media(us); - + rc = alauda_init_media(us); + if (rc == USB_STOR_TRANSPORT_GOOD) + info->media_initialized = true; info->sense_key = UNIT_ATTENTION; info->sense_asc = 0x28; info->sense_ascq = 0x00; From 91215f70ea8541e9011c0b48f8b59b9e0ce6953b Mon Sep 17 00:00:00 2001 From: Su Hui Date: Tue, 4 Jun 2024 20:12:43 +0800 Subject: [PATCH 0559/1578] io_uring/io-wq: avoid garbage value of 'match' in io_wq_enqueue() Clang static checker (scan-build) warning: o_uring/io-wq.c:line 1051, column 3 The expression is an uninitialized value. The computed value will also be garbage. 'match.nr_pending' is used in io_acct_cancel_pending_work(), but it is not fully initialized. Change the order of assignment for 'match' to fix this problem. Fixes: 42abc95f05bf ("io-wq: decouple work_list protection from the big wqe->lock") Signed-off-by: Su Hui Link: https://lore.kernel.org/r/20240604121242.2661244-1-suhui@nfschina.com Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index d1c47a9d921582..7d3316fe9bfc46 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -927,7 +927,11 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) { struct io_wq_acct *acct = io_work_get_acct(wq, work); unsigned long work_flags = work->flags; - struct io_cb_cancel_data match; + struct io_cb_cancel_data match = { + .fn = io_wq_work_match_item, + .data = work, + .cancel_all = false, + }; bool do_create; /* @@ -965,10 +969,6 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) raw_spin_unlock(&wq->lock); /* fatal condition, failed to create the first worker */ - match.fn = io_wq_work_match_item, - match.data = work, - match.cancel_all = false, - io_acct_cancel_pending_work(wq, acct, &match); } } From 73254a297c2dd094abec7c9efee32455ae875bdf Mon Sep 17 00:00:00 2001 From: Hagar Hemdan Date: Tue, 4 Jun 2024 13:05:27 +0000 Subject: [PATCH 0560/1578] io_uring: fix possible deadlock in io_register_iowq_max_workers() The io_register_iowq_max_workers() function calls io_put_sq_data(), which acquires the sqd->lock without releasing the uring_lock. Similar to the commit 009ad9f0c6ee ("io_uring: drop ctx->uring_lock before acquiring sqd->lock"), this can lead to a potential deadlock situation. To resolve this issue, the uring_lock is released before calling io_put_sq_data(), and then it is re-acquired after the function call. This change ensures that the locks are acquired in the correct order, preventing the possibility of a deadlock. Suggested-by: Maximilian Heyne Signed-off-by: Hagar Hemdan Link: https://lore.kernel.org/r/20240604130527.3597-1-hagarhem@amazon.com Signed-off-by: Jens Axboe --- io_uring/register.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/io_uring/register.c b/io_uring/register.c index ef8c908346a4ef..c0010a66a6f2c2 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -355,8 +355,10 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, } if (sqd) { + mutex_unlock(&ctx->uring_lock); mutex_unlock(&sqd->lock); io_put_sq_data(sqd); + mutex_lock(&ctx->uring_lock); } if (copy_to_user(arg, new_count, sizeof(new_count))) @@ -380,8 +382,10 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, return 0; err: if (sqd) { + mutex_unlock(&ctx->uring_lock); mutex_unlock(&sqd->lock); io_put_sq_data(sqd); + mutex_lock(&ctx->uring_lock); } return ret; } From 971187350602d03c4a27c0783ff412502b95720a Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 4 Jul 2023 14:17:19 +0100 Subject: [PATCH 0561/1578] driver core: remove devm_device_add_groups() There is no more in-kernel users of this function, and no driver should ever be using it, so remove it from the kernel. Acked-by: Dmitry Torokhov Acked-by: "Rafael J. Wysocki" Link: https://lore.kernel.org/r/20230704131715.44454-8-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 45 ------------------------------------------ include/linux/device.h | 2 -- 2 files changed, 47 deletions(-) diff --git a/drivers/base/core.c b/drivers/base/core.c index 131d96c6090be4..2e776fcc31b695 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -2845,15 +2845,6 @@ static void devm_attr_group_remove(struct device *dev, void *res) sysfs_remove_group(&dev->kobj, group); } -static void devm_attr_groups_remove(struct device *dev, void *res) -{ - union device_attr_group_devres *devres = res; - const struct attribute_group **groups = devres->groups; - - dev_dbg(dev, "%s: removing groups %p\n", __func__, groups); - sysfs_remove_groups(&dev->kobj, groups); -} - /** * devm_device_add_group - given a device, create a managed attribute group * @dev: The device to create the group for @@ -2886,42 +2877,6 @@ int devm_device_add_group(struct device *dev, const struct attribute_group *grp) } EXPORT_SYMBOL_GPL(devm_device_add_group); -/** - * devm_device_add_groups - create a bunch of managed attribute groups - * @dev: The device to create the group for - * @groups: The attribute groups to create, NULL terminated - * - * This function creates a bunch of managed attribute groups. If an error - * occurs when creating a group, all previously created groups will be - * removed, unwinding everything back to the original state when this - * function was called. It will explicitly warn and error if any of the - * attribute files being created already exist. - * - * Returns 0 on success or error code from sysfs_create_group on failure. - */ -int devm_device_add_groups(struct device *dev, - const struct attribute_group **groups) -{ - union device_attr_group_devres *devres; - int error; - - devres = devres_alloc(devm_attr_groups_remove, - sizeof(*devres), GFP_KERNEL); - if (!devres) - return -ENOMEM; - - error = sysfs_create_groups(&dev->kobj, groups); - if (error) { - devres_free(devres); - return error; - } - - devres->groups = groups; - devres_add(dev, devres); - return 0; -} -EXPORT_SYMBOL_GPL(devm_device_add_groups); - static int device_add_attrs(struct device *dev) { const struct class *class = dev->class; diff --git a/include/linux/device.h b/include/linux/device.h index fc3bd7116ab9cd..ace039151cb86f 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1220,8 +1220,6 @@ static inline void device_remove_group(struct device *dev, return device_remove_groups(dev, groups); } -int __must_check devm_device_add_groups(struct device *dev, - const struct attribute_group **groups); int __must_check devm_device_add_group(struct device *dev, const struct attribute_group *grp); From 44a45be57f85165761fdabf072f9a97aa026ff61 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Thu, 23 May 2024 13:00:00 +0200 Subject: [PATCH 0562/1578] sysfs: Unbreak the build around sysfs_bin_attr_simple_read() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Günter reports build breakage for m68k "m5208evb_defconfig" plus CONFIG_BLK_DEV_INITRD=y caused by commit 66bc1a173328 ("treewide: Use sysfs_bin_attr_simple_read() helper"). The defconfig disables CONFIG_SYSFS, so sysfs_bin_attr_simple_read() is not compiled into the kernel. But init/initramfs.c references that function in the initializer of a struct bin_attribute. Add an empty static inline to avoid the build breakage. Fixes: 66bc1a173328 ("treewide: Use sysfs_bin_attr_simple_read() helper") Reported-by: Guenter Roeck Closes: https://lore.kernel.org/r/e12b0027-b199-4de7-b83d-668171447ccc@roeck-us.net Signed-off-by: Lukas Wunner Tested-by: Guenter Roeck Reviewed-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/05f4290439a58730738a15b0c99cd8576c4aa0d9.1716461752.git.lukas@wunner.de Signed-off-by: Greg Kroah-Hartman --- include/linux/sysfs.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index a7d725fbf73937..c4e64dc112063f 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -750,6 +750,15 @@ static inline int sysfs_emit_at(char *buf, int at, const char *fmt, ...) { return 0; } + +static inline ssize_t sysfs_bin_attr_simple_read(struct file *file, + struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, + size_t count) +{ + return 0; +} #endif /* CONFIG_SYSFS */ static inline int __must_check sysfs_create_file(struct kobject *kobj, From 87bb39ed40bdf1596b8820e800226e24eb642677 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 3 Jun 2024 13:28:43 +0100 Subject: [PATCH 0563/1578] KVM: arm64: Reintroduce __sve_save_state Now that the hypervisor is handling the host sve state in protected mode, it needs to be able to save it. This reverts commit e66425fc9ba3 ("KVM: arm64: Remove unused __sve_save_state"). Reviewed-by: Oliver Upton Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20240603122852.3923848-2-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_hyp.h | 1 + arch/arm64/kvm/hyp/fpsimd.S | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 3e80464f895317..2ab23589339a0f 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -111,6 +111,7 @@ void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu); void __fpsimd_save_state(struct user_fpsimd_state *fp_regs); void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs); +void __sve_save_state(void *sve_pffr, u32 *fpsr); void __sve_restore_state(void *sve_pffr, u32 *fpsr); u64 __guest_enter(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/hyp/fpsimd.S b/arch/arm64/kvm/hyp/fpsimd.S index 61e6f3ba7b7d17..e950875e31cee4 100644 --- a/arch/arm64/kvm/hyp/fpsimd.S +++ b/arch/arm64/kvm/hyp/fpsimd.S @@ -25,3 +25,9 @@ SYM_FUNC_START(__sve_restore_state) sve_load 0, x1, x2, 3 ret SYM_FUNC_END(__sve_restore_state) + +SYM_FUNC_START(__sve_save_state) + mov x2, #1 + sve_save 0, x1, x2, 3 + ret +SYM_FUNC_END(__sve_save_state) From 45f4ea9bcfe909b3461059990b1e232e55dde809 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 3 Jun 2024 13:28:44 +0100 Subject: [PATCH 0564/1578] KVM: arm64: Fix prototype for __sve_save_state/__sve_restore_state Since the prototypes for __sve_save_state/__sve_restore_state at hyp were added, the underlying macro has acquired a third parameter for saving/restoring ffr. Fix the prototypes to account for the third parameter, and restore the ffr for the guest since it is saved. Suggested-by: Mark Brown Signed-off-by: Fuad Tabba Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20240603122852.3923848-3-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_hyp.h | 4 ++-- arch/arm64/kvm/hyp/include/hyp/switch.h | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 2ab23589339a0f..686cce7e4e96c8 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -111,8 +111,8 @@ void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu); void __fpsimd_save_state(struct user_fpsimd_state *fp_regs); void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs); -void __sve_save_state(void *sve_pffr, u32 *fpsr); -void __sve_restore_state(void *sve_pffr, u32 *fpsr); +void __sve_save_state(void *sve_pffr, u32 *fpsr, int save_ffr); +void __sve_restore_state(void *sve_pffr, u32 *fpsr, int restore_ffr); u64 __guest_enter(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index a92566f36022e6..d58933ae8fd5c8 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -316,7 +316,8 @@ static inline void __hyp_sve_restore_guest(struct kvm_vcpu *vcpu) { sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, SYS_ZCR_EL2); __sve_restore_state(vcpu_sve_pffr(vcpu), - &vcpu->arch.ctxt.fp_regs.fpsr); + &vcpu->arch.ctxt.fp_regs.fpsr, + true); write_sysreg_el1(__vcpu_sys_reg(vcpu, ZCR_EL1), SYS_ZCR); } From 6d8fb3cbf7e06431a607c30c1bc4cd53a62c220a Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 3 Jun 2024 13:28:45 +0100 Subject: [PATCH 0565/1578] KVM: arm64: Abstract set/clear of CPTR_EL2 bits behind helper The same traps controlled by CPTR_EL2 or CPACR_EL1 need to be toggled in different parts of the code, but the exact bits and their polarity differ between these two formats and the mode (vhe/nvhe/hvhe). To reduce the amount of duplicated code and the chance of getting the wrong bit/polarity or missing a field, abstract the set/clear of CPTR_EL2 bits behind a helper. Since (h)VHE is the way of the future, use the CPACR_EL1 format, which is a subset of the VHE CPTR_EL2, as a reference. No functional change intended. Suggested-by: Oliver Upton Reviewed-by: Oliver Upton Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20240603122852.3923848-4-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_arm.h | 6 +++ arch/arm64/include/asm/kvm_emulate.h | 62 +++++++++++++++++++++++++ arch/arm64/kvm/hyp/include/hyp/switch.h | 18 ++----- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 6 +-- 4 files changed, 73 insertions(+), 19 deletions(-) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index e01bb5ca13b7cc..b2adc2c6c82a54 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -305,6 +305,12 @@ GENMASK(19, 14) | \ BIT(11)) +#define CPTR_VHE_EL2_RES0 (GENMASK(63, 32) | \ + GENMASK(27, 26) | \ + GENMASK(23, 22) | \ + GENMASK(19, 18) | \ + GENMASK(15, 0)) + /* Hyp Debug Configuration Register bits */ #define MDCR_EL2_E2TB_MASK (UL(0x3)) #define MDCR_EL2_E2TB_SHIFT (UL(24)) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 501e3e019c930c..2d7a0bdf9d03f5 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -557,6 +557,68 @@ static __always_inline void kvm_incr_pc(struct kvm_vcpu *vcpu) vcpu_set_flag((v), e); \ } while (0) +#define __build_check_all_or_none(r, bits) \ + BUILD_BUG_ON(((r) & (bits)) && ((r) & (bits)) != (bits)) + +#define __cpacr_to_cptr_clr(clr, set) \ + ({ \ + u64 cptr = 0; \ + \ + if ((set) & CPACR_ELx_FPEN) \ + cptr |= CPTR_EL2_TFP; \ + if ((set) & CPACR_ELx_ZEN) \ + cptr |= CPTR_EL2_TZ; \ + if ((set) & CPACR_ELx_SMEN) \ + cptr |= CPTR_EL2_TSM; \ + if ((clr) & CPACR_ELx_TTA) \ + cptr |= CPTR_EL2_TTA; \ + if ((clr) & CPTR_EL2_TAM) \ + cptr |= CPTR_EL2_TAM; \ + if ((clr) & CPTR_EL2_TCPAC) \ + cptr |= CPTR_EL2_TCPAC; \ + \ + cptr; \ + }) + +#define __cpacr_to_cptr_set(clr, set) \ + ({ \ + u64 cptr = 0; \ + \ + if ((clr) & CPACR_ELx_FPEN) \ + cptr |= CPTR_EL2_TFP; \ + if ((clr) & CPACR_ELx_ZEN) \ + cptr |= CPTR_EL2_TZ; \ + if ((clr) & CPACR_ELx_SMEN) \ + cptr |= CPTR_EL2_TSM; \ + if ((set) & CPACR_ELx_TTA) \ + cptr |= CPTR_EL2_TTA; \ + if ((set) & CPTR_EL2_TAM) \ + cptr |= CPTR_EL2_TAM; \ + if ((set) & CPTR_EL2_TCPAC) \ + cptr |= CPTR_EL2_TCPAC; \ + \ + cptr; \ + }) + +#define cpacr_clear_set(clr, set) \ + do { \ + BUILD_BUG_ON((set) & CPTR_VHE_EL2_RES0); \ + BUILD_BUG_ON((clr) & CPACR_ELx_E0POE); \ + __build_check_all_or_none((clr), CPACR_ELx_FPEN); \ + __build_check_all_or_none((set), CPACR_ELx_FPEN); \ + __build_check_all_or_none((clr), CPACR_ELx_ZEN); \ + __build_check_all_or_none((set), CPACR_ELx_ZEN); \ + __build_check_all_or_none((clr), CPACR_ELx_SMEN); \ + __build_check_all_or_none((set), CPACR_ELx_SMEN); \ + \ + if (has_vhe() || has_hvhe()) \ + sysreg_clear_set(cpacr_el1, clr, set); \ + else \ + sysreg_clear_set(cptr_el2, \ + __cpacr_to_cptr_clr(clr, set), \ + __cpacr_to_cptr_set(clr, set));\ + } while (0) + static __always_inline void kvm_write_cptr_el2(u64 val) { if (has_vhe() || has_hvhe()) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index d58933ae8fd5c8..055d2ca7264e02 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -331,7 +331,6 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) { bool sve_guest; u8 esr_ec; - u64 reg; if (!system_supports_fpsimd()) return false; @@ -354,19 +353,10 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) /* Valid trap. Switch the context: */ /* First disable enough traps to allow us to update the registers */ - if (has_vhe() || has_hvhe()) { - reg = CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN; - if (sve_guest) - reg |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN; - - sysreg_clear_set(cpacr_el1, 0, reg); - } else { - reg = CPTR_EL2_TFP; - if (sve_guest) - reg |= CPTR_EL2_TZ; - - sysreg_clear_set(cptr_el2, reg, 0); - } + if (sve_guest) + cpacr_clear_set(0, CPACR_ELx_FPEN | CPACR_ELx_ZEN); + else + cpacr_clear_set(0, CPACR_ELx_FPEN); isb(); /* Write out the host state if it's in the registers */ diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index d5c48dc98f6794..f71394d0e32a74 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -405,11 +405,7 @@ void handle_trap(struct kvm_cpu_context *host_ctxt) handle_host_smc(host_ctxt); break; case ESR_ELx_EC_SVE: - if (has_hvhe()) - sysreg_clear_set(cpacr_el1, 0, (CPACR_EL1_ZEN_EL1EN | - CPACR_EL1_ZEN_EL0EN)); - else - sysreg_clear_set(cptr_el2, CPTR_EL2_TZ, 0); + cpacr_clear_set(0, CPACR_ELx_ZEN); isb(); sve_cond_update_zcr_vq(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2); break; From e511e08a9f496948b13aac50610f2d17335f56c3 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 3 Jun 2024 13:28:46 +0100 Subject: [PATCH 0566/1578] KVM: arm64: Specialize handling of host fpsimd state on trap In subsequent patches, n/vhe will diverge on saving the host fpsimd/sve state when taking a guest fpsimd/sve trap. Add a specialized helper to handle it. No functional change intended. Reviewed-by: Mark Brown Reviewed-by: Oliver Upton Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20240603122852.3923848-5-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/hyp/switch.h | 4 +++- arch/arm64/kvm/hyp/nvhe/switch.c | 5 +++++ arch/arm64/kvm/hyp/vhe/switch.c | 5 +++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 055d2ca7264e02..9b904c858df089 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -321,6 +321,8 @@ static inline void __hyp_sve_restore_guest(struct kvm_vcpu *vcpu) write_sysreg_el1(__vcpu_sys_reg(vcpu, ZCR_EL1), SYS_ZCR); } +static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu); + /* * We trap the first access to the FP/SIMD to save the host context and * restore the guest context lazily. @@ -361,7 +363,7 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) /* Write out the host state if it's in the registers */ if (host_owns_fp_regs()) - __fpsimd_save_state(*host_data_ptr(fpsimd_state)); + kvm_hyp_save_fpsimd_host(vcpu); /* Restore the guest state */ if (sve_guest) diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 6758cd90557061..019f863922fa42 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -182,6 +182,11 @@ static bool kvm_handle_pvm_sys64(struct kvm_vcpu *vcpu, u64 *exit_code) kvm_handle_pvm_sysreg(vcpu, exit_code)); } +static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) +{ + __fpsimd_save_state(*host_data_ptr(fpsimd_state)); +} + static const exit_handler_fn hyp_exit_handlers[] = { [0 ... ESR_ELx_EC_MAX] = NULL, [ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32, diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index d7af5f46f22a3e..20073579e9f5b4 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -262,6 +262,11 @@ static bool kvm_hyp_handle_eret(struct kvm_vcpu *vcpu, u64 *exit_code) return true; } +static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) +{ + __fpsimd_save_state(*host_data_ptr(fpsimd_state)); +} + static const exit_handler_fn hyp_exit_handlers[] = { [0 ... ESR_ELx_EC_MAX] = NULL, [ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32, From 66d5b53e20a6e00b7ce3b652a3e2db967f7b33d0 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 3 Jun 2024 13:28:47 +0100 Subject: [PATCH 0567/1578] KVM: arm64: Allocate memory mapped at hyp for host sve state in pKVM Protected mode needs to maintain (save/restore) the host's sve state, rather than relying on the host kernel to do that. This is to avoid leaking information to the host about guests and the type of operations they are performing. As a first step towards that, allocate memory mapped at hyp, per cpu, for the host sve state. The following patch will use this memory to save/restore the host state. Reviewed-by: Oliver Upton Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20240603122852.3923848-6-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 17 ++++++++ arch/arm64/include/asm/kvm_hyp.h | 1 + arch/arm64/include/asm/kvm_pkvm.h | 9 ++++ arch/arm64/kvm/arm.c | 68 +++++++++++++++++++++++++++++++ arch/arm64/kvm/hyp/nvhe/pkvm.c | 2 + arch/arm64/kvm/hyp/nvhe/setup.c | 24 +++++++++++ arch/arm64/kvm/reset.c | 3 ++ 7 files changed, 124 insertions(+) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 8170c04fde914c..90df7ccec5f4fa 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -76,6 +76,7 @@ static inline enum kvm_mode kvm_get_mode(void) { return KVM_MODE_NONE; }; DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use); extern unsigned int __ro_after_init kvm_sve_max_vl; +extern unsigned int __ro_after_init kvm_host_sve_max_vl; int __init kvm_arm_init_sve(void); u32 __attribute_const__ kvm_target_cpu(void); @@ -521,6 +522,20 @@ struct kvm_cpu_context { u64 *vncr_array; }; +struct cpu_sve_state { + __u64 zcr_el1; + + /* + * Ordering is important since __sve_save_state/__sve_restore_state + * relies on it. + */ + __u32 fpsr; + __u32 fpcr; + + /* Must be SVE_VQ_BYTES (128 bit) aligned. */ + __u8 sve_regs[]; +}; + /* * This structure is instantiated on a per-CPU basis, and contains * data that is: @@ -534,7 +549,9 @@ struct kvm_cpu_context { */ struct kvm_host_data { struct kvm_cpu_context host_ctxt; + struct user_fpsimd_state *fpsimd_state; /* hyp VA */ + struct cpu_sve_state *sve_state; /* hyp VA */ /* Ownership of the FP regs */ enum { diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 686cce7e4e96c8..b05bceca33850d 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -143,5 +143,6 @@ extern u64 kvm_nvhe_sym(id_aa64smfr0_el1_sys_val); extern unsigned long kvm_nvhe_sym(__icache_flags); extern unsigned int kvm_nvhe_sym(kvm_arm_vmid_bits); +extern unsigned int kvm_nvhe_sym(kvm_host_sve_max_vl); #endif /* __ARM64_KVM_HYP_H__ */ diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h index ad9cfb5c1ff4e6..cd56acd9a842cc 100644 --- a/arch/arm64/include/asm/kvm_pkvm.h +++ b/arch/arm64/include/asm/kvm_pkvm.h @@ -128,4 +128,13 @@ static inline unsigned long hyp_ffa_proxy_pages(void) return (2 * KVM_FFA_MBOX_NR_PAGES) + DIV_ROUND_UP(desc_max, PAGE_SIZE); } +static inline size_t pkvm_host_sve_state_size(void) +{ + if (!system_supports_sve()) + return 0; + + return size_add(sizeof(struct cpu_sve_state), + SVE_SIG_REGS_SIZE(sve_vq_from_vl(kvm_host_sve_max_vl))); +} + #endif /* __ARM64_KVM_PKVM_H__ */ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 9996a989b52e87..1acf7415e831b8 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1931,6 +1931,11 @@ static unsigned long nvhe_percpu_order(void) return size ? get_order(size) : 0; } +static size_t pkvm_host_sve_state_order(void) +{ + return get_order(pkvm_host_sve_state_size()); +} + /* A lookup table holding the hypervisor VA for each vector slot */ static void *hyp_spectre_vector_selector[BP_HARDEN_EL2_SLOTS]; @@ -2310,12 +2315,20 @@ static void __init teardown_subsystems(void) static void __init teardown_hyp_mode(void) { + bool free_sve = system_supports_sve() && is_protected_kvm_enabled(); int cpu; free_hyp_pgds(); for_each_possible_cpu(cpu) { free_page(per_cpu(kvm_arm_hyp_stack_page, cpu)); free_pages(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu], nvhe_percpu_order()); + + if (free_sve) { + struct cpu_sve_state *sve_state; + + sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state; + free_pages((unsigned long) sve_state, pkvm_host_sve_state_order()); + } } } @@ -2398,6 +2411,50 @@ static int __init kvm_hyp_init_protection(u32 hyp_va_bits) return 0; } +static int init_pkvm_host_sve_state(void) +{ + int cpu; + + if (!system_supports_sve()) + return 0; + + /* Allocate pages for host sve state in protected mode. */ + for_each_possible_cpu(cpu) { + struct page *page = alloc_pages(GFP_KERNEL, pkvm_host_sve_state_order()); + + if (!page) + return -ENOMEM; + + per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state = page_address(page); + } + + /* + * Don't map the pages in hyp since these are only used in protected + * mode, which will (re)create its own mapping when initialized. + */ + + return 0; +} + +/* + * Finalizes the initialization of hyp mode, once everything else is initialized + * and the initialziation process cannot fail. + */ +static void finalize_init_hyp_mode(void) +{ + int cpu; + + if (!is_protected_kvm_enabled() || !system_supports_sve()) + return; + + for_each_possible_cpu(cpu) { + struct cpu_sve_state *sve_state; + + sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state; + per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state = kern_hyp_va(sve_state); + } +} + static void pkvm_hyp_init_ptrauth(void) { struct kvm_cpu_context *hyp_ctxt; @@ -2566,6 +2623,10 @@ static int __init init_hyp_mode(void) goto out_err; } + err = init_pkvm_host_sve_state(); + if (err) + goto out_err; + err = kvm_hyp_init_protection(hyp_va_bits); if (err) { kvm_err("Failed to init hyp memory protection\n"); @@ -2730,6 +2791,13 @@ static __init int kvm_arm_init(void) if (err) goto out_subs; + /* + * This should be called after initialization is done and failure isn't + * possible anymore. + */ + if (!in_hyp_mode) + finalize_init_hyp_mode(); + kvm_arm_initialised = true; return 0; diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 16aa4875ddb8c0..25e9a94f6d7642 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -18,6 +18,8 @@ unsigned long __icache_flags; /* Used by kvm_get_vttbr(). */ unsigned int kvm_arm_vmid_bits; +unsigned int kvm_host_sve_max_vl; + /* * Set trap register values based on features in ID_AA64PFR0. */ diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 859f22f754d373..3fae4247959823 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -67,6 +67,28 @@ static int divide_memory_pool(void *virt, unsigned long size) return 0; } +static int pkvm_create_host_sve_mappings(void) +{ + void *start, *end; + int ret, i; + + if (!system_supports_sve()) + return 0; + + for (i = 0; i < hyp_nr_cpus; i++) { + struct kvm_host_data *host_data = per_cpu_ptr(&kvm_host_data, i); + struct cpu_sve_state *sve_state = host_data->sve_state; + + start = kern_hyp_va(sve_state); + end = start + PAGE_ALIGN(pkvm_host_sve_state_size()); + ret = pkvm_create_mappings(start, end, PAGE_HYP); + if (ret) + return ret; + } + + return 0; +} + static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, unsigned long *per_cpu_base, u32 hyp_va_bits) @@ -125,6 +147,8 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, return ret; } + pkvm_create_host_sve_mappings(); + /* * Map the host sections RO in the hypervisor, but transfer the * ownership from the host to the hypervisor itself to make sure they diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 1b7b58cb121f98..3fc8ca164dbe4e 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -32,6 +32,7 @@ /* Maximum phys_shift supported for any VM on this host */ static u32 __ro_after_init kvm_ipa_limit; +unsigned int __ro_after_init kvm_host_sve_max_vl; /* * ARMv8 Reset Values @@ -51,6 +52,8 @@ int __init kvm_arm_init_sve(void) { if (system_supports_sve()) { kvm_sve_max_vl = sve_max_virtualisable_vl(); + kvm_host_sve_max_vl = sve_max_vl(); + kvm_nvhe_sym(kvm_host_sve_max_vl) = kvm_host_sve_max_vl; /* * The get_sve_reg()/set_sve_reg() ioctl interface will need From b5b9955617bc0b41546f2fa7c3dbcc048b43dc82 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 3 Jun 2024 13:28:48 +0100 Subject: [PATCH 0568/1578] KVM: arm64: Eagerly restore host fpsimd/sve state in pKVM When running in protected mode we don't want to leak protected guest state to the host, including whether a guest has used fpsimd/sve. Therefore, eagerly restore the host state on guest exit when running in protected mode, which happens only if the guest has used fpsimd/sve. Reviewed-by: Oliver Upton Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20240603122852.3923848-7-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/hyp/switch.h | 13 ++++- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 67 +++++++++++++++++++++++-- arch/arm64/kvm/hyp/nvhe/pkvm.c | 2 + arch/arm64/kvm/hyp/nvhe/switch.c | 16 +++++- 4 files changed, 93 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 9b904c858df089..0c4de44534b7a8 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -321,6 +321,17 @@ static inline void __hyp_sve_restore_guest(struct kvm_vcpu *vcpu) write_sysreg_el1(__vcpu_sys_reg(vcpu, ZCR_EL1), SYS_ZCR); } +static inline void __hyp_sve_save_host(void) +{ + struct cpu_sve_state *sve_state = *host_data_ptr(sve_state); + + sve_state->zcr_el1 = read_sysreg_el1(SYS_ZCR); + write_sysreg_s(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2); + __sve_save_state(sve_state->sve_regs + sve_ffr_offset(kvm_host_sve_max_vl), + &sve_state->fpsr, + true); +} + static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu); /* @@ -355,7 +366,7 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) /* Valid trap. Switch the context: */ /* First disable enough traps to allow us to update the registers */ - if (sve_guest) + if (sve_guest || (is_protected_kvm_enabled() && system_supports_sve())) cpacr_clear_set(0, CPACR_ELx_FPEN | CPACR_ELx_ZEN); else cpacr_clear_set(0, CPACR_ELx_FPEN); diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index f71394d0e32a74..bd93b8a9e17261 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -23,20 +23,80 @@ DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt); +static void __hyp_sve_save_guest(struct kvm_vcpu *vcpu) +{ + __vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_el1(SYS_ZCR); + /* + * On saving/restoring guest sve state, always use the maximum VL for + * the guest. The layout of the data when saving the sve state depends + * on the VL, so use a consistent (i.e., the maximum) guest VL. + */ + sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, SYS_ZCR_EL2); + __sve_save_state(vcpu_sve_pffr(vcpu), &vcpu->arch.ctxt.fp_regs.fpsr, true); + write_sysreg_s(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2); +} + +static void __hyp_sve_restore_host(void) +{ + struct cpu_sve_state *sve_state = *host_data_ptr(sve_state); + + /* + * On saving/restoring host sve state, always use the maximum VL for + * the host. The layout of the data when saving the sve state depends + * on the VL, so use a consistent (i.e., the maximum) host VL. + * + * Setting ZCR_EL2 to ZCR_ELx_LEN_MASK sets the effective length + * supported by the system (or limited at EL3). + */ + write_sysreg_s(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2); + __sve_restore_state(sve_state->sve_regs + sve_ffr_offset(kvm_host_sve_max_vl), + &sve_state->fpsr, + true); + write_sysreg_el1(sve_state->zcr_el1, SYS_ZCR); +} + +static void fpsimd_sve_flush(void) +{ + *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED; +} + +static void fpsimd_sve_sync(struct kvm_vcpu *vcpu) +{ + if (!guest_owns_fp_regs()) + return; + + cpacr_clear_set(0, CPACR_ELx_FPEN | CPACR_ELx_ZEN); + isb(); + + if (vcpu_has_sve(vcpu)) + __hyp_sve_save_guest(vcpu); + else + __fpsimd_save_state(&vcpu->arch.ctxt.fp_regs); + + if (system_supports_sve()) + __hyp_sve_restore_host(); + else + __fpsimd_restore_state(*host_data_ptr(fpsimd_state)); + + *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED; +} + static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) { struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + fpsimd_sve_flush(); + hyp_vcpu->vcpu.arch.ctxt = host_vcpu->arch.ctxt; hyp_vcpu->vcpu.arch.sve_state = kern_hyp_va(host_vcpu->arch.sve_state); - hyp_vcpu->vcpu.arch.sve_max_vl = host_vcpu->arch.sve_max_vl; + /* Limit guest vector length to the maximum supported by the host. */ + hyp_vcpu->vcpu.arch.sve_max_vl = min(host_vcpu->arch.sve_max_vl, kvm_host_sve_max_vl); hyp_vcpu->vcpu.arch.hw_mmu = host_vcpu->arch.hw_mmu; hyp_vcpu->vcpu.arch.hcr_el2 = host_vcpu->arch.hcr_el2; hyp_vcpu->vcpu.arch.mdcr_el2 = host_vcpu->arch.mdcr_el2; - hyp_vcpu->vcpu.arch.cptr_el2 = host_vcpu->arch.cptr_el2; hyp_vcpu->vcpu.arch.iflags = host_vcpu->arch.iflags; @@ -54,10 +114,11 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) struct vgic_v3_cpu_if *host_cpu_if = &host_vcpu->arch.vgic_cpu.vgic_v3; unsigned int i; + fpsimd_sve_sync(&hyp_vcpu->vcpu); + host_vcpu->arch.ctxt = hyp_vcpu->vcpu.arch.ctxt; host_vcpu->arch.hcr_el2 = hyp_vcpu->vcpu.arch.hcr_el2; - host_vcpu->arch.cptr_el2 = hyp_vcpu->vcpu.arch.cptr_el2; host_vcpu->arch.fault = hyp_vcpu->vcpu.arch.fault; diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 25e9a94f6d7642..feb27b4ce45925 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -588,6 +588,8 @@ int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu, if (ret) unmap_donated_memory(hyp_vcpu, sizeof(*hyp_vcpu)); + hyp_vcpu->vcpu.arch.cptr_el2 = kvm_get_reset_cptr_el2(&hyp_vcpu->vcpu); + return ret; } diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 019f863922fa42..bef74de7065b31 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -184,7 +184,21 @@ static bool kvm_handle_pvm_sys64(struct kvm_vcpu *vcpu, u64 *exit_code) static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) { - __fpsimd_save_state(*host_data_ptr(fpsimd_state)); + /* + * Non-protected kvm relies on the host restoring its sve state. + * Protected kvm restores the host's sve state as not to reveal that + * fpsimd was used by a guest nor leak upper sve bits. + */ + if (unlikely(is_protected_kvm_enabled() && system_supports_sve())) { + __hyp_sve_save_host(); + + /* Re-enable SVE traps if not supported for the guest vcpu. */ + if (!vcpu_has_sve(vcpu)) + cpacr_clear_set(CPACR_ELx_ZEN, 0); + + } else { + __fpsimd_save_state(*host_data_ptr(fpsimd_state)); + } } static const exit_handler_fn hyp_exit_handlers[] = { From 1696fc2174dbab12228ea9ec4c213d6aeea348f8 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 3 Jun 2024 13:28:49 +0100 Subject: [PATCH 0569/1578] KVM: arm64: Consolidate initializing the host data's fpsimd_state/sve in pKVM Now that we have introduced finalize_init_hyp_mode(), lets consolidate the initializing of the host_data fpsimd_state and sve state. Reviewed-by: Oliver Upton Signed-off-by: Fuad Tabba Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20240603122852.3923848-8-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 10 ++++++++-- arch/arm64/kvm/arm.c | 20 ++++++++++++++------ arch/arm64/kvm/hyp/include/nvhe/pkvm.h | 1 - arch/arm64/kvm/hyp/nvhe/pkvm.c | 11 ----------- arch/arm64/kvm/hyp/nvhe/setup.c | 1 - 5 files changed, 22 insertions(+), 21 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 90df7ccec5f4fa..36b8e97bf49ec4 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -550,8 +550,14 @@ struct cpu_sve_state { struct kvm_host_data { struct kvm_cpu_context host_ctxt; - struct user_fpsimd_state *fpsimd_state; /* hyp VA */ - struct cpu_sve_state *sve_state; /* hyp VA */ + /* + * All pointers in this union are hyp VA. + * sve_state is only used in pKVM and if system_supports_sve(). + */ + union { + struct user_fpsimd_state *fpsimd_state; + struct cpu_sve_state *sve_state; + }; /* Ownership of the FP regs */ enum { diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 1acf7415e831b8..59716789fe0f38 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2444,14 +2444,22 @@ static void finalize_init_hyp_mode(void) { int cpu; - if (!is_protected_kvm_enabled() || !system_supports_sve()) - return; + if (system_supports_sve() && is_protected_kvm_enabled()) { + for_each_possible_cpu(cpu) { + struct cpu_sve_state *sve_state; - for_each_possible_cpu(cpu) { - struct cpu_sve_state *sve_state; + sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state; + per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state = + kern_hyp_va(sve_state); + } + } else { + for_each_possible_cpu(cpu) { + struct user_fpsimd_state *fpsimd_state; - sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state; - per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state = kern_hyp_va(sve_state); + fpsimd_state = &per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->host_ctxt.fp_regs; + per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->fpsimd_state = + kern_hyp_va(fpsimd_state); + } } } diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h index 22f374e9f53296..24a9a8330d1903 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h @@ -59,7 +59,6 @@ static inline bool pkvm_hyp_vcpu_is_protected(struct pkvm_hyp_vcpu *hyp_vcpu) } void pkvm_hyp_vm_table_init(void *tbl); -void pkvm_host_fpsimd_state_init(void); int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, unsigned long pgd_hva); diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index feb27b4ce45925..ea67fcbf837617 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -249,17 +249,6 @@ void pkvm_hyp_vm_table_init(void *tbl) vm_table = tbl; } -void pkvm_host_fpsimd_state_init(void) -{ - unsigned long i; - - for (i = 0; i < hyp_nr_cpus; i++) { - struct kvm_host_data *host_data = per_cpu_ptr(&kvm_host_data, i); - - host_data->fpsimd_state = &host_data->host_ctxt.fp_regs; - } -} - /* * Return the hyp vm structure corresponding to the handle. */ diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 3fae4247959823..f4350ba07b0b0c 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -324,7 +324,6 @@ void __noreturn __pkvm_init_finalise(void) goto out; pkvm_hyp_vm_table_init(vm_table_base); - pkvm_host_fpsimd_state_init(); out: /* * We tail-called to here from handle___pkvm_init() and will not return, From a69283ae1db8dd416870d931caa9e2d3d2c1cd8b Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 3 Jun 2024 13:28:50 +0100 Subject: [PATCH 0570/1578] KVM: arm64: Refactor CPACR trap bit setting/clearing to use ELx format When setting/clearing CPACR bits for EL0 and EL1, use the ELx format of the bits, which covers both. This makes the code clearer, and reduces the chances of accidentally missing a bit. No functional change intended. Reviewed-by: Oliver Upton Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20240603122852.3923848-9-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/el2_setup.h | 6 +++--- arch/arm64/include/asm/kvm_emulate.h | 9 ++++----- arch/arm64/kvm/fpsimd.c | 4 +--- arch/arm64/kvm/hyp/nvhe/pkvm.c | 2 +- arch/arm64/kvm/hyp/nvhe/switch.c | 5 ++--- arch/arm64/kvm/hyp/vhe/switch.c | 7 +++---- 6 files changed, 14 insertions(+), 19 deletions(-) diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index e4546b29dd0cbf..fd87c4b8f98403 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -146,7 +146,7 @@ /* Coprocessor traps */ .macro __init_el2_cptr __check_hvhe .LnVHE_\@, x1 - mov x0, #(CPACR_EL1_FPEN_EL1EN | CPACR_EL1_FPEN_EL0EN) + mov x0, #CPACR_ELx_FPEN msr cpacr_el1, x0 b .Lskip_set_cptr_\@ .LnVHE_\@: @@ -277,7 +277,7 @@ // (h)VHE case mrs x0, cpacr_el1 // Disable SVE traps - orr x0, x0, #(CPACR_EL1_ZEN_EL1EN | CPACR_EL1_ZEN_EL0EN) + orr x0, x0, #CPACR_ELx_ZEN msr cpacr_el1, x0 b .Lskip_set_cptr_\@ @@ -298,7 +298,7 @@ // (h)VHE case mrs x0, cpacr_el1 // Disable SME traps - orr x0, x0, #(CPACR_EL1_SMEN_EL0EN | CPACR_EL1_SMEN_EL1EN) + orr x0, x0, #CPACR_ELx_SMEN msr cpacr_el1, x0 b .Lskip_set_cptr_sme_\@ diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 2d7a0bdf9d03f5..21650e7924d458 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -632,17 +632,16 @@ static __always_inline u64 kvm_get_reset_cptr_el2(struct kvm_vcpu *vcpu) u64 val; if (has_vhe()) { - val = (CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN | - CPACR_EL1_ZEN_EL1EN); + val = (CPACR_ELx_FPEN | CPACR_EL1_ZEN_EL1EN); if (cpus_have_final_cap(ARM64_SME)) val |= CPACR_EL1_SMEN_EL1EN; } else if (has_hvhe()) { - val = (CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN); + val = CPACR_ELx_FPEN; if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs()) - val |= CPACR_EL1_ZEN_EL1EN | CPACR_EL1_ZEN_EL0EN; + val |= CPACR_ELx_ZEN; if (cpus_have_final_cap(ARM64_SME)) - val |= CPACR_EL1_SMEN_EL1EN | CPACR_EL1_SMEN_EL0EN; + val |= CPACR_ELx_SMEN; } else { val = CPTR_NVHE_EL2_RES1; diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 1807d3a79a8af8..eb21f29d91fc5e 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -161,9 +161,7 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) if (has_vhe() && system_supports_sme()) { /* Also restore EL0 state seen on entry */ if (vcpu_get_flag(vcpu, HOST_SME_ENABLED)) - sysreg_clear_set(CPACR_EL1, 0, - CPACR_EL1_SMEN_EL0EN | - CPACR_EL1_SMEN_EL1EN); + sysreg_clear_set(CPACR_EL1, 0, CPACR_ELx_SMEN); else sysreg_clear_set(CPACR_EL1, CPACR_EL1_SMEN_EL0EN, diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index ea67fcbf837617..95cf1857425175 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -65,7 +65,7 @@ static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu) /* Trap SVE */ if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), feature_ids)) { if (has_hvhe()) - cptr_clear |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN; + cptr_clear |= CPACR_ELx_ZEN; else cptr_set |= CPTR_EL2_TZ; } diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index bef74de7065b31..6af179c6356d66 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -48,15 +48,14 @@ static void __activate_traps(struct kvm_vcpu *vcpu) val |= has_hvhe() ? CPACR_EL1_TTA : CPTR_EL2_TTA; if (cpus_have_final_cap(ARM64_SME)) { if (has_hvhe()) - val &= ~(CPACR_EL1_SMEN_EL1EN | CPACR_EL1_SMEN_EL0EN); + val &= ~CPACR_ELx_SMEN; else val |= CPTR_EL2_TSM; } if (!guest_owns_fp_regs()) { if (has_hvhe()) - val &= ~(CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN | - CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN); + val &= ~(CPACR_ELx_FPEN | CPACR_ELx_ZEN); else val |= CPTR_EL2_TFP | CPTR_EL2_TZ; diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 20073579e9f5b4..8fbb6a2e0559d7 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -93,8 +93,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu) val = read_sysreg(cpacr_el1); val |= CPACR_ELx_TTA; - val &= ~(CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN | - CPACR_EL1_SMEN_EL0EN | CPACR_EL1_SMEN_EL1EN); + val &= ~(CPACR_ELx_ZEN | CPACR_ELx_SMEN); /* * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to @@ -109,9 +108,9 @@ static void __activate_traps(struct kvm_vcpu *vcpu) if (guest_owns_fp_regs()) { if (vcpu_has_sve(vcpu)) - val |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN; + val |= CPACR_ELx_ZEN; } else { - val &= ~(CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN); + val &= ~CPACR_ELx_FPEN; __activate_traps_fpsimd32(vcpu); } From afb91f5f8ad7af172d993a34fde1947892408f53 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 3 Jun 2024 13:28:51 +0100 Subject: [PATCH 0571/1578] KVM: arm64: Ensure that SME controls are disabled in protected mode KVM (and pKVM) do not support SME guests. Therefore KVM ensures that the host's SME state is flushed and that SME controls for enabling access to ZA storage and for streaming are disabled. pKVM needs to protect against a buggy/malicious host. Ensure that it wouldn't run a guest when protected mode is enabled should any of the SME controls be enabled. Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20240603122852.3923848-10-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/fpsimd.c | 7 +++++++ arch/arm64/kvm/hyp/nvhe/hyp-main.c | 11 +++++++++++ 2 files changed, 18 insertions(+) diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index eb21f29d91fc5e..521b32868d0d26 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -90,6 +90,13 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) fpsimd_save_and_flush_cpu_state(); } } + + /* + * If normal guests gain SME support, maintain this behavior for pKVM + * guests, which don't support SME. + */ + WARN_ON(is_protected_kvm_enabled() && system_supports_sme() && + read_sysreg_s(SYS_SVCR)); } /* diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index bd93b8a9e17261..f43d845f3c4ecd 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -140,6 +140,17 @@ static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt) struct pkvm_hyp_vcpu *hyp_vcpu; struct kvm *host_kvm; + /* + * KVM (and pKVM) doesn't support SME guests for now, and + * ensures that SME features aren't enabled in pstate when + * loading a vcpu. Therefore, if SME features enabled the host + * is misbehaving. + */ + if (unlikely(system_supports_sme() && read_sysreg_s(SYS_SVCR))) { + ret = -EINVAL; + goto out; + } + host_kvm = kern_hyp_va(host_vcpu->kvm); hyp_vcpu = pkvm_load_hyp_vcpu(host_kvm->arch.pkvm.handle, host_vcpu->vcpu_idx); From ce3af2ee95170b7d9e15fff6e500d67deab1e7b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Exp=C3=B3sito?= Date: Fri, 24 May 2024 15:05:39 +0200 Subject: [PATCH 0572/1578] HID: logitech-dj: Fix memory leak in logi_dj_recv_switch_to_dj_mode() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a memory leak on logi_dj_recv_send_report() error path. Fixes: 6f20d3261265 ("HID: logitech-dj: Fix error handling in logi_dj_recv_switch_to_dj_mode()") Signed-off-by: José Expósito Signed-off-by: Jiri Kosina --- drivers/hid/hid-logitech-dj.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/hid/hid-logitech-dj.c b/drivers/hid/hid-logitech-dj.c index 3c3c497b6b9114..37958edec55f5f 100644 --- a/drivers/hid/hid-logitech-dj.c +++ b/drivers/hid/hid-logitech-dj.c @@ -1284,8 +1284,10 @@ static int logi_dj_recv_switch_to_dj_mode(struct dj_receiver_dev *djrcv_dev, */ msleep(50); - if (retval) + if (retval) { + kfree(dj_report); return retval; + } } /* From 655a8a7684b897721f87c59798fd04d8b79f1b69 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 26 May 2024 17:05:59 +0200 Subject: [PATCH 0573/1578] HID: nintendo: Fix an error handling path in nintendo_hid_probe() joycon_leds_create() has a ida_alloc() call. So if an error occurs after it, a corresponding ida_free() call is needed, as already done in the .remove function. This is not 100% perfect, because if ida_alloc() fails, then 'ctlr->player_id' will forced to be U32_MAX, and an error will be logged when ida_free() is called. Considering that this can't happen in real life, no special handling is done to handle it. Fixes: 5307de63d71d ("HID: nintendo: use ida for LED player id") Signed-off-by: Christophe JAILLET Reviewed-by: Silvan Jegen Signed-off-by: Jiri Kosina --- drivers/hid/hid-nintendo.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/hid/hid-nintendo.c b/drivers/hid/hid-nintendo.c index b4a97803eca3e7..3062daf68d312c 100644 --- a/drivers/hid/hid-nintendo.c +++ b/drivers/hid/hid-nintendo.c @@ -2725,13 +2725,13 @@ static int nintendo_hid_probe(struct hid_device *hdev, ret = joycon_power_supply_create(ctlr); if (ret) { hid_err(hdev, "Failed to create power_supply; ret=%d\n", ret); - goto err_close; + goto err_ida; } ret = joycon_input_create(ctlr); if (ret) { hid_err(hdev, "Failed to create input device; ret=%d\n", ret); - goto err_close; + goto err_ida; } ctlr->ctlr_state = JOYCON_CTLR_STATE_READ; @@ -2739,6 +2739,8 @@ static int nintendo_hid_probe(struct hid_device *hdev, hid_dbg(hdev, "probe - success\n"); return 0; +err_ida: + ida_free(&nintendo_player_id_allocator, ctlr->player_id); err_close: hid_hw_close(hdev); err_stop: From 9e438fe31e96b75c2ec599f6a9239950e6712358 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 3 Jun 2024 09:41:03 +0200 Subject: [PATCH 0574/1578] HID: intel-ish-hid: fix endian-conversion The newly added file causes a ton of sparse warnings about the incorrect use of __le32 and similar types: drivers/hid/intel-ish-hid/ishtp/loader.h:41:23: error: invalid bitfield specifier for type restricted __le32. drivers/hid/intel-ish-hid/ishtp/loader.h:42:27: error: invalid bitfield specifier for type restricted __le32. drivers/hid/intel-ish-hid/ishtp/loader.h:43:24: error: invalid bitfield specifier for type restricted __le32. drivers/hid/intel-ish-hid/ishtp/loader.h:44:24: error: invalid bitfield specifier for type restricted __le32. drivers/hid/intel-ish-hid/ishtp/loader.h:45:22: error: invalid bitfield specifier for type restricted __le32. drivers/hid/intel-ish-hid/ishtp/loader.c:172:33: warning: restricted __le32 degrades to integer drivers/hid/intel-ish-hid/ishtp/loader.c:178:50: warning: incorrect type in assignment (different base types) drivers/hid/intel-ish-hid/ishtp/loader.c:178:50: expected restricted __le32 [usertype] length drivers/hid/intel-ish-hid/ishtp/loader.c:178:50: got unsigned long drivers/hid/intel-ish-hid/ishtp/loader.c:179:50: warning: incorrect type in assignment (different base types) drivers/hid/intel-ish-hid/ishtp/loader.c:179:50: expected restricted __le32 [usertype] fw_off drivers/hid/intel-ish-hid/ishtp/loader.c:179:50: got unsigned int [usertype] offset drivers/hid/intel-ish-hid/ishtp/loader.c:180:17: warning: cast from restricted __le32 drivers/hid/intel-ish-hid/ishtp/loader.c:183:24: warning: invalid assignment: += drivers/hid/intel-ish-hid/ishtp/loader.c:183:24: left side has type unsigned int drivers/hid/intel-ish-hid/ishtp/loader.c:183:24: right side has type restricted __le32 Add the necessary conversions and use temporary variables where appropriate to avoid converting back. Fixes: 579a267e4617 ("HID: intel-ish-hid: Implement loading firmware from host feature") Signed-off-by: Arnd Bergmann Reviewed-by: Zhang Lixu Tested-by: Zhang Lixu Signed-off-by: Jiri Kosina --- drivers/hid/intel-ish-hid/ishtp/loader.c | 69 +++++++++++++----------- drivers/hid/intel-ish-hid/ishtp/loader.h | 33 +++++++----- 2 files changed, 57 insertions(+), 45 deletions(-) diff --git a/drivers/hid/intel-ish-hid/ishtp/loader.c b/drivers/hid/intel-ish-hid/ishtp/loader.c index 2785b04a2f5a0b..fcca070bdecbe9 100644 --- a/drivers/hid/intel-ish-hid/ishtp/loader.c +++ b/drivers/hid/intel-ish-hid/ishtp/loader.c @@ -84,8 +84,8 @@ static int loader_write_message(struct ishtp_device *dev, void *buf, int len) static int loader_xfer_cmd(struct ishtp_device *dev, void *req, int req_len, void *resp, int resp_len) { - struct loader_msg_header *req_hdr = req; - struct loader_msg_header *resp_hdr = resp; + union loader_msg_header req_hdr; + union loader_msg_header resp_hdr; struct device *devc = dev->devc; int rv; @@ -93,34 +93,37 @@ static int loader_xfer_cmd(struct ishtp_device *dev, void *req, int req_len, dev->fw_loader_rx_size = resp_len; rv = loader_write_message(dev, req, req_len); + req_hdr.val32 = le32_to_cpup(req); + if (rv < 0) { - dev_err(devc, "write cmd %u failed:%d\n", req_hdr->command, rv); + dev_err(devc, "write cmd %u failed:%d\n", req_hdr.command, rv); return rv; } /* Wait the ACK */ wait_event_interruptible_timeout(dev->wait_loader_recvd_msg, dev->fw_loader_received, ISHTP_LOADER_TIMEOUT); + resp_hdr.val32 = le32_to_cpup(resp); dev->fw_loader_rx_size = 0; dev->fw_loader_rx_buf = NULL; if (!dev->fw_loader_received) { - dev_err(devc, "wait response of cmd %u timeout\n", req_hdr->command); + dev_err(devc, "wait response of cmd %u timeout\n", req_hdr.command); return -ETIMEDOUT; } - if (!resp_hdr->is_response) { - dev_err(devc, "not a response for %u\n", req_hdr->command); + if (!resp_hdr.is_response) { + dev_err(devc, "not a response for %u\n", req_hdr.command); return -EBADMSG; } - if (req_hdr->command != resp_hdr->command) { - dev_err(devc, "unexpected cmd response %u:%u\n", req_hdr->command, - resp_hdr->command); + if (req_hdr.command != resp_hdr.command) { + dev_err(devc, "unexpected cmd response %u:%u\n", req_hdr.command, + resp_hdr.command); return -EBADMSG; } - if (resp_hdr->status) { - dev_err(devc, "cmd %u failed %u\n", req_hdr->command, resp_hdr->status); + if (resp_hdr.status) { + dev_err(devc, "cmd %u failed %u\n", req_hdr.command, resp_hdr.status); return -EIO; } @@ -157,30 +160,33 @@ static void release_dma_bufs(struct ishtp_device *dev, * @fragment: The ISHTP firmware fragment descriptor * @dma_bufs: The array of DMA fragment buffers * @fragment_size: The size of a single DMA fragment + * @fragment_count: Number of fragments * * Return: 0 on success, negative error code on failure */ static int prepare_dma_bufs(struct ishtp_device *dev, const struct firmware *ish_fw, struct loader_xfer_dma_fragment *fragment, - void **dma_bufs, u32 fragment_size) + void **dma_bufs, u32 fragment_size, u32 fragment_count) { dma_addr_t dma_addr; u32 offset = 0; + u32 length; int i; - for (i = 0; i < fragment->fragment_cnt && offset < ish_fw->size; i++) { + for (i = 0; i < fragment_count && offset < ish_fw->size; i++) { dma_bufs[i] = dma_alloc_coherent(dev->devc, fragment_size, &dma_addr, GFP_KERNEL); if (!dma_bufs[i]) return -ENOMEM; fragment->fragment_tbl[i].ddr_adrs = cpu_to_le64(dma_addr); - fragment->fragment_tbl[i].length = clamp(ish_fw->size - offset, 0, fragment_size); - fragment->fragment_tbl[i].fw_off = offset; - memcpy(dma_bufs[i], ish_fw->data + offset, fragment->fragment_tbl[i].length); + length = clamp(ish_fw->size - offset, 0, fragment_size); + fragment->fragment_tbl[i].length = cpu_to_le32(length); + fragment->fragment_tbl[i].fw_off = cpu_to_le32(offset); + memcpy(dma_bufs[i], ish_fw->data + offset, length); clflush_cache_range(dma_bufs[i], fragment_size); - offset += fragment->fragment_tbl[i].length; + offset += length; } return 0; @@ -208,17 +214,17 @@ void ishtp_loader_work(struct work_struct *work) { DEFINE_RAW_FLEX(struct loader_xfer_dma_fragment, fragment, fragment_tbl, FRAGMENT_MAX_NUM); struct ishtp_device *dev = container_of(work, struct ishtp_device, work_fw_loader); - struct loader_xfer_query query = { - .header.command = LOADER_CMD_XFER_QUERY, - }; - struct loader_start start = { - .header.command = LOADER_CMD_START, - }; + union loader_msg_header query_hdr = { .command = LOADER_CMD_XFER_QUERY, }; + union loader_msg_header start_hdr = { .command = LOADER_CMD_START, }; + union loader_msg_header fragment_hdr = { .command = LOADER_CMD_XFER_FRAGMENT, }; + struct loader_xfer_query query = { .header = cpu_to_le32(query_hdr.val32), }; + struct loader_start start = { .header = cpu_to_le32(start_hdr.val32), }; union loader_recv_message recv_msg; char *filename = dev->driver_data->fw_filename; const struct firmware *ish_fw; void *dma_bufs[FRAGMENT_MAX_NUM] = {}; u32 fragment_size; + u32 fragment_count; int retry = ISHTP_LOADER_RETRY_TIMES; int rv; @@ -228,23 +234,24 @@ void ishtp_loader_work(struct work_struct *work) return; } - fragment->fragment.header.command = LOADER_CMD_XFER_FRAGMENT; - fragment->fragment.xfer_mode = LOADER_XFER_MODE_DMA; - fragment->fragment.is_last = 1; - fragment->fragment.size = ish_fw->size; + fragment->fragment.header = cpu_to_le32(fragment_hdr.val32); + fragment->fragment.xfer_mode = cpu_to_le32(LOADER_XFER_MODE_DMA); + fragment->fragment.is_last = cpu_to_le32(1); + fragment->fragment.size = cpu_to_le32(ish_fw->size); /* Calculate the size of a single DMA fragment */ fragment_size = PFN_ALIGN(DIV_ROUND_UP(ish_fw->size, FRAGMENT_MAX_NUM)); /* Calculate the count of DMA fragments */ - fragment->fragment_cnt = DIV_ROUND_UP(ish_fw->size, fragment_size); + fragment_count = DIV_ROUND_UP(ish_fw->size, fragment_size); + fragment->fragment_cnt = cpu_to_le32(fragment_count); - rv = prepare_dma_bufs(dev, ish_fw, fragment, dma_bufs, fragment_size); + rv = prepare_dma_bufs(dev, ish_fw, fragment, dma_bufs, fragment_size, fragment_count); if (rv) { dev_err(dev->devc, "prepare DMA buffer failed.\n"); goto out; } do { - query.image_size = ish_fw->size; + query.image_size = cpu_to_le32(ish_fw->size); rv = loader_xfer_cmd(dev, &query, sizeof(query), recv_msg.raw_data, sizeof(struct loader_xfer_query_ack)); if (rv) @@ -257,7 +264,7 @@ void ishtp_loader_work(struct work_struct *work) recv_msg.query_ack.version_build); rv = loader_xfer_cmd(dev, fragment, - struct_size(fragment, fragment_tbl, fragment->fragment_cnt), + struct_size(fragment, fragment_tbl, fragment_count), recv_msg.raw_data, sizeof(struct loader_xfer_fragment_ack)); if (rv) continue; /* try again if failed */ diff --git a/drivers/hid/intel-ish-hid/ishtp/loader.h b/drivers/hid/intel-ish-hid/ishtp/loader.h index 7aa45ebc3f7be5..308b96085a4db4 100644 --- a/drivers/hid/intel-ish-hid/ishtp/loader.h +++ b/drivers/hid/intel-ish-hid/ishtp/loader.h @@ -30,19 +30,23 @@ struct work_struct; #define LOADER_XFER_MODE_DMA BIT(0) /** - * struct loader_msg_header - ISHTP firmware loader message header + * union loader_msg_header - ISHTP firmware loader message header * @command: Command type * @is_response: Indicates if the message is a response * @has_next: Indicates if there is a next message * @reserved: Reserved for future use * @status: Status of the message - */ -struct loader_msg_header { - __le32 command:7; - __le32 is_response:1; - __le32 has_next:1; - __le32 reserved:15; - __le32 status:8; + * @val32: entire header as a 32-bit value + */ +union loader_msg_header { + struct { + __u32 command:7; + __u32 is_response:1; + __u32 has_next:1; + __u32 reserved:15; + __u32 status:8; + }; + __u32 val32; }; /** @@ -51,7 +55,7 @@ struct loader_msg_header { * @image_size: Size of the image */ struct loader_xfer_query { - struct loader_msg_header header; + __le32 header; __le32 image_size; }; @@ -103,7 +107,7 @@ struct loader_capability { * @capability: Loader capability */ struct loader_xfer_query_ack { - struct loader_msg_header header; + __le32 header; __le16 version_major; __le16 version_minor; __le16 version_hotfix; @@ -122,7 +126,7 @@ struct loader_xfer_query_ack { * @is_last: Is last */ struct loader_xfer_fragment { - struct loader_msg_header header; + __le32 header; __le32 xfer_mode; __le32 offset; __le32 size; @@ -134,7 +138,7 @@ struct loader_xfer_fragment { * @header: Header of the message */ struct loader_xfer_fragment_ack { - struct loader_msg_header header; + __le32 header; }; /** @@ -170,7 +174,7 @@ struct loader_xfer_dma_fragment { * @header: Header of the message */ struct loader_start { - struct loader_msg_header header; + __le32 header; }; /** @@ -178,10 +182,11 @@ struct loader_start { * @header: Header of the message */ struct loader_start_ack { - struct loader_msg_header header; + __le32 header; }; union loader_recv_message { + __le32 header; struct loader_xfer_query_ack query_ack; struct loader_xfer_fragment_ack fragment_ack; struct loader_start_ack start_ack; From 9c8f05cf1d7abd1bfb53cebe691bf3acb7baee99 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Mon, 3 Jun 2024 23:00:59 -0700 Subject: [PATCH 0575/1578] HID: logitech-hidpp: add missing MODULE_DESCRIPTION() macro make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/hid/hid-logitech-hidpp.o Add the missing invocation of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Signed-off-by: Jiri Kosina --- drivers/hid/hid-logitech-hidpp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/hid/hid-logitech-hidpp.c b/drivers/hid/hid-logitech-hidpp.c index b81d5bcc76a77a..400d70e6dbe2dd 100644 --- a/drivers/hid/hid-logitech-hidpp.c +++ b/drivers/hid/hid-logitech-hidpp.c @@ -27,6 +27,7 @@ #include "usbhid/usbhid.h" #include "hid-ids.h" +MODULE_DESCRIPTION("Support for Logitech devices relying on the HID++ specification"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Benjamin Tissoires "); MODULE_AUTHOR("Nestor Lopez Casado "); From 0698ff57bf327d9a5735a898f78161b8dada160b Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Mon, 27 May 2024 13:54:08 +0200 Subject: [PATCH 0576/1578] drm/xe/pf: Update the LMTT when freeing VF GT config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The LMTT must be updated whenever we change the VF LMEM configuration. We missed that step when freeing the whole VF GT config, which could result in stale PTE in LMTT or LMTT PT object leaks. Fix that. Fixes: ac6598aed1b3 ("drm/xe/pf: Add support to configure SR-IOV VFs") Signed-off-by: Michal Wajdeczko Reviewed-by: Piotr Piórkowski Link: https://patchwork.freedesktop.org/patch/msgid/20240527115408.1064-1-michal.wajdeczko@intel.com (cherry picked from commit c063cce7df3a765539e2a2d75ab943f334446cce) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c index 79116ad5862041..476d613333a981 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c @@ -1749,6 +1749,7 @@ static void pf_release_vf_config(struct xe_gt *gt, unsigned int vfid) if (!xe_gt_is_media_type(gt)) { pf_release_vf_config_ggtt(gt, config); pf_release_vf_config_lmem(gt, config); + pf_update_vf_lmtt(gt_to_xe(gt), vfid); } pf_release_config_ctxs(gt, config); pf_release_config_dbs(gt, config); From 1db5322b7e6b58e1b304ce69a50e9dca798ca95b Mon Sep 17 00:00:00 2001 From: Alexander Usyskin Date: Thu, 30 May 2024 12:14:15 +0300 Subject: [PATCH 0577/1578] mei: demote client disconnect warning on suspend to debug Change level for the "not connected" client message in the write callback from error to debug. The MEI driver currently disconnects all clients upon system suspend. This behavior is by design and user-space applications with open connections before the suspend are expected to handle errors upon resume, by reopening their handles, reconnecting, and retrying their operations. However, the current driver implementation logs an error message every time a write operation is attempted on a disconnected client. Since this is a normal and expected flow after system resume logging this as an error can be misleading. Signed-off-by: Alexander Usyskin Signed-off-by: Tomas Winkler Link: https://lore.kernel.org/r/20240530091415.725247-1-tomas.winkler@intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mei/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/mei/main.c b/drivers/misc/mei/main.c index 79e6f3c1341feb..40c3fe26f76df0 100644 --- a/drivers/misc/mei/main.c +++ b/drivers/misc/mei/main.c @@ -329,7 +329,7 @@ static ssize_t mei_write(struct file *file, const char __user *ubuf, } if (!mei_cl_is_connected(cl)) { - cl_err(dev, cl, "is not connected"); + cl_dbg(dev, cl, "is not connected"); rets = -ENODEV; goto out; } From 283cb234ef95d94c61f59e1cd070cd9499b51292 Mon Sep 17 00:00:00 2001 From: Tomas Winkler Date: Tue, 4 Jun 2024 12:07:28 +0300 Subject: [PATCH 0578/1578] mei: me: release irq in mei_me_pci_resume error path The mei_me_pci_resume doesn't release irq on the error path, in case mei_start() fails. Cc: Fixes: 33ec08263147 ("mei: revamp mei reset state machine") Signed-off-by: Tomas Winkler Link: https://lore.kernel.org/r/20240604090728.1027307-1-tomas.winkler@intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mei/pci-me.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/misc/mei/pci-me.c b/drivers/misc/mei/pci-me.c index 7f59dd38c32f52..6589635f8ba32b 100644 --- a/drivers/misc/mei/pci-me.c +++ b/drivers/misc/mei/pci-me.c @@ -385,8 +385,10 @@ static int mei_me_pci_resume(struct device *device) } err = mei_restart(dev); - if (err) + if (err) { + free_irq(pdev->irq, dev); return err; + } /* Start timer if stopped in suspend */ schedule_delayed_work(&dev->timer_work, HZ); From 9b5e045029d8bded4c6979874ed3abc347c1415c Mon Sep 17 00:00:00 2001 From: Wentong Wu Date: Mon, 27 May 2024 20:38:35 +0800 Subject: [PATCH 0579/1578] mei: vsc: Don't stop/restart mei device during system suspend/resume The dynamically created mei client device (mei csi) is used as one V4L2 sub device of the whole video pipeline, and the V4L2 connection graph is built by software node. The mei_stop() and mei_restart() will delete the old mei csi client device and create a new mei client device, which will cause the software node information saved in old mei csi device lost and the whole video pipeline will be broken. Removing mei_stop()/mei_restart() during system suspend/resume can fix the issue above and won't impact hardware actual power saving logic. Fixes: f6085a96c973 ("mei: vsc: Unregister interrupt handler for system suspend") Cc: stable@vger.kernel.org # for 6.8+ Reported-by: Hao Yao Signed-off-by: Wentong Wu Reviewed-by: Sakari Ailus Tested-by: Jason Chen Tested-by: Sakari Ailus Acked-by: Tomas Winkler Link: https://lore.kernel.org/r/20240527123835.522384-1-wentong.wu@intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mei/platform-vsc.c | 39 +++++++++++++-------------------- 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/drivers/misc/mei/platform-vsc.c b/drivers/misc/mei/platform-vsc.c index b543e6b9f3cfd6..1ec65d87488a33 100644 --- a/drivers/misc/mei/platform-vsc.c +++ b/drivers/misc/mei/platform-vsc.c @@ -399,41 +399,32 @@ static void mei_vsc_remove(struct platform_device *pdev) static int mei_vsc_suspend(struct device *dev) { - struct mei_device *mei_dev = dev_get_drvdata(dev); - struct mei_vsc_hw *hw = mei_dev_to_vsc_hw(mei_dev); + struct mei_device *mei_dev; + int ret = 0; - mei_stop(mei_dev); + mei_dev = dev_get_drvdata(dev); + if (!mei_dev) + return -ENODEV; - mei_disable_interrupts(mei_dev); + mutex_lock(&mei_dev->device_lock); - vsc_tp_free_irq(hw->tp); + if (!mei_write_is_idle(mei_dev)) + ret = -EAGAIN; - return 0; + mutex_unlock(&mei_dev->device_lock); + + return ret; } static int mei_vsc_resume(struct device *dev) { - struct mei_device *mei_dev = dev_get_drvdata(dev); - struct mei_vsc_hw *hw = mei_dev_to_vsc_hw(mei_dev); - int ret; - - ret = vsc_tp_request_irq(hw->tp); - if (ret) - return ret; - - ret = mei_restart(mei_dev); - if (ret) - goto err_free; + struct mei_device *mei_dev; - /* start timer if stopped in suspend */ - schedule_delayed_work(&mei_dev->timer_work, HZ); + mei_dev = dev_get_drvdata(dev); + if (!mei_dev) + return -ENODEV; return 0; - -err_free: - vsc_tp_free_irq(hw->tp); - - return ret; } static DEFINE_SIMPLE_DEV_PM_OPS(mei_vsc_pm_ops, mei_vsc_suspend, mei_vsc_resume); From af076156ec6d70332f1555754e99d4a3771ec297 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 3 Jun 2024 22:50:50 +0200 Subject: [PATCH 0580/1578] mei: vsc: Fix wrong invocation of ACPI SID method When using an initializer for a union only one of the union members must be initialized. The initializer for the acpi_object union variable passed as argument to the SID ACPI method was initializing both the type and the integer members of the union. Unfortunately rather then complaining about this gcc simply ignores the first initializer and only used the second integer.value = 1 initializer. Leaving type set to 0 which leads to the argument being skipped by acpi acpi_ns_evaluate() resulting in: ACPI Warning: \_SB.PC00.SPI1.SPFD.CVFD.SID: Insufficient arguments - Caller passed 0, method requires 1 (20240322/nsarguments-232) Fix this by initializing only the integer struct part of the union and initializing both members of the integer struct. Signed-off-by: Hans de Goede Reviewed-by: Sakari Ailus Fixes: 566f5ca97680 ("mei: Add transport driver for IVSC device") Reviewed-by: Wentong Wu Link: https://lore.kernel.org/r/20240603205050.505389-1-hdegoede@redhat.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mei/vsc-fw-loader.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/mei/vsc-fw-loader.c b/drivers/misc/mei/vsc-fw-loader.c index ffa4ccd96a1043..596a9d695dfc14 100644 --- a/drivers/misc/mei/vsc-fw-loader.c +++ b/drivers/misc/mei/vsc-fw-loader.c @@ -252,7 +252,7 @@ static int vsc_get_sensor_name(struct vsc_fw_loader *fw_loader, { struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER }; union acpi_object obj = { - .type = ACPI_TYPE_INTEGER, + .integer.type = ACPI_TYPE_INTEGER, .integer.value = 1, }; struct acpi_object_list arg_list = { From 73fedc31fed38cb6039fd8a7efea1774143b68b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 13 May 2024 09:52:06 +0200 Subject: [PATCH 0581/1578] parport: amiga: Mark driver struct with __refdata to prevent section mismatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As described in the added code comment, a reference to .exit.text is ok for drivers registered via module_platform_driver_probe(). Make this explicit to prevent the following section mismatch warning WARNING: modpost: drivers/parport/parport_amiga: section mismatch in reference: amiga_parallel_driver+0x8 (section: .data) -> amiga_parallel_remove (section: .exit.text) that triggers on an allmodconfig W=1 build. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20240513075206.2337310-2-u.kleine-koenig@pengutronix.de Signed-off-by: Greg Kroah-Hartman --- drivers/parport/parport_amiga.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/parport/parport_amiga.c b/drivers/parport/parport_amiga.c index e6dc857aac3feb..e06c7b2aac5c46 100644 --- a/drivers/parport/parport_amiga.c +++ b/drivers/parport/parport_amiga.c @@ -229,7 +229,13 @@ static void __exit amiga_parallel_remove(struct platform_device *pdev) parport_put_port(port); } -static struct platform_driver amiga_parallel_driver = { +/* + * amiga_parallel_remove() lives in .exit.text. For drivers registered via + * module_platform_driver_probe() this is ok because they cannot get unbound at + * runtime. So mark the driver struct with __refdata to prevent modpost + * triggering a section mismatch warning. + */ +static struct platform_driver amiga_parallel_driver __refdata = { .remove_new = __exit_p(amiga_parallel_remove), .driver = { .name = "amiga-parallel", From 086c6cbcc563c81d55257f9b27e14faf1d0963d3 Mon Sep 17 00:00:00 2001 From: Yongzhi Liu Date: Thu, 23 May 2024 20:14:33 +0800 Subject: [PATCH 0582/1578] misc: microchip: pci1xxxx: fix double free in the error handling of gp_aux_bus_probe() When auxiliary_device_add() returns error and then calls auxiliary_device_uninit(), callback function gp_auxiliary_device_release() calls ida_free() and kfree(aux_device_wrapper) to free memory. We should't call them again in the error handling path. Fix this by skipping the redundant cleanup functions. Fixes: 393fc2f5948f ("misc: microchip: pci1xxxx: load auxiliary bus driver for the PIO function in the multi-function endpoint of pci1xxxx device.") Signed-off-by: Yongzhi Liu Link: https://lore.kernel.org/r/20240523121434.21855-3-hyperlyzcs@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gp.c b/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gp.c index 32af2b14ff3448..de75d89ef53e85 100644 --- a/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gp.c +++ b/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gp.c @@ -111,6 +111,7 @@ static int gp_aux_bus_probe(struct pci_dev *pdev, const struct pci_device_id *id err_aux_dev_add_1: auxiliary_device_uninit(&aux_bus->aux_device_wrapper[1]->aux_dev); + goto err_aux_dev_add_0; err_aux_dev_init_1: ida_free(&gp_client_ida, aux_bus->aux_device_wrapper[1]->aux_dev.id); @@ -120,6 +121,7 @@ static int gp_aux_bus_probe(struct pci_dev *pdev, const struct pci_device_id *id err_aux_dev_add_0: auxiliary_device_uninit(&aux_bus->aux_device_wrapper[0]->aux_dev); + goto err_ret; err_aux_dev_init_0: ida_free(&gp_client_ida, aux_bus->aux_device_wrapper[0]->aux_dev.id); @@ -127,6 +129,7 @@ static int gp_aux_bus_probe(struct pci_dev *pdev, const struct pci_device_id *id err_ida_alloc_0: kfree(aux_bus->aux_device_wrapper[0]); +err_ret: return retval; } From 77427e3d5c353e3dd98c7c0af322f8d9e3131ace Mon Sep 17 00:00:00 2001 From: Yongzhi Liu Date: Thu, 23 May 2024 20:14:34 +0800 Subject: [PATCH 0583/1578] misc: microchip: pci1xxxx: Fix a memory leak in the error handling of gp_aux_bus_probe() There is a memory leak (forget to free allocated buffers) in a memory allocation failure path. Fix it to jump to the correct error handling code. Fixes: 393fc2f5948f ("misc: microchip: pci1xxxx: load auxiliary bus driver for the PIO function in the multi-function endpoint of pci1xxxx device.") Signed-off-by: Yongzhi Liu Reviewed-by: Kumaravel Thiagarajan Link: https://lore.kernel.org/r/20240523121434.21855-4-hyperlyzcs@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gp.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gp.c b/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gp.c index de75d89ef53e85..34c9be437432a3 100644 --- a/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gp.c +++ b/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gp.c @@ -69,8 +69,10 @@ static int gp_aux_bus_probe(struct pci_dev *pdev, const struct pci_device_id *id aux_bus->aux_device_wrapper[1] = kzalloc(sizeof(*aux_bus->aux_device_wrapper[1]), GFP_KERNEL); - if (!aux_bus->aux_device_wrapper[1]) - return -ENOMEM; + if (!aux_bus->aux_device_wrapper[1]) { + retval = -ENOMEM; + goto err_aux_dev_add_0; + } retval = ida_alloc(&gp_client_ida, GFP_KERNEL); if (retval < 0) From 7c55b78818cfb732680c4a72ab270cc2d2ee3d0f Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 14 May 2024 12:06:34 +0200 Subject: [PATCH 0584/1578] jfs: xattr: fix buffer overflow for invalid xattr When an xattr size is not what is expected, it is printed out to the kernel log in hex format as a form of debugging. But when that xattr size is bigger than the expected size, printing it out can cause an access off the end of the buffer. Fix this all up by properly restricting the size of the debug hex dump in the kernel log. Reported-by: syzbot+9dfe490c8176301c1d06@syzkaller.appspotmail.com Cc: Dave Kleikamp Link: https://lore.kernel.org/r/2024051433-slider-cloning-98f9@gregkh Signed-off-by: Greg Kroah-Hartman --- fs/jfs/xattr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 0fb7afac298e13..9987055293b359 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -557,9 +557,11 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size) size_check: if (EALIST_SIZE(ea_buf->xattr) != ea_size) { + int size = min_t(int, EALIST_SIZE(ea_buf->xattr), ea_size); + printk(KERN_ERR "ea_get: invalid extended attribute\n"); print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, - ea_buf->xattr, ea_size, 1); + ea_buf->xattr, size, 1); ea_release(inode, ea_buf); rc = -EIO; goto clean_up; From 616501eccb58615f8f352a29239ea6c6fc5e6546 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 27 May 2024 09:00:06 +0100 Subject: [PATCH 0585/1578] clkdev: don't fail clkdev_alloc() if over-sized Don't fail clkdev_alloc() if the strings are over-sized. In this case, the entry will not match during lookup, so its useless. However, since code fails if we return NULL leading to boot failure, return a dummy entry with the connection and device IDs set to "bad". Leave the warning so these problems can be found, and the useless wasteful clkdev registrations removed. Reported-by: Ron Economos Reported-by: Guenter Roeck Fixes: 8d532528ff6a ("clkdev: report over-sized strings when creating clkdev entries") Closes: https://lore.kernel.org/linux-clk/7eda7621-0dde-4153-89e4-172e4c095d01@roeck-us.net. Link: https://lore.kernel.org/r/28114882-f8d7-21bf-4536-a186e8d7a22a@w6rz.net Tested-by: Ron Economos Signed-off-by: Russell King (Oracle) --- drivers/clk/clkdev.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/clk/clkdev.c b/drivers/clk/clkdev.c index 56a12f1da472d7..476fb9b7adb5ed 100644 --- a/drivers/clk/clkdev.c +++ b/drivers/clk/clkdev.c @@ -204,8 +204,15 @@ vclkdev_alloc(struct clk_hw *hw, const char *con_id, const char *dev_fmt, pr_err("%pV:%s: %s ID is greater than %zu\n", &vaf, con_id, failure, max_size); va_end(ap_copy); - kfree(cla); - return NULL; + + /* + * Don't fail in this case, but as the entry won't ever match just + * fill it with something that also won't match. + */ + strscpy(cla->con_id, "bad", sizeof(cla->con_id)); + strscpy(cla->dev_id, "bad", sizeof(cla->dev_id)); + + return &cla->cl; } static struct clk_lookup * From c0a40097f0bc81deafc15f9195d1fb54595cd6d0 Mon Sep 17 00:00:00 2001 From: Dirk Behme Date: Mon, 13 May 2024 07:06:34 +0200 Subject: [PATCH 0586/1578] drivers: core: synchronize really_probe() and dev_uevent() Synchronize the dev->driver usage in really_probe() and dev_uevent(). These can run in different threads, what can result in the following race condition for dev->driver uninitialization: Thread #1: ========== really_probe() { ... probe_failed: ... device_unbind_cleanup(dev) { ... dev->driver = NULL; // <= Failed probe sets dev->driver to NULL ... } ... } Thread #2: ========== dev_uevent() { ... if (dev->driver) // If dev->driver is NULLed from really_probe() from here on, // after above check, the system crashes add_uevent_var(env, "DRIVER=%s", dev->driver->name); ... } really_probe() holds the lock, already. So nothing needs to be done there. dev_uevent() is called with lock held, often, too. But not always. What implies that we can't add any locking in dev_uevent() itself. So fix this race by adding the lock to the non-protected path. This is the path where above race is observed: dev_uevent+0x235/0x380 uevent_show+0x10c/0x1f0 <= Add lock here dev_attr_show+0x3a/0xa0 sysfs_kf_seq_show+0x17c/0x250 kernfs_seq_show+0x7c/0x90 seq_read_iter+0x2d7/0x940 kernfs_fop_read_iter+0xc6/0x310 vfs_read+0x5bc/0x6b0 ksys_read+0xeb/0x1b0 __x64_sys_read+0x42/0x50 x64_sys_call+0x27ad/0x2d30 do_syscall_64+0xcd/0x1d0 entry_SYSCALL_64_after_hwframe+0x77/0x7f Similar cases are reported by syzkaller in https://syzkaller.appspot.com/bug?extid=ffa8143439596313a85a But these are regarding the *initialization* of dev->driver dev->driver = drv; As this switches dev->driver to non-NULL these reports can be considered to be false-positives (which should be "fixed" by this commit, as well, though). The same issue was reported and tried to be fixed back in 2015 in https://lore.kernel.org/lkml/1421259054-2574-1-git-send-email-a.sangwan@samsung.com/ already. Fixes: 239378f16aa1 ("Driver core: add uevent vars for devices of a class") Cc: stable Cc: syzbot+ffa8143439596313a85a@syzkaller.appspotmail.com Cc: Ashish Sangwan Cc: Namjae Jeon Signed-off-by: Dirk Behme Link: https://lore.kernel.org/r/20240513050634.3964461-1-dirk.behme@de.bosch.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/base/core.c b/drivers/base/core.c index 2e776fcc31b695..2b4c0624b7043a 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -2739,8 +2739,11 @@ static ssize_t uevent_show(struct device *dev, struct device_attribute *attr, if (!env) return -ENOMEM; + /* Synchronize with really_probe() */ + device_lock(dev); /* let the kset specific function add its keys */ retval = kset->uevent_ops->uevent(&dev->kobj, env); + device_unlock(dev); if (retval) goto out; From c9d52fb313d3719d69a040f4ca78a3e2e95fba21 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 30 May 2024 18:04:24 -0700 Subject: [PATCH 0587/1578] PCI: Revert the cfg_access_lock lockdep mechanism While the experiment did reveal that there are additional places that are missing the lock during secondary bus reset, one of the places that needs to take cfg_access_lock (pci_bus_lock()) is not prepared for lockdep annotation. Specifically, pci_bus_lock() takes pci_dev_lock() recursively and is currently dependent on the fact that the device_lock() is marked lockdep_set_novalidate_class(&dev->mutex). Otherwise, without that annotation, pci_bus_lock() would need to use something like a new pci_dev_lock_nested() helper, a scheme to track a PCI device's depth in the topology, and a hope that the depth of a PCI tree never exceeds the max value for a lockdep subclass. The alternative to ripping out the lockdep coverage would be to deploy a dynamic lock key for every PCI device. Unfortunately, there is evidence that increasing the number of keys that lockdep needs to track to be per-PCI-device is prohibitively expensive for something like the cfg_access_lock. The main motivation for adding the annotation in the first place was to catch unlocked secondary bus resets, not necessarily catch lock ordering problems between cfg_access_lock and other locks. Solve that narrower problem with follow-on patches, and just due to targeted revert for now. Link: https://lore.kernel.org/r/171711746402.1628941.14575335981264103013.stgit@dwillia2-xfh.jf.intel.com Fixes: 7e89efc6e9e4 ("PCI: Lock upstream bridge for pci_reset_function()") Reported-by: Imre Deak Closes: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_134186v1/shard-dg2-1/igt@device_reset@unbind-reset-rebind.html Signed-off-by: Dan Williams Signed-off-by: Bjorn Helgaas Tested-by: Hans de Goede Tested-by: Kalle Valo Reviewed-by: Dave Jiang Cc: Jani Saarinen --- drivers/pci/access.c | 4 ---- drivers/pci/pci.c | 1 - drivers/pci/probe.c | 3 --- include/linux/lockdep.h | 5 ----- include/linux/pci.h | 2 -- 5 files changed, 15 deletions(-) diff --git a/drivers/pci/access.c b/drivers/pci/access.c index 30f031de9cfe8c..b123da16b63ba3 100644 --- a/drivers/pci/access.c +++ b/drivers/pci/access.c @@ -289,8 +289,6 @@ void pci_cfg_access_lock(struct pci_dev *dev) { might_sleep(); - lock_map_acquire(&dev->cfg_access_lock); - raw_spin_lock_irq(&pci_lock); if (dev->block_cfg_access) pci_wait_cfg(dev); @@ -345,8 +343,6 @@ void pci_cfg_access_unlock(struct pci_dev *dev) raw_spin_unlock_irqrestore(&pci_lock, flags); wake_up_all(&pci_cfg_wait); - - lock_map_release(&dev->cfg_access_lock); } EXPORT_SYMBOL_GPL(pci_cfg_access_unlock); diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 59e0949fb079d5..35fb1f17a589c1 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -4883,7 +4883,6 @@ void __weak pcibios_reset_secondary_bus(struct pci_dev *dev) */ int pci_bridge_secondary_bus_reset(struct pci_dev *dev) { - lock_map_assert_held(&dev->cfg_access_lock); pcibios_reset_secondary_bus(dev); return pci_bridge_wait_for_secondary_bus(dev, "bus reset"); diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 8e696e547565c3..5fbabb4e3425fb 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2546,9 +2546,6 @@ void pci_device_add(struct pci_dev *dev, struct pci_bus *bus) dev->dev.dma_mask = &dev->dma_mask; dev->dev.dma_parms = &dev->dma_parms; dev->dev.coherent_dma_mask = 0xffffffffull; - lockdep_register_key(&dev->cfg_access_key); - lockdep_init_map(&dev->cfg_access_lock, dev_name(&dev->dev), - &dev->cfg_access_key, 0); dma_set_max_seg_size(&dev->dev, 65536); dma_set_seg_boundary(&dev->dev, 0xffffffff); diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 5e51b0de4c4b57..08b0d1d9d78b76 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -297,9 +297,6 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); .wait_type_inner = _wait_type, \ .lock_type = LD_LOCK_WAIT_OVERRIDE, } -#define lock_map_assert_held(l) \ - lockdep_assert(lock_is_held(l) != LOCK_STATE_NOT_HELD) - #else /* !CONFIG_LOCKDEP */ static inline void lockdep_init_task(struct task_struct *task) @@ -391,8 +388,6 @@ extern int lockdep_is_held(const void *); #define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type) \ struct lockdep_map __maybe_unused _name = {} -#define lock_map_assert_held(l) do { (void)(l); } while (0) - #endif /* !LOCKDEP */ #ifdef CONFIG_PROVE_LOCKING diff --git a/include/linux/pci.h b/include/linux/pci.h index fb004fd4e88905..cafc5ab1cbcb42 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -413,8 +413,6 @@ struct pci_dev { struct resource driver_exclusive_resource; /* driver exclusive resource ranges */ bool match_driver; /* Skip attaching driver */ - struct lock_class_key cfg_access_key; - struct lockdep_map cfg_access_lock; unsigned int transparent:1; /* Subtractive decode bridge */ unsigned int io_window:1; /* Bridge has I/O window */ From d9fef76e89498bf99cdb03f77b7091d7e95d7edd Mon Sep 17 00:00:00 2001 From: Julien Panis Date: Thu, 16 May 2024 12:44:37 +0200 Subject: [PATCH 0588/1578] thermal/drivers/mediatek/lvts_thermal: Remove filtered mode for mt8188 Filtered mode is not supported on mt8188 SoC and is the source of bad results. Move to immediate mode which provides good temperatures. Fixes: f4745f546e60 ("thermal/drivers/mediatek/lvts_thermal: Add MT8188 support") Reviewed-by: Nicolas Pitre Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Julien Panis Link: https://lore.kernel.org/r/20240516-mtk-thermal-mt8188-mode-fix-v2-1-40a317442c62@baylibre.com Signed-off-by: Daniel Lezcano --- drivers/thermal/mediatek/lvts_thermal.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/thermal/mediatek/lvts_thermal.c b/drivers/thermal/mediatek/lvts_thermal.c index 0bb3a495b56ed7..82c355c466cfec 100644 --- a/drivers/thermal/mediatek/lvts_thermal.c +++ b/drivers/thermal/mediatek/lvts_thermal.c @@ -1458,7 +1458,6 @@ static const struct lvts_ctrl_data mt8188_lvts_mcu_data_ctrl[] = { }, VALID_SENSOR_MAP(1, 1, 1, 1), .offset = 0x0, - .mode = LVTS_MSR_FILTERED_MODE, }, { .lvts_sensor = { @@ -1469,7 +1468,6 @@ static const struct lvts_ctrl_data mt8188_lvts_mcu_data_ctrl[] = { }, VALID_SENSOR_MAP(1, 1, 0, 0), .offset = 0x100, - .mode = LVTS_MSR_FILTERED_MODE, } }; @@ -1483,7 +1481,6 @@ static const struct lvts_ctrl_data mt8188_lvts_ap_data_ctrl[] = { }, VALID_SENSOR_MAP(0, 1, 0, 0), .offset = 0x0, - .mode = LVTS_MSR_FILTERED_MODE, }, { .lvts_sensor = { @@ -1496,7 +1493,6 @@ static const struct lvts_ctrl_data mt8188_lvts_ap_data_ctrl[] = { }, VALID_SENSOR_MAP(1, 1, 1, 0), .offset = 0x100, - .mode = LVTS_MSR_FILTERED_MODE, }, { .lvts_sensor = { @@ -1507,7 +1503,6 @@ static const struct lvts_ctrl_data mt8188_lvts_ap_data_ctrl[] = { }, VALID_SENSOR_MAP(1, 1, 0, 0), .offset = 0x200, - .mode = LVTS_MSR_FILTERED_MODE, }, { .lvts_sensor = { @@ -1518,7 +1513,6 @@ static const struct lvts_ctrl_data mt8188_lvts_ap_data_ctrl[] = { }, VALID_SENSOR_MAP(1, 1, 0, 0), .offset = 0x300, - .mode = LVTS_MSR_FILTERED_MODE, } }; From afe377286ad49e0b69071d2a767e2c6553f4094b Mon Sep 17 00:00:00 2001 From: Maciej Strozek Date: Tue, 4 Jun 2024 14:28:43 +0100 Subject: [PATCH 0589/1578] ASoC: cs42l43: Increase default type detect time and button delay Some problematic headsets have been discovered, to help with correctly identifying these, the detect time must be increased. Also improve the reliability of the impedance value from the button detect by slightly increasing the button detect delay. Fixes: 686b8f711b99 ("ASoC: cs42l43: Lower default type detect time") Signed-off-by: Maciej Strozek Signed-off-by: Charles Keepax Link: https://msgid.link/r/20240604132843.3309114-1-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs42l43-jack.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/cs42l43-jack.c b/sound/soc/codecs/cs42l43-jack.c index 901b9dbcf58543..d9ab003e166bfa 100644 --- a/sound/soc/codecs/cs42l43-jack.c +++ b/sound/soc/codecs/cs42l43-jack.c @@ -121,7 +121,7 @@ int cs42l43_set_jack(struct snd_soc_component *component, priv->buttons[3] = 735; } - ret = cs42l43_find_index(priv, "cirrus,detect-us", 1000, &priv->detect_us, + ret = cs42l43_find_index(priv, "cirrus,detect-us", 50000, &priv->detect_us, cs42l43_accdet_us, ARRAY_SIZE(cs42l43_accdet_us)); if (ret < 0) goto error; @@ -433,7 +433,7 @@ irqreturn_t cs42l43_button_press(int irq, void *data) // Wait for 2 full cycles of comb filter to ensure good reading queue_delayed_work(system_wq, &priv->button_press_work, - msecs_to_jiffies(10)); + msecs_to_jiffies(20)); return IRQ_HANDLED; } From b7c40988808f8d7426dee1e4d96a4e204de4a8bc Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 4 Jun 2024 10:19:46 +0800 Subject: [PATCH 0590/1578] ASoC: codecs: ES8326: Solve headphone detection issue When switching between OMTP and CTIA headset, we can hear pop noise. To solve this issue, We modified the configuration for headphone detection Signed-off-by: Zhang Yi Link: https://msgid.link/r/20240604021946.2911-1-zhangyi@everest-semi.com Signed-off-by: Mark Brown --- sound/soc/codecs/es8326.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/es8326.c b/sound/soc/codecs/es8326.c index 03b539ba540f66..6a4e42e5e35b99 100644 --- a/sound/soc/codecs/es8326.c +++ b/sound/soc/codecs/es8326.c @@ -857,12 +857,16 @@ static void es8326_jack_detect_handler(struct work_struct *work) * set auto-check mode, then restart jack_detect_work after 400ms. * Don't report jack status. */ - regmap_write(es8326->regmap, ES8326_INT_SOURCE, - (ES8326_INT_SRC_PIN9 | ES8326_INT_SRC_BUTTON)); + regmap_write(es8326->regmap, ES8326_INT_SOURCE, 0x00); regmap_update_bits(es8326->regmap, ES8326_HPDET_TYPE, 0x03, 0x01); + regmap_update_bits(es8326->regmap, ES8326_HPDET_TYPE, 0x10, 0x00); es8326_enable_micbias(es8326->component); usleep_range(50000, 70000); regmap_update_bits(es8326->regmap, ES8326_HPDET_TYPE, 0x03, 0x00); + regmap_update_bits(es8326->regmap, ES8326_HPDET_TYPE, 0x10, 0x10); + usleep_range(50000, 70000); + regmap_write(es8326->regmap, ES8326_INT_SOURCE, + (ES8326_INT_SRC_PIN9 | ES8326_INT_SRC_BUTTON)); regmap_write(es8326->regmap, ES8326_SYS_BIAS, 0x1f); regmap_update_bits(es8326->regmap, ES8326_HP_DRIVER_REF, 0x0f, 0x08); queue_delayed_work(system_wq, &es8326->jack_detect_work, From 4eecb644b8b82f5279a348f6ebe77e3d6e5b1b05 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 4 Jun 2024 14:17:04 +0100 Subject: [PATCH 0591/1578] spi: cs42l43: Correct SPI root clock speed The root clock is actually 49.152MHz not 40MHz, as it is derived from the primary audio clock, update the driver to match. This error can cause the actual clock rate to be higher than the requested clock rate on the SPI bus. Fixes: ef75e767167a ("spi: cs42l43: Add SPI controller support") Signed-off-by: Charles Keepax Link: https://msgid.link/r/20240604131704.3227500-1-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- drivers/spi/spi-cs42l43.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-cs42l43.c b/drivers/spi/spi-cs42l43.c index 9d747ea6992640..902a0734cc361e 100644 --- a/drivers/spi/spi-cs42l43.c +++ b/drivers/spi/spi-cs42l43.c @@ -26,7 +26,7 @@ #include #define CS42L43_FIFO_SIZE 16 -#define CS42L43_SPI_ROOT_HZ (40 * HZ_PER_MHZ) +#define CS42L43_SPI_ROOT_HZ 49152000 #define CS42L43_SPI_MAX_LENGTH 65532 enum cs42l43_spi_cmd { From dc6abbbde4b099e936cd5428e196d86a5e119aae Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 3 Jun 2024 15:25:23 -0300 Subject: [PATCH 0592/1578] tools headers arm64: Sync arm64's cputype.h with the kernel sources To get the changes in: 0ce85db6c2141b7f ("arm64: cputype: Add Neoverse-V3 definitions") 02a0a04676fa7796 ("arm64: cputype: Add Cortex-X4 definitions") f4d9d9dcc70b96b5 ("arm64: Add Neoverse-V2 part") That makes this perf source code to be rebuilt: CC /tmp/build/perf-tools/util/arm-spe.o The changes in the above patch add MIDR_NEOVERSE_V[23] and MIDR_NEOVERSE_V1 is used in arm-spe.c, so probably we need to add those and perhaps MIDR_CORTEX_X4 to that array? Or maybe we need to leave this for later when this is all tested on those machines? static const struct midr_range neoverse_spe[] = { MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), {}, }; Mark Rutland recommended about arm-spe.c: "I would not touch this for now -- someone would have to go audit the TRMs to check that those other cores have the same encoding, and I think it'd be better to do that as a follow-up." That addresses this perf build warning: Warning: Kernel ABI header differences: diff -u tools/arch/arm64/include/asm/cputype.h arch/arm64/include/asm/cputype.h Acked-by: Mark Rutland Cc: Adrian Hunter Cc: Besar Wicaksono Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Will Deacon Link: https://lore.kernel.org/lkml/Zl8cYk0Tai2fs7aM@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/arm64/include/asm/cputype.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/arch/arm64/include/asm/cputype.h b/tools/arch/arm64/include/asm/cputype.h index 52f076afeb9600..7b32b99023a21d 100644 --- a/tools/arch/arm64/include/asm/cputype.h +++ b/tools/arch/arm64/include/asm/cputype.h @@ -86,6 +86,9 @@ #define ARM_CPU_PART_CORTEX_X2 0xD48 #define ARM_CPU_PART_NEOVERSE_N2 0xD49 #define ARM_CPU_PART_CORTEX_A78C 0xD4B +#define ARM_CPU_PART_NEOVERSE_V2 0xD4F +#define ARM_CPU_PART_CORTEX_X4 0xD82 +#define ARM_CPU_PART_NEOVERSE_V3 0xD84 #define APM_CPU_PART_XGENE 0x000 #define APM_CPU_VAR_POTENZA 0x00 @@ -159,6 +162,9 @@ #define MIDR_CORTEX_X2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X2) #define MIDR_NEOVERSE_N2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N2) #define MIDR_CORTEX_A78C MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78C) +#define MIDR_NEOVERSE_V2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V2) +#define MIDR_CORTEX_X4 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X4) +#define MIDR_NEOVERSE_V3 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V3) #define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX) #define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX) From 0ea00e249ca992adee54dc71a526ee70ef109e40 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 29 May 2024 15:23:25 +0300 Subject: [PATCH 0593/1578] tpm_tis: Do *not* flush uninitialized work tpm_tis_core_init() may fail before tpm_tis_probe_irq_single() is called, in which case tpm_tis_remove() unconditionally calling flush_work() is triggering a warning for .func still being NULL. Cc: stable@vger.kernel.org # v6.5+ Fixes: 481c2d14627d ("tpm,tpm_tis: Disable interrupts after 1000 unhandled IRQs") Signed-off-by: Jan Beulich Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm_tis_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/char/tpm/tpm_tis_core.c b/drivers/char/tpm/tpm_tis_core.c index 176cd8dbf1db2c..fdef214b9f6bff 100644 --- a/drivers/char/tpm/tpm_tis_core.c +++ b/drivers/char/tpm/tpm_tis_core.c @@ -1020,7 +1020,8 @@ void tpm_tis_remove(struct tpm_chip *chip) interrupt = 0; tpm_tis_write32(priv, reg, ~TPM_GLOBAL_INT_ENABLE & interrupt); - flush_work(&priv->free_irq_work); + if (priv->free_irq_work.func) + flush_work(&priv->free_irq_work); tpm_tis_clkrun_enable(chip, false); From f071d02ecad4cfbf3ab41807c90bd1fef1cbfd3f Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 5 Jun 2024 01:28:42 +0300 Subject: [PATCH 0594/1578] tpm: Switch to new Intel CPU model defines New CPU #defines encode vendor and family as well as model. Link: https://lore.kernel.org/all/20240520224620.9480-4-tony.luck@intel.com/ Signed-off-by: Tony Luck Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm.h | 2 +- drivers/char/tpm/tpm_tis_core.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h index 6b8b9956ba6944..7bb87fa5f7a126 100644 --- a/drivers/char/tpm/tpm.h +++ b/drivers/char/tpm/tpm.h @@ -28,7 +28,7 @@ #include #ifdef CONFIG_X86 -#include +#include #endif #define TPM_MINOR 224 /* officially assigned */ diff --git a/drivers/char/tpm/tpm_tis_core.h b/drivers/char/tpm/tpm_tis_core.h index 13e99cf65efe44..690ad8e9b73190 100644 --- a/drivers/char/tpm/tpm_tis_core.h +++ b/drivers/char/tpm/tpm_tis_core.h @@ -210,7 +210,7 @@ static inline int tpm_tis_verify_crc(struct tpm_tis_data *data, size_t len, static inline bool is_bsw(void) { #ifdef CONFIG_X86 - return ((boot_cpu_data.x86_model == INTEL_FAM6_ATOM_AIRMONT) ? 1 : 0); + return (boot_cpu_data.x86_vfm == INTEL_ATOM_AIRMONT) ? 1 : 0; #else return false; #endif From d0d1df8ba18abc57f28fb3bc053b2bf319367f2c Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 4 Jun 2024 17:00:24 +0200 Subject: [PATCH 0595/1578] bpf: Set run context for rawtp test_run callback syzbot reported crash when rawtp program executed through the test_run interface calls bpf_get_attach_cookie helper or any other helper that touches task->bpf_ctx pointer. Setting the run context (task->bpf_ctx pointer) for test_run callback. Fixes: 7adfc6c9b315 ("bpf: Add bpf_get_attach_cookie() BPF helper to access bpf_cookie value") Reported-by: syzbot+3ab78ff125b7979e45f9@syzkaller.appspotmail.com Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Closes: https://syzkaller.appspot.com/bug?extid=3ab78ff125b7979e45f9 Link: https://lore.kernel.org/bpf/20240604150024.359247-1-jolsa@kernel.org --- net/bpf/test_run.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index f6aad4ed2ab2f6..36ae54f57bf574 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -727,10 +727,16 @@ static void __bpf_prog_test_run_raw_tp(void *data) { struct bpf_raw_tp_test_run_info *info = data; + struct bpf_trace_run_ctx run_ctx = {}; + struct bpf_run_ctx *old_run_ctx; + + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); rcu_read_lock(); info->retval = bpf_prog_run(info->prog, info->ctx); rcu_read_unlock(); + + bpf_reset_run_ctx(old_run_ctx); } int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, From 7fcf26b315bbb728036da0862de6b335da83dff2 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Tue, 4 Jun 2024 14:29:25 +0200 Subject: [PATCH 0596/1578] Revert "xsk: Support redirect to any socket bound to the same umem" This reverts commit 2863d665ea41282379f108e4da6c8a2366ba66db. This patch introduced a potential kernel crash when multiple napi instances redirect to the same AF_XDP socket. By removing the queue_index check, it is possible for multiple napi instances to access the Rx ring at the same time, which will result in a corrupted ring state which can lead to a crash when flushing the rings in __xsk_flush(). This can happen when the linked list of sockets to flush gets corrupted by concurrent accesses. A quick and small fix is not possible, so let us revert this for now. Reported-by: Yuval El-Hanany Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/xdp-newbies/8100DBDC-0B7C-49DB-9995-6027F6E63147@radware.com Link: https://lore.kernel.org/bpf/20240604122927.29080-2-magnus.karlsson@gmail.com --- net/xdp/xsk.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 727aa20be4bde8..7d1c0986f9bb35 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -313,13 +313,10 @@ static bool xsk_is_bound(struct xdp_sock *xs) static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { - struct net_device *dev = xdp->rxq->dev; - u32 qid = xdp->rxq->queue_index; - if (!xsk_is_bound(xs)) return -ENXIO; - if (!dev->_rx[qid].pool || xs->umem != dev->_rx[qid].pool->umem) + if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) return -EINVAL; if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) { From 03e38d315f3c5258270ad50f2ae784b6372e87c3 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Tue, 4 Jun 2024 14:29:26 +0200 Subject: [PATCH 0597/1578] Revert "xsk: Document ability to redirect to any socket bound to the same umem" This reverts commit 968595a93669b6b4f6d1fcf80cf2d97956b6868f. Reported-by: Yuval El-Hanany Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/xdp-newbies/8100DBDC-0B7C-49DB-9995-6027F6E63147@radware.com Link: https://lore.kernel.org/bpf/20240604122927.29080-3-magnus.karlsson@gmail.com --- Documentation/networking/af_xdp.rst | 33 ++++++++++++----------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/Documentation/networking/af_xdp.rst b/Documentation/networking/af_xdp.rst index 72da7057e4cf96..dceeb0d763aa23 100644 --- a/Documentation/networking/af_xdp.rst +++ b/Documentation/networking/af_xdp.rst @@ -329,24 +329,23 @@ XDP_SHARED_UMEM option and provide the initial socket's fd in the sxdp_shared_umem_fd field as you registered the UMEM on that socket. These two sockets will now share one and the same UMEM. -In this case, it is possible to use the NIC's packet steering -capabilities to steer the packets to the right queue. This is not -possible in the previous example as there is only one queue shared -among sockets, so the NIC cannot do this steering as it can only steer -between queues. - -In libxdp (or libbpf prior to version 1.0), you need to use the -xsk_socket__create_shared() API as it takes a reference to a FILL ring -and a COMPLETION ring that will be created for you and bound to the -shared UMEM. You can use this function for all the sockets you create, -or you can use it for the second and following ones and use -xsk_socket__create() for the first one. Both methods yield the same -result. +There is no need to supply an XDP program like the one in the previous +case where sockets were bound to the same queue id and +device. Instead, use the NIC's packet steering capabilities to steer +the packets to the right queue. In the previous example, there is only +one queue shared among sockets, so the NIC cannot do this steering. It +can only steer between queues. + +In libbpf, you need to use the xsk_socket__create_shared() API as it +takes a reference to a FILL ring and a COMPLETION ring that will be +created for you and bound to the shared UMEM. You can use this +function for all the sockets you create, or you can use it for the +second and following ones and use xsk_socket__create() for the first +one. Both methods yield the same result. Note that a UMEM can be shared between sockets on the same queue id and device, as well as between queues on the same device and between -devices at the same time. It is also possible to redirect to any -socket as long as it is bound to the same umem with XDP_SHARED_UMEM. +devices at the same time. XDP_USE_NEED_WAKEUP bind flag ----------------------------- @@ -823,10 +822,6 @@ A: The short answer is no, that is not supported at the moment. The switch, or other distribution mechanism, in your NIC to direct traffic to the correct queue id and socket. - Note that if you are using the XDP_SHARED_UMEM option, it is - possible to switch traffic between any socket bound to the same - umem. - Q: My packets are sometimes corrupted. What is wrong? A: Care has to be taken not to feed the same buffer in the UMEM into From 99280413a5b785f22d91e8a8a66dc38f4a214495 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 4 Jun 2024 17:45:20 +0200 Subject: [PATCH 0598/1578] efi: Add missing __nocfi annotations to runtime wrappers The EFI runtime wrappers are a sandbox for calling into EFI runtime services, which are invoked using indirect calls. When running with kCFI enabled, the compiler will require the target of any indirect call to be type annotated. Given that the EFI runtime services prototypes and calling convention are governed by the EFI spec, not the Linux kernel, adding such type annotations for firmware routines is infeasible, and so the compiler must be informed that prototype validation should be omitted. Add the __nocfi annotation at the appropriate places in the EFI runtime wrapper code to achieve this. Note that this currently only affects 32-bit ARM, given that other architectures that support both kCFI and EFI use an asm wrapper to call EFI runtime services, and this hides the indirect call from the compiler. Fixes: 1a4fec49efe5 ("ARM: 9392/2: Support CLANG CFI") Reviewed-by: Linus Walleij Tested-by: Nathan Chancellor Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/runtime-wrappers.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/firmware/efi/runtime-wrappers.c b/drivers/firmware/efi/runtime-wrappers.c index 5d56bc40a79d72..708b777857d341 100644 --- a/drivers/firmware/efi/runtime-wrappers.c +++ b/drivers/firmware/efi/runtime-wrappers.c @@ -213,7 +213,7 @@ extern struct semaphore __efi_uv_runtime_lock __alias(efi_runtime_lock); * Calls the appropriate efi_runtime_service() with the appropriate * arguments. */ -static void efi_call_rts(struct work_struct *work) +static void __nocfi efi_call_rts(struct work_struct *work) { const union efi_rts_args *args = efi_rts_work.args; efi_status_t status = EFI_NOT_FOUND; @@ -435,7 +435,7 @@ static efi_status_t virt_efi_set_variable(efi_char16_t *name, return status; } -static efi_status_t +static efi_status_t __nocfi virt_efi_set_variable_nb(efi_char16_t *name, efi_guid_t *vendor, u32 attr, unsigned long data_size, void *data) { @@ -469,7 +469,7 @@ static efi_status_t virt_efi_query_variable_info(u32 attr, return status; } -static efi_status_t +static efi_status_t __nocfi virt_efi_query_variable_info_nb(u32 attr, u64 *storage_space, u64 *remaining_space, u64 *max_variable_size) { @@ -499,10 +499,9 @@ static efi_status_t virt_efi_get_next_high_mono_count(u32 *count) return status; } -static void virt_efi_reset_system(int reset_type, - efi_status_t status, - unsigned long data_size, - efi_char16_t *data) +static void __nocfi +virt_efi_reset_system(int reset_type, efi_status_t status, + unsigned long data_size, efi_char16_t *data) { if (down_trylock(&efi_runtime_lock)) { pr_warn("failed to invoke the reset_system() runtime service:\n" From 8b0f7410942cdc420c4557eda02bfcdf60ccec17 Mon Sep 17 00:00:00 2001 From: Subbaraya Sundeep Date: Wed, 29 May 2024 20:59:44 +0530 Subject: [PATCH 0599/1578] octeontx2-af: Always allocate PF entries from low prioriy zone PF mcam entries has to be at low priority always so that VF can install longest prefix match rules at higher priority. This was taken care currently but when priority allocation wrt reference entry is requested then entries are allocated from mid-zone instead of low priority zone. Fix this and always allocate entries from low priority zone for PFs. Fixes: 7df5b4b260dd ("octeontx2-af: Allocate low priority entries for PF") Signed-off-by: Subbaraya Sundeep Reviewed-by: Jacob Keller Signed-off-by: David S. Miller --- .../ethernet/marvell/octeontx2/af/rvu_npc.c | 33 ++++++++++++------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c index e8b73b9d75e311..97722ce8c4cb34 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c @@ -2519,7 +2519,17 @@ static int npc_mcam_alloc_entries(struct npc_mcam *mcam, u16 pcifunc, * - when available free entries are less. * Lower priority ones out of avaialble free entries are always * chosen when 'high vs low' question arises. + * + * For a VF base MCAM match rule is set by its PF. And all the + * further MCAM rules installed by VF on its own are + * concatenated with the base rule set by its PF. Hence PF entries + * should be at lower priority compared to VF entries. Otherwise + * base rule is hit always and rules installed by VF will be of + * no use. Hence if the request is from PF then allocate low + * priority entries. */ + if (!(pcifunc & RVU_PFVF_FUNC_MASK)) + goto lprio_alloc; /* Get the search range for priority allocation request */ if (req->priority) { @@ -2528,17 +2538,6 @@ static int npc_mcam_alloc_entries(struct npc_mcam *mcam, u16 pcifunc, goto alloc; } - /* For a VF base MCAM match rule is set by its PF. And all the - * further MCAM rules installed by VF on its own are - * concatenated with the base rule set by its PF. Hence PF entries - * should be at lower priority compared to VF entries. Otherwise - * base rule is hit always and rules installed by VF will be of - * no use. Hence if the request is from PF and NOT a priority - * allocation request then allocate low priority entries. - */ - if (!(pcifunc & RVU_PFVF_FUNC_MASK)) - goto lprio_alloc; - /* Find out the search range for non-priority allocation request * * Get MCAM free entry count in middle zone. @@ -2568,6 +2567,18 @@ static int npc_mcam_alloc_entries(struct npc_mcam *mcam, u16 pcifunc, reverse = true; start = 0; end = mcam->bmap_entries; + /* Ensure PF requests are always at bottom and if PF requests + * for higher/lower priority entry wrt reference entry then + * honour that criteria and start search for entries from bottom + * and not in mid zone. + */ + if (!(pcifunc & RVU_PFVF_FUNC_MASK) && + req->priority == NPC_MCAM_HIGHER_PRIO) + end = req->ref_entry; + + if (!(pcifunc & RVU_PFVF_FUNC_MASK) && + req->priority == NPC_MCAM_LOWER_PRIO) + start = req->ref_entry; } alloc: From fb0aa0781a5f457e3864da68af52c3b1f4f7fd8f Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 31 May 2024 16:54:17 +0800 Subject: [PATCH 0600/1578] net/smc: avoid overwriting when adjusting sock bufsizes When copying smc settings to clcsock, avoid setting clcsock's sk_sndbuf to sysctl_tcp_wmem[1], since this may overwrite the value set by tcp_sndbuf_expand() in TCP connection establishment. And the other setting sk_{snd|rcv}buf to sysctl value in smc_adjust_sock_bufsizes() can also be omitted since the initialization of smc sock and clcsock has set sk_{snd|rcv}buf to smc.sysctl_{w|r}mem or ipv4_sysctl_tcp_{w|r}mem[1]. Fixes: 30c3c4a4497c ("net/smc: Use correct buffer sizes when switching between TCP and SMC") Link: https://lore.kernel.org/r/5eaf3858-e7fd-4db8-83e8-3d7a3e0e9ae2@linux.alibaba.com Signed-off-by: Wen Gu Reviewed-by: Wenjia Zhang Reviewed-by: Gerd Bayer , too. Signed-off-by: David S. Miller --- net/smc/af_smc.c | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index e50a286fd0fb77..c5f98c6b25613f 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -459,29 +459,11 @@ static int smc_bind(struct socket *sock, struct sockaddr *uaddr, static void smc_adjust_sock_bufsizes(struct sock *nsk, struct sock *osk, unsigned long mask) { - struct net *nnet = sock_net(nsk); - nsk->sk_userlocks = osk->sk_userlocks; - if (osk->sk_userlocks & SOCK_SNDBUF_LOCK) { + if (osk->sk_userlocks & SOCK_SNDBUF_LOCK) nsk->sk_sndbuf = osk->sk_sndbuf; - } else { - if (mask == SK_FLAGS_SMC_TO_CLC) - WRITE_ONCE(nsk->sk_sndbuf, - READ_ONCE(nnet->ipv4.sysctl_tcp_wmem[1])); - else - WRITE_ONCE(nsk->sk_sndbuf, - 2 * READ_ONCE(nnet->smc.sysctl_wmem)); - } - if (osk->sk_userlocks & SOCK_RCVBUF_LOCK) { + if (osk->sk_userlocks & SOCK_RCVBUF_LOCK) nsk->sk_rcvbuf = osk->sk_rcvbuf; - } else { - if (mask == SK_FLAGS_SMC_TO_CLC) - WRITE_ONCE(nsk->sk_rcvbuf, - READ_ONCE(nnet->ipv4.sysctl_tcp_rmem[1])); - else - WRITE_ONCE(nsk->sk_rcvbuf, - 2 * READ_ONCE(nnet->smc.sysctl_rmem)); - } } static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, From 0a8d3f2e3e8d8aea8af017e14227b91d5989b696 Mon Sep 17 00:00:00 2001 From: Tristram Ha Date: Thu, 30 May 2024 18:38:01 -0700 Subject: [PATCH 0601/1578] net: phy: Micrel KSZ8061: fix errata solution not taking effect problem KSZ8061 needs to write to a MMD register at driver initialization to fix an errata. This worked in 5.0 kernel but not in newer kernels. The issue is the main phylib code no longer resets PHY at the very beginning. Calling phy resuming code later will reset the chip if it is already powered down at the beginning. This wipes out the MMD register write. Solution is to implement a phy resume function for KSZ8061 to take care of this problem. Fixes: 232ba3a51cc2 ("net: phy: Micrel KSZ8061: link failure after cable connect") Signed-off-by: Tristram Ha Signed-off-by: David S. Miller --- drivers/net/phy/micrel.c | 42 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 8c20cf93753066..5aada7cf3da726 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -866,6 +866,17 @@ static int ksz8061_config_init(struct phy_device *phydev) { int ret; + /* Chip can be powered down by the bootstrap code. */ + ret = phy_read(phydev, MII_BMCR); + if (ret < 0) + return ret; + if (ret & BMCR_PDOWN) { + ret = phy_write(phydev, MII_BMCR, ret & ~BMCR_PDOWN); + if (ret < 0) + return ret; + usleep_range(1000, 2000); + } + ret = phy_write_mmd(phydev, MDIO_MMD_PMAPMD, MDIO_DEVID1, 0xB61A); if (ret) return ret; @@ -2135,6 +2146,35 @@ static int ksz9477_resume(struct phy_device *phydev) return 0; } +static int ksz8061_resume(struct phy_device *phydev) +{ + int ret; + + /* This function can be called twice when the Ethernet device is on. */ + ret = phy_read(phydev, MII_BMCR); + if (ret < 0) + return ret; + if (!(ret & BMCR_PDOWN)) + return 0; + + genphy_resume(phydev); + usleep_range(1000, 2000); + + /* Re-program the value after chip is reset. */ + ret = phy_write_mmd(phydev, MDIO_MMD_PMAPMD, MDIO_DEVID1, 0xB61A); + if (ret) + return ret; + + /* Enable PHY Interrupts */ + if (phy_interrupt_is_valid(phydev)) { + phydev->interrupts = PHY_INTERRUPT_ENABLED; + if (phydev->drv->config_intr) + phydev->drv->config_intr(phydev); + } + + return 0; +} + static int kszphy_probe(struct phy_device *phydev) { const struct kszphy_type *type = phydev->drv->driver_data; @@ -5389,7 +5429,7 @@ static struct phy_driver ksphy_driver[] = { .config_intr = kszphy_config_intr, .handle_interrupt = kszphy_handle_interrupt, .suspend = kszphy_suspend, - .resume = kszphy_resume, + .resume = ksz8061_resume, }, { .phy_id = PHY_ID_KSZ9021, .phy_id_mask = 0x000ffffe, From 491aee894a08bc9b8bb52e7363b9d4bc6403f363 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Mon, 3 Jun 2024 04:57:55 +0000 Subject: [PATCH 0602/1578] ionic: fix kernel panic in XDP_TX action In the XDP_TX path, ionic driver sends a packet to the TX path with rx page and corresponding dma address. After tx is done, ionic_tx_clean() frees that page. But RX ring buffer isn't reset to NULL. So, it uses a freed page, which causes kernel panic. BUG: unable to handle page fault for address: ffff8881576c110c PGD 773801067 P4D 773801067 PUD 87f086067 PMD 87efca067 PTE 800ffffea893e060 Oops: Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC KASAN NOPTI CPU: 1 PID: 25 Comm: ksoftirqd/1 Not tainted 6.9.0+ #11 Hardware name: ASUS System Product Name/PRIME Z690-P D4, BIOS 0603 11/01/2021 RIP: 0010:bpf_prog_f0b8caeac1068a55_balancer_ingress+0x3b/0x44f Code: 00 53 41 55 41 56 41 57 b8 01 00 00 00 48 8b 5f 08 4c 8b 77 00 4c 89 f7 48 83 c7 0e 48 39 d8 RSP: 0018:ffff888104e6fa28 EFLAGS: 00010283 RAX: 0000000000000002 RBX: ffff8881576c1140 RCX: 0000000000000002 RDX: ffffffffc0051f64 RSI: ffffc90002d33048 RDI: ffff8881576c110e RBP: ffff888104e6fa88 R08: 0000000000000000 R09: ffffed1027a04a23 R10: 0000000000000000 R11: 0000000000000000 R12: ffff8881b03a21a8 R13: ffff8881589f800f R14: ffff8881576c1100 R15: 00000001576c1100 FS: 0000000000000000(0000) GS:ffff88881ae00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff8881576c110c CR3: 0000000767a90000 CR4: 00000000007506f0 PKRU: 55555554 Call Trace: ? __die+0x20/0x70 ? page_fault_oops+0x254/0x790 ? __pfx_page_fault_oops+0x10/0x10 ? __pfx_is_prefetch.constprop.0+0x10/0x10 ? search_bpf_extables+0x165/0x260 ? fixup_exception+0x4a/0x970 ? exc_page_fault+0xcb/0xe0 ? asm_exc_page_fault+0x22/0x30 ? 0xffffffffc0051f64 ? bpf_prog_f0b8caeac1068a55_balancer_ingress+0x3b/0x44f ? do_raw_spin_unlock+0x54/0x220 ionic_rx_service+0x11ab/0x3010 [ionic 9180c3001ab627d82bbc5f3ebe8a0decaf6bb864] ? ionic_tx_clean+0x29b/0xc60 [ionic 9180c3001ab627d82bbc5f3ebe8a0decaf6bb864] ? __pfx_ionic_tx_clean+0x10/0x10 [ionic 9180c3001ab627d82bbc5f3ebe8a0decaf6bb864] ? __pfx_ionic_rx_service+0x10/0x10 [ionic 9180c3001ab627d82bbc5f3ebe8a0decaf6bb864] ? ionic_tx_cq_service+0x25d/0xa00 [ionic 9180c3001ab627d82bbc5f3ebe8a0decaf6bb864] ? __pfx_ionic_rx_service+0x10/0x10 [ionic 9180c3001ab627d82bbc5f3ebe8a0decaf6bb864] ionic_cq_service+0x69/0x150 [ionic 9180c3001ab627d82bbc5f3ebe8a0decaf6bb864] ionic_txrx_napi+0x11a/0x540 [ionic 9180c3001ab627d82bbc5f3ebe8a0decaf6bb864] __napi_poll.constprop.0+0xa0/0x440 net_rx_action+0x7e7/0xc30 ? __pfx_net_rx_action+0x10/0x10 Fixes: 8eeed8373e1c ("ionic: Add XDP_TX support") Signed-off-by: Taehee Yoo Reviewed-by: Shannon Nelson Reviewed-by: Brett Creeley Signed-off-by: David S. Miller --- drivers/net/ethernet/pensando/ionic/ionic_txrx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c index 5dba6d2d633cb6..2427610f4306d9 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c @@ -586,6 +586,7 @@ static bool ionic_run_xdp(struct ionic_rx_stats *stats, netdev_dbg(netdev, "tx ionic_xdp_post_frame err %d\n", err); goto out_xdp_abort; } + buf_info->page = NULL; stats->xdp_tx++; /* the Tx completion will free the buffers */ From affc18fdc694190ca7575b9a86632a73b9fe043d Mon Sep 17 00:00:00 2001 From: Hangyu Hua Date: Mon, 3 Jun 2024 15:13:03 +0800 Subject: [PATCH 0603/1578] net: sched: sch_multiq: fix possible OOB write in multiq_tune() q->bands will be assigned to qopt->bands to execute subsequent code logic after kmalloc. So the old q->bands should not be used in kmalloc. Otherwise, an out-of-bounds write will occur. Fixes: c2999f7fb05b ("net: sched: multiq: don't call qdisc_put() while holding tree lock") Signed-off-by: Hangyu Hua Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/sch_multiq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 79e93a19d5fabe..06e03f5cd7ce18 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -185,7 +185,7 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt, qopt->bands = qdisc_dev(sch)->real_num_tx_queues; - removed = kmalloc(sizeof(*removed) * (q->max_bands - q->bands), + removed = kmalloc(sizeof(*removed) * (q->max_bands - qopt->bands), GFP_KERNEL); if (!removed) return -ENOMEM; From 1cd4bc987abb2823836cbb8f887026011ccddc8a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 3 Jun 2024 10:59:26 +0200 Subject: [PATCH 0604/1578] vxlan: Fix regression when dropping packets due to invalid src addresses Commit f58f45c1e5b9 ("vxlan: drop packets from invalid src-address") has recently been added to vxlan mainly in the context of source address snooping/learning so that when it is enabled, an entry in the FDB is not being created for an invalid address for the corresponding tunnel endpoint. Before commit f58f45c1e5b9 vxlan was similarly behaving as geneve in that it passed through whichever macs were set in the L2 header. It turns out that this change in behavior breaks setups, for example, Cilium with netkit in L3 mode for Pods as well as tunnel mode has been passing before the change in f58f45c1e5b9 for both vxlan and geneve. After mentioned change it is only passing for geneve as in case of vxlan packets are dropped due to vxlan_set_mac() returning false as source and destination macs are zero which for E/W traffic via tunnel is totally fine. Fix it by only opting into the is_valid_ether_addr() check in vxlan_set_mac() when in fact source address snooping/learning is actually enabled in vxlan. This is done by moving the check into vxlan_snoop(). With this change, the Cilium connectivity test suite passes again for both tunnel flavors. Fixes: f58f45c1e5b9 ("vxlan: drop packets from invalid src-address") Signed-off-by: Daniel Borkmann Cc: David Bauer Cc: Ido Schimmel Cc: Nikolay Aleksandrov Cc: Martin KaFai Lau Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Reviewed-by: David Bauer Signed-off-by: David S. Miller --- drivers/net/vxlan/vxlan_core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index f78dd0438843b1..567cb3faab709c 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1446,6 +1446,10 @@ static bool vxlan_snoop(struct net_device *dev, struct vxlan_fdb *f; u32 ifindex = 0; + /* Ignore packets from invalid src-address */ + if (!is_valid_ether_addr(src_mac)) + return true; + #if IS_ENABLED(CONFIG_IPV6) if (src_ip->sa.sa_family == AF_INET6 && (ipv6_addr_type(&src_ip->sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL)) @@ -1616,10 +1620,6 @@ static bool vxlan_set_mac(struct vxlan_dev *vxlan, if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr)) return false; - /* Ignore packets from invalid src-address */ - if (!is_valid_ether_addr(eth_hdr(skb)->h_source)) - return false; - /* Get address from the outer IP header */ if (vxlan_get_sk_family(vs) == AF_INET) { saddr.sin.sin_addr.s_addr = ip_hdr(skb)->saddr; From 712115a24b1a5318c10fc757d48d8f33815a6bfa Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Mon, 3 Jun 2024 17:30:19 +0800 Subject: [PATCH 0605/1578] selftests: hsr: add missing config for CONFIG_BRIDGE hsr_redbox.sh test need to create bridge for testing. Add the missing config CONFIG_BRIDGE in config file. Fixes: eafbf0574e05 ("test: hsr: Extend the hsr_redbox.sh to have more SAN devices connected") Signed-off-by: Hangbin Liu Tested-by: Simon Horman Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- tools/testing/selftests/net/hsr/config | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/net/hsr/config b/tools/testing/selftests/net/hsr/config index 22061204fb6912..241542441c5177 100644 --- a/tools/testing/selftests/net/hsr/config +++ b/tools/testing/selftests/net/hsr/config @@ -2,3 +2,4 @@ CONFIG_IPV6=y CONFIG_NET_SCH_NETEM=m CONFIG_HSR=y CONFIG_VETH=y +CONFIG_BRIDGE=y From 14951beaec93696b092a906baa0f29322cf34004 Mon Sep 17 00:00:00 2001 From: Wei Li Date: Tue, 23 Apr 2024 17:35:01 +0800 Subject: [PATCH 0606/1578] arm64: armv8_deprecated: Fix warning in isndep cpuhp starting process The function run_all_insn_set_hw_mode() is registered as startup callback of 'CPUHP_AP_ARM64_ISNDEP_STARTING', it invokes set_hw_mode() methods of all emulated instructions. As the STARTING callbacks are not expected to fail, if one of the set_hw_mode() fails, e.g. due to el0 mixed-endian is not supported for 'setend', it will report a warning: ``` CPU[2] cannot support the emulation of setend CPU 2 UP state arm64/isndep:starting (136) failed (-22) CPU2: Booted secondary processor 0x0000000002 [0x414fd0c1] ``` To fix it, add a check for INSN_UNAVAILABLE status and skip the process. Signed-off-by: Wei Li Tested-by: Huisong Li Link: https://lore.kernel.org/r/20240423093501.3460764-1-liwei391@huawei.com Signed-off-by: Will Deacon --- arch/arm64/kernel/armv8_deprecated.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index dd6ce86d4332be..b776e7424fe914 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -462,6 +462,9 @@ static int run_all_insn_set_hw_mode(unsigned int cpu) for (int i = 0; i < ARRAY_SIZE(insn_emulations); i++) { struct insn_emulation *insn = insn_emulations[i]; bool enable = READ_ONCE(insn->current_mode) == INSN_HW; + if (insn->status == INSN_UNAVAILABLE) + continue; + if (insn->set_hw_mode && insn->set_hw_mode(enable)) { pr_warn("CPU[%u] cannot support the emulation of %s", cpu, insn->name); From db574f2f96d0c9a245a9e787e3d9ec288fb2b445 Mon Sep 17 00:00:00 2001 From: Tao Su Date: Tue, 28 May 2024 18:22:34 +0800 Subject: [PATCH 0607/1578] KVM: x86/mmu: Don't save mmu_invalidate_seq after checking private attr Drop the second snapshot of mmu_invalidate_seq in kvm_faultin_pfn(). Before checking the mismatch of private vs. shared, mmu_invalidate_seq is saved to fault->mmu_seq, which can be used to detect an invalidation related to the gfn occurred, i.e. KVM will not install a mapping in page table if fault->mmu_seq != mmu_invalidate_seq. Currently there is a second snapshot of mmu_invalidate_seq, which may not be same as the first snapshot in kvm_faultin_pfn(), i.e. the gfn attribute may be changed between the two snapshots, but the gfn may be mapped in page table without hindrance. Therefore, drop the second snapshot as it has no obvious benefits. Fixes: f6adeae81f35 ("KVM: x86/mmu: Handle no-slot faults at the beginning of kvm_faultin_pfn()") Signed-off-by: Tao Su Message-ID: <20240528102234.2162763-1-tao1.su@linux.intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index b7b4426a7221df..8d74bdef68c1d3 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4411,9 +4411,6 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, return RET_PF_EMULATE; } - fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq; - smp_rmb(); - /* * Check for a relevant mmu_notifier invalidation event before getting * the pfn from the primary MMU, and before acquiring mmu_lock. From a46d0ea5c94205f40ecf912d1bb7806a8a64704f Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Tue, 4 Jun 2024 01:02:16 +0800 Subject: [PATCH 0608/1578] tcp: count CLOSE-WAIT sockets for TCP_MIB_CURRESTAB According to RFC 1213, we should also take CLOSE-WAIT sockets into consideration: "tcpCurrEstab OBJECT-TYPE ... The number of TCP connections for which the current state is either ESTABLISHED or CLOSE- WAIT." After this, CurrEstab counter will display the total number of ESTABLISHED and CLOSE-WAIT sockets. The logic of counting When we increment the counter? a) if we change the state to ESTABLISHED. b) if we change the state from SYN-RECEIVED to CLOSE-WAIT. When we decrement the counter? a) if the socket leaves ESTABLISHED and will never go into CLOSE-WAIT, say, on the client side, changing from ESTABLISHED to FIN-WAIT-1. b) if the socket leaves CLOSE-WAIT, say, on the server side, changing from CLOSE-WAIT to LAST-ACK. Please note: there are two chances that old state of socket can be changed to CLOSE-WAIT in tcp_fin(). One is SYN-RECV, the other is ESTABLISHED. So we have to take care of the former case. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Jason Xing Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4d8cc2ebb64c7d..e6790ea7487738 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2649,6 +2649,10 @@ void tcp_set_state(struct sock *sk, int state) if (oldstate != TCP_ESTABLISHED) TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); break; + case TCP_CLOSE_WAIT: + if (oldstate == TCP_SYN_RECV) + TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); + break; case TCP_CLOSE: if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED) @@ -2660,7 +2664,7 @@ void tcp_set_state(struct sock *sk, int state) inet_put_port(sk); fallthrough; default: - if (oldstate == TCP_ESTABLISHED) + if (oldstate == TCP_ESTABLISHED || oldstate == TCP_CLOSE_WAIT) TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); } From 9633e9377e6af0244f7381e86b9aac5276f5be97 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Tue, 4 Jun 2024 01:02:17 +0800 Subject: [PATCH 0609/1578] mptcp: count CLOSE-WAIT sockets for MPTCP_MIB_CURRESTAB Like previous patch does in TCP, we need to adhere to RFC 1213: "tcpCurrEstab OBJECT-TYPE ... The number of TCP connections for which the current state is either ESTABLISHED or CLOSE- WAIT." So let's consider CLOSE-WAIT sockets. The logic of counting When we increment the counter? a) Only if we change the state to ESTABLISHED. When we decrement the counter? a) if the socket leaves ESTABLISHED and will never go into CLOSE-WAIT, say, on the client side, changing from ESTABLISHED to FIN-WAIT-1. b) if the socket leaves CLOSE-WAIT, say, on the server side, changing from CLOSE-WAIT to LAST-ACK. Fixes: d9cd27b8cd19 ("mptcp: add CurrEstab MIB counter support") Signed-off-by: Jason Xing Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 7d44196ec5b630..96b113854bd3ce 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2916,9 +2916,14 @@ void mptcp_set_state(struct sock *sk, int state) if (oldstate != TCP_ESTABLISHED) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_CURRESTAB); break; - + case TCP_CLOSE_WAIT: + /* Unlike TCP, MPTCP sk would not have the TCP_SYN_RECV state: + * MPTCP "accepted" sockets will be created later on. So no + * transition from TCP_SYN_RECV to TCP_CLOSE_WAIT. + */ + break; default: - if (oldstate == TCP_ESTABLISHED) + if (oldstate == TCP_ESTABLISHED || oldstate == TCP_CLOSE_WAIT) MPTCP_DEC_STATS(sock_net(sk), MPTCP_MIB_CURRESTAB); } From 5b4b62a169e10401cca34a6e7ac39161986f5605 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 3 Jun 2024 11:48:26 -0700 Subject: [PATCH 0610/1578] rtnetlink: make the "split" NLM_DONE handling generic Jaroslav reports Dell's OMSA Systems Management Data Engine expects NLM_DONE in a separate recvmsg(), both for rtnl_dump_ifinfo() and inet_dump_ifaddr(). We already added a similar fix previously in commit 460b0d33cf10 ("inet: bring NLM_DONE out to a separate recv() again") Instead of modifying all the dump handlers, and making them look different than modern for_each_netdev_dump()-based dump handlers - put the workaround in rtnetlink code. This will also help us move the custom rtnl-locking from af_netlink in the future (in net-next). Note that this change is not touching rtnl_dump_all(). rtnl_dump_all() is different kettle of fish and a potential problem. We now mix families in a single recvmsg(), but NLM_DONE is not coalesced. Tested: ./cli.py --dbg-small-recv 4096 --spec netlink/specs/rt_addr.yaml \ --dump getaddr --json '{"ifa-family": 2}' ./cli.py --dbg-small-recv 4096 --spec netlink/specs/rt_route.yaml \ --dump getroute --json '{"rtm-family": 2}' ./cli.py --dbg-small-recv 4096 --spec netlink/specs/rt_link.yaml \ --dump getlink Fixes: 3e41af90767d ("rtnetlink: use xarray iterator to implement rtnl_dump_ifinfo()") Fixes: cdb2f80f1c10 ("inet: use xa_array iterator to implement inet_dump_ifaddr()") Reported-by: Jaroslav Pulchart Link: https://lore.kernel.org/all/CAK8fFZ7MKoFSEzMBDAOjoUt+vTZRRQgLDNXEOfdCCXSoXXKE0g@mail.gmail.com Signed-off-by: Jakub Kicinski Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/rtnetlink.h | 1 + net/core/rtnetlink.c | 44 +++++++++++++++++++++++++++++++++++++++-- net/ipv4/devinet.c | 2 +- net/ipv4/fib_frontend.c | 7 +------ 4 files changed, 45 insertions(+), 9 deletions(-) diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 3bfb80bad1739d..b45d57b5968af4 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -13,6 +13,7 @@ enum rtnl_link_flags { RTNL_FLAG_DOIT_UNLOCKED = BIT(0), RTNL_FLAG_BULK_DEL_SUPPORTED = BIT(1), RTNL_FLAG_DUMP_UNLOCKED = BIT(2), + RTNL_FLAG_DUMP_SPLIT_NLM_DONE = BIT(3), /* legacy behavior */ }; enum rtnl_kinds { diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index b86b0a87367dd5..4668d671804070 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -6484,6 +6484,46 @@ static int rtnl_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, /* Process one rtnetlink message. */ +static int rtnl_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + rtnl_dumpit_func dumpit = cb->data; + int err; + + /* Previous iteration have already finished, avoid calling->dumpit() + * again, it may not expect to be called after it reached the end. + */ + if (!dumpit) + return 0; + + err = dumpit(skb, cb); + + /* Old dump handlers used to send NLM_DONE as in a separate recvmsg(). + * Some applications which parse netlink manually depend on this. + */ + if (cb->flags & RTNL_FLAG_DUMP_SPLIT_NLM_DONE) { + if (err < 0 && err != -EMSGSIZE) + return err; + if (!err) + cb->data = NULL; + + return skb->len; + } + return err; +} + +static int rtnetlink_dump_start(struct sock *ssk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct netlink_dump_control *control) +{ + if (control->flags & RTNL_FLAG_DUMP_SPLIT_NLM_DONE) { + WARN_ON(control->data); + control->data = control->dump; + control->dump = rtnl_dumpit; + } + + return netlink_dump_start(ssk, skb, nlh, control); +} + static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -6548,7 +6588,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, .module = owner, .flags = flags, }; - err = netlink_dump_start(rtnl, skb, nlh, &c); + err = rtnetlink_dump_start(rtnl, skb, nlh, &c); /* netlink_dump_start() will keep a reference on * module if dump is still in progress. */ @@ -6694,7 +6734,7 @@ void __init rtnetlink_init(void) register_netdevice_notifier(&rtnetlink_dev_notifier); rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink, - rtnl_dump_ifinfo, 0); + rtnl_dump_ifinfo, RTNL_FLAG_DUMP_SPLIT_NLM_DONE); rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, 0); rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, 0); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index f3892ee9dfb33f..d09f557eaa7790 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2805,7 +2805,7 @@ void __init devinet_init(void) rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, 0); rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, 0); rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, - RTNL_FLAG_DUMP_UNLOCKED); + RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE); rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf, inet_netconf_dump_devconf, RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index c484b1c0fc00a7..7ad2cafb927634 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1050,11 +1050,6 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) e++; } } - - /* Don't let NLM_DONE coalesce into a message, even if it could. - * Some user space expects NLM_DONE in a separate recv(). - */ - err = skb->len; out: cb->args[1] = e; @@ -1665,5 +1660,5 @@ void __init ip_fib_init(void) rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, 0); rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, 0); rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, - RTNL_FLAG_DUMP_UNLOCKED); + RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE); } From ccd8d753f0fe8f16745fa2b6be5946349731d901 Mon Sep 17 00:00:00 2001 From: Alibek Omarov Date: Tue, 4 Jun 2024 21:47:52 +0300 Subject: [PATCH 0611/1578] ASoC: rockchip: i2s-tdm: Fix trcm mode by setting clock on right mclk When TRCM mode is enabled, I2S RX and TX clocks are synchronized through selected clock source. Without this fix BCLK and LRCK might get parented to an uninitialized MCLK and the DAI will receive data at wrong pace. However, unlike in original i2s-tdm driver, there is no need to manually synchronize mclk_rx and mclk_tx, as only one gets used anyway. Tested on a board with RK3568 SoC and Silergy SY24145S codec with enabled and disabled TRCM mode. Fixes: 9e2ab4b18ebd ("ASoC: rockchip: i2s-tdm: Fix inaccurate sampling rates") Signed-off-by: Alibek Omarov Reviewed-by: Luca Ceresoli Link: https://msgid.link/r/20240604184752.697313-1-a1ba.omarov@gmail.com Signed-off-by: Mark Brown --- sound/soc/rockchip/rockchip_i2s_tdm.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/sound/soc/rockchip/rockchip_i2s_tdm.c b/sound/soc/rockchip/rockchip_i2s_tdm.c index 9fa020ef7eab9b..ee517d7b5b7bb4 100644 --- a/sound/soc/rockchip/rockchip_i2s_tdm.c +++ b/sound/soc/rockchip/rockchip_i2s_tdm.c @@ -655,8 +655,17 @@ static int rockchip_i2s_tdm_hw_params(struct snd_pcm_substream *substream, int err; if (i2s_tdm->is_master_mode) { - struct clk *mclk = (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) ? - i2s_tdm->mclk_tx : i2s_tdm->mclk_rx; + struct clk *mclk; + + if (i2s_tdm->clk_trcm == TRCM_TX) { + mclk = i2s_tdm->mclk_tx; + } else if (i2s_tdm->clk_trcm == TRCM_RX) { + mclk = i2s_tdm->mclk_rx; + } else if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) { + mclk = i2s_tdm->mclk_tx; + } else { + mclk = i2s_tdm->mclk_rx; + } err = clk_set_rate(mclk, DEFAULT_MCLK_FS * params_rate(params)); if (err) From 97d8613679eb53bd0c07d0fbd3d8471e46ba46c1 Mon Sep 17 00:00:00 2001 From: Hsin-Te Yuan Date: Fri, 31 May 2024 08:37:54 +0000 Subject: [PATCH 0612/1578] ASoC: mediatek: mt8183-da7219-max98357: Fix kcontrol name collision Since "Headphone Switch" kcontrol name has already been used by da7219, rename the control name from "Headphone" to "Headphones" to prevent the colision. Also, this change makes kcontrol name align with the one in mt8186-mt6366-da7219-max98357.c. Fixes: 9c7388baa2053 ("ASoC: mediatek: mt8183-da7219-max98357: Map missing jack kcontrols") Change-Id: I9ae69a4673cd04786b247cc514fdd20f878ef009 Signed-off-by: Hsin-Te Yuan Reviewed-by: Chen-Yu Tsai Link: https://msgid.link/r/20240531-da7219-v1-1-ac3343f3ae6a@chromium.org Signed-off-by: Mark Brown --- sound/soc/mediatek/mt8183/mt8183-da7219-max98357.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sound/soc/mediatek/mt8183/mt8183-da7219-max98357.c b/sound/soc/mediatek/mt8183/mt8183-da7219-max98357.c index acaf81fd6c9b59..f848e14b091a15 100644 --- a/sound/soc/mediatek/mt8183/mt8183-da7219-max98357.c +++ b/sound/soc/mediatek/mt8183/mt8183-da7219-max98357.c @@ -31,7 +31,7 @@ struct mt8183_da7219_max98357_priv { static struct snd_soc_jack_pin mt8183_da7219_max98357_jack_pins[] = { { - .pin = "Headphone", + .pin = "Headphones", .mask = SND_JACK_HEADPHONE, }, { @@ -626,7 +626,7 @@ static struct snd_soc_codec_conf mt6358_codec_conf[] = { }; static const struct snd_kcontrol_new mt8183_da7219_max98357_snd_controls[] = { - SOC_DAPM_PIN_SWITCH("Headphone"), + SOC_DAPM_PIN_SWITCH("Headphones"), SOC_DAPM_PIN_SWITCH("Headset Mic"), SOC_DAPM_PIN_SWITCH("Speakers"), SOC_DAPM_PIN_SWITCH("Line Out"), @@ -634,7 +634,7 @@ static const struct snd_kcontrol_new mt8183_da7219_max98357_snd_controls[] = { static const struct snd_soc_dapm_widget mt8183_da7219_max98357_dapm_widgets[] = { - SND_SOC_DAPM_HP("Headphone", NULL), + SND_SOC_DAPM_HP("Headphones", NULL), SND_SOC_DAPM_MIC("Headset Mic", NULL), SND_SOC_DAPM_SPK("Speakers", NULL), SND_SOC_DAPM_SPK("Line Out", NULL), @@ -680,7 +680,7 @@ static struct snd_soc_codec_conf mt8183_da7219_rt1015_codec_conf[] = { }; static const struct snd_kcontrol_new mt8183_da7219_rt1015_snd_controls[] = { - SOC_DAPM_PIN_SWITCH("Headphone"), + SOC_DAPM_PIN_SWITCH("Headphones"), SOC_DAPM_PIN_SWITCH("Headset Mic"), SOC_DAPM_PIN_SWITCH("Left Spk"), SOC_DAPM_PIN_SWITCH("Right Spk"), @@ -689,7 +689,7 @@ static const struct snd_kcontrol_new mt8183_da7219_rt1015_snd_controls[] = { static const struct snd_soc_dapm_widget mt8183_da7219_rt1015_dapm_widgets[] = { - SND_SOC_DAPM_HP("Headphone", NULL), + SND_SOC_DAPM_HP("Headphones", NULL), SND_SOC_DAPM_MIC("Headset Mic", NULL), SND_SOC_DAPM_SPK("Left Spk", NULL), SND_SOC_DAPM_SPK("Right Spk", NULL), From 5c40e428aea644c9d924e491b1bc22fa9f272bcc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 4 Jun 2024 22:59:57 +0200 Subject: [PATCH 0613/1578] arm64/io: add constant-argument check In some configurations __const_iowrite32_copy() does not get inlined and gcc runs into the BUILD_BUG(): In file included from : In function '__const_memcpy_toio_aligned32', inlined from '__const_iowrite32_copy' at arch/arm64/include/asm/io.h:203:3, inlined from '__const_iowrite32_copy' at arch/arm64/include/asm/io.h:199:20: include/linux/compiler_types.h:487:45: error: call to '__compiletime_assert_538' declared with attribute error: BUILD_BUG failed 487 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) | ^ include/linux/compiler_types.h:468:25: note: in definition of macro '__compiletime_assert' 468 | prefix ## suffix(); \ | ^~~~~~ include/linux/compiler_types.h:487:9: note: in expansion of macro '_compiletime_assert' 487 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) | ^~~~~~~~~~~~~~~~~~~ include/linux/build_bug.h:39:37: note: in expansion of macro 'compiletime_assert' 39 | #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg) | ^~~~~~~~~~~~~~~~~~ include/linux/build_bug.h:59:21: note: in expansion of macro 'BUILD_BUG_ON_MSG' 59 | #define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed") | ^~~~~~~~~~~~~~~~ arch/arm64/include/asm/io.h:193:17: note: in expansion of macro 'BUILD_BUG' 193 | BUILD_BUG(); | ^~~~~~~~~ Move the check for constant arguments into the inline function to ensure it is still constant if the compiler decides against inlining it, and mark them as __always_inline to override the logic that sometimes leads to the compiler not producing the simplified output. Note that either the __always_inline annotation or the check for a constant value are sufficient here, but combining the two looks cleaner as it also avoids the macro. With clang-8 and older, the macro was still needed, but all versions of gcc and clang can reliably perform constant folding here. Fixes: ead79118dae6 ("arm64/io: Provide a WC friendly __iowriteXX_copy()") Signed-off-by: Arnd Bergmann Reviewed-by: Mark Rutland Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240604210006.668912-1-arnd@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/io.h | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 4ff0ae3f6d6690..41fd90895dfc3d 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -153,8 +153,9 @@ extern void __memset_io(volatile void __iomem *, int, size_t); * emit the large TLP from the CPU. */ -static inline void __const_memcpy_toio_aligned32(volatile u32 __iomem *to, - const u32 *from, size_t count) +static __always_inline void +__const_memcpy_toio_aligned32(volatile u32 __iomem *to, const u32 *from, + size_t count) { switch (count) { case 8: @@ -196,24 +197,22 @@ static inline void __const_memcpy_toio_aligned32(volatile u32 __iomem *to, void __iowrite32_copy_full(void __iomem *to, const void *from, size_t count); -static inline void __const_iowrite32_copy(void __iomem *to, const void *from, - size_t count) +static __always_inline void +__iowrite32_copy(void __iomem *to, const void *from, size_t count) { - if (count == 8 || count == 4 || count == 2 || count == 1) { + if (__builtin_constant_p(count) && + (count == 8 || count == 4 || count == 2 || count == 1)) { __const_memcpy_toio_aligned32(to, from, count); dgh(); } else { __iowrite32_copy_full(to, from, count); } } +#define __iowrite32_copy __iowrite32_copy -#define __iowrite32_copy(to, from, count) \ - (__builtin_constant_p(count) ? \ - __const_iowrite32_copy(to, from, count) : \ - __iowrite32_copy_full(to, from, count)) - -static inline void __const_memcpy_toio_aligned64(volatile u64 __iomem *to, - const u64 *from, size_t count) +static __always_inline void +__const_memcpy_toio_aligned64(volatile u64 __iomem *to, const u64 *from, + size_t count) { switch (count) { case 8: @@ -255,21 +254,18 @@ static inline void __const_memcpy_toio_aligned64(volatile u64 __iomem *to, void __iowrite64_copy_full(void __iomem *to, const void *from, size_t count); -static inline void __const_iowrite64_copy(void __iomem *to, const void *from, - size_t count) +static __always_inline void +__iowrite64_copy(void __iomem *to, const void *from, size_t count) { - if (count == 8 || count == 4 || count == 2 || count == 1) { + if (__builtin_constant_p(count) && + (count == 8 || count == 4 || count == 2 || count == 1)) { __const_memcpy_toio_aligned64(to, from, count); dgh(); } else { __iowrite64_copy_full(to, from, count); } } - -#define __iowrite64_copy(to, from, count) \ - (__builtin_constant_p(count) ? \ - __const_iowrite64_copy(to, from, count) : \ - __iowrite64_copy_full(to, from, count)) +#define __iowrite64_copy __iowrite64_copy /* * I/O memory mapping functions. From c57e558194430d10d5e5f4acd8a8655b68dade13 Mon Sep 17 00:00:00 2001 From: Frank Wunderlich Date: Mon, 3 Jun 2024 21:25:05 +0200 Subject: [PATCH 0614/1578] net: ethernet: mtk_eth_soc: handle dma buffer size soc specific The mainline MTK ethernet driver suffers long time from rarly but annoying tx queue timeouts. We think that this is caused by fixed dma sizes hardcoded for all SoCs. We suspect this problem arises from a low level of free TX DMADs, the TX Ring alomost full. The transmit timeout is caused by the Tx queue not waking up. The Tx queue stops when the free counter is less than ring->thres, and it will wake up once the free counter is greater than ring->thres. If the CPU is too late to wake up the Tx queues, it may cause a transmit timeout. Therefore, we increased the TX and RX DMADs to improve this error situation. Use the dma-size implementation from SDK in a per SoC manner. In difference to SDK we have no RSS feature yet, so all RX/TX sizes should be raised from 512 to 2048 byte except fqdma on mt7988 to avoid the tx timeout issue. Fixes: 656e705243fd ("net-next: mediatek: add support for MT7623 ethernet") Suggested-by: Daniel Golle Signed-off-by: Frank Wunderlich Reviewed-by: Jacob Keller Signed-off-by: David S. Miller --- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 104 +++++++++++++------- drivers/net/ethernet/mediatek/mtk_eth_soc.h | 9 +- 2 files changed, 77 insertions(+), 36 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index cae46290a7aee2..c84ce54a84a00e 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -1131,9 +1131,9 @@ static int mtk_init_fq_dma(struct mtk_eth *eth) { const struct mtk_soc_data *soc = eth->soc; dma_addr_t phy_ring_tail; - int cnt = MTK_QDMA_RING_SIZE; + int cnt = soc->tx.fq_dma_size; dma_addr_t dma_addr; - int i; + int i, j, len; if (MTK_HAS_CAPS(eth->soc->caps, MTK_SRAM)) eth->scratch_ring = eth->sram_base; @@ -1142,40 +1142,46 @@ static int mtk_init_fq_dma(struct mtk_eth *eth) cnt * soc->tx.desc_size, ð->phy_scratch_ring, GFP_KERNEL); + if (unlikely(!eth->scratch_ring)) return -ENOMEM; - eth->scratch_head = kcalloc(cnt, MTK_QDMA_PAGE_SIZE, GFP_KERNEL); - if (unlikely(!eth->scratch_head)) - return -ENOMEM; + phy_ring_tail = eth->phy_scratch_ring + soc->tx.desc_size * (cnt - 1); - dma_addr = dma_map_single(eth->dma_dev, - eth->scratch_head, cnt * MTK_QDMA_PAGE_SIZE, - DMA_FROM_DEVICE); - if (unlikely(dma_mapping_error(eth->dma_dev, dma_addr))) - return -ENOMEM; + for (j = 0; j < DIV_ROUND_UP(soc->tx.fq_dma_size, MTK_FQ_DMA_LENGTH); j++) { + len = min_t(int, cnt - j * MTK_FQ_DMA_LENGTH, MTK_FQ_DMA_LENGTH); + eth->scratch_head[j] = kcalloc(len, MTK_QDMA_PAGE_SIZE, GFP_KERNEL); - phy_ring_tail = eth->phy_scratch_ring + soc->tx.desc_size * (cnt - 1); + if (unlikely(!eth->scratch_head[j])) + return -ENOMEM; - for (i = 0; i < cnt; i++) { - dma_addr_t addr = dma_addr + i * MTK_QDMA_PAGE_SIZE; - struct mtk_tx_dma_v2 *txd; + dma_addr = dma_map_single(eth->dma_dev, + eth->scratch_head[j], len * MTK_QDMA_PAGE_SIZE, + DMA_FROM_DEVICE); - txd = eth->scratch_ring + i * soc->tx.desc_size; - txd->txd1 = addr; - if (i < cnt - 1) - txd->txd2 = eth->phy_scratch_ring + - (i + 1) * soc->tx.desc_size; + if (unlikely(dma_mapping_error(eth->dma_dev, dma_addr))) + return -ENOMEM; - txd->txd3 = TX_DMA_PLEN0(MTK_QDMA_PAGE_SIZE); - if (MTK_HAS_CAPS(soc->caps, MTK_36BIT_DMA)) - txd->txd3 |= TX_DMA_PREP_ADDR64(addr); - txd->txd4 = 0; - if (mtk_is_netsys_v2_or_greater(eth)) { - txd->txd5 = 0; - txd->txd6 = 0; - txd->txd7 = 0; - txd->txd8 = 0; + for (i = 0; i < cnt; i++) { + struct mtk_tx_dma_v2 *txd; + + txd = eth->scratch_ring + (j * MTK_FQ_DMA_LENGTH + i) * soc->tx.desc_size; + txd->txd1 = dma_addr + i * MTK_QDMA_PAGE_SIZE; + if (j * MTK_FQ_DMA_LENGTH + i < cnt) + txd->txd2 = eth->phy_scratch_ring + + (j * MTK_FQ_DMA_LENGTH + i + 1) * soc->tx.desc_size; + + txd->txd3 = TX_DMA_PLEN0(MTK_QDMA_PAGE_SIZE); + if (MTK_HAS_CAPS(soc->caps, MTK_36BIT_DMA)) + txd->txd3 |= TX_DMA_PREP_ADDR64(dma_addr + i * MTK_QDMA_PAGE_SIZE); + + txd->txd4 = 0; + if (mtk_is_netsys_v2_or_greater(eth)) { + txd->txd5 = 0; + txd->txd6 = 0; + txd->txd7 = 0; + txd->txd8 = 0; + } } } @@ -2457,7 +2463,7 @@ static int mtk_tx_alloc(struct mtk_eth *eth) if (MTK_HAS_CAPS(soc->caps, MTK_QDMA)) ring_size = MTK_QDMA_RING_SIZE; else - ring_size = MTK_DMA_SIZE; + ring_size = soc->tx.dma_size; ring->buf = kcalloc(ring_size, sizeof(*ring->buf), GFP_KERNEL); @@ -2465,8 +2471,8 @@ static int mtk_tx_alloc(struct mtk_eth *eth) goto no_tx_mem; if (MTK_HAS_CAPS(soc->caps, MTK_SRAM)) { - ring->dma = eth->sram_base + ring_size * sz; - ring->phys = eth->phy_scratch_ring + ring_size * (dma_addr_t)sz; + ring->dma = eth->sram_base + soc->tx.fq_dma_size * sz; + ring->phys = eth->phy_scratch_ring + soc->tx.fq_dma_size * (dma_addr_t)sz; } else { ring->dma = dma_alloc_coherent(eth->dma_dev, ring_size * sz, &ring->phys, GFP_KERNEL); @@ -2588,6 +2594,7 @@ static void mtk_tx_clean(struct mtk_eth *eth) static int mtk_rx_alloc(struct mtk_eth *eth, int ring_no, int rx_flag) { const struct mtk_reg_map *reg_map = eth->soc->reg_map; + const struct mtk_soc_data *soc = eth->soc; struct mtk_rx_ring *ring; int rx_data_len, rx_dma_size, tx_ring_size; int i; @@ -2595,7 +2602,7 @@ static int mtk_rx_alloc(struct mtk_eth *eth, int ring_no, int rx_flag) if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) tx_ring_size = MTK_QDMA_RING_SIZE; else - tx_ring_size = MTK_DMA_SIZE; + tx_ring_size = soc->tx.dma_size; if (rx_flag == MTK_RX_FLAGS_QDMA) { if (ring_no) @@ -2610,7 +2617,7 @@ static int mtk_rx_alloc(struct mtk_eth *eth, int ring_no, int rx_flag) rx_dma_size = MTK_HW_LRO_DMA_SIZE; } else { rx_data_len = ETH_DATA_LEN; - rx_dma_size = MTK_DMA_SIZE; + rx_dma_size = soc->rx.dma_size; } ring->frag_size = mtk_max_frag_size(rx_data_len); @@ -3139,7 +3146,10 @@ static void mtk_dma_free(struct mtk_eth *eth) mtk_rx_clean(eth, ð->rx_ring[i], false); } - kfree(eth->scratch_head); + for (i = 0; i < DIV_ROUND_UP(soc->tx.fq_dma_size, MTK_FQ_DMA_LENGTH); i++) { + kfree(eth->scratch_head[i]); + eth->scratch_head[i] = NULL; + } } static bool mtk_hw_reset_check(struct mtk_eth *eth) @@ -5052,11 +5062,14 @@ static const struct mtk_soc_data mt2701_data = { .desc_size = sizeof(struct mtk_tx_dma), .dma_max_len = MTK_TX_DMA_BUF_LEN, .dma_len_offset = 16, + .dma_size = MTK_DMA_SIZE(2K), + .fq_dma_size = MTK_DMA_SIZE(2K), }, .rx = { .desc_size = sizeof(struct mtk_rx_dma), .irq_done_mask = MTK_RX_DONE_INT, .dma_l4_valid = RX_DMA_L4_VALID, + .dma_size = MTK_DMA_SIZE(2K), .dma_max_len = MTK_TX_DMA_BUF_LEN, .dma_len_offset = 16, }, @@ -5076,11 +5089,14 @@ static const struct mtk_soc_data mt7621_data = { .desc_size = sizeof(struct mtk_tx_dma), .dma_max_len = MTK_TX_DMA_BUF_LEN, .dma_len_offset = 16, + .dma_size = MTK_DMA_SIZE(2K), + .fq_dma_size = MTK_DMA_SIZE(2K), }, .rx = { .desc_size = sizeof(struct mtk_rx_dma), .irq_done_mask = MTK_RX_DONE_INT, .dma_l4_valid = RX_DMA_L4_VALID, + .dma_size = MTK_DMA_SIZE(2K), .dma_max_len = MTK_TX_DMA_BUF_LEN, .dma_len_offset = 16, }, @@ -5102,11 +5118,14 @@ static const struct mtk_soc_data mt7622_data = { .desc_size = sizeof(struct mtk_tx_dma), .dma_max_len = MTK_TX_DMA_BUF_LEN, .dma_len_offset = 16, + .dma_size = MTK_DMA_SIZE(2K), + .fq_dma_size = MTK_DMA_SIZE(2K), }, .rx = { .desc_size = sizeof(struct mtk_rx_dma), .irq_done_mask = MTK_RX_DONE_INT, .dma_l4_valid = RX_DMA_L4_VALID, + .dma_size = MTK_DMA_SIZE(2K), .dma_max_len = MTK_TX_DMA_BUF_LEN, .dma_len_offset = 16, }, @@ -5127,11 +5146,14 @@ static const struct mtk_soc_data mt7623_data = { .desc_size = sizeof(struct mtk_tx_dma), .dma_max_len = MTK_TX_DMA_BUF_LEN, .dma_len_offset = 16, + .dma_size = MTK_DMA_SIZE(2K), + .fq_dma_size = MTK_DMA_SIZE(2K), }, .rx = { .desc_size = sizeof(struct mtk_rx_dma), .irq_done_mask = MTK_RX_DONE_INT, .dma_l4_valid = RX_DMA_L4_VALID, + .dma_size = MTK_DMA_SIZE(2K), .dma_max_len = MTK_TX_DMA_BUF_LEN, .dma_len_offset = 16, }, @@ -5150,11 +5172,14 @@ static const struct mtk_soc_data mt7629_data = { .desc_size = sizeof(struct mtk_tx_dma), .dma_max_len = MTK_TX_DMA_BUF_LEN, .dma_len_offset = 16, + .dma_size = MTK_DMA_SIZE(2K), + .fq_dma_size = MTK_DMA_SIZE(2K), }, .rx = { .desc_size = sizeof(struct mtk_rx_dma), .irq_done_mask = MTK_RX_DONE_INT, .dma_l4_valid = RX_DMA_L4_VALID, + .dma_size = MTK_DMA_SIZE(2K), .dma_max_len = MTK_TX_DMA_BUF_LEN, .dma_len_offset = 16, }, @@ -5176,6 +5201,8 @@ static const struct mtk_soc_data mt7981_data = { .desc_size = sizeof(struct mtk_tx_dma_v2), .dma_max_len = MTK_TX_DMA_BUF_LEN_V2, .dma_len_offset = 8, + .dma_size = MTK_DMA_SIZE(2K), + .fq_dma_size = MTK_DMA_SIZE(2K), }, .rx = { .desc_size = sizeof(struct mtk_rx_dma), @@ -5183,6 +5210,7 @@ static const struct mtk_soc_data mt7981_data = { .dma_l4_valid = RX_DMA_L4_VALID_V2, .dma_max_len = MTK_TX_DMA_BUF_LEN, .dma_len_offset = 16, + .dma_size = MTK_DMA_SIZE(2K), }, }; @@ -5202,6 +5230,8 @@ static const struct mtk_soc_data mt7986_data = { .desc_size = sizeof(struct mtk_tx_dma_v2), .dma_max_len = MTK_TX_DMA_BUF_LEN_V2, .dma_len_offset = 8, + .dma_size = MTK_DMA_SIZE(2K), + .fq_dma_size = MTK_DMA_SIZE(2K), }, .rx = { .desc_size = sizeof(struct mtk_rx_dma), @@ -5209,6 +5239,7 @@ static const struct mtk_soc_data mt7986_data = { .dma_l4_valid = RX_DMA_L4_VALID_V2, .dma_max_len = MTK_TX_DMA_BUF_LEN, .dma_len_offset = 16, + .dma_size = MTK_DMA_SIZE(2K), }, }; @@ -5228,6 +5259,8 @@ static const struct mtk_soc_data mt7988_data = { .desc_size = sizeof(struct mtk_tx_dma_v2), .dma_max_len = MTK_TX_DMA_BUF_LEN_V2, .dma_len_offset = 8, + .dma_size = MTK_DMA_SIZE(2K), + .fq_dma_size = MTK_DMA_SIZE(4K), }, .rx = { .desc_size = sizeof(struct mtk_rx_dma_v2), @@ -5235,6 +5268,7 @@ static const struct mtk_soc_data mt7988_data = { .dma_l4_valid = RX_DMA_L4_VALID_V2, .dma_max_len = MTK_TX_DMA_BUF_LEN_V2, .dma_len_offset = 8, + .dma_size = MTK_DMA_SIZE(2K), }, }; @@ -5249,6 +5283,7 @@ static const struct mtk_soc_data rt5350_data = { .desc_size = sizeof(struct mtk_tx_dma), .dma_max_len = MTK_TX_DMA_BUF_LEN, .dma_len_offset = 16, + .dma_size = MTK_DMA_SIZE(2K), }, .rx = { .desc_size = sizeof(struct mtk_rx_dma), @@ -5256,6 +5291,7 @@ static const struct mtk_soc_data rt5350_data = { .dma_l4_valid = RX_DMA_L4_VALID_PDMA, .dma_max_len = MTK_TX_DMA_BUF_LEN, .dma_len_offset = 16, + .dma_size = MTK_DMA_SIZE(2K), }, }; diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.h b/drivers/net/ethernet/mediatek/mtk_eth_soc.h index 4eab30b4407063..f5174f6cb1bbec 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h @@ -32,7 +32,9 @@ #define MTK_TX_DMA_BUF_LEN 0x3fff #define MTK_TX_DMA_BUF_LEN_V2 0xffff #define MTK_QDMA_RING_SIZE 2048 -#define MTK_DMA_SIZE 512 +#define MTK_DMA_SIZE(x) (SZ_##x) +#define MTK_FQ_DMA_HEAD 32 +#define MTK_FQ_DMA_LENGTH 2048 #define MTK_RX_ETH_HLEN (ETH_HLEN + ETH_FCS_LEN) #define MTK_RX_HLEN (NET_SKB_PAD + MTK_RX_ETH_HLEN + NET_IP_ALIGN) #define MTK_DMA_DUMMY_DESC 0xffffffff @@ -1176,6 +1178,8 @@ struct mtk_soc_data { u32 desc_size; u32 dma_max_len; u32 dma_len_offset; + u32 dma_size; + u32 fq_dma_size; } tx; struct { u32 desc_size; @@ -1183,6 +1187,7 @@ struct mtk_soc_data { u32 dma_l4_valid; u32 dma_max_len; u32 dma_len_offset; + u32 dma_size; } rx; }; @@ -1264,7 +1269,7 @@ struct mtk_eth { struct napi_struct rx_napi; void *scratch_ring; dma_addr_t phy_scratch_ring; - void *scratch_head; + void *scratch_head[MTK_FQ_DMA_HEAD]; struct clk *clks[MTK_CLK_MAX]; struct mii_bus *mii_bus; From 33afbfcc105a572159750f2ebee834a8a70fdd96 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Tue, 4 Jun 2024 00:04:42 +0300 Subject: [PATCH 0615/1578] net/mlx5: Stop waiting for PCI if pci channel is offline In case pci channel becomes offline the driver should not wait for PCI reads during health dump and recovery flow. The driver has timeout for each of these loops trying to read PCI, so it would fail anyway. However, in case of recovery waiting till timeout may cause the pci error_detected() callback fail to meet pci_dpc_recovered() wait timeout. Fixes: b3bd076f7501 ("net/mlx5: Report devlink health on FW fatal issues") Signed-off-by: Moshe Shemesh Reviewed-by: Shay Drori Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/fw.c | 4 ++++ drivers/net/ethernet/mellanox/mlx5/core/health.c | 8 ++++++++ drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.c | 4 ++++ 3 files changed, 16 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c index 2d95a9b7b44e19..b61b7d96611413 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c @@ -373,6 +373,10 @@ int mlx5_cmd_fast_teardown_hca(struct mlx5_core_dev *dev) do { if (mlx5_get_nic_state(dev) == MLX5_INITIAL_SEG_NIC_INTERFACE_DISABLED) break; + if (pci_channel_offline(dev->pdev)) { + mlx5_core_err(dev, "PCI channel offline, stop waiting for NIC IFC\n"); + return -EACCES; + } cond_resched(); } while (!time_after(jiffies, end)); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index ad38e31822df10..a6329ca2d9bffb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -248,6 +248,10 @@ void mlx5_error_sw_reset(struct mlx5_core_dev *dev) do { if (mlx5_get_nic_state(dev) == MLX5_INITIAL_SEG_NIC_INTERFACE_DISABLED) break; + if (pci_channel_offline(dev->pdev)) { + mlx5_core_err(dev, "PCI channel offline, stop waiting for NIC IFC\n"); + goto unlock; + } msleep(20); } while (!time_after(jiffies, end)); @@ -317,6 +321,10 @@ int mlx5_health_wait_pci_up(struct mlx5_core_dev *dev) mlx5_core_warn(dev, "device is being removed, stop waiting for PCI\n"); return -ENODEV; } + if (pci_channel_offline(dev->pdev)) { + mlx5_core_err(dev, "PCI channel offline, stop waiting for PCI\n"); + return -EACCES; + } msleep(100); } return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.c index 6b774e0c276659..d0b595ba611014 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.c @@ -74,6 +74,10 @@ int mlx5_vsc_gw_lock(struct mlx5_core_dev *dev) ret = -EBUSY; goto pci_unlock; } + if (pci_channel_offline(dev->pdev)) { + ret = -EACCES; + goto pci_unlock; + } /* Check if semaphore is already locked */ ret = vsc_read(dev, VSC_SEMAPHORE_OFFSET, &lock_val); From c8b3f38d2dae0397944814d691a419c451f9906f Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 4 Jun 2024 00:04:43 +0300 Subject: [PATCH 0616/1578] net/mlx5: Always stop health timer during driver removal Currently, if teardown_hca fails to execute during driver removal, mlx5 does not stop the health timer. Afterwards, mlx5 continue with driver teardown. This may lead to a UAF bug, which results in page fault Oops[1], since the health timer invokes after resources were freed. Hence, stop the health monitor even if teardown_hca fails. [1] mlx5_core 0000:18:00.0: E-Switch: Unload vfs: mode(LEGACY), nvfs(0), necvfs(0), active vports(0) mlx5_core 0000:18:00.0: E-Switch: Disable: mode(LEGACY), nvfs(0), necvfs(0), active vports(0) mlx5_core 0000:18:00.0: E-Switch: Disable: mode(LEGACY), nvfs(0), necvfs(0), active vports(0) mlx5_core 0000:18:00.0: E-Switch: cleanup mlx5_core 0000:18:00.0: wait_func:1155:(pid 1967079): TEARDOWN_HCA(0x103) timeout. Will cause a leak of a command resource mlx5_core 0000:18:00.0: mlx5_function_close:1288:(pid 1967079): tear_down_hca failed, skip cleanup BUG: unable to handle page fault for address: ffffa26487064230 PGD 100c00067 P4D 100c00067 PUD 100e5a067 PMD 105ed7067 PTE 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 0 PID: 0 Comm: swapper/0 Tainted: G OE ------- --- 6.7.0-68.fc38.x86_64 #1 Hardware name: Intel Corporation S2600WFT/S2600WFT, BIOS SE5C620.86B.02.01.0013.121520200651 12/15/2020 RIP: 0010:ioread32be+0x34/0x60 RSP: 0018:ffffa26480003e58 EFLAGS: 00010292 RAX: ffffa26487064200 RBX: ffff9042d08161a0 RCX: ffff904c108222c0 RDX: 000000010bbf1b80 RSI: ffffffffc055ddb0 RDI: ffffa26487064230 RBP: ffff9042d08161a0 R08: 0000000000000022 R09: ffff904c108222e8 R10: 0000000000000004 R11: 0000000000000441 R12: ffffffffc055ddb0 R13: ffffa26487064200 R14: ffffa26480003f00 R15: ffff904c108222c0 FS: 0000000000000000(0000) GS:ffff904c10800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffa26487064230 CR3: 00000002c4420006 CR4: 00000000007706f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? __die+0x23/0x70 ? page_fault_oops+0x171/0x4e0 ? exc_page_fault+0x175/0x180 ? asm_exc_page_fault+0x26/0x30 ? __pfx_poll_health+0x10/0x10 [mlx5_core] ? __pfx_poll_health+0x10/0x10 [mlx5_core] ? ioread32be+0x34/0x60 mlx5_health_check_fatal_sensors+0x20/0x100 [mlx5_core] ? __pfx_poll_health+0x10/0x10 [mlx5_core] poll_health+0x42/0x230 [mlx5_core] ? __next_timer_interrupt+0xbc/0x110 ? __pfx_poll_health+0x10/0x10 [mlx5_core] call_timer_fn+0x21/0x130 ? __pfx_poll_health+0x10/0x10 [mlx5_core] __run_timers+0x222/0x2c0 run_timer_softirq+0x1d/0x40 __do_softirq+0xc9/0x2c8 __irq_exit_rcu+0xa6/0xc0 sysvec_apic_timer_interrupt+0x72/0x90 asm_sysvec_apic_timer_interrupt+0x1a/0x20 RIP: 0010:cpuidle_enter_state+0xcc/0x440 ? cpuidle_enter_state+0xbd/0x440 cpuidle_enter+0x2d/0x40 do_idle+0x20d/0x270 cpu_startup_entry+0x2a/0x30 rest_init+0xd0/0xd0 arch_call_rest_init+0xe/0x30 start_kernel+0x709/0xa90 x86_64_start_reservations+0x18/0x30 x86_64_start_kernel+0x96/0xa0 secondary_startup_64_no_verify+0x18f/0x19b ---[ end trace 0000000000000000 ]--- Fixes: 9b98d395b85d ("net/mlx5: Start health poll at earlier stage of driver load") Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 6574c145dc1e2d..459a836a5d9c15 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1298,6 +1298,9 @@ static int mlx5_function_teardown(struct mlx5_core_dev *dev, bool boot) if (!err) mlx5_function_disable(dev, boot); + else + mlx5_stop_health_poll(dev, boot); + return err; } From d21b3c60d6e3917f7388db6bcc455f01c99ee42b Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 23 May 2024 16:41:02 +0100 Subject: [PATCH 0617/1578] KVM: selftests: Fix shift of 32 bit unsigned int more than 32 bits Currrentl a 32 bit 1u value is being shifted more than 32 bits causing overflow and incorrect checking of bits 32-63. Fix this by using the BIT_ULL macro for shifting bits. Detected by cppcheck: sev_init2_tests.c:108:34: error: Shifting 32-bit value by 63 bits is undefined behaviour [shiftTooManyBits] Fixes: dfc083a181ba ("selftests: kvm: add tests for KVM_SEV_INIT2") Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20240523154102.2236133-1-colin.i.king@gmail.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86_64/sev_init2_tests.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/sev_init2_tests.c b/tools/testing/selftests/kvm/x86_64/sev_init2_tests.c index 7a4a61be119b15..3fb967f40c6a16 100644 --- a/tools/testing/selftests/kvm/x86_64/sev_init2_tests.c +++ b/tools/testing/selftests/kvm/x86_64/sev_init2_tests.c @@ -105,11 +105,11 @@ void test_features(uint32_t vm_type, uint64_t supported_features) int i; for (i = 0; i < 64; i++) { - if (!(supported_features & (1u << i))) + if (!(supported_features & BIT_ULL(i))) test_init2_invalid(vm_type, &(struct kvm_sev_init){ .vmsa_features = BIT_ULL(i) }, "unknown feature"); - else if (KNOWN_FEATURES & (1u << i)) + else if (KNOWN_FEATURES & BIT_ULL(i)) test_init2(vm_type, &(struct kvm_sev_init){ .vmsa_features = BIT_ULL(i) }); } From 980b8bc01938c8bcc9742c1051f64b5f0ed178ac Mon Sep 17 00:00:00 2001 From: Tao Su Date: Mon, 13 May 2024 09:40:03 +0800 Subject: [PATCH 0618/1578] KVM: selftests: x86: Prioritize getting max_gfn from GuestPhysBits Use the max mappable GPA via GuestPhysBits advertised by KVM to calculate max_gfn. Currently some selftests (e.g. access_tracking_perf_test, dirty_log_test...) add RAM regions close to max_gfn, so guest may access GPA beyond its mappable range and cause infinite loop. Adjust max_gfn in vm_compute_max_gfn() since x86 selftests already overrides vm_compute_max_gfn() specifically to deal with goofy edge cases. Reported-by: Yi Lai Signed-off-by: Tao Su Tested-by: Yi Lai Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20240513014003.104593-1-tao1.su@linux.intel.com [sean: tweak name, add comment and sanity check] Signed-off-by: Sean Christopherson --- .../selftests/kvm/include/x86_64/processor.h | 1 + .../testing/selftests/kvm/lib/x86_64/processor.c | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 8eb57de0b5876d..c0c7c1fe93f987 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -277,6 +277,7 @@ struct kvm_x86_cpu_property { #define X86_PROPERTY_MAX_EXT_LEAF KVM_X86_CPU_PROPERTY(0x80000000, 0, EAX, 0, 31) #define X86_PROPERTY_MAX_PHY_ADDR KVM_X86_CPU_PROPERTY(0x80000008, 0, EAX, 0, 7) #define X86_PROPERTY_MAX_VIRT_ADDR KVM_X86_CPU_PROPERTY(0x80000008, 0, EAX, 8, 15) +#define X86_PROPERTY_GUEST_MAX_PHY_ADDR KVM_X86_CPU_PROPERTY(0x80000008, 0, EAX, 16, 23) #define X86_PROPERTY_SEV_C_BIT KVM_X86_CPU_PROPERTY(0x8000001F, 0, EBX, 0, 5) #define X86_PROPERTY_PHYS_ADDR_REDUCTION KVM_X86_CPU_PROPERTY(0x8000001F, 0, EBX, 6, 11) diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index c664e446136bc6..594b061aef5217 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1247,9 +1247,20 @@ unsigned long vm_compute_max_gfn(struct kvm_vm *vm) { const unsigned long num_ht_pages = 12 << (30 - vm->page_shift); /* 12 GiB */ unsigned long ht_gfn, max_gfn, max_pfn; - uint8_t maxphyaddr; + uint8_t maxphyaddr, guest_maxphyaddr; - max_gfn = (1ULL << (vm->pa_bits - vm->page_shift)) - 1; + /* + * Use "guest MAXPHYADDR" from KVM if it's available. Guest MAXPHYADDR + * enumerates the max _mappable_ GPA, which can be less than the raw + * MAXPHYADDR, e.g. if MAXPHYADDR=52, KVM is using TDP, and the CPU + * doesn't support 5-level TDP. + */ + guest_maxphyaddr = kvm_cpu_property(X86_PROPERTY_GUEST_MAX_PHY_ADDR); + guest_maxphyaddr = guest_maxphyaddr ?: vm->pa_bits; + TEST_ASSERT(guest_maxphyaddr <= vm->pa_bits, + "Guest MAXPHYADDR should never be greater than raw MAXPHYADDR"); + + max_gfn = (1ULL << (guest_maxphyaddr - vm->page_shift)) - 1; /* Avoid reserved HyperTransport region on AMD processors. */ if (!host_cpu_is_amd) From 49f683b41f28918df3e51ddc0d928cb2e934ccdb Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 10 May 2024 02:23:52 -0700 Subject: [PATCH 0619/1578] KVM: Fix a data race on last_boosted_vcpu in kvm_vcpu_on_spin() Use {READ,WRITE}_ONCE() to access kvm->last_boosted_vcpu to ensure the loads and stores are atomic. In the extremely unlikely scenario the compiler tears the stores, it's theoretically possible for KVM to attempt to get a vCPU using an out-of-bounds index, e.g. if the write is split into multiple 8-bit stores, and is paired with a 32-bit load on a VM with 257 vCPUs: CPU0 CPU1 last_boosted_vcpu = 0xff; (last_boosted_vcpu = 0x100) last_boosted_vcpu[15:8] = 0x01; i = (last_boosted_vcpu = 0x1ff) last_boosted_vcpu[7:0] = 0x00; vcpu = kvm->vcpu_array[0x1ff]; As detected by KCSAN: BUG: KCSAN: data-race in kvm_vcpu_on_spin [kvm] / kvm_vcpu_on_spin [kvm] write to 0xffffc90025a92344 of 4 bytes by task 4340 on cpu 16: kvm_vcpu_on_spin (arch/x86/kvm/../../../virt/kvm/kvm_main.c:4112) kvm handle_pause (arch/x86/kvm/vmx/vmx.c:5929) kvm_intel vmx_handle_exit (arch/x86/kvm/vmx/vmx.c:? arch/x86/kvm/vmx/vmx.c:6606) kvm_intel vcpu_run (arch/x86/kvm/x86.c:11107 arch/x86/kvm/x86.c:11211) kvm kvm_arch_vcpu_ioctl_run (arch/x86/kvm/x86.c:?) kvm kvm_vcpu_ioctl (arch/x86/kvm/../../../virt/kvm/kvm_main.c:?) kvm __se_sys_ioctl (fs/ioctl.c:52 fs/ioctl.c:904 fs/ioctl.c:890) __x64_sys_ioctl (fs/ioctl.c:890) x64_sys_call (arch/x86/entry/syscall_64.c:33) do_syscall_64 (arch/x86/entry/common.c:?) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) read to 0xffffc90025a92344 of 4 bytes by task 4342 on cpu 4: kvm_vcpu_on_spin (arch/x86/kvm/../../../virt/kvm/kvm_main.c:4069) kvm handle_pause (arch/x86/kvm/vmx/vmx.c:5929) kvm_intel vmx_handle_exit (arch/x86/kvm/vmx/vmx.c:? arch/x86/kvm/vmx/vmx.c:6606) kvm_intel vcpu_run (arch/x86/kvm/x86.c:11107 arch/x86/kvm/x86.c:11211) kvm kvm_arch_vcpu_ioctl_run (arch/x86/kvm/x86.c:?) kvm kvm_vcpu_ioctl (arch/x86/kvm/../../../virt/kvm/kvm_main.c:?) kvm __se_sys_ioctl (fs/ioctl.c:52 fs/ioctl.c:904 fs/ioctl.c:890) __x64_sys_ioctl (fs/ioctl.c:890) x64_sys_call (arch/x86/entry/syscall_64.c:33) do_syscall_64 (arch/x86/entry/common.c:?) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) value changed: 0x00000012 -> 0x00000000 Fixes: 217ece6129f2 ("KVM: use yield_to instead of sleep in kvm_vcpu_on_spin") Cc: stable@vger.kernel.org Signed-off-by: Breno Leitao Link: https://lore.kernel.org/r/20240510092353.2261824-1-leitao@debian.org Signed-off-by: Sean Christopherson --- virt/kvm/kvm_main.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 14841acb8b959b..843aa68cbcd050 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4025,12 +4025,13 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) { struct kvm *kvm = me->kvm; struct kvm_vcpu *vcpu; - int last_boosted_vcpu = me->kvm->last_boosted_vcpu; + int last_boosted_vcpu; unsigned long i; int yielded = 0; int try = 3; int pass; + last_boosted_vcpu = READ_ONCE(kvm->last_boosted_vcpu); kvm_vcpu_set_in_spin_loop(me, true); /* * We boost the priority of a VCPU that is runnable but not @@ -4068,7 +4069,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) yielded = kvm_vcpu_yield_to(vcpu); if (yielded > 0) { - kvm->last_boosted_vcpu = i; + WRITE_ONCE(kvm->last_boosted_vcpu, i); break; } else if (yielded < 0) { try--; From 74751ef5c1912ebd3e65c3b65f45587e05ce5d36 Mon Sep 17 00:00:00 2001 From: Haifeng Xu Date: Mon, 13 May 2024 10:39:48 +0000 Subject: [PATCH 0620/1578] perf/core: Fix missing wakeup when waiting for context reference In our production environment, we found many hung tasks which are blocked for more than 18 hours. Their call traces are like this: [346278.191038] __schedule+0x2d8/0x890 [346278.191046] schedule+0x4e/0xb0 [346278.191049] perf_event_free_task+0x220/0x270 [346278.191056] ? init_wait_var_entry+0x50/0x50 [346278.191060] copy_process+0x663/0x18d0 [346278.191068] kernel_clone+0x9d/0x3d0 [346278.191072] __do_sys_clone+0x5d/0x80 [346278.191076] __x64_sys_clone+0x25/0x30 [346278.191079] do_syscall_64+0x5c/0xc0 [346278.191083] ? syscall_exit_to_user_mode+0x27/0x50 [346278.191086] ? do_syscall_64+0x69/0xc0 [346278.191088] ? irqentry_exit_to_user_mode+0x9/0x20 [346278.191092] ? irqentry_exit+0x19/0x30 [346278.191095] ? exc_page_fault+0x89/0x160 [346278.191097] ? asm_exc_page_fault+0x8/0x30 [346278.191102] entry_SYSCALL_64_after_hwframe+0x44/0xae The task was waiting for the refcount become to 1, but from the vmcore, we found the refcount has already been 1. It seems that the task didn't get woken up by perf_event_release_kernel() and got stuck forever. The below scenario may cause the problem. Thread A Thread B ... ... perf_event_free_task perf_event_release_kernel ... acquire event->child_mutex ... get_ctx ... release event->child_mutex acquire ctx->mutex ... perf_free_event (acquire/release event->child_mutex) ... release ctx->mutex wait_var_event acquire ctx->mutex acquire event->child_mutex # move existing events to free_list release event->child_mutex release ctx->mutex put_ctx ... ... In this case, all events of the ctx have been freed, so we couldn't find the ctx in free_list and Thread A will miss the wakeup. It's thus necessary to add a wakeup after dropping the reference. Fixes: 1cf8dfe8a661 ("perf/core: Fix race between close() and fork()") Signed-off-by: Haifeng Xu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Frederic Weisbecker Acked-by: Mark Rutland Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20240513103948.33570-1-haifeng.xu@shopee.com --- kernel/events/core.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/kernel/events/core.c b/kernel/events/core.c index f0128c5ff27896..8f908f0779354d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5384,6 +5384,7 @@ int perf_event_release_kernel(struct perf_event *event) again: mutex_lock(&event->child_mutex); list_for_each_entry(child, &event->child_list, child_list) { + void *var = NULL; /* * Cannot change, child events are not migrated, see the @@ -5424,11 +5425,23 @@ int perf_event_release_kernel(struct perf_event *event) * this can't be the last reference. */ put_event(event); + } else { + var = &ctx->refcount; } mutex_unlock(&event->child_mutex); mutex_unlock(&ctx->mutex); put_ctx(ctx); + + if (var) { + /* + * If perf_event_free_task() has deleted all events from the + * ctx while the child_mutex got released above, make sure to + * notify about the preceding put_ctx(). + */ + smp_mb(); /* pairs with wait_var_event() */ + wake_up_var(var); + } goto again; } mutex_unlock(&event->child_mutex); From f92a59f6d12e31ead999fee9585471b95a8ae8a3 Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Wed, 15 May 2024 13:37:10 +0000 Subject: [PATCH 0621/1578] locking/atomic: scripts: fix ${atomic}_sub_and_test() kerneldoc For ${atomic}_sub_and_test() the @i parameter is the value to subtract, not add. Fix the typo in the kerneldoc template and generate the headers with this update. Fixes: ad8110706f38 ("locking/atomic: scripts: generate kerneldoc comments") Suggested-by: Mark Rutland Signed-off-by: Carlos Llamas Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland Reviewed-by: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20240515133844.3502360-1-cmllamas@google.com --- include/linux/atomic/atomic-arch-fallback.h | 6 +++--- include/linux/atomic/atomic-instrumented.h | 8 ++++---- include/linux/atomic/atomic-long.h | 4 ++-- scripts/atomic/kerneldoc/sub_and_test | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/linux/atomic/atomic-arch-fallback.h b/include/linux/atomic/atomic-arch-fallback.h index 956bcba5dbf2f8..2f9d36b72bd89b 100644 --- a/include/linux/atomic/atomic-arch-fallback.h +++ b/include/linux/atomic/atomic-arch-fallback.h @@ -2242,7 +2242,7 @@ raw_atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new) /** * raw_atomic_sub_and_test() - atomic subtract and test if zero with full ordering - * @i: int value to add + * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with full ordering. @@ -4368,7 +4368,7 @@ raw_atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new) /** * raw_atomic64_sub_and_test() - atomic subtract and test if zero with full ordering - * @i: s64 value to add + * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with full ordering. @@ -4690,4 +4690,4 @@ raw_atomic64_dec_if_positive(atomic64_t *v) } #endif /* _LINUX_ATOMIC_FALLBACK_H */ -// 14850c0b0db20c62fdc78ccd1d42b98b88d76331 +// b565db590afeeff0d7c9485ccbca5bb6e155749f diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h index debd487fe97160..9409a6ddf3e0df 100644 --- a/include/linux/atomic/atomic-instrumented.h +++ b/include/linux/atomic/atomic-instrumented.h @@ -1349,7 +1349,7 @@ atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new) /** * atomic_sub_and_test() - atomic subtract and test if zero with full ordering - * @i: int value to add + * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with full ordering. @@ -2927,7 +2927,7 @@ atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new) /** * atomic64_sub_and_test() - atomic subtract and test if zero with full ordering - * @i: s64 value to add + * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with full ordering. @@ -4505,7 +4505,7 @@ atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new) /** * atomic_long_sub_and_test() - atomic subtract and test if zero with full ordering - * @i: long value to add + * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with full ordering. @@ -5050,4 +5050,4 @@ atomic_long_dec_if_positive(atomic_long_t *v) #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */ -// ce5b65e0f1f8a276268b667194581d24bed219d4 +// 8829b337928e9508259079d32581775ececd415b diff --git a/include/linux/atomic/atomic-long.h b/include/linux/atomic/atomic-long.h index 3ef844b3ab8a3d..f86b29d9087752 100644 --- a/include/linux/atomic/atomic-long.h +++ b/include/linux/atomic/atomic-long.h @@ -1535,7 +1535,7 @@ raw_atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new) /** * raw_atomic_long_sub_and_test() - atomic subtract and test if zero with full ordering - * @i: long value to add + * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with full ordering. @@ -1809,4 +1809,4 @@ raw_atomic_long_dec_if_positive(atomic_long_t *v) } #endif /* _LINUX_ATOMIC_LONG_H */ -// 1c4a26fc77f345342953770ebe3c4d08e7ce2f9a +// eadf183c3600b8b92b91839dd3be6bcc560c752d diff --git a/scripts/atomic/kerneldoc/sub_and_test b/scripts/atomic/kerneldoc/sub_and_test index d3760f7749d4e7..96615e50836b0a 100644 --- a/scripts/atomic/kerneldoc/sub_and_test +++ b/scripts/atomic/kerneldoc/sub_and_test @@ -1,7 +1,7 @@ cat < Date: Tue, 4 Jun 2024 11:00:22 -0300 Subject: [PATCH 0622/1578] Revert "perf record: Reduce memory for recording PERF_RECORD_LOST_SAMPLES event" This reverts commit 7d1405c71df21f6c394b8a885aa8a133f749fa22. This causes segfaults in some cases, as reported by Milian: ``` sudo /usr/bin/perf record -z --call-graph dwarf -e cycles -e raw_syscalls:sys_enter ls ... [ perf record: Woken up 3 times to write data ] malloc(): invalid next size (unsorted) Aborted ``` Backtrace with GDB + debuginfod: ``` malloc(): invalid next size (unsorted) Thread 1 "perf" received signal SIGABRT, Aborted. __pthread_kill_implementation (threadid=, signo=signo@entry=6, no_tid=no_tid@entry=0) at pthread_kill.c:44 Downloading source file /usr/src/debug/glibc/glibc/nptl/pthread_kill.c 44 return INTERNAL_SYSCALL_ERROR_P (ret) ? INTERNAL_SYSCALL_ERRNO (ret) : 0; (gdb) bt #0 __pthread_kill_implementation (threadid=, signo=signo@entry=6, no_tid=no_tid@entry=0) at pthread_kill.c:44 #1 0x00007ffff6ea8eb3 in __pthread_kill_internal (threadid=, signo=6) at pthread_kill.c:78 #2 0x00007ffff6e50a30 in __GI_raise (sig=sig@entry=6) at ../sysdeps/posix/ raise.c:26 #3 0x00007ffff6e384c3 in __GI_abort () at abort.c:79 #4 0x00007ffff6e39354 in __libc_message_impl (fmt=fmt@entry=0x7ffff6fc22ea "%s\n") at ../sysdeps/posix/libc_fatal.c:132 #5 0x00007ffff6eb3085 in malloc_printerr (str=str@entry=0x7ffff6fc5850 "malloc(): invalid next size (unsorted)") at malloc.c:5772 #6 0x00007ffff6eb657c in _int_malloc (av=av@entry=0x7ffff6ff6ac0 , bytes=bytes@entry=368) at malloc.c:4081 #7 0x00007ffff6eb877e in __libc_calloc (n=, elem_size=) at malloc.c:3754 #8 0x000055555569bdb6 in perf_session.do_write_header () #9 0x00005555555a373a in __cmd_record.constprop.0 () #10 0x00005555555a6846 in cmd_record () #11 0x000055555564db7f in run_builtin () #12 0x000055555558ed77 in main () ``` Valgrind memcheck: ``` ==45136== Invalid write of size 8 ==45136== at 0x2B38A5: perf_event__synthesize_id_sample (in /usr/bin/perf) ==45136== by 0x157069: __cmd_record.constprop.0 (in /usr/bin/perf) ==45136== by 0x15A845: cmd_record (in /usr/bin/perf) ==45136== by 0x201B7E: run_builtin (in /usr/bin/perf) ==45136== by 0x142D76: main (in /usr/bin/perf) ==45136== Address 0x6a866a8 is 0 bytes after a block of size 40 alloc'd ==45136== at 0x4849BF3: calloc (vg_replace_malloc.c:1675) ==45136== by 0x3574AB: zalloc (in /usr/bin/perf) ==45136== by 0x1570E0: __cmd_record.constprop.0 (in /usr/bin/perf) ==45136== by 0x15A845: cmd_record (in /usr/bin/perf) ==45136== by 0x201B7E: run_builtin (in /usr/bin/perf) ==45136== by 0x142D76: main (in /usr/bin/perf) ==45136== ==45136== Syscall param write(buf) points to unaddressable byte(s) ==45136== at 0x575953D: __libc_write (write.c:26) ==45136== by 0x575953D: write (write.c:24) ==45136== by 0x35761F: ion (in /usr/bin/perf) ==45136== by 0x357778: writen (in /usr/bin/perf) ==45136== by 0x1548F7: record__write (in /usr/bin/perf) ==45136== by 0x15708A: __cmd_record.constprop.0 (in /usr/bin/perf) ==45136== by 0x15A845: cmd_record (in /usr/bin/perf) ==45136== by 0x201B7E: run_builtin (in /usr/bin/perf) ==45136== by 0x142D76: main (in /usr/bin/perf) ==45136== Address 0x6a866a8 is 0 bytes after a block of size 40 alloc'd ==45136== at 0x4849BF3: calloc (vg_replace_malloc.c:1675) ==45136== by 0x3574AB: zalloc (in /usr/bin/perf) ==45136== by 0x1570E0: __cmd_record.constprop.0 (in /usr/bin/perf) ==45136== by 0x15A845: cmd_record (in /usr/bin/perf) ==45136== by 0x201B7E: run_builtin (in /usr/bin/perf) ==45136== by 0x142D76: main (in /usr/bin/perf) ==45136== ----- Closes: https://lore.kernel.org/linux-perf-users/23879991.0LEYPuXRzz@milian-workstation/ Reported-by: Milian Wolff Tested-by: Milian Wolff Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: stable@kernel.org # 6.8+ Link: https://lore.kernel.org/lkml/Zl9ksOlHJHnKM70p@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 66a3de8ac66186..0a8ba1323d64be 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1956,8 +1956,7 @@ static void record__read_lost_samples(struct record *rec) if (count.lost) { if (!lost) { - lost = zalloc(sizeof(*lost) + - session->machines.host.id_hdr_size); + lost = zalloc(PERF_SAMPLE_MAX_SIZE); if (!lost) { pr_debug("Memory allocation failed\n"); return; @@ -1973,8 +1972,7 @@ static void record__read_lost_samples(struct record *rec) lost_count = perf_bpf_filter__lost_count(evsel); if (lost_count) { if (!lost) { - lost = zalloc(sizeof(*lost) + - session->machines.host.id_hdr_size); + lost = zalloc(PERF_SAMPLE_MAX_SIZE); if (!lost) { pr_debug("Memory allocation failed\n"); return; From ca9680821dfec73c9100860bda4fab1f1309722e Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 21 May 2024 10:07:40 -0700 Subject: [PATCH 0623/1578] perf bpf: Fix handling of minimal vmlinux.h file when interrupting the build Ingo reported that he was seeing these when hitting Control+C during a perf tools build: Makefile.perf:1149: *** Missing bpftool input for generating vmlinux.h. Stop. The failure happens when you don't have vmlinux.h or vmlinux with BTF. ifeq ($(VMLINUX_H),) ifeq ($(VMLINUX_BTF),) $(error Missing bpftool input for generating vmlinux.h) endif endif VMLINUX_BTF can be empty if you didn't build a kernel or it doesn't have a BTF section and the current kernel also has no BTF. This is totally ok. But VMLINUX_H should be set to the minimal version in the source tree (unless you overwrite it manually) when you don't pass GEN_VMLINUX_H=1 (which requires VMLINUX_BTF should not be empty). The problem is that it's defined in Makefile.config which is not included for `make clean`. Reported-by: Ingo Molnar Signed-off-by: Namhyung Kim Tested-by: Ingo Molnar Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Link: http://lore.kernel.org/lkml/CAM9d7ch5HTr+k+_GpbMrX0HUo5BZ11byh1xq0Two7B7RQACuNw@mail.gmail.com Link: http://lore.kernel.org/lkml/ZjssGrj+abyC6mYP@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.perf | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 5c35c0d8930696..e6d56b55536958 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -214,6 +214,7 @@ NON_CONFIG_TARGETS := clean python-clean TAGS tags cscope help ifdef MAKECMDGOALS ifeq ($(filter-out $(NON_CONFIG_TARGETS),$(MAKECMDGOALS)),) + VMLINUX_H=$(src-perf)/util/bpf_skel/vmlinux/vmlinux.h config := 0 endif endif From 9a64e1bfd8a10c015c41fe0d289e89862486c50d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 2 Jun 2024 22:52:24 -0400 Subject: [PATCH 0624/1578] bcachefs: Fix GFP_KERNEL allocation in break_cycle() Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_locking.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index c3e9b0cc7bbdc9..d66fff22109ae7 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -215,6 +215,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) if (unlikely(!best)) { struct printbuf buf = PRINTBUF; + buf.atomic++; prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks")); From fdccb24352e589bb59c9ba90f23c4e0994b90518 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 2 Jun 2024 22:25:18 -0400 Subject: [PATCH 0625/1578] bcachefs: Rereplicate now moves data off of durability=0 devices This fixes an issue where setting a device to durability=0 after it's been used makes it impossible to remove. Signed-off-by: Kent Overstreet --- fs/bcachefs/move.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 8171f947fac8f8..2706d82423450c 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -920,7 +920,20 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg, ? c->opts.metadata_replicas : io_opts->data_replicas; - if (!nr_good || nr_good >= replicas) + rcu_read_lock(); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + unsigned i = 0; + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + if (!ptr->cached && + (!ca || !ca->mi.durability)) + data_opts->kill_ptrs |= BIT(i); + i++; + } + rcu_read_unlock(); + + if (!data_opts->kill_ptrs && + (!nr_good || nr_good >= replicas)) return false; data_opts->target = 0; From 319fef29e96524966bb8593117ce0c5867846eea Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 3 Jun 2024 18:00:48 -0400 Subject: [PATCH 0626/1578] bcachefs: Fix trans->locked assert in bch2_move_data_btree, we might start with the trans unlocked from a previous loop iteration - we need a trans_begin() before iter_init(). Signed-off-by: Kent Overstreet --- fs/bcachefs/move.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 2706d82423450c..6e477fadaa2a5b 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -547,6 +547,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, ctxt->stats->pos = BBPOS(btree_id, start); } + bch2_trans_begin(trans); bch2_trans_iter_init(trans, &iter, btree_id, start, BTREE_ITER_prefetch| BTREE_ITER_all_snapshots); From d38e48563c1f70460503de7ffff9a7f46b54b67d Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Mon, 3 Jun 2024 21:03:42 +0200 Subject: [PATCH 0627/1578] s390/crash: Do not use VM info if os_info does not have it The virtual memory information stored in os_info area is required for creation of the kernel image PT_LOAD program header for kernels since commit a2ec5bec56dd ("s390/mm: uncouple physical vs virtual address spaces"). By contrast, if such information in os_info is absent the PT_LOAD program header should not be created. Currently the proper PT_LOAD program header is created for kernels that contain the virtual memory information, but for kernels without one an invalid header of zero size is created. That in turn leads to stand-alone dump failures. Use OS_INFO_KASLR_OFFSET variable to check whether os_info is present or not (same as crash and makedumpfile tools do) and based on that create or do not create the kernel image PT_LOAD program header. Fixes: f4cac27dc0d6 ("s390/crash: Use old os_info to create PT_LOAD headers") Tested-by: Mikhail Zaslonko Acked-by: Mikhail Zaslonko Acked-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/kernel/crash_dump.c | 54 +++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index 9863ebe75019a6..edae1341619600 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -451,7 +451,7 @@ static void *nt_final(void *ptr) /* * Initialize ELF header (new kernel) */ -static void *ehdr_init(Elf64_Ehdr *ehdr, int mem_chunk_cnt) +static void *ehdr_init(Elf64_Ehdr *ehdr, int phdr_count) { memset(ehdr, 0, sizeof(*ehdr)); memcpy(ehdr->e_ident, ELFMAG, SELFMAG); @@ -465,11 +465,8 @@ static void *ehdr_init(Elf64_Ehdr *ehdr, int mem_chunk_cnt) ehdr->e_phoff = sizeof(Elf64_Ehdr); ehdr->e_ehsize = sizeof(Elf64_Ehdr); ehdr->e_phentsize = sizeof(Elf64_Phdr); - /* - * Number of memory chunk PT_LOAD program headers plus one kernel - * image PT_LOAD program header plus one PT_NOTE program header. - */ - ehdr->e_phnum = mem_chunk_cnt + 1 + 1; + /* Number of PT_LOAD program headers plus PT_NOTE program header */ + ehdr->e_phnum = phdr_count + 1; return ehdr + 1; } @@ -503,12 +500,14 @@ static int get_mem_chunk_cnt(void) /* * Initialize ELF loads (new kernel) */ -static void loads_init(Elf64_Phdr *phdr) +static void loads_init(Elf64_Phdr *phdr, bool os_info_has_vm) { - unsigned long old_identity_base = os_info_old_value(OS_INFO_IDENTITY_BASE); + unsigned long old_identity_base = 0; phys_addr_t start, end; u64 idx; + if (os_info_has_vm) + old_identity_base = os_info_old_value(OS_INFO_IDENTITY_BASE); for_each_physmem_range(idx, &oldmem_type, &start, &end) { phdr->p_type = PT_LOAD; phdr->p_vaddr = old_identity_base + start; @@ -522,6 +521,11 @@ static void loads_init(Elf64_Phdr *phdr) } } +static bool os_info_has_vm(void) +{ + return os_info_old_value(OS_INFO_KASLR_OFFSET); +} + /* * Prepare PT_LOAD type program header for kernel image region */ @@ -566,7 +570,7 @@ static void *notes_init(Elf64_Phdr *phdr, void *ptr, u64 notes_offset) return ptr; } -static size_t get_elfcorehdr_size(int mem_chunk_cnt) +static size_t get_elfcorehdr_size(int phdr_count) { size_t size; @@ -581,10 +585,8 @@ static size_t get_elfcorehdr_size(int mem_chunk_cnt) size += nt_vmcoreinfo_size(); /* nt_final */ size += sizeof(Elf64_Nhdr); - /* PT_LOAD type program header for kernel text region */ - size += sizeof(Elf64_Phdr); /* PT_LOADS */ - size += mem_chunk_cnt * sizeof(Elf64_Phdr); + size += phdr_count * sizeof(Elf64_Phdr); return size; } @@ -595,8 +597,8 @@ static size_t get_elfcorehdr_size(int mem_chunk_cnt) int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size) { Elf64_Phdr *phdr_notes, *phdr_loads, *phdr_text; + int mem_chunk_cnt, phdr_text_cnt; size_t alloc_size; - int mem_chunk_cnt; void *ptr, *hdr; u64 hdr_off; @@ -615,12 +617,14 @@ int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size) } mem_chunk_cnt = get_mem_chunk_cnt(); + phdr_text_cnt = os_info_has_vm() ? 1 : 0; - alloc_size = get_elfcorehdr_size(mem_chunk_cnt); + alloc_size = get_elfcorehdr_size(mem_chunk_cnt + phdr_text_cnt); hdr = kzalloc(alloc_size, GFP_KERNEL); - /* Without elfcorehdr /proc/vmcore cannot be created. Thus creating + /* + * Without elfcorehdr /proc/vmcore cannot be created. Thus creating * a dump with this crash kernel will fail. Panic now to allow other * dump mechanisms to take over. */ @@ -628,21 +632,23 @@ int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size) panic("s390 kdump allocating elfcorehdr failed"); /* Init elf header */ - ptr = ehdr_init(hdr, mem_chunk_cnt); + phdr_notes = ehdr_init(hdr, mem_chunk_cnt + phdr_text_cnt); /* Init program headers */ - phdr_notes = ptr; - ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr)); - phdr_text = ptr; - ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr)); - phdr_loads = ptr; - ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr) * mem_chunk_cnt); + if (phdr_text_cnt) { + phdr_text = phdr_notes + 1; + phdr_loads = phdr_text + 1; + } else { + phdr_loads = phdr_notes + 1; + } + ptr = PTR_ADD(phdr_loads, sizeof(Elf64_Phdr) * mem_chunk_cnt); /* Init notes */ hdr_off = PTR_DIFF(ptr, hdr); ptr = notes_init(phdr_notes, ptr, ((unsigned long) hdr) + hdr_off); /* Init kernel text program header */ - text_init(phdr_text); + if (phdr_text_cnt) + text_init(phdr_text); /* Init loads */ - loads_init(phdr_loads); + loads_init(phdr_loads, phdr_text_cnt); /* Finalize program headers */ hdr_off = PTR_DIFF(ptr, hdr); *addr = (unsigned long long) hdr; From 01c51a32dc18f128d2e55a7b2128b77fc01a2285 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 12 Apr 2024 10:43:29 +0200 Subject: [PATCH 0628/1578] KVM: s390x: selftests: Add shared zeropage test Let's test that we can have shared zeropages in our process as long as storage keys are not getting used, that shared zeropages are properly unshared (replaced by anonymous pages) once storage keys are enabled, and that no new shared zeropages are populated after storage keys were enabled. We require the new pagemap interface to detect the shared zeropage. On an old kernel (zeropages always disabled): # ./s390x/shared_zeropage_test TAP version 13 1..3 not ok 1 Shared zeropages should be enabled ok 2 Shared zeropage should be gone ok 3 Shared zeropages should be disabled # Totals: pass:2 fail:1 xfail:0 xpass:0 skip:0 error:0 On a fixed kernel: # ./s390x/shared_zeropage_test TAP version 13 1..3 ok 1 Shared zeropages should be enabled ok 2 Shared zeropage should be gone ok 3 Shared zeropages should be disabled # Totals: pass:3 fail:0 xfail:0 xpass:0 skip:0 error:0 Testing of UFFDIO_ZEROPAGE can be added later. [ agordeev: Fixed checkpatch complaint, added ucall_common.h include ] Cc: Christian Borntraeger Cc: Janosch Frank Cc: Claudio Imbrenda Cc: Thomas Huth Cc: Alexander Gordeev Cc: Paolo Bonzini Cc: Shuah Khan Signed-off-by: David Hildenbrand Acked-by: Christian Borntraeger Acked-by: Muhammad Usama Anjum Tested-by: Alexander Gordeev Link: https://lore.kernel.org/r/20240412084329.30315-1-david@redhat.com Signed-off-by: Alexander Gordeev --- tools/testing/selftests/kvm/Makefile | 1 + .../kvm/s390x/shared_zeropage_test.c | 111 ++++++++++++++++++ 2 files changed, 112 insertions(+) create mode 100644 tools/testing/selftests/kvm/s390x/shared_zeropage_test.c diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index ce8ff8e8ce3a25..ac280dcba996df 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -183,6 +183,7 @@ TEST_GEN_PROGS_s390x += s390x/sync_regs_test TEST_GEN_PROGS_s390x += s390x/tprot TEST_GEN_PROGS_s390x += s390x/cmma_test TEST_GEN_PROGS_s390x += s390x/debug_test +TEST_GEN_PROGS_s390x += s390x/shared_zeropage_test TEST_GEN_PROGS_s390x += demand_paging_test TEST_GEN_PROGS_s390x += dirty_log_test TEST_GEN_PROGS_s390x += guest_print_test diff --git a/tools/testing/selftests/kvm/s390x/shared_zeropage_test.c b/tools/testing/selftests/kvm/s390x/shared_zeropage_test.c new file mode 100644 index 00000000000000..bba0d9a6dcc874 --- /dev/null +++ b/tools/testing/selftests/kvm/s390x/shared_zeropage_test.c @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Test shared zeropage handling (with/without storage keys) + * + * Copyright (C) 2024, Red Hat, Inc. + */ +#include + +#include + +#include "test_util.h" +#include "kvm_util.h" +#include "kselftest.h" +#include "ucall_common.h" + +static void set_storage_key(void *addr, uint8_t skey) +{ + asm volatile("sske %0,%1" : : "d" (skey), "a" (addr)); +} + +static void guest_code(void) +{ + /* Issue some storage key instruction. */ + set_storage_key((void *)0, 0x98); + GUEST_DONE(); +} + +/* + * Returns 1 if the shared zeropage is mapped, 0 if something else is mapped. + * Returns < 0 on error or if nothing is mapped. + */ +static int maps_shared_zeropage(int pagemap_fd, void *addr) +{ + struct page_region region; + struct pm_scan_arg arg = { + .start = (uintptr_t)addr, + .end = (uintptr_t)addr + 4096, + .vec = (uintptr_t)®ion, + .vec_len = 1, + .size = sizeof(struct pm_scan_arg), + .category_mask = PAGE_IS_PFNZERO, + .category_anyof_mask = PAGE_IS_PRESENT, + .return_mask = PAGE_IS_PFNZERO, + }; + return ioctl(pagemap_fd, PAGEMAP_SCAN, &arg); +} + +int main(int argc, char *argv[]) +{ + char *mem, *page0, *page1, *page2, tmp; + const size_t pagesize = getpagesize(); + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct ucall uc; + int pagemap_fd; + + ksft_print_header(); + ksft_set_plan(3); + + /* + * We'll use memory that is not mapped into the VM for simplicity. + * Shared zeropages are enabled/disabled per-process. + */ + mem = mmap(0, 3 * pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0); + TEST_ASSERT(mem != MAP_FAILED, "mmap() failed"); + + /* Disable THP. Ignore errors on older kernels. */ + madvise(mem, 3 * pagesize, MADV_NOHUGEPAGE); + + page0 = mem; + page1 = page0 + pagesize; + page2 = page1 + pagesize; + + /* Can we even detect shared zeropages? */ + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + TEST_REQUIRE(pagemap_fd >= 0); + + tmp = *page0; + asm volatile("" : "+r" (tmp)); + TEST_REQUIRE(maps_shared_zeropage(pagemap_fd, page0) == 1); + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + /* Verify that we get the shared zeropage after VM creation. */ + tmp = *page1; + asm volatile("" : "+r" (tmp)); + ksft_test_result(maps_shared_zeropage(pagemap_fd, page1) == 1, + "Shared zeropages should be enabled\n"); + + /* + * Let our VM execute a storage key instruction that should + * unshare all shared zeropages. + */ + vcpu_run(vcpu); + get_ucall(vcpu, &uc); + TEST_ASSERT_EQ(uc.cmd, UCALL_DONE); + + /* Verify that we don't have a shared zeropage anymore. */ + ksft_test_result(!maps_shared_zeropage(pagemap_fd, page1), + "Shared zeropage should be gone\n"); + + /* Verify that we don't get any new shared zeropages. */ + tmp = *page2; + asm volatile("" : "+r" (tmp)); + ksft_test_result(!maps_shared_zeropage(pagemap_fd, page2), + "Shared zeropages should be disabled\n"); + + kvm_vm_free(vm); + + ksft_finished(); +} From 0841ea4a3b416554be401a91aa267b7de838de8b Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 3 Jun 2024 19:22:22 +0800 Subject: [PATCH 0629/1578] iomap: keep on increasing i_size in iomap_write_end() Commit '943bc0882ceb ("iomap: don't increase i_size if it's not a write operation")' breaks xfs with realtime device on generic/561, the problem is when unaligned truncate down a xfs realtime inode with rtextsize > 1 fs block, xfs only zero out the EOF block but doesn't zero out the tail blocks that aligned to rtextsize, so if we don't increase i_size in iomap_write_end(), it could expose stale data after we do an append write beyond the aligned EOF block. xfs should zero out the tail blocks when truncate down, but before we finish that, let's fix the issue by just revert the changes in iomap_write_end(). Fixes: 943bc0882ceb ("iomap: don't increase i_size if it's not a write operation") Reported-by: Chandan Babu R Link: https://lore.kernel.org/linux-xfs/0b92a215-9d9b-3788-4504-a520778953c2@huaweicloud.com Signed-off-by: Zhang Yi Link: https://lore.kernel.org/r/20240603112222.2109341-1-yi.zhang@huaweicloud.com Tested-by: Chandan Babu R Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 53 +++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 29 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index c5802a4593345a..bd70fcbc168ea1 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -877,22 +877,37 @@ static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, size_t copied, struct folio *folio) { const struct iomap *srcmap = iomap_iter_srcmap(iter); + loff_t old_size = iter->inode->i_size; + size_t written; if (srcmap->type == IOMAP_INLINE) { iomap_write_end_inline(iter, folio, pos, copied); - return true; + written = copied; + } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { + written = block_write_end(NULL, iter->inode->i_mapping, pos, + len, copied, &folio->page, NULL); + WARN_ON_ONCE(written != copied && written != 0); + } else { + written = __iomap_write_end(iter->inode, pos, len, copied, + folio) ? copied : 0; } - if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { - size_t bh_written; - - bh_written = block_write_end(NULL, iter->inode->i_mapping, pos, - len, copied, &folio->page, NULL); - WARN_ON_ONCE(bh_written != copied && bh_written != 0); - return bh_written == copied; + /* + * Update the in-memory inode size after copying the data into the page + * cache. It's up to the file system to write the updated size to disk, + * preferably after I/O completion so that no stale data is exposed. + * Only once that's done can we unlock and release the folio. + */ + if (pos + written > old_size) { + i_size_write(iter->inode, pos + written); + iter->iomap.flags |= IOMAP_F_SIZE_CHANGED; } + __iomap_put_folio(iter, pos, written, folio); - return __iomap_write_end(iter->inode, pos, len, copied, folio); + if (old_size < pos) + pagecache_isize_extended(iter->inode, old_size, pos); + + return written == copied; } static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) @@ -907,7 +922,6 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) do { struct folio *folio; - loff_t old_size; size_t offset; /* Offset into folio */ size_t bytes; /* Bytes to write to folio */ size_t copied; /* Bytes copied from user */ @@ -959,23 +973,6 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) written = iomap_write_end(iter, pos, bytes, copied, folio) ? copied : 0; - /* - * Update the in-memory inode size after copying the data into - * the page cache. It's up to the file system to write the - * updated size to disk, preferably after I/O completion so that - * no stale data is exposed. Only once that's done can we - * unlock and release the folio. - */ - old_size = iter->inode->i_size; - if (pos + written > old_size) { - i_size_write(iter->inode, pos + written); - iter->iomap.flags |= IOMAP_F_SIZE_CHANGED; - } - __iomap_put_folio(iter, pos, written, folio); - - if (old_size < pos) - pagecache_isize_extended(iter->inode, old_size, pos); - cond_resched(); if (unlikely(written == 0)) { /* @@ -1346,7 +1343,6 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) bytes = folio_size(folio) - offset; ret = iomap_write_end(iter, pos, bytes, bytes, folio); - __iomap_put_folio(iter, pos, bytes, folio); if (WARN_ON_ONCE(!ret)) return -EIO; @@ -1412,7 +1408,6 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) folio_mark_accessed(folio); ret = iomap_write_end(iter, pos, bytes, bytes, folio); - __iomap_put_folio(iter, pos, bytes, folio); if (WARN_ON_ONCE(!ret)) return -EIO; From f5ceb1bbc98c69536d4673a97315e8427e67de1b Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Tue, 7 May 2024 14:25:42 +0530 Subject: [PATCH 0630/1578] iomap: Fix iomap_adjust_read_range for plen calculation If the extent spans the block that contains i_size, we need to handle both halves separately so that we properly zero data in the page cache for blocks that are entirely outside of i_size. But this is needed only when i_size is within the current folio under processing. "orig_pos + length > isize" can be true for all folios if the mapped extent length is greater than the folio size. That is making plen to break for every folio instead of only the last folio. So use orig_plen for checking if "orig_pos + orig_plen > isize". Signed-off-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/a32e5f9a4fcfdb99077300c4020ed7ae61d6e0f9.1715067055.git.ritesh.list@gmail.com Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Jan Kara cc: Ojaswin Mujoo Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index bd70fcbc168ea1..d4655899027905 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -241,6 +241,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, unsigned block_size = (1 << block_bits); size_t poff = offset_in_folio(folio, *pos); size_t plen = min_t(loff_t, folio_size(folio) - poff, length); + size_t orig_plen = plen; unsigned first = poff >> block_bits; unsigned last = (poff + plen - 1) >> block_bits; @@ -277,7 +278,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, * handle both halves separately so that we properly zero data in the * page cache for blocks that are entirely outside of i_size. */ - if (orig_pos <= isize && orig_pos + length > isize) { + if (orig_pos <= isize && orig_pos + orig_plen > isize) { unsigned end = offset_in_folio(folio, isize - 1) >> block_bits; if (first <= end && last > end) From 9d274c19a71b3a276949933859610721a453946b Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 24 May 2024 13:58:11 -0700 Subject: [PATCH 0631/1578] btrfs: fix crash on racing fsync and size-extending write into prealloc We have been seeing crashes on duplicate keys in btrfs_set_item_key_safe(): BTRFS critical (device vdb): slot 4 key (450 108 8192) new key (450 108 8192) ------------[ cut here ]------------ kernel BUG at fs/btrfs/ctree.c:2620! invalid opcode: 0000 [#1] PREEMPT SMP PTI CPU: 0 PID: 3139 Comm: xfs_io Kdump: loaded Not tainted 6.9.0 #6 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-2.fc40 04/01/2014 RIP: 0010:btrfs_set_item_key_safe+0x11f/0x290 [btrfs] With the following stack trace: #0 btrfs_set_item_key_safe (fs/btrfs/ctree.c:2620:4) #1 btrfs_drop_extents (fs/btrfs/file.c:411:4) #2 log_one_extent (fs/btrfs/tree-log.c:4732:9) #3 btrfs_log_changed_extents (fs/btrfs/tree-log.c:4955:9) #4 btrfs_log_inode (fs/btrfs/tree-log.c:6626:9) #5 btrfs_log_inode_parent (fs/btrfs/tree-log.c:7070:8) #6 btrfs_log_dentry_safe (fs/btrfs/tree-log.c:7171:8) #7 btrfs_sync_file (fs/btrfs/file.c:1933:8) #8 vfs_fsync_range (fs/sync.c:188:9) #9 vfs_fsync (fs/sync.c:202:9) #10 do_fsync (fs/sync.c:212:9) #11 __do_sys_fdatasync (fs/sync.c:225:9) #12 __se_sys_fdatasync (fs/sync.c:223:1) #13 __x64_sys_fdatasync (fs/sync.c:223:1) #14 do_syscall_x64 (arch/x86/entry/common.c:52:14) #15 do_syscall_64 (arch/x86/entry/common.c:83:7) #16 entry_SYSCALL_64+0xaf/0x14c (arch/x86/entry/entry_64.S:121) So we're logging a changed extent from fsync, which is splitting an extent in the log tree. But this split part already exists in the tree, triggering the BUG(). This is the state of the log tree at the time of the crash, dumped with drgn (https://github.com/osandov/drgn/blob/main/contrib/btrfs_tree.py) to get more details than btrfs_print_leaf() gives us: >>> print_extent_buffer(prog.crashed_thread().stack_trace()[0]["eb"]) leaf 33439744 level 0 items 72 generation 9 owner 18446744073709551610 leaf 33439744 flags 0x100000000000000 fs uuid e5bd3946-400c-4223-8923-190ef1f18677 chunk uuid d58cb17e-6d02-494a-829a-18b7d8a399da item 0 key (450 INODE_ITEM 0) itemoff 16123 itemsize 160 generation 7 transid 9 size 8192 nbytes 8473563889606862198 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 sequence 204 flags 0x10(PREALLOC) atime 1716417703.220000000 (2024-05-22 15:41:43) ctime 1716417704.983333333 (2024-05-22 15:41:44) mtime 1716417704.983333333 (2024-05-22 15:41:44) otime 17592186044416.000000000 (559444-03-08 01:40:16) item 1 key (450 INODE_REF 256) itemoff 16110 itemsize 13 index 195 namelen 3 name: 193 item 2 key (450 XATTR_ITEM 1640047104) itemoff 16073 itemsize 37 location key (0 UNKNOWN.0 0) type XATTR transid 7 data_len 1 name_len 6 name: user.a data a item 3 key (450 EXTENT_DATA 0) itemoff 16020 itemsize 53 generation 9 type 1 (regular) extent data disk byte 303144960 nr 12288 extent data offset 0 nr 4096 ram 12288 extent compression 0 (none) item 4 key (450 EXTENT_DATA 4096) itemoff 15967 itemsize 53 generation 9 type 2 (prealloc) prealloc data disk byte 303144960 nr 12288 prealloc data offset 4096 nr 8192 item 5 key (450 EXTENT_DATA 8192) itemoff 15914 itemsize 53 generation 9 type 2 (prealloc) prealloc data disk byte 303144960 nr 12288 prealloc data offset 8192 nr 4096 ... So the real problem happened earlier: notice that items 4 (4k-12k) and 5 (8k-12k) overlap. Both are prealloc extents. Item 4 straddles i_size and item 5 starts at i_size. Here is the state of the filesystem tree at the time of the crash: >>> root = prog.crashed_thread().stack_trace()[2]["inode"].root >>> ret, nodes, slots = btrfs_search_slot(root, BtrfsKey(450, 0, 0)) >>> print_extent_buffer(nodes[0]) leaf 30425088 level 0 items 184 generation 9 owner 5 leaf 30425088 flags 0x100000000000000 fs uuid e5bd3946-400c-4223-8923-190ef1f18677 chunk uuid d58cb17e-6d02-494a-829a-18b7d8a399da ... item 179 key (450 INODE_ITEM 0) itemoff 4907 itemsize 160 generation 7 transid 7 size 4096 nbytes 12288 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 sequence 6 flags 0x10(PREALLOC) atime 1716417703.220000000 (2024-05-22 15:41:43) ctime 1716417703.220000000 (2024-05-22 15:41:43) mtime 1716417703.220000000 (2024-05-22 15:41:43) otime 1716417703.220000000 (2024-05-22 15:41:43) item 180 key (450 INODE_REF 256) itemoff 4894 itemsize 13 index 195 namelen 3 name: 193 item 181 key (450 XATTR_ITEM 1640047104) itemoff 4857 itemsize 37 location key (0 UNKNOWN.0 0) type XATTR transid 7 data_len 1 name_len 6 name: user.a data a item 182 key (450 EXTENT_DATA 0) itemoff 4804 itemsize 53 generation 9 type 1 (regular) extent data disk byte 303144960 nr 12288 extent data offset 0 nr 8192 ram 12288 extent compression 0 (none) item 183 key (450 EXTENT_DATA 8192) itemoff 4751 itemsize 53 generation 9 type 2 (prealloc) prealloc data disk byte 303144960 nr 12288 prealloc data offset 8192 nr 4096 Item 5 in the log tree corresponds to item 183 in the filesystem tree, but nothing matches item 4. Furthermore, item 183 is the last item in the leaf. btrfs_log_prealloc_extents() is responsible for logging prealloc extents beyond i_size. It first truncates any previously logged prealloc extents that start beyond i_size. Then, it walks the filesystem tree and copies the prealloc extent items to the log tree. If it hits the end of a leaf, then it calls btrfs_next_leaf(), which unlocks the tree and does another search. However, while the filesystem tree is unlocked, an ordered extent completion may modify the tree. In particular, it may insert an extent item that overlaps with an extent item that was already copied to the log tree. This may manifest in several ways depending on the exact scenario, including an EEXIST error that is silently translated to a full sync, overlapping items in the log tree, or this crash. This particular crash is triggered by the following sequence of events: - Initially, the file has i_size=4k, a regular extent from 0-4k, and a prealloc extent beyond i_size from 4k-12k. The prealloc extent item is the last item in its B-tree leaf. - The file is fsync'd, which copies its inode item and both extent items to the log tree. - An xattr is set on the file, which sets the BTRFS_INODE_COPY_EVERYTHING flag. - The range 4k-8k in the file is written using direct I/O. i_size is extended to 8k, but the ordered extent is still in flight. - The file is fsync'd. Since BTRFS_INODE_COPY_EVERYTHING is set, this calls copy_inode_items_to_log(), which calls btrfs_log_prealloc_extents(). - btrfs_log_prealloc_extents() finds the 4k-12k prealloc extent in the filesystem tree. Since it starts before i_size, it skips it. Since it is the last item in its B-tree leaf, it calls btrfs_next_leaf(). - btrfs_next_leaf() unlocks the path. - The ordered extent completion runs, which converts the 4k-8k part of the prealloc extent to written and inserts the remaining prealloc part from 8k-12k. - btrfs_next_leaf() does a search and finds the new prealloc extent 8k-12k. - btrfs_log_prealloc_extents() copies the 8k-12k prealloc extent into the log tree. Note that it overlaps with the 4k-12k prealloc extent that was copied to the log tree by the first fsync. - fsync calls btrfs_log_changed_extents(), which tries to log the 4k-8k extent that was written. - This tries to drop the range 4k-8k in the log tree, which requires adjusting the start of the 4k-12k prealloc extent in the log tree to 8k. - btrfs_set_item_key_safe() sees that there is already an extent starting at 8k in the log tree and calls BUG(). Fix this by detecting when we're about to insert an overlapping file extent item in the log tree and truncating the part that would overlap. CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Filipe Manana Signed-off-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 5146387b416bfe..26a2e5aa08e9ca 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4860,18 +4860,23 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, path->slots[0]++; continue; } - if (!dropped_extents) { - /* - * Avoid logging extent items logged in past fsync calls - * and leading to duplicate keys in the log tree. - */ + /* + * Avoid overlapping items in the log tree. The first time we + * get here, get rid of everything from a past fsync. After + * that, if the current extent starts before the end of the last + * extent we copied, truncate the last one. This can happen if + * an ordered extent completion modifies the subvolume tree + * while btrfs_next_leaf() has the tree unlocked. + */ + if (!dropped_extents || key.offset < truncate_offset) { ret = truncate_inode_items(trans, root->log_root, inode, - truncate_offset, + min(key.offset, truncate_offset), BTRFS_EXTENT_DATA_KEY); if (ret) goto out; dropped_extents = true; } + truncate_offset = btrfs_file_extent_end(path); if (ins_nr == 0) start_slot = slot; ins_nr++; From fb33eb2ef0d88e75564983ef057b44c5b7e4fded Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 3 Jun 2024 12:49:08 +0100 Subject: [PATCH 0632/1578] btrfs: fix leak of qgroup extent records after transaction abort Qgroup extent records are created when delayed ref heads are created and then released after accounting extents at btrfs_qgroup_account_extents(), called during the transaction commit path. If a transaction is aborted we free the qgroup records by calling btrfs_qgroup_destroy_extent_records() at btrfs_destroy_delayed_refs(), unless we don't have delayed references. We are incorrectly assuming that no delayed references means we don't have qgroup extents records. We can currently have no delayed references because we ran them all during a transaction commit and the transaction was aborted after that due to some error in the commit path. So fix this by ensuring we btrfs_qgroup_destroy_extent_records() at btrfs_destroy_delayed_refs() even if we don't have any delayed references. Reported-by: syzbot+0fecc032fa134afd49df@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/0000000000004e7f980619f91835@google.com/ Fixes: 81f7eb00ff5b ("btrfs: destroy qgroup extent records on transaction abort") CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Josef Bacik Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a91a8056758a32..242ada7e47b405 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4538,18 +4538,10 @@ static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info) { struct rb_node *node; - struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs; struct btrfs_delayed_ref_node *ref; - delayed_refs = &trans->delayed_refs; - spin_lock(&delayed_refs->lock); - if (atomic_read(&delayed_refs->num_entries) == 0) { - spin_unlock(&delayed_refs->lock); - btrfs_debug(fs_info, "delayed_refs has NO entry"); - return; - } - while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) { struct btrfs_delayed_ref_head *head; struct rb_node *n; From 267cace556e8a53d703119f7435ab556209e5b6a Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Sun, 26 May 2024 07:59:08 -0500 Subject: [PATCH 0633/1578] drm/amd: Fix shutdown (again) on some SMU v13.0.4/11 platforms commit cd94d1b182d2 ("dm/amd/pm: Fix problems with reboot/shutdown for some SMU 13.0.4/13.0.11 users") attempted to fix shutdown issues that were reported since commit 31729e8c21ec ("drm/amd/pm: fixes a random hang in S4 for SMU v13.0.4/11") but caused issues for some people. Adjust the workaround flow to properly only apply in the S4 case: -> For shutdown go through SMU_MSG_PrepareMp1ForUnload -> For S4 go through SMU_MSG_GfxDeviceDriverReset and SMU_MSG_PrepareMp1ForUnload Reported-and-tested-by: lectrode Closes: https://github.com/void-linux/void-packages/issues/50417 Cc: stable@vger.kernel.org Fixes: cd94d1b182d2 ("dm/amd/pm: Fix problems with reboot/shutdown for some SMU 13.0.4/13.0.11 users") Reviewed-by: Tim Huang Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher --- .../drm/amd/pm/swsmu/smu13/smu_v13_0_4_ppt.c | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_4_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_4_ppt.c index bc241b593db155..b6257f34a7c657 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_4_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_4_ppt.c @@ -226,15 +226,17 @@ static int smu_v13_0_4_system_features_control(struct smu_context *smu, bool en) struct amdgpu_device *adev = smu->adev; int ret = 0; - if (!en && adev->in_s4) { - /* Adds a GFX reset as workaround just before sending the - * MP1_UNLOAD message to prevent GC/RLC/PMFW from entering - * an invalid state. - */ - ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_GfxDeviceDriverReset, - SMU_RESET_MODE_2, NULL); - if (ret) - return ret; + if (!en && !adev->in_s0ix) { + if (adev->in_s4) { + /* Adds a GFX reset as workaround just before sending the + * MP1_UNLOAD message to prevent GC/RLC/PMFW from entering + * an invalid state. + */ + ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_GfxDeviceDriverReset, + SMU_RESET_MODE_2, NULL); + if (ret) + return ret; + } ret = smu_cmn_send_smc_msg(smu, SMU_MSG_PrepareMp1ForUnload, NULL); } From c6c4dd54012551cce5cde408b35468f2c62b0cce Mon Sep 17 00:00:00 2001 From: Tasos Sahanidis Date: Fri, 31 May 2024 19:23:30 +0300 Subject: [PATCH 0634/1578] drm/amdgpu/pptable: Fix UBSAN array-index-out-of-bounds Flexible arrays used [1] instead of []. Replace the former with the latter to resolve multiple UBSAN warnings observed on boot with a BONAIRE card. In addition, use the __counted_by attribute where possible to hint the length of the arrays to the compiler and any sanitizers. Signed-off-by: Tasos Sahanidis Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/include/pptable.h | 91 ++++++++++++++------------- 1 file changed, 49 insertions(+), 42 deletions(-) diff --git a/drivers/gpu/drm/amd/include/pptable.h b/drivers/gpu/drm/amd/include/pptable.h index 2e8e6c9875f6c6..f83ace2d7ec30f 100644 --- a/drivers/gpu/drm/amd/include/pptable.h +++ b/drivers/gpu/drm/amd/include/pptable.h @@ -477,31 +477,30 @@ typedef struct _ATOM_PPLIB_STATE_V2 } ATOM_PPLIB_STATE_V2; typedef struct _StateArray{ - //how many states we have - UCHAR ucNumEntries; - - ATOM_PPLIB_STATE_V2 states[1]; + //how many states we have + UCHAR ucNumEntries; + + ATOM_PPLIB_STATE_V2 states[] /* __counted_by(ucNumEntries) */; }StateArray; typedef struct _ClockInfoArray{ - //how many clock levels we have - UCHAR ucNumEntries; - - //sizeof(ATOM_PPLIB_CLOCK_INFO) - UCHAR ucEntrySize; - - UCHAR clockInfo[]; + //how many clock levels we have + UCHAR ucNumEntries; + + //sizeof(ATOM_PPLIB_CLOCK_INFO) + UCHAR ucEntrySize; + + UCHAR clockInfo[]; }ClockInfoArray; typedef struct _NonClockInfoArray{ + //how many non-clock levels we have. normally should be same as number of states + UCHAR ucNumEntries; + //sizeof(ATOM_PPLIB_NONCLOCK_INFO) + UCHAR ucEntrySize; - //how many non-clock levels we have. normally should be same as number of states - UCHAR ucNumEntries; - //sizeof(ATOM_PPLIB_NONCLOCK_INFO) - UCHAR ucEntrySize; - - ATOM_PPLIB_NONCLOCK_INFO nonClockInfo[]; + ATOM_PPLIB_NONCLOCK_INFO nonClockInfo[] __counted_by(ucNumEntries); }NonClockInfoArray; typedef struct _ATOM_PPLIB_Clock_Voltage_Dependency_Record @@ -513,8 +512,10 @@ typedef struct _ATOM_PPLIB_Clock_Voltage_Dependency_Record typedef struct _ATOM_PPLIB_Clock_Voltage_Dependency_Table { - UCHAR ucNumEntries; // Number of entries. - ATOM_PPLIB_Clock_Voltage_Dependency_Record entries[1]; // Dynamically allocate entries. + // Number of entries. + UCHAR ucNumEntries; + // Dynamically allocate entries. + ATOM_PPLIB_Clock_Voltage_Dependency_Record entries[] __counted_by(ucNumEntries); }ATOM_PPLIB_Clock_Voltage_Dependency_Table; typedef struct _ATOM_PPLIB_Clock_Voltage_Limit_Record @@ -529,8 +530,10 @@ typedef struct _ATOM_PPLIB_Clock_Voltage_Limit_Record typedef struct _ATOM_PPLIB_Clock_Voltage_Limit_Table { - UCHAR ucNumEntries; // Number of entries. - ATOM_PPLIB_Clock_Voltage_Limit_Record entries[1]; // Dynamically allocate entries. + // Number of entries. + UCHAR ucNumEntries; + // Dynamically allocate entries. + ATOM_PPLIB_Clock_Voltage_Limit_Record entries[] __counted_by(ucNumEntries); }ATOM_PPLIB_Clock_Voltage_Limit_Table; union _ATOM_PPLIB_CAC_Leakage_Record @@ -553,8 +556,10 @@ typedef union _ATOM_PPLIB_CAC_Leakage_Record ATOM_PPLIB_CAC_Leakage_Record; typedef struct _ATOM_PPLIB_CAC_Leakage_Table { - UCHAR ucNumEntries; // Number of entries. - ATOM_PPLIB_CAC_Leakage_Record entries[1]; // Dynamically allocate entries. + // Number of entries. + UCHAR ucNumEntries; + // Dynamically allocate entries. + ATOM_PPLIB_CAC_Leakage_Record entries[] __counted_by(ucNumEntries); }ATOM_PPLIB_CAC_Leakage_Table; typedef struct _ATOM_PPLIB_PhaseSheddingLimits_Record @@ -568,8 +573,10 @@ typedef struct _ATOM_PPLIB_PhaseSheddingLimits_Record typedef struct _ATOM_PPLIB_PhaseSheddingLimits_Table { - UCHAR ucNumEntries; // Number of entries. - ATOM_PPLIB_PhaseSheddingLimits_Record entries[1]; // Dynamically allocate entries. + // Number of entries. + UCHAR ucNumEntries; + // Dynamically allocate entries. + ATOM_PPLIB_PhaseSheddingLimits_Record entries[] __counted_by(ucNumEntries); }ATOM_PPLIB_PhaseSheddingLimits_Table; typedef struct _VCEClockInfo{ @@ -580,8 +587,8 @@ typedef struct _VCEClockInfo{ }VCEClockInfo; typedef struct _VCEClockInfoArray{ - UCHAR ucNumEntries; - VCEClockInfo entries[1]; + UCHAR ucNumEntries; + VCEClockInfo entries[] __counted_by(ucNumEntries); }VCEClockInfoArray; typedef struct _ATOM_PPLIB_VCE_Clock_Voltage_Limit_Record @@ -592,8 +599,8 @@ typedef struct _ATOM_PPLIB_VCE_Clock_Voltage_Limit_Record typedef struct _ATOM_PPLIB_VCE_Clock_Voltage_Limit_Table { - UCHAR numEntries; - ATOM_PPLIB_VCE_Clock_Voltage_Limit_Record entries[1]; + UCHAR numEntries; + ATOM_PPLIB_VCE_Clock_Voltage_Limit_Record entries[] __counted_by(numEntries); }ATOM_PPLIB_VCE_Clock_Voltage_Limit_Table; typedef struct _ATOM_PPLIB_VCE_State_Record @@ -604,8 +611,8 @@ typedef struct _ATOM_PPLIB_VCE_State_Record typedef struct _ATOM_PPLIB_VCE_State_Table { - UCHAR numEntries; - ATOM_PPLIB_VCE_State_Record entries[1]; + UCHAR numEntries; + ATOM_PPLIB_VCE_State_Record entries[] __counted_by(numEntries); }ATOM_PPLIB_VCE_State_Table; @@ -626,8 +633,8 @@ typedef struct _UVDClockInfo{ }UVDClockInfo; typedef struct _UVDClockInfoArray{ - UCHAR ucNumEntries; - UVDClockInfo entries[1]; + UCHAR ucNumEntries; + UVDClockInfo entries[] __counted_by(ucNumEntries); }UVDClockInfoArray; typedef struct _ATOM_PPLIB_UVD_Clock_Voltage_Limit_Record @@ -638,8 +645,8 @@ typedef struct _ATOM_PPLIB_UVD_Clock_Voltage_Limit_Record typedef struct _ATOM_PPLIB_UVD_Clock_Voltage_Limit_Table { - UCHAR numEntries; - ATOM_PPLIB_UVD_Clock_Voltage_Limit_Record entries[1]; + UCHAR numEntries; + ATOM_PPLIB_UVD_Clock_Voltage_Limit_Record entries[] __counted_by(numEntries); }ATOM_PPLIB_UVD_Clock_Voltage_Limit_Table; typedef struct _ATOM_PPLIB_UVD_Table @@ -657,8 +664,8 @@ typedef struct _ATOM_PPLIB_SAMClk_Voltage_Limit_Record }ATOM_PPLIB_SAMClk_Voltage_Limit_Record; typedef struct _ATOM_PPLIB_SAMClk_Voltage_Limit_Table{ - UCHAR numEntries; - ATOM_PPLIB_SAMClk_Voltage_Limit_Record entries[]; + UCHAR numEntries; + ATOM_PPLIB_SAMClk_Voltage_Limit_Record entries[] __counted_by(numEntries); }ATOM_PPLIB_SAMClk_Voltage_Limit_Table; typedef struct _ATOM_PPLIB_SAMU_Table @@ -675,8 +682,8 @@ typedef struct _ATOM_PPLIB_ACPClk_Voltage_Limit_Record }ATOM_PPLIB_ACPClk_Voltage_Limit_Record; typedef struct _ATOM_PPLIB_ACPClk_Voltage_Limit_Table{ - UCHAR numEntries; - ATOM_PPLIB_ACPClk_Voltage_Limit_Record entries[1]; + UCHAR numEntries; + ATOM_PPLIB_ACPClk_Voltage_Limit_Record entries[] __counted_by(numEntries); }ATOM_PPLIB_ACPClk_Voltage_Limit_Table; typedef struct _ATOM_PPLIB_ACP_Table @@ -743,9 +750,9 @@ typedef struct ATOM_PPLIB_VQ_Budgeting_Record{ } ATOM_PPLIB_VQ_Budgeting_Record; typedef struct ATOM_PPLIB_VQ_Budgeting_Table { - UCHAR revid; - UCHAR numEntries; - ATOM_PPLIB_VQ_Budgeting_Record entries[1]; + UCHAR revid; + UCHAR numEntries; + ATOM_PPLIB_VQ_Budgeting_Record entries[] __counted_by(numEntries); } ATOM_PPLIB_VQ_Budgeting_Table; #pragma pack() From c462ecd659b5fce731f1d592285832fd6ad54053 Mon Sep 17 00:00:00 2001 From: Andreas Hindborg Date: Mon, 3 Jun 2024 21:26:45 +0200 Subject: [PATCH 0635/1578] null_blk: fix validation of block size Block size should be between 512 and PAGE_SIZE and be a power of 2. The current check does not validate this, so update the check. Without this patch, null_blk would Oops due to a null pointer deref when loaded with bs=1536 [1]. Link: https://lore.kernel.org/all/87wmn8mocd.fsf@metaspace.dk/ Signed-off-by: Andreas Hindborg Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20240603192645.977968-1-nmi@metaspace.dk [axboe: remove unnecessary braces and != 0 check] Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 631dca2e4e8442..75f189e42f885d 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1824,8 +1824,8 @@ static int null_validate_conf(struct nullb_device *dev) dev->queue_mode = NULL_Q_MQ; } - dev->blocksize = round_down(dev->blocksize, 512); - dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096); + if (blk_validate_block_size(dev->blocksize)) + return -EINVAL; if (dev->use_per_node_hctx) { if (dev->submit_queues != nr_online_nodes) From c625dabbf1c4a8e77e4734014f2fde7aa9071a1f Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 3 Apr 2023 16:42:44 +0000 Subject: [PATCH 0636/1578] x86/amd_nb: Check for invalid SMN reads AMD Zen-based systems use a System Management Network (SMN) that provides access to implementation-specific registers. SMN accesses are done indirectly through an index/data pair in PCI config space. The PCI config access may fail and return an error code. This would prevent the "read" value from being updated. However, the PCI config access may succeed, but the return value may be invalid. This is in similar fashion to PCI bad reads, i.e. return all bits set. Most systems will return 0 for SMN addresses that are not accessible. This is in line with AMD convention that unavailable registers are Read-as-Zero/Writes-Ignored. However, some systems will return a "PCI Error Response" instead. This value, along with an error code of 0 from the PCI config access, will confuse callers of the amd_smn_read() function. Check for this condition, clear the return value, and set a proper error code. Fixes: ddfe43cdc0da ("x86/amd_nb: Add SMN and Indirect Data Fabric access for AMD Fam17h") Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230403164244.471141-1-yazen.ghannam@amd.com --- arch/x86/kernel/amd_nb.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 3cf156f7085917..027a8c7a2c9e64 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -215,7 +215,14 @@ static int __amd_smn_rw(u16 node, u32 address, u32 *value, bool write) int amd_smn_read(u16 node, u32 address, u32 *value) { - return __amd_smn_rw(node, address, value, false); + int err = __amd_smn_rw(node, address, value, false); + + if (PCI_POSSIBLE_ERROR(*value)) { + err = -ENODEV; + *value = 0; + } + + return err; } EXPORT_SYMBOL_GPL(amd_smn_read); From 229bedbf62b13af5aba6525ad10b62ad38d9ccb5 Mon Sep 17 00:00:00 2001 From: Aleksandr Mishin Date: Tue, 4 Jun 2024 13:05:52 +0300 Subject: [PATCH 0637/1578] net/mlx5: Fix tainted pointer delete is case of flow rules creation fail In case of flow rule creation fail in mlx5_lag_create_port_sel_table(), instead of previously created rules, the tainted pointer is deleted deveral times. Fix this bug by using correct flow rules pointers. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 352899f384d4 ("net/mlx5: Lag, use buckets in hash mode") Signed-off-by: Aleksandr Mishin Reviewed-by: Jacob Keller Reviewed-by: Tariq Toukan Link: https://lore.kernel.org/r/20240604100552.25201-1-amishin@t-argos.ru Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c index c16b462ddedf7e..ab2717012b79b5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c @@ -88,9 +88,13 @@ static int mlx5_lag_create_port_sel_table(struct mlx5_lag *ldev, &dest, 1); if (IS_ERR(lag_definer->rules[idx])) { err = PTR_ERR(lag_definer->rules[idx]); - while (i--) - while (j--) + do { + while (j--) { + idx = i * ldev->buckets + j; mlx5_del_flow_rules(lag_definer->rules[idx]); + } + j = ldev->buckets; + } while (i--); goto destroy_fg; } } From f921a58ae20852d188f70842431ce6519c4fdc36 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 4 Jun 2024 18:15:11 +0000 Subject: [PATCH 0638/1578] net/sched: taprio: always validate TCA_TAPRIO_ATTR_PRIOMAP If one TCA_TAPRIO_ATTR_PRIOMAP attribute has been provided, taprio_parse_mqprio_opt() must validate it, or userspace can inject arbitrary data to the kernel, the second time taprio_change() is called. First call (with valid attributes) sets dev->num_tc to a non zero value. Second call (with arbitrary mqprio attributes) returns early from taprio_parse_mqprio_opt() and bad things can happen. Fixes: a3d43c0d56f1 ("taprio: Add support adding an admin schedule") Reported-by: Noam Rathaus Signed-off-by: Eric Dumazet Acked-by: Vinicius Costa Gomes Reviewed-by: Vladimir Oltean Link: https://lore.kernel.org/r/20240604181511.769870-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/sch_taprio.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 937a0c513c174b..b284a06b5a75fa 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1176,16 +1176,13 @@ static int taprio_parse_mqprio_opt(struct net_device *dev, { bool allow_overlapping_txqs = TXTIME_ASSIST_IS_ENABLED(taprio_flags); - if (!qopt && !dev->num_tc) { - NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary"); - return -EINVAL; - } - - /* If num_tc is already set, it means that the user already - * configured the mqprio part - */ - if (dev->num_tc) + if (!qopt) { + if (!dev->num_tc) { + NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary"); + return -EINVAL; + } return 0; + } /* taprio imposes that traffic classes map 1:n to tx queues */ if (qopt->num_tc > dev->num_tx_queues) { From 323a359f9b077f382f4483023d096a4d316fd135 Mon Sep 17 00:00:00 2001 From: Karol Kolacinski Date: Tue, 4 Jun 2024 14:05:27 +0200 Subject: [PATCH 0639/1578] ptp: Fix error message on failed pin verification On failed verification of PTP clock pin, error message prints channel number instead of pin index after "pin", which is incorrect. Fix error message by adding channel number to the message and printing pin number instead of channel number. Fixes: 6092315dfdec ("ptp: introduce programmable pins.") Signed-off-by: Karol Kolacinski Acked-by: Richard Cochran Link: https://lore.kernel.org/r/20240604120555.16643-1-karol.kolacinski@intel.com Signed-off-by: Jakub Kicinski --- drivers/ptp/ptp_chardev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index 7513018c9f9ac7..2067b0120d083d 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -85,7 +85,8 @@ int ptp_set_pinfunc(struct ptp_clock *ptp, unsigned int pin, } if (info->verify(info, pin, func, chan)) { - pr_err("driver cannot use function %u on pin %u\n", func, chan); + pr_err("driver cannot use function %u and channel %u on pin %u\n", + func, chan, pin); return -EOPNOTSUPP; } From 7926d51f73e0434a6250c2fd1a0555f98d9a62da Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Tue, 4 Jun 2024 22:25:21 -0400 Subject: [PATCH 0640/1578] scsi: sd: Use READ(16) when reading block zero on large capacity disks Commit 321da3dc1f3c ("scsi: sd: usb_storage: uas: Access media prior to querying device properties") triggered a read to LBA 0 before attempting to inquire about device characteristics. This was done because some protocol bridge devices will return generic values until an attached storage device's media has been accessed. Pierre Tomon reported that this change caused problems on a large capacity external drive connected via a bridge device. The bridge in question does not appear to implement the READ(10) command. Issue a READ(16) instead of READ(10) when a device has been identified as preferring 16-byte commands (use_16_for_rw heuristic). Link: https://bugzilla.kernel.org/show_bug.cgi?id=218890 Link: https://lore.kernel.org/r/70dd7ae0-b6b1-48e1-bb59-53b7c7f18274@rowland.harvard.edu Link: https://lore.kernel.org/r/20240605022521.3960956-1-martin.petersen@oracle.com Fixes: 321da3dc1f3c ("scsi: sd: usb_storage: uas: Access media prior to querying device properties") Cc: stable@vger.kernel.org Reported-by: Pierre Tomon Suggested-by: Alan Stern Tested-by: Pierre Tomon Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 332eb9dac22d91..fbc11046bbf601 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3565,16 +3565,23 @@ static bool sd_validate_opt_xfer_size(struct scsi_disk *sdkp, static void sd_read_block_zero(struct scsi_disk *sdkp) { - unsigned int buf_len = sdkp->device->sector_size; - char *buffer, cmd[10] = { }; + struct scsi_device *sdev = sdkp->device; + unsigned int buf_len = sdev->sector_size; + u8 *buffer, cmd[16] = { }; buffer = kmalloc(buf_len, GFP_KERNEL); if (!buffer) return; - cmd[0] = READ_10; - put_unaligned_be32(0, &cmd[2]); /* Logical block address 0 */ - put_unaligned_be16(1, &cmd[7]); /* Transfer 1 logical block */ + if (sdev->use_16_for_rw) { + cmd[0] = READ_16; + put_unaligned_be64(0, &cmd[2]); /* Logical block address 0 */ + put_unaligned_be32(1, &cmd[10]);/* Transfer 1 logical block */ + } else { + cmd[0] = READ_10; + put_unaligned_be32(0, &cmd[2]); /* Logical block address 0 */ + put_unaligned_be16(1, &cmd[7]); /* Transfer 1 logical block */ + } scsi_execute_cmd(sdkp->device, cmd, REQ_OP_DRV_IN, buffer, buf_len, SD_TIMEOUT, sdkp->max_retries, NULL); From 4254dfeda82f20844299dca6c38cbffcfd499f41 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 5 Jun 2024 01:55:29 -0700 Subject: [PATCH 0641/1578] scsi: mpt3sas: Avoid test/set_bit() operating in non-allocated memory There is a potential out-of-bounds access when using test_bit() on a single word. The test_bit() and set_bit() functions operate on long values, and when testing or setting a single word, they can exceed the word boundary. KASAN detects this issue and produces a dump: BUG: KASAN: slab-out-of-bounds in _scsih_add_device.constprop.0 (./arch/x86/include/asm/bitops.h:60 ./include/asm-generic/bitops/instrumented-atomic.h:29 drivers/scsi/mpt3sas/mpt3sas_scsih.c:7331) mpt3sas Write of size 8 at addr ffff8881d26e3c60 by task kworker/u1536:2/2965 For full log, please look at [1]. Make the allocation at least the size of sizeof(unsigned long) so that set_bit() and test_bit() have sufficient room for read/write operations without overwriting unallocated memory. [1] Link: https://lore.kernel.org/all/ZkNcALr3W3KGYYJG@gmail.com/ Fixes: c696f7b83ede ("scsi: mpt3sas: Implement device_remove_in_progress check in IOCTL path") Cc: stable@vger.kernel.org Suggested-by: Keith Busch Signed-off-by: Breno Leitao Link: https://lore.kernel.org/r/20240605085530.499432-1-leitao@debian.org Reviewed-by: Keith Busch Signed-off-by: Martin K. Petersen --- drivers/scsi/mpt3sas/mpt3sas_base.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c index 258647fc6bddb4..1092497563b22b 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_base.c +++ b/drivers/scsi/mpt3sas/mpt3sas_base.c @@ -8512,6 +8512,12 @@ mpt3sas_base_attach(struct MPT3SAS_ADAPTER *ioc) ioc->pd_handles_sz = (ioc->facts.MaxDevHandle / 8); if (ioc->facts.MaxDevHandle % 8) ioc->pd_handles_sz++; + /* + * pd_handles_sz should have, at least, the minimal room for + * set_bit()/test_bit(), otherwise out-of-memory touch may occur. + */ + ioc->pd_handles_sz = ALIGN(ioc->pd_handles_sz, sizeof(unsigned long)); + ioc->pd_handles = kzalloc(ioc->pd_handles_sz, GFP_KERNEL); if (!ioc->pd_handles) { @@ -8529,6 +8535,13 @@ mpt3sas_base_attach(struct MPT3SAS_ADAPTER *ioc) ioc->pend_os_device_add_sz = (ioc->facts.MaxDevHandle / 8); if (ioc->facts.MaxDevHandle % 8) ioc->pend_os_device_add_sz++; + + /* + * pend_os_device_add_sz should have, at least, the minimal room for + * set_bit()/test_bit(), otherwise out-of-memory may occur. + */ + ioc->pend_os_device_add_sz = ALIGN(ioc->pend_os_device_add_sz, + sizeof(unsigned long)); ioc->pend_os_device_add = kzalloc(ioc->pend_os_device_add_sz, GFP_KERNEL); if (!ioc->pend_os_device_add) { @@ -8820,6 +8833,12 @@ _base_check_ioc_facts_changes(struct MPT3SAS_ADAPTER *ioc) if (ioc->facts.MaxDevHandle % 8) pd_handles_sz++; + /* + * pd_handles should have, at least, the minimal room for + * set_bit()/test_bit(), otherwise out-of-memory touch may + * occur. + */ + pd_handles_sz = ALIGN(pd_handles_sz, sizeof(unsigned long)); pd_handles = krealloc(ioc->pd_handles, pd_handles_sz, GFP_KERNEL); if (!pd_handles) { From 426826933109093503e7ef15d49348fc5ab505fe Mon Sep 17 00:00:00 2001 From: Ian Forbes Date: Tue, 21 May 2024 13:47:17 -0500 Subject: [PATCH 0642/1578] drm/vmwgfx: Filter modes which exceed graphics memory SVGA requires individual surfaces to fit within graphics memory (max_mob_pages) which means that modes with a final buffer size that would exceed graphics memory must be pruned otherwise creation will fail. Additionally llvmpipe requires its buffer height and width to be a multiple of its tile size which is 64. As a result we have to anticipate that llvmpipe will round up the mode size passed to it by the compositor when it creates buffers and filter modes where this rounding exceeds graphics memory. This fixes an issue where VMs with low graphics memory (< 64MiB) configured with high resolution mode boot to a black screen because surface creation fails. Fixes: d947d1b71deb ("drm/vmwgfx: Add and connect connector helper function") Signed-off-by: Ian Forbes Signed-off-by: Zack Rusin Link: https://patchwork.freedesktop.org/patch/msgid/20240521184720.767-2-ian.forbes@broadcom.com --- drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c | 45 ++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c b/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c index 2041c4d48daa53..52e8ec3b235758 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c @@ -43,7 +43,14 @@ #define vmw_connector_to_stdu(x) \ container_of(x, struct vmw_screen_target_display_unit, base.connector) - +/* + * Some renderers such as llvmpipe will align the width and height of their + * buffers to match their tile size. We need to keep this in mind when exposing + * modes to userspace so that this possible over-allocation will not exceed + * graphics memory. 64x64 pixels seems to be a reasonable upper bound for the + * tile size of current renderers. + */ +#define GPU_TILE_SIZE 64 enum stdu_content_type { SAME_AS_DISPLAY = 0, @@ -829,7 +836,41 @@ static void vmw_stdu_connector_destroy(struct drm_connector *connector) vmw_stdu_destroy(vmw_connector_to_stdu(connector)); } +static enum drm_mode_status +vmw_stdu_connector_mode_valid(struct drm_connector *connector, + struct drm_display_mode *mode) +{ + enum drm_mode_status ret; + struct drm_device *dev = connector->dev; + struct vmw_private *dev_priv = vmw_priv(dev); + u64 assumed_cpp = dev_priv->assume_16bpp ? 2 : 4; + /* Align width and height to account for GPU tile over-alignment */ + u64 required_mem = ALIGN(mode->hdisplay, GPU_TILE_SIZE) * + ALIGN(mode->vdisplay, GPU_TILE_SIZE) * + assumed_cpp; + required_mem = ALIGN(required_mem, PAGE_SIZE); + + ret = drm_mode_validate_size(mode, dev_priv->stdu_max_width, + dev_priv->stdu_max_height); + if (ret != MODE_OK) + return ret; + ret = drm_mode_validate_size(mode, dev_priv->texture_max_width, + dev_priv->texture_max_height); + if (ret != MODE_OK) + return ret; + + if (required_mem > dev_priv->max_primary_mem) + return MODE_MEM; + + if (required_mem > dev_priv->max_mob_pages * PAGE_SIZE) + return MODE_MEM; + + if (required_mem > dev_priv->max_mob_size) + return MODE_MEM; + + return MODE_OK; +} static const struct drm_connector_funcs vmw_stdu_connector_funcs = { .dpms = vmw_du_connector_dpms, @@ -845,7 +886,7 @@ static const struct drm_connector_funcs vmw_stdu_connector_funcs = { static const struct drm_connector_helper_funcs vmw_stdu_connector_helper_funcs = { .get_modes = vmw_connector_get_modes, - .mode_valid = vmw_connector_mode_valid + .mode_valid = vmw_stdu_connector_mode_valid }; From fb5e19d2dd03eb995ccd468d599b2337f7f66555 Mon Sep 17 00:00:00 2001 From: Ian Forbes Date: Tue, 21 May 2024 13:47:18 -0500 Subject: [PATCH 0643/1578] drm/vmwgfx: 3D disabled should not effect STDU memory limits This limit became a hard cap starting with the change referenced below. Surface creation on the device will fail if the requested size is larger than this limit so altering the value arbitrarily will expose modes that are too large for the device's hard limits. Fixes: 7ebb47c9f9ab ("drm/vmwgfx: Read new register for GB memory when available") Signed-off-by: Ian Forbes Signed-off-by: Zack Rusin Link: https://patchwork.freedesktop.org/patch/msgid/20240521184720.767-3-ian.forbes@broadcom.com --- drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c index 8f1730aeacc9eb..18a1f05c6fa857 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c @@ -960,13 +960,6 @@ static int vmw_driver_load(struct vmw_private *dev_priv, u32 pci_id) vmw_read(dev_priv, SVGA_REG_SUGGESTED_GBOBJECT_MEM_SIZE_KB); - /* - * Workaround for low memory 2D VMs to compensate for the - * allocation taken by fbdev - */ - if (!(dev_priv->capabilities & SVGA_CAP_3D)) - mem_size *= 3; - dev_priv->max_mob_pages = mem_size * 1024 / PAGE_SIZE; dev_priv->max_primary_mem = vmw_read(dev_priv, SVGA_REG_MAX_PRIMARY_MEM); From dde1de06bd7248fd83c4ce5cf0dbe9e4e95bbb91 Mon Sep 17 00:00:00 2001 From: Ian Forbes Date: Tue, 21 May 2024 13:47:19 -0500 Subject: [PATCH 0644/1578] drm/vmwgfx: Remove STDU logic from generic mode_valid function STDU has its own mode_valid function now so this logic can be removed from the generic version. Fixes: 935f795045a6 ("drm/vmwgfx: Refactor drm connector probing for display modes") Signed-off-by: Ian Forbes Signed-off-by: Zack Rusin Link: https://patchwork.freedesktop.org/patch/msgid/20240521184720.767-4-ian.forbes@broadcom.com --- drivers/gpu/drm/vmwgfx/vmwgfx_drv.h | 3 --- drivers/gpu/drm/vmwgfx/vmwgfx_kms.c | 26 +++++++++----------------- 2 files changed, 9 insertions(+), 20 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h index 4ecaea0026fccd..a1ce41e1c4684e 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h @@ -1043,9 +1043,6 @@ void vmw_kms_cursor_snoop(struct vmw_surface *srf, int vmw_kms_write_svga(struct vmw_private *vmw_priv, unsigned width, unsigned height, unsigned pitch, unsigned bpp, unsigned depth); -bool vmw_kms_validate_mode_vram(struct vmw_private *dev_priv, - uint32_t pitch, - uint32_t height); int vmw_kms_present(struct vmw_private *dev_priv, struct drm_file *file_priv, struct vmw_framebuffer *vfb, diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c index 13b2820cae51d9..9532258a0848c0 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c @@ -2171,13 +2171,12 @@ int vmw_kms_write_svga(struct vmw_private *vmw_priv, return 0; } +static bool vmw_kms_validate_mode_vram(struct vmw_private *dev_priv, - uint32_t pitch, - uint32_t height) + u64 pitch, + u64 height) { - return ((u64) pitch * (u64) height) < (u64) - ((dev_priv->active_display_unit == vmw_du_screen_target) ? - dev_priv->max_primary_mem : dev_priv->vram_size); + return (pitch * height) < (u64)dev_priv->vram_size; } /** @@ -2873,25 +2872,18 @@ int vmw_du_helper_plane_update(struct vmw_du_update_plane *update) enum drm_mode_status vmw_connector_mode_valid(struct drm_connector *connector, struct drm_display_mode *mode) { + enum drm_mode_status ret; struct drm_device *dev = connector->dev; struct vmw_private *dev_priv = vmw_priv(dev); - u32 max_width = dev_priv->texture_max_width; - u32 max_height = dev_priv->texture_max_height; u32 assumed_cpp = 4; if (dev_priv->assume_16bpp) assumed_cpp = 2; - if (dev_priv->active_display_unit == vmw_du_screen_target) { - max_width = min(dev_priv->stdu_max_width, max_width); - max_height = min(dev_priv->stdu_max_height, max_height); - } - - if (max_width < mode->hdisplay) - return MODE_BAD_HVALUE; - - if (max_height < mode->vdisplay) - return MODE_BAD_VVALUE; + ret = drm_mode_validate_size(mode, dev_priv->texture_max_width, + dev_priv->texture_max_height); + if (ret != MODE_OK) + return ret; if (!vmw_kms_validate_mode_vram(dev_priv, mode->hdisplay * assumed_cpp, From a54a200f3dc710db0572aba45c5c06b12b74489a Mon Sep 17 00:00:00 2001 From: Ian Forbes Date: Tue, 21 May 2024 13:47:20 -0500 Subject: [PATCH 0645/1578] drm/vmwgfx: Standardize use of kibibytes when logging Use the same standard abbreviation KiB instead of incorrect variants. Signed-off-by: Ian Forbes Signed-off-by: Zack Rusin Link: https://patchwork.freedesktop.org/patch/msgid/20240521184720.767-5-ian.forbes@broadcom.com --- drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 12 ++++++------ drivers/gpu/drm/vmwgfx/vmwgfx_gmrid_manager.c | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c index 18a1f05c6fa857..823d8d2da17c3f 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c @@ -746,7 +746,7 @@ static int vmw_setup_pci_resources(struct vmw_private *dev, dev->vram_size = pci_resource_len(pdev, 2); drm_info(&dev->drm, - "Register MMIO at 0x%pa size is %llu kiB\n", + "Register MMIO at 0x%pa size is %llu KiB\n", &rmmio_start, (uint64_t)rmmio_size / 1024); dev->rmmio = devm_ioremap(dev->drm.dev, rmmio_start, @@ -765,7 +765,7 @@ static int vmw_setup_pci_resources(struct vmw_private *dev, fifo_size = pci_resource_len(pdev, 2); drm_info(&dev->drm, - "FIFO at %pa size is %llu kiB\n", + "FIFO at %pa size is %llu KiB\n", &fifo_start, (uint64_t)fifo_size / 1024); dev->fifo_mem = devm_memremap(dev->drm.dev, fifo_start, @@ -790,7 +790,7 @@ static int vmw_setup_pci_resources(struct vmw_private *dev, * SVGA_REG_VRAM_SIZE. */ drm_info(&dev->drm, - "VRAM at %pa size is %llu kiB\n", + "VRAM at %pa size is %llu KiB\n", &dev->vram_start, (uint64_t)dev->vram_size / 1024); return 0; @@ -984,13 +984,13 @@ static int vmw_driver_load(struct vmw_private *dev_priv, u32 pci_id) dev_priv->max_primary_mem = dev_priv->vram_size; } drm_info(&dev_priv->drm, - "Legacy memory limits: VRAM = %llu kB, FIFO = %llu kB, surface = %u kB\n", + "Legacy memory limits: VRAM = %llu KiB, FIFO = %llu KiB, surface = %u KiB\n", (u64)dev_priv->vram_size / 1024, (u64)dev_priv->fifo_mem_size / 1024, dev_priv->memory_size / 1024); drm_info(&dev_priv->drm, - "MOB limits: max mob size = %u kB, max mob pages = %u\n", + "MOB limits: max mob size = %u KiB, max mob pages = %u\n", dev_priv->max_mob_size / 1024, dev_priv->max_mob_pages); ret = vmw_dma_masks(dev_priv); @@ -1008,7 +1008,7 @@ static int vmw_driver_load(struct vmw_private *dev_priv, u32 pci_id) (unsigned)dev_priv->max_gmr_pages); } drm_info(&dev_priv->drm, - "Maximum display memory size is %llu kiB\n", + "Maximum display memory size is %llu KiB\n", (uint64_t)dev_priv->max_primary_mem / 1024); /* Need mmio memory to check for fifo pitchlock cap. */ diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_gmrid_manager.c b/drivers/gpu/drm/vmwgfx/vmwgfx_gmrid_manager.c index a0b47c9b33f552..5bd967fbcf5547 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_gmrid_manager.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_gmrid_manager.c @@ -94,14 +94,14 @@ static int vmw_gmrid_man_get_node(struct ttm_resource_manager *man, } else new_max_pages = gman->max_gmr_pages * 2; if (new_max_pages > gman->max_gmr_pages && new_max_pages >= gman->used_gmr_pages) { - DRM_WARN("vmwgfx: increasing guest mob limits to %u kB.\n", + DRM_WARN("vmwgfx: increasing guest mob limits to %u KiB.\n", ((new_max_pages) << (PAGE_SHIFT - 10))); gman->max_gmr_pages = new_max_pages; } else { char buf[256]; snprintf(buf, sizeof(buf), - "vmwgfx, error: guest graphics is out of memory (mob limit at: %ukB).\n", + "vmwgfx, error: guest graphics is out of memory (mob limit at: %u KiB).\n", ((gman->max_gmr_pages) << (PAGE_SHIFT - 10))); vmw_host_printf(buf); DRM_WARN("%s", buf); From 7ef91dcba172441582962602ff6899bfec6078b7 Mon Sep 17 00:00:00 2001 From: Ian Forbes Date: Fri, 31 May 2024 15:33:58 -0500 Subject: [PATCH 0646/1578] drm/vmwgfx: Don't destroy Screen Target when CRTC is enabled but inactive drm_crtc_helper_funcs::atomic_disable can be called even when the CRTC is still enabled. This can occur when the mode changes or the CRTC is set as inactive. In the case where the CRTC is being set as inactive we only want to blank the screen. The Screen Target should remain intact as long as the mode has not changed and CRTC is enabled. This fixes a bug with GDM where locking the screen results in a permanent black screen because the Screen Target is no longer defined. Fixes: 7b0062036c3b ("drm/vmwgfx: Implement virtual crc generation") Signed-off-by: Ian Forbes Reviewed-by: Martin Krastev Signed-off-by: Zack Rusin Link: https://patchwork.freedesktop.org/patch/msgid/20240531203358.26677-1-ian.forbes@broadcom.com --- drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c b/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c index 52e8ec3b235758..afc0bc4ef4e1e0 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c @@ -421,6 +421,7 @@ static void vmw_stdu_crtc_atomic_disable(struct drm_crtc *crtc, { struct vmw_private *dev_priv; struct vmw_screen_target_display_unit *stdu; + struct drm_crtc_state *new_crtc_state; int ret; if (!crtc) { @@ -430,6 +431,7 @@ static void vmw_stdu_crtc_atomic_disable(struct drm_crtc *crtc, stdu = vmw_crtc_to_stdu(crtc); dev_priv = vmw_priv(crtc->dev); + new_crtc_state = drm_atomic_get_new_crtc_state(state, crtc); if (dev_priv->vkms_enabled) drm_crtc_vblank_off(crtc); @@ -441,6 +443,14 @@ static void vmw_stdu_crtc_atomic_disable(struct drm_crtc *crtc, (void) vmw_stdu_update_st(dev_priv, stdu); + /* Don't destroy the Screen Target if we are only setting the + * display as inactive + */ + if (new_crtc_state->enable && + !new_crtc_state->active && + !new_crtc_state->mode_changed) + return; + ret = vmw_stdu_destroy_st(dev_priv, stdu); if (ret) DRM_ERROR("Failed to destroy Screen Target\n"); From 0d648dd5c899f33154b98a6aef6e3dab0f4de613 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 23 May 2024 10:36:39 +0800 Subject: [PATCH 0647/1578] mm: drop the 'anon_' prefix for swap-out mTHP counters The mTHP swap related counters: 'anon_swpout' and 'anon_swpout_fallback' are confusing with an 'anon_' prefix, since the shmem can swap out non-anonymous pages. So drop the 'anon_' prefix to keep consistent with the old swap counter names. This is needed in 6.10-rcX to avoid having an inconsistent ABI out in the field. Link: https://lkml.kernel.org/r/7a8989c13299920d7589007a30065c3e2c19f0e0.1716431702.git.baolin.wang@linux.alibaba.com Fixes: d0f048ac39f6 ("mm: add per-order mTHP anon_swpout and anon_swpout_fallback counters") Fixes: 42248b9d34ea ("mm: add docs for per-order mTHP counters and transhuge_page ABI") Signed-off-by: Baolin Wang Suggested-by: "Huang, Ying" Acked-by: Barry Song Cc: David Hildenbrand Cc: Lance Yang Cc: Matthew Wilcox (Oracle) Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/transhuge.rst | 4 ++-- include/linux/huge_mm.h | 4 ++-- mm/huge_memory.c | 8 ++++---- mm/page_io.c | 2 +- mm/vmscan.c | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 076443cc10a6c7..d414d3f5592a8f 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -467,11 +467,11 @@ anon_fault_fallback_charge instead falls back to using huge pages with lower orders or small pages even though the allocation was successful. -anon_swpout +swpout is incremented every time a huge page is swapped out in one piece without splitting. -anon_swpout_fallback +swpout_fallback is incremented if a huge page has to be split before swapout. Usually because failed to allocate some continuous swap space for the huge page. diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index c8d3ec116e291b..8c72d378658382 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -269,8 +269,8 @@ enum mthp_stat_item { MTHP_STAT_ANON_FAULT_ALLOC, MTHP_STAT_ANON_FAULT_FALLBACK, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, - MTHP_STAT_ANON_SWPOUT, - MTHP_STAT_ANON_SWPOUT_FALLBACK, + MTHP_STAT_SWPOUT, + MTHP_STAT_SWPOUT_FALLBACK, __MTHP_STAT_COUNT }; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 317de2afd37181..89932fd0f62e86 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -558,15 +558,15 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); -DEFINE_MTHP_STAT_ATTR(anon_swpout, MTHP_STAT_ANON_SWPOUT); -DEFINE_MTHP_STAT_ATTR(anon_swpout_fallback, MTHP_STAT_ANON_SWPOUT_FALLBACK); +DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT); +DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK); static struct attribute *stats_attrs[] = { &anon_fault_alloc_attr.attr, &anon_fault_fallback_attr.attr, &anon_fault_fallback_charge_attr.attr, - &anon_swpout_attr.attr, - &anon_swpout_fallback_attr.attr, + &swpout_attr.attr, + &swpout_fallback_attr.attr, NULL, }; diff --git a/mm/page_io.c b/mm/page_io.c index 46c603dddf043d..0a150c240bf4fd 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -217,7 +217,7 @@ static inline void count_swpout_vm_event(struct folio *folio) count_memcg_folio_events(folio, THP_SWPOUT, 1); count_vm_event(THP_SWPOUT); } - count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_SWPOUT); + count_mthp_stat(folio_order(folio), MTHP_STAT_SWPOUT); #endif count_vm_events(PSWPOUT, folio_nr_pages(folio)); } diff --git a/mm/vmscan.c b/mm/vmscan.c index d55e8d07ffc4a7..2e34de9cd0d4f7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1227,7 +1227,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, THP_SWPOUT_FALLBACK, 1); count_vm_event(THP_SWPOUT_FALLBACK); } - count_mthp_stat(order, MTHP_STAT_ANON_SWPOUT_FALLBACK); + count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK); #endif if (!add_to_swap(folio)) goto activate_locked_split; From 94d46bf17916965e918bd2f3d2eec057f7c5578d Mon Sep 17 00:00:00 2001 From: Barry Song Date: Fri, 24 May 2024 08:50:48 +1200 Subject: [PATCH 0648/1578] mm: huge_mm: fix undefined reference to `mthp_stats' for CONFIG_SYSFS=n if CONFIG_SYSFS is not enabled in config, we get the below error, All errors (new ones prefixed by >>): s390-linux-ld: mm/memory.o: in function `count_mthp_stat': >> include/linux/huge_mm.h:285:(.text+0x191c): undefined reference to `mthp_stats' s390-linux-ld: mm/huge_memory.o:(.rodata+0x10): undefined reference to `mthp_stats' vim +285 include/linux/huge_mm.h 279 280 static inline void count_mthp_stat(int order, enum mthp_stat_item item) 281 { 282 if (order <= 0 || order > PMD_ORDER) 283 return; 284 > 285 this_cpu_inc(mthp_stats.stats[order][item]); 286 } 287 Link: https://lkml.kernel.org/r/20240523210045.40444-1-21cnbao@gmail.com Fixes: ec33687c6749 ("mm: add per-order mTHP anon_fault_alloc and anon_fault_fallback counters") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202405231728.tCAogiSI-lkp@intel.com/ Signed-off-by: Barry Song Tested-by: Yujie Liu Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 8c72d378658382..2aa986a5cd1b55 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -278,6 +278,7 @@ struct mthp_stat { unsigned long stats[ilog2(MAX_PTRS_PER_PTE) + 1][__MTHP_STAT_COUNT]; }; +#ifdef CONFIG_SYSFS DECLARE_PER_CPU(struct mthp_stat, mthp_stats); static inline void count_mthp_stat(int order, enum mthp_stat_item item) @@ -287,6 +288,11 @@ static inline void count_mthp_stat(int order, enum mthp_stat_item item) this_cpu_inc(mthp_stats.stats[order][item]); } +#else +static inline void count_mthp_stat(int order, enum mthp_stat_item item) +{ +} +#endif #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ From 6434e69814b159608a23135ca2be36024f402717 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Fri, 24 May 2024 12:54:44 +1200 Subject: [PATCH 0649/1578] mm: arm64: fix the out-of-bounds issue in contpte_clear_young_dirty_ptes We are passing a huge nr to __clear_young_dirty_ptes() right now. While we should pass the number of pages, we are actually passing CONT_PTE_SIZE. This is causing lots of crashes of MADV_FREE, panic oops could vary everytime. Link: https://lkml.kernel.org/r/20240524005444.135417-1-21cnbao@gmail.com Fixes: 89e86854fb0a ("mm/arm64: override clear_young_dirty_ptes() batch helper") Signed-off-by: Barry Song Reviewed-by: Baolin Wang Acked-by: Lance Yang Acked-by: David Hildenbrand Acked-by: Chris Li Cc: Barry Song <21cnbao@gmail.com> Cc: Ryan Roberts Cc: Jeff Xie Cc: Kefeng Wang Cc: Michal Hocko Cc: Minchan Kim Cc: Muchun Song Cc: Peter Xu Cc: Yang Shi Cc: Yin Fengwei Cc: Zach O'Keefe Cc: Catalin Marinas Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/mm/contpte.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c index 9f9486de0004dd..a3edced29ac138 100644 --- a/arch/arm64/mm/contpte.c +++ b/arch/arm64/mm/contpte.c @@ -376,7 +376,7 @@ void contpte_clear_young_dirty_ptes(struct vm_area_struct *vma, * clearing access/dirty for the whole block. */ unsigned long start = addr; - unsigned long end = start + nr; + unsigned long end = start + nr * PAGE_SIZE; if (pte_cont(__ptep_get(ptep + nr - 1))) end = ALIGN(end, CONT_PTE_SIZE); @@ -386,7 +386,7 @@ void contpte_clear_young_dirty_ptes(struct vm_area_struct *vma, ptep = contpte_align_down(ptep); } - __clear_young_dirty_ptes(vma, start, ptep, end - start, flags); + __clear_young_dirty_ptes(vma, start, ptep, (end - start) / PAGE_SIZE, flags); } EXPORT_SYMBOL_GPL(contpte_clear_young_dirty_ptes); From 36eef400c2d571d05b6e41a9a61f874d4a7b82c5 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 28 May 2024 16:13:41 +0200 Subject: [PATCH 0650/1578] memcg: remove the lockdep assert from __mod_objcg_mlstate() The assert was introduced in the commit cited below as an insurance that the semantic is the same after the local_irq_save() has been removed and the function has been made static. The original requirement to disable interrupt was due the modification of per-CPU counters which require interrupts to be disabled because the counter update operation is not atomic and some of the counters are updated from interrupt context. All callers of __mod_objcg_mlstate() acquire a lock (memcg_stock.stock_lock) which disables interrupts on !PREEMPT_RT and the lockdep assert is satisfied. On PREEMPT_RT the interrupts are not disabled and the assert triggers. The safety of the counter update is already ensured by VM_WARN_ON_IRQS_ENABLED() which is part of __mod_memcg_lruvec_state() and does not require yet another check. Remove the lockdep assert from __mod_objcg_mlstate(). Link: https://lkml.kernel.org/r/20240528141341.rz_rytN_@linutronix.de Fixes: 91882c1617c1 ("memcg: simple cleanup of stats update functions") Signed-off-by: Sebastian Andrzej Siewior Acked-by: Vlastimil Babka Acked-by: Shakeel Butt Acked-by: Michal Hocko Reviewed-by: Vlastimil Babka Cc: Johannes Weiner Cc: Muchun Song Cc: Roman Gushchin Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- mm/memcontrol.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7fad15b2290c70..36793e509f4708 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3147,8 +3147,6 @@ static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg, struct mem_cgroup *memcg; struct lruvec *lruvec; - lockdep_assert_irqs_disabled(); - rcu_read_lock(); memcg = obj_cgroup_memcg(objcg); lruvec = mem_cgroup_lruvec(memcg, pgdat); From a4ca369ca221bb7e06c725792ac107f0e48e82e7 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Thu, 30 May 2024 23:15:56 +0900 Subject: [PATCH 0651/1578] nilfs2: fix potential kernel bug due to lack of writeback flag waiting Destructive writes to a block device on which nilfs2 is mounted can cause a kernel bug in the folio/page writeback start routine or writeback end routine (__folio_start_writeback in the log below): kernel BUG at mm/page-writeback.c:3070! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI ... RIP: 0010:__folio_start_writeback+0xbaa/0x10e0 Code: 25 ff 0f 00 00 0f 84 18 01 00 00 e8 40 ca c6 ff e9 17 f6 ff ff e8 36 ca c6 ff 4c 89 f7 48 c7 c6 80 c0 12 84 e8 e7 b3 0f 00 90 <0f> 0b e8 1f ca c6 ff 4c 89 f7 48 c7 c6 a0 c6 12 84 e8 d0 b3 0f 00 ... Call Trace: nilfs_segctor_do_construct+0x4654/0x69d0 [nilfs2] nilfs_segctor_construct+0x181/0x6b0 [nilfs2] nilfs_segctor_thread+0x548/0x11c0 [nilfs2] kthread+0x2f0/0x390 ret_from_fork+0x4b/0x80 ret_from_fork_asm+0x1a/0x30 This is because when the log writer starts a writeback for segment summary blocks or a super root block that use the backing device's page cache, it does not wait for the ongoing folio/page writeback, resulting in an inconsistent writeback state. Fix this issue by waiting for ongoing writebacks when putting folios/pages on the backing device into writeback state. Link: https://lkml.kernel.org/r/20240530141556.4411-1-konishi.ryusuke@gmail.com Fixes: 9ff05123e3bf ("nilfs2: segment constructor") Signed-off-by: Ryusuke Konishi Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/segment.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 60d4f59f7665d7..6ea81f1d509441 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1652,6 +1652,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci) if (bh->b_folio != bd_folio) { if (bd_folio) { folio_lock(bd_folio); + folio_wait_writeback(bd_folio); folio_clear_dirty_for_io(bd_folio); folio_start_writeback(bd_folio); folio_unlock(bd_folio); @@ -1665,6 +1666,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci) if (bh == segbuf->sb_super_root) { if (bh->b_folio != bd_folio) { folio_lock(bd_folio); + folio_wait_writeback(bd_folio); folio_clear_dirty_for_io(bd_folio); folio_start_writeback(bd_folio); folio_unlock(bd_folio); @@ -1681,6 +1683,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci) } if (bd_folio) { folio_lock(bd_folio); + folio_wait_writeback(bd_folio); folio_clear_dirty_for_io(bd_folio); folio_start_writeback(bd_folio); folio_unlock(bd_folio); From 7cc5a5d65011983952a9c62f170f5b79e24b1239 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 29 May 2024 18:18:12 -0400 Subject: [PATCH 0652/1578] mm: page_alloc: fix highatomic typing in multi-block buddies Christoph reports a page allocator splat triggered by xfstests: generic/176 214s ... [ 1204.507931] run fstests generic/176 at 2024-05-27 12:52:30 XFS (nvme0n1): Mounting V5 Filesystem cd936307-415f-48a3-b99d-a2d52ae1f273 XFS (nvme0n1): Ending clean mount XFS (nvme1n1): Mounting V5 Filesystem ab3ee1a4-af62-4934-9a6a-6c2fde321850 XFS (nvme1n1): Ending clean mount XFS (nvme1n1): Unmounting Filesystem ab3ee1a4-af62-4934-9a6a-6c2fde321850 XFS (nvme1n1): Mounting V5 Filesystem 7099b02d-9c58-4d1d-be1d-2cc472d12cd9 XFS (nvme1n1): Ending clean mount ------------[ cut here ]------------ page type is 3, passed migratetype is 1 (nr=512) WARNING: CPU: 0 PID: 509870 at mm/page_alloc.c:645 expand+0x1c5/0x1f0 Modules linked in: i2c_i801 crc32_pclmul i2c_smbus [last unloaded: scsi_debug] CPU: 0 PID: 509870 Comm: xfs_io Not tainted 6.10.0-rc1+ #2437 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 RIP: 0010:expand+0x1c5/0x1f0 Code: 05 16 70 bf 02 01 e8 ca fc ff ff 8b 54 24 34 44 89 e1 48 c7 c7 80 a2 28 83 48 89 c6 b8 01 00 3 RSP: 0018:ffffc90003b2b968 EFLAGS: 00010082 RAX: 0000000000000000 RBX: ffffffff83fa9480 RCX: 0000000000000000 RDX: 0000000000000005 RSI: 0000000000000027 RDI: 00000000ffffffff RBP: 00000000001f2600 R08: 00000000fffeffff R09: 0000000000000001 R10: 0000000000000000 R11: ffffffff83676200 R12: 0000000000000009 R13: 0000000000000200 R14: 0000000000000001 R15: ffffea0007c98000 FS: 00007f72ca3d5780(0000) GS:ffff8881f9c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f72ca1fff38 CR3: 00000001aa0c6002 CR4: 0000000000770ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff07f0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? __warn+0x7b/0x120 ? expand+0x1c5/0x1f0 ? report_bug+0x191/0x1c0 ? handle_bug+0x3c/0x80 ? exc_invalid_op+0x17/0x70 ? asm_exc_invalid_op+0x1a/0x20 ? expand+0x1c5/0x1f0 ? expand+0x1c5/0x1f0 __rmqueue_pcplist+0x3a9/0x730 get_page_from_freelist+0x7a0/0xf00 __alloc_pages_noprof+0x153/0x2e0 __folio_alloc_noprof+0x10/0xa0 __filemap_get_folio+0x16b/0x370 iomap_write_begin+0x496/0x680 While trying to service a movable allocation (page type 1), the page allocator runs into a two-pageblock buddy on the movable freelist whose second block is typed as highatomic (page type 3). This inconsistency is caused by the highatomic reservation system operating on single pageblocks, while MAX_ORDER can be bigger than that - in this configuration, pageblock_order is 9 while MAX_PAGE_ORDER is 10. The test case is observed to make several adjacent order-3 requests with __GFP_DIRECT_RECLAIM cleared, which marks the surrounding block as highatomic. Upon freeing, the blocks merge into an order-10 buddy. When the highatomic pool is drained later on, this order-10 buddy gets moved back to the movable list, but only the first pageblock is marked movable again. A subsequent expand() of this buddy warns about the tail being of a different type. This is a long-standing bug that's surfaced by the recent block type warnings added to the allocator. The consequences seem mostly benign, it just results in odd behavior: the highatomic tail blocks are not properly drained, instead they end up on the movable list first, then go back to the highatomic list after an alloc-free cycle. To fix this, make the highatomic reservation code aware that allocations/buddies can be larger than a pageblock. While it's an old quirk, the recently added type consistency warnings seem to be the most prominent consequence of it. Set the Fixes: tag accordingly to highlight this backporting dependency. Link: https://lkml.kernel.org/r/20240530114203.GA1222079@cmpxchg.org Fixes: e0932b6c1f94 ("mm: page_alloc: consolidate free page accounting") Signed-off-by: Johannes Weiner Reported-by: Christoph Hellwig Reviewed-by: Zi Yan Tested-by: Christoph Hellwig Cc: Andy Shevchenko Cc: Baolin Wang Cc: Mel Gorman Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_alloc.c | 50 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2e22ce5675ca1a..222299b5c0e6af 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1955,10 +1955,12 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, } /* - * Reserve a pageblock for exclusive use of high-order atomic allocations if - * there are no empty page blocks that contain a page with a suitable order + * Reserve the pageblock(s) surrounding an allocation request for + * exclusive use of high-order atomic allocations if there are no + * empty page blocks that contain a page with a suitable order */ -static void reserve_highatomic_pageblock(struct page *page, struct zone *zone) +static void reserve_highatomic_pageblock(struct page *page, int order, + struct zone *zone) { int mt; unsigned long max_managed, flags; @@ -1984,10 +1986,17 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone) /* Yoink! */ mt = get_pageblock_migratetype(page); /* Only reserve normal pageblocks (i.e., they can merge with others) */ - if (migratetype_is_mergeable(mt)) - if (move_freepages_block(zone, page, mt, - MIGRATE_HIGHATOMIC) != -1) - zone->nr_reserved_highatomic += pageblock_nr_pages; + if (!migratetype_is_mergeable(mt)) + goto out_unlock; + + if (order < pageblock_order) { + if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1) + goto out_unlock; + zone->nr_reserved_highatomic += pageblock_nr_pages; + } else { + change_pageblock_range(page, order, MIGRATE_HIGHATOMIC); + zone->nr_reserved_highatomic += 1 << order; + } out_unlock: spin_unlock_irqrestore(&zone->lock, flags); @@ -1999,7 +2008,7 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone) * intense memory pressure but failed atomic allocations should be easier * to recover from than an OOM. * - * If @force is true, try to unreserve a pageblock even though highatomic + * If @force is true, try to unreserve pageblocks even though highatomic * pageblock is exhausted. */ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, @@ -2041,6 +2050,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * adjust the count once. */ if (is_migrate_highatomic(mt)) { + unsigned long size; /* * It should never happen but changes to * locking could inadvertently allow a per-cpu @@ -2048,9 +2058,9 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * while unreserving so be safe and watch for * underflows. */ - zone->nr_reserved_highatomic -= min( - pageblock_nr_pages, - zone->nr_reserved_highatomic); + size = max(pageblock_nr_pages, 1UL << order); + size = min(size, zone->nr_reserved_highatomic); + zone->nr_reserved_highatomic -= size; } /* @@ -2062,11 +2072,19 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * of pageblocks that cannot be completely freed * may increase. */ - ret = move_freepages_block(zone, page, mt, - ac->migratetype); + if (order < pageblock_order) + ret = move_freepages_block(zone, page, mt, + ac->migratetype); + else { + move_to_free_list(page, zone, order, mt, + ac->migratetype); + change_pageblock_range(page, order, + ac->migratetype); + ret = 1; + } /* - * Reserving this block already succeeded, so this should - * not fail on zone boundaries. + * Reserving the block(s) already succeeded, + * so this should not fail on zone boundaries. */ WARN_ON_ONCE(ret == -1); if (ret > 0) { @@ -3406,7 +3424,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, * if the pageblock should be reserved for the future */ if (unlikely(alloc_flags & ALLOC_HIGHATOMIC)) - reserve_highatomic_pageblock(page, zone); + reserve_highatomic_pageblock(page, order, zone); return page; } else { From 0105eaabb27f31d9b8d340aca6fb6a3420cab30f Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 28 May 2024 09:08:38 -0700 Subject: [PATCH 0653/1578] vmalloc: check CONFIG_EXECMEM in is_vmalloc_or_module_addr() After commit 2c9e5d4a0082 ("bpf: remove CONFIG_BPF_JIT dependency on CONFIG_MODULES of") CONFIG_BPF_JIT does not depend on CONFIG_MODULES any more and bpf jit also uses the [MODULES_VADDR, MODULES_END] memory region. But is_vmalloc_or_module_addr() still checks CONFIG_MODULES, which then returns false for a bpf jit memory region when CONFIG_MODULES is not defined. It leads to the following kernel BUG: [ 1.567023] ------------[ cut here ]------------ [ 1.567883] kernel BUG at mm/vmalloc.c:745! [ 1.568477] Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN NOPTI [ 1.569367] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.9.0+ #448 [ 1.570247] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1 04/01/2014 [ 1.570786] RIP: 0010:vmalloc_to_page+0x48/0x1ec [ 1.570786] Code: 0f 00 00 e8 eb 1a 05 00 b8 37 00 00 00 48 ba fe ff ff ff ff 1f 00 00 4c 03 25 76 49 c6 02 48 c1 e0 28 48 01 e8 48 39 d0 76 02 <0f> 0b 4c 89 e7 e8 bf 1a 05 00 49 8b 04 24 48 a9 9f ff ff ff 0f 84 [ 1.570786] RSP: 0018:ffff888007787960 EFLAGS: 00010212 [ 1.570786] RAX: 000036ffa0000000 RBX: 0000000000000640 RCX: ffffffff8147e93c [ 1.570786] RDX: 00001ffffffffffe RSI: dffffc0000000000 RDI: ffffffff840e32c8 [ 1.570786] RBP: ffffffffa0000000 R08: 0000000000000000 R09: 0000000000000000 [ 1.570786] R10: ffff888007787a88 R11: ffffffff8475d8e7 R12: ffffffff83e80ff8 [ 1.570786] R13: 0000000000000640 R14: 0000000000000640 R15: 0000000000000640 [ 1.570786] FS: 0000000000000000(0000) GS:ffff88806cc00000(0000) knlGS:0000000000000000 [ 1.570786] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1.570786] CR2: ffff888006a01000 CR3: 0000000003e80000 CR4: 0000000000350ef0 [ 1.570786] Call Trace: [ 1.570786] [ 1.570786] ? __die_body+0x1b/0x58 [ 1.570786] ? die+0x31/0x4b [ 1.570786] ? do_trap+0x9d/0x138 [ 1.570786] ? vmalloc_to_page+0x48/0x1ec [ 1.570786] ? do_error_trap+0xcd/0x102 [ 1.570786] ? vmalloc_to_page+0x48/0x1ec [ 1.570786] ? vmalloc_to_page+0x48/0x1ec [ 1.570786] ? handle_invalid_op+0x2f/0x38 [ 1.570786] ? vmalloc_to_page+0x48/0x1ec [ 1.570786] ? exc_invalid_op+0x2b/0x41 [ 1.570786] ? asm_exc_invalid_op+0x16/0x20 [ 1.570786] ? vmalloc_to_page+0x26/0x1ec [ 1.570786] ? vmalloc_to_page+0x48/0x1ec [ 1.570786] __text_poke+0xb6/0x458 [ 1.570786] ? __pfx_text_poke_memcpy+0x10/0x10 [ 1.570786] ? __pfx___mutex_lock+0x10/0x10 [ 1.570786] ? __pfx___text_poke+0x10/0x10 [ 1.570786] ? __pfx_get_random_u32+0x10/0x10 [ 1.570786] ? srso_return_thunk+0x5/0x5f [ 1.570786] text_poke_copy_locked+0x70/0x84 [ 1.570786] text_poke_copy+0x32/0x4f [ 1.570786] bpf_arch_text_copy+0xf/0x27 [ 1.570786] bpf_jit_binary_pack_finalize+0x26/0x5a [ 1.570786] bpf_int_jit_compile+0x576/0x8ad [ 1.570786] ? __pfx_bpf_int_jit_compile+0x10/0x10 [ 1.570786] ? srso_return_thunk+0x5/0x5f [ 1.570786] ? __kmalloc_node_track_caller+0x2b5/0x2e0 [ 1.570786] bpf_prog_select_runtime+0x7c/0x199 [ 1.570786] bpf_prepare_filter+0x1e9/0x25b [ 1.570786] ? __pfx_bpf_prepare_filter+0x10/0x10 [ 1.570786] ? srso_return_thunk+0x5/0x5f [ 1.570786] ? _find_next_bit+0x29/0x7e [ 1.570786] bpf_prog_create+0xb8/0xe0 [ 1.570786] ptp_classifier_init+0x75/0xa1 [ 1.570786] ? __pfx_ptp_classifier_init+0x10/0x10 [ 1.570786] ? srso_return_thunk+0x5/0x5f [ 1.570786] ? register_pernet_subsys+0x36/0x42 [ 1.570786] ? srso_return_thunk+0x5/0x5f [ 1.570786] sock_init+0x99/0xa3 [ 1.570786] ? __pfx_sock_init+0x10/0x10 [ 1.570786] do_one_initcall+0x104/0x2c4 [ 1.570786] ? __pfx_do_one_initcall+0x10/0x10 [ 1.570786] ? parameq+0x25/0x2d [ 1.570786] ? rcu_is_watching+0x1c/0x3c [ 1.570786] ? trace_kmalloc+0x81/0xb2 [ 1.570786] ? srso_return_thunk+0x5/0x5f [ 1.570786] ? __kmalloc+0x29c/0x2c7 [ 1.570786] ? srso_return_thunk+0x5/0x5f [ 1.570786] do_initcalls+0xf9/0x123 [ 1.570786] kernel_init_freeable+0x24f/0x289 [ 1.570786] ? __pfx_kernel_init+0x10/0x10 [ 1.570786] kernel_init+0x19/0x13a [ 1.570786] ret_from_fork+0x24/0x41 [ 1.570786] ? __pfx_kernel_init+0x10/0x10 [ 1.570786] ret_from_fork_asm+0x1a/0x30 [ 1.570786] [ 1.570819] ---[ end trace 0000000000000000 ]--- [ 1.571463] RIP: 0010:vmalloc_to_page+0x48/0x1ec [ 1.572111] Code: 0f 00 00 e8 eb 1a 05 00 b8 37 00 00 00 48 ba fe ff ff ff ff 1f 00 00 4c 03 25 76 49 c6 02 48 c1 e0 28 48 01 e8 48 39 d0 76 02 <0f> 0b 4c 89 e7 e8 bf 1a 05 00 49 8b 04 24 48 a9 9f ff ff ff 0f 84 [ 1.574632] RSP: 0018:ffff888007787960 EFLAGS: 00010212 [ 1.575129] RAX: 000036ffa0000000 RBX: 0000000000000640 RCX: ffffffff8147e93c [ 1.576097] RDX: 00001ffffffffffe RSI: dffffc0000000000 RDI: ffffffff840e32c8 [ 1.577084] RBP: ffffffffa0000000 R08: 0000000000000000 R09: 0000000000000000 [ 1.578077] R10: ffff888007787a88 R11: ffffffff8475d8e7 R12: ffffffff83e80ff8 [ 1.578810] R13: 0000000000000640 R14: 0000000000000640 R15: 0000000000000640 [ 1.579823] FS: 0000000000000000(0000) GS:ffff88806cc00000(0000) knlGS:0000000000000000 [ 1.580992] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1.581869] CR2: ffff888006a01000 CR3: 0000000003e80000 CR4: 0000000000350ef0 [ 1.582800] Kernel panic - not syncing: Fatal exception [ 1.583765] ---[ end Kernel panic - not syncing: Fatal exception ]--- Fix this by checking CONFIG_EXECMEM instead. Link: https://lkml.kernel.org/r/20240528160838.102223-1-xiyou.wangcong@gmail.com Fixes: 2c9e5d4a0082 ("bpf: remove CONFIG_BPF_JIT dependency on CONFIG_MODULES of") Signed-off-by: Cong Wang Acked-by: Mike Rapoport (IBM) Cc: Luis Chamberlain Cc: Daniel Borkmann Signed-off-by: Andrew Morton --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5d3aa2dc88a83f..45e1506d58c3a4 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -722,7 +722,7 @@ int is_vmalloc_or_module_addr(const void *x) * and fall back on vmalloc() if that fails. Others * just put it in the vmalloc space. */ -#if defined(CONFIG_MODULES) && defined(MODULES_VADDR) +#if defined(CONFIG_EXECMEM) && defined(MODULES_VADDR) unsigned long addr = (unsigned long)kasan_reset_tag(x); if (addr >= MODULES_VADDR && addr < MODULES_END) return 1; From 2ef3cec44c60ae171b287db7fc2aa341586d65ba Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Tue, 28 May 2024 12:48:06 +0200 Subject: [PATCH 0654/1578] kmsan: do not wipe out origin when doing partial unpoisoning As noticed by Brian, KMSAN should not be zeroing the origin when unpoisoning parts of a four-byte uninitialized value, e.g.: char a[4]; kmsan_unpoison_memory(a, 1); This led to false negatives, as certain poisoned values could receive zero origins, preventing those values from being reported. To fix the problem, check that kmsan_internal_set_shadow_origin() writes zero origins only to slots which have zero shadow. Link: https://lkml.kernel.org/r/20240528104807.738758-1-glider@google.com Fixes: f80be4571b19 ("kmsan: add KMSAN runtime core") Signed-off-by: Alexander Potapenko Reported-by: Brian Johannesmeyer Link: https://lore.kernel.org/lkml/20240524232804.1984355-1-bjohannesmeyer@gmail.com/T/ Reviewed-by: Marco Elver Tested-by: Brian Johannesmeyer Cc: Dmitry Vyukov Cc: Kees Cook Cc: Signed-off-by: Andrew Morton --- mm/kmsan/core.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c index cf2d70e9c9a5ff..95f859e38c533d 100644 --- a/mm/kmsan/core.c +++ b/mm/kmsan/core.c @@ -196,8 +196,7 @@ void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b, u32 origin, bool checked) { u64 address = (u64)addr; - void *shadow_start; - u32 *origin_start; + u32 *shadow_start, *origin_start; size_t pad = 0; KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(addr, size)); @@ -225,8 +224,16 @@ void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b, origin_start = (u32 *)kmsan_get_metadata((void *)address, KMSAN_META_ORIGIN); - for (int i = 0; i < size / KMSAN_ORIGIN_SIZE; i++) - origin_start[i] = origin; + /* + * If the new origin is non-zero, assume that the shadow byte is also non-zero, + * and unconditionally overwrite the old origin slot. + * If the new origin is zero, overwrite the old origin slot iff the + * corresponding shadow slot is zero. + */ + for (int i = 0; i < size / KMSAN_ORIGIN_SIZE; i++) { + if (origin || !shadow_start[i]) + origin_start[i] = origin; + } } struct page *kmsan_vmalloc_to_page_or_null(void *vaddr) From 730cdc2c72c6905a2eda2fccbbf67dcef1206590 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Tue, 28 May 2024 13:15:21 +0800 Subject: [PATCH 0655/1578] mm/ksm: fix ksm_pages_scanned accounting Patch series "mm/ksm: fix some accounting problems", v3. We encountered some abnormal ksm_pages_scanned and ksm_zero_pages during some random tests. 1. ksm_pages_scanned unchanged even ksmd scanning has progress. 2. ksm_zero_pages maybe -1 in some rare cases. This patch (of 2): During testing, I found ksm_pages_scanned is unchanged although the scan_get_next_rmap_item() did return valid rmap_item that is not NULL. The reason is the scan_get_next_rmap_item() will return NULL after a full scan, so ksm_do_scan() just return without accounting of the ksm_pages_scanned. Fix it by just putting ksm_pages_scanned accounting in that loop, and it will be accounted more timely if that loop would last for a long time. Link: https://lkml.kernel.org/r/20240528-b4-ksm-counters-v3-0-34bb358fdc13@linux.dev Link: https://lkml.kernel.org/r/20240528-b4-ksm-counters-v3-1-34bb358fdc13@linux.dev Fixes: b348b5fe2b5f ("mm/ksm: add pages scanned metric") Signed-off-by: Chengming Zhou Acked-by: David Hildenbrand Reviewed-by: xu xin Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Ran Xiaokai Cc: Stefan Roesch Cc: Yang Yang Cc: Signed-off-by: Andrew Morton --- mm/ksm.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 452ac8346e6e28..9e99cb12d33008 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2754,18 +2754,16 @@ static void ksm_do_scan(unsigned int scan_npages) { struct ksm_rmap_item *rmap_item; struct page *page; - unsigned int npages = scan_npages; - while (npages-- && likely(!freezing(current))) { + while (scan_npages-- && likely(!freezing(current))) { cond_resched(); rmap_item = scan_get_next_rmap_item(&page); if (!rmap_item) return; cmp_and_merge_page(page, rmap_item); put_page(page); + ksm_pages_scanned++; } - - ksm_pages_scanned += scan_npages - npages; } static int ksmd_should_run(void) From c2dc78b86e0821ecf9a9d0c35dba2618279a5bb6 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Tue, 28 May 2024 13:15:22 +0800 Subject: [PATCH 0656/1578] mm/ksm: fix ksm_zero_pages accounting We normally ksm_zero_pages++ in ksmd when page is merged with zero page, but ksm_zero_pages-- is done from page tables side, where there is no any accessing protection of ksm_zero_pages. So we can read very exceptional value of ksm_zero_pages in rare cases, such as -1, which is very confusing to users. Fix it by changing to use atomic_long_t, and the same case with the mm->ksm_zero_pages. Link: https://lkml.kernel.org/r/20240528-b4-ksm-counters-v3-2-34bb358fdc13@linux.dev Fixes: e2942062e01d ("ksm: count all zero pages placed by KSM") Fixes: 6080d19f0704 ("ksm: add ksm zero pages for each process") Signed-off-by: Chengming Zhou Acked-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Ran Xiaokai Cc: Stefan Roesch Cc: xu xin Cc: Yang Yang Cc: Signed-off-by: Andrew Morton --- fs/proc/base.c | 2 +- include/linux/ksm.h | 17 ++++++++++++++--- include/linux/mm_types.h | 2 +- mm/ksm.c | 11 +++++------ 4 files changed, 21 insertions(+), 11 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 18550c071d71c7..72a1acd03675cc 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3214,7 +3214,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, mm = get_task_mm(task); if (mm) { seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items); - seq_printf(m, "ksm_zero_pages %lu\n", mm->ksm_zero_pages); + seq_printf(m, "ksm_zero_pages %ld\n", mm_ksm_zero_pages(mm)); seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages); seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm)); mmput(mm); diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 52c63a9c5a9cdf..11690dacd98684 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -33,16 +33,27 @@ void __ksm_exit(struct mm_struct *mm); */ #define is_ksm_zero_pte(pte) (is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte)) -extern unsigned long ksm_zero_pages; +extern atomic_long_t ksm_zero_pages; + +static inline void ksm_map_zero_page(struct mm_struct *mm) +{ + atomic_long_inc(&ksm_zero_pages); + atomic_long_inc(&mm->ksm_zero_pages); +} static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte) { if (is_ksm_zero_pte(pte)) { - ksm_zero_pages--; - mm->ksm_zero_pages--; + atomic_long_dec(&ksm_zero_pages); + atomic_long_dec(&mm->ksm_zero_pages); } } +static inline long mm_ksm_zero_pages(struct mm_struct *mm) +{ + return atomic_long_read(&mm->ksm_zero_pages); +} + static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 24323c7d0bd484..af3a0256fa93be 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -985,7 +985,7 @@ struct mm_struct { * Represent how many empty pages are merged with kernel zero * pages when enabling KSM use_zero_pages. */ - unsigned long ksm_zero_pages; + atomic_long_t ksm_zero_pages; #endif /* CONFIG_KSM */ #ifdef CONFIG_LRU_GEN_WALKS_MMU struct { diff --git a/mm/ksm.c b/mm/ksm.c index 9e99cb12d33008..34c4820e0d3dfa 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -296,7 +296,7 @@ static bool ksm_use_zero_pages __read_mostly; static bool ksm_smart_scan = true; /* The number of zero pages which is placed by KSM */ -unsigned long ksm_zero_pages; +atomic_long_t ksm_zero_pages = ATOMIC_LONG_INIT(0); /* The number of pages that have been skipped due to "smart scanning" */ static unsigned long ksm_pages_skipped; @@ -1429,8 +1429,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, * the dirty bit in zero page's PTE is set. */ newpte = pte_mkdirty(pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot))); - ksm_zero_pages++; - mm->ksm_zero_pages++; + ksm_map_zero_page(mm); /* * We're replacing an anonymous page with a zero page, which is * not anonymous. We need to do proper accounting otherwise we @@ -3374,7 +3373,7 @@ static void wait_while_offlining(void) #ifdef CONFIG_PROC_FS long ksm_process_profit(struct mm_struct *mm) { - return (long)(mm->ksm_merging_pages + mm->ksm_zero_pages) * PAGE_SIZE - + return (long)(mm->ksm_merging_pages + mm_ksm_zero_pages(mm)) * PAGE_SIZE - mm->ksm_rmap_items * sizeof(struct ksm_rmap_item); } #endif /* CONFIG_PROC_FS */ @@ -3663,7 +3662,7 @@ KSM_ATTR_RO(pages_skipped); static ssize_t ksm_zero_pages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sysfs_emit(buf, "%ld\n", ksm_zero_pages); + return sysfs_emit(buf, "%ld\n", atomic_long_read(&ksm_zero_pages)); } KSM_ATTR_RO(ksm_zero_pages); @@ -3672,7 +3671,7 @@ static ssize_t general_profit_show(struct kobject *kobj, { long general_profit; - general_profit = (ksm_pages_sharing + ksm_zero_pages) * PAGE_SIZE - + general_profit = (ksm_pages_sharing + atomic_long_read(&ksm_zero_pages)) * PAGE_SIZE - ksm_rmap_items * sizeof(struct ksm_rmap_item); return sysfs_emit(buf, "%ld\n", general_profit); From 8daf9c702ee7f825f0de8600abff764acfedea13 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 28 May 2024 22:53:23 +0200 Subject: [PATCH 0657/1578] mm/hugetlb: do not call vma_add_reservation upon ENOMEM sysbot reported a splat [1] on __unmap_hugepage_range(). This is because vma_needs_reservation() can return -ENOMEM if allocate_file_region_entries() fails to allocate the file_region struct for the reservation. Check for that and do not call vma_add_reservation() if that is the case, otherwise region_abort() and region_del() will see that we do not have any file_regions. If we detect that vma_needs_reservation() returned -ENOMEM, we clear the hugetlb_restore_reserve flag as if this reservation was still consumed, so free_huge_folio() will not increment the resv count. [1] https://lore.kernel.org/linux-mm/0000000000004096100617c58d54@google.com/T/#ma5983bc1ab18a54910da83416b3f89f3c7ee43aa Link: https://lkml.kernel.org/r/20240528205323.20439-1-osalvador@suse.de Fixes: df7a6d1f6405 ("mm/hugetlb: restore the reservation if needed") Signed-off-by: Oscar Salvador Reported-and-tested-by: syzbot+d3fe2dc5ffe9380b714b@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-mm/0000000000004096100617c58d54@google.com/ Cc: Breno Leitao Cc: Muchun Song Cc: Signed-off-by: Andrew Morton --- mm/hugetlb.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6be78e7d4f6e05..f35abff8be60f8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5768,8 +5768,20 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * do_exit() will not see it, and will keep the reservation * forever. */ - if (adjust_reservation && vma_needs_reservation(h, vma, address)) - vma_add_reservation(h, vma, address); + if (adjust_reservation) { + int rc = vma_needs_reservation(h, vma, address); + + if (rc < 0) + /* Pressumably allocate_file_region_entries failed + * to allocate a file_region struct. Clear + * hugetlb_restore_reserve so that global reserve + * count will not be incremented by free_huge_folio. + * Act as if we consumed the reservation. + */ + folio_clear_hugetlb_restore_reserve(page_folio(page)); + else if (rc) + vma_add_reservation(h, vma, address); + } tlb_remove_page_size(tlb, page, huge_page_size(h)); /* From 3f0c44c8c21cfa3bb6b756b939491b7a60932cd1 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Mon, 27 May 2024 15:30:06 -0300 Subject: [PATCH 0658/1578] codetag: avoid race at alloc_slab_obj_exts When CONFIG_MEM_ALLOC_PROFILING_DEBUG is enabled, the following warning may be noticed: [ 48.299584] ------------[ cut here ]------------ [ 48.300092] alloc_tag was not set [ 48.300528] WARNING: CPU: 2 PID: 1361 at include/linux/alloc_tag.h:130 alloc_tagging_slab_free_hook+0x84/0xc7 [ 48.301305] Modules linked in: [ 48.301553] CPU: 2 PID: 1361 Comm: systemd-udevd Not tainted 6.10.0-rc1-00003-gac8755535862 #176 [ 48.302196] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 [ 48.302752] RIP: 0010:alloc_tagging_slab_free_hook+0x84/0xc7 [ 48.303169] Code: 8d 1c c4 48 85 db 74 4d 48 83 3b 00 75 1e 80 3d 65 02 86 04 00 75 15 48 c7 c7 11 48 1d 85 c6 05 55 02 86 04 01 e8 64 44 a5 ff <0f> 0b 48 8b 03 48 85 c0 74 21 48 83 f8 01 74 14 48 8b 50 20 48 f7 [ 48.304411] RSP: 0018:ffff8880111b7d40 EFLAGS: 00010282 [ 48.304916] RAX: 0000000000000000 RBX: ffff88800fcc9008 RCX: 0000000000000000 [ 48.305455] RDX: 0000000080000000 RSI: ffff888014060000 RDI: ffffed1002236f97 [ 48.305979] RBP: 0000000000001100 R08: fffffbfff0aa73a1 R09: 0000000000000000 [ 48.306473] R10: ffffffff814515e5 R11: 0000000000000003 R12: ffff88800fcc9000 [ 48.306943] R13: ffff88800b2e5cc0 R14: ffff8880111b7d90 R15: 0000000000000000 [ 48.307529] FS: 00007faf5d1908c0(0000) GS:ffff88806cf00000(0000) knlGS:0000000000000000 [ 48.308223] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 48.308710] CR2: 000058fb220c9118 CR3: 00000000110cc000 CR4: 0000000000750ef0 [ 48.309274] PKRU: 55555554 [ 48.309804] Call Trace: [ 48.310029] [ 48.310290] ? show_regs+0x84/0x8d [ 48.310722] ? alloc_tagging_slab_free_hook+0x84/0xc7 [ 48.311298] ? __warn+0x13b/0x2ff [ 48.311580] ? alloc_tagging_slab_free_hook+0x84/0xc7 [ 48.311987] ? report_bug+0x2ce/0x3ab [ 48.312292] ? handle_bug+0x8c/0x107 [ 48.312563] ? exc_invalid_op+0x34/0x6f [ 48.312842] ? asm_exc_invalid_op+0x1a/0x20 [ 48.313173] ? this_cpu_in_panic+0x1c/0x72 [ 48.313503] ? alloc_tagging_slab_free_hook+0x84/0xc7 [ 48.313880] ? putname+0x143/0x14e [ 48.314152] kmem_cache_free+0xe9/0x214 [ 48.314454] putname+0x143/0x14e [ 48.314712] do_unlinkat+0x413/0x45e [ 48.315001] ? __pfx_do_unlinkat+0x10/0x10 [ 48.315388] ? __check_object_size+0x4d7/0x525 [ 48.315744] ? __sanitizer_cov_trace_pc+0x20/0x4a [ 48.316167] ? __sanitizer_cov_trace_pc+0x20/0x4a [ 48.316757] ? getname_flags+0x4ed/0x500 [ 48.317261] __x64_sys_unlink+0x42/0x4a [ 48.317741] do_syscall_64+0xe2/0x149 [ 48.318171] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 48.318602] RIP: 0033:0x7faf5d8850ab [ 48.318891] Code: fd ff ff e8 27 dd 01 00 0f 1f 80 00 00 00 00 f3 0f 1e fa b8 5f 00 00 00 0f 05 c3 0f 1f 40 00 f3 0f 1e fa b8 57 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 05 c3 0f 1f 40 00 48 8b 15 41 2d 0e 00 f7 d8 [ 48.320649] RSP: 002b:00007ffc44982b38 EFLAGS: 00000246 ORIG_RAX: 0000000000000057 [ 48.321182] RAX: ffffffffffffffda RBX: 00005ba344a44680 RCX: 00007faf5d8850ab [ 48.321667] RDX: 0000000000000000 RSI: 00005ba344a44430 RDI: 00007ffc44982b40 [ 48.322139] RBP: 00007ffc44982c00 R08: 0000000000000000 R09: 0000000000000007 [ 48.322598] R10: 00005ba344a44430 R11: 0000000000000246 R12: 0000000000000000 [ 48.323071] R13: 00007ffc44982b40 R14: 0000000000000000 R15: 0000000000000000 [ 48.323596] This is due to a race when two objects are allocated from the same slab, which did not have an obj_exts allocated for. In such a case, the two threads will notice the NULL obj_exts and after one assigns slab->obj_exts, the second one will happily do the exchange if it reads this new assigned value. In order to avoid that, verify that the read obj_exts does not point to an allocated obj_exts before doing the exchange. Link: https://lkml.kernel.org/r/20240527183007.1595037-1-cascardo@igalia.com Fixes: 09c46563ff6d ("codetag: debug: introduce OBJEXTS_ALLOC_FAIL to mark failed slab_ext allocations") Signed-off-by: Thadeu Lima de Souza Cascardo Acked-by: Vlastimil Babka Cc: Suren Baghdasaryan Cc: Christoph Lameter Cc: David Rientjes Cc: Gustavo A. R. Silva Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Joonsoo Kim Cc: Kees Cook Cc: Pekka Enberg Cc: Roman Gushchin Cc: Thadeu Lima de Souza Cascardo Cc: Kent Overstreet Signed-off-by: Andrew Morton --- mm/slub.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 0809760cf789d1..1373ac365a46f4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1952,7 +1952,7 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, #ifdef CONFIG_MEMCG new_exts |= MEMCG_DATA_OBJEXTS; #endif - old_exts = slab->obj_exts; + old_exts = READ_ONCE(slab->obj_exts); handle_failed_objexts_alloc(old_exts, vec, objects); if (new_slab) { /* @@ -1961,7 +1961,8 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, * be simply assigned. */ slab->obj_exts = new_exts; - } else if (cmpxchg(&slab->obj_exts, old_exts, new_exts) != old_exts) { + } else if ((old_exts & ~OBJEXTS_FLAGS_MASK) || + cmpxchg(&slab->obj_exts, old_exts, new_exts) != old_exts) { /* * If the slab is already in use, somebody can allocate and * assign slabobj_exts in parallel. In this case the existing From 9415983599413f847ec9f081e9f9e5ed6cdeb342 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 31 May 2024 13:53:50 -0700 Subject: [PATCH 0659/1578] mm: fix xyz_noprof functions calling profiled functions Grepping /proc/allocinfo for "noprof" reveals several xyz_noprof functions, which means internally they are calling profiled functions. This should never happen as such calls move allocation charge from a higher level location where it should be accounted for into these lower level helpers. Fix this by replacing profiled function calls with noprof ones. Link: https://lkml.kernel.org/r/20240531205350.3973009-1-surenb@google.com Fixes: b951aaff5035 ("mm: enable page allocation tagging") Fixes: e26d8769da6d ("mempool: hook up to memory allocation profiling") Fixes: 88ae5fb755b0 ("mm: vmalloc: enable memory allocation profiling") Signed-off-by: Suren Baghdasaryan Cc: Kent Overstreet Reviewed-by: Kees Cook Acked-by: Vlastimil Babka Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- mm/filemap.c | 2 +- mm/mempool.c | 2 +- mm/util.c | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 382c3d06bfb10c..876cc64aadd7ce 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1000,7 +1000,7 @@ struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order) do { cpuset_mems_cookie = read_mems_allowed_begin(); n = cpuset_mem_spread_node(); - folio = __folio_alloc_node(gfp, order, n); + folio = __folio_alloc_node_noprof(gfp, order, n); } while (!folio && read_mems_allowed_retry(cpuset_mems_cookie)); return folio; diff --git a/mm/mempool.c b/mm/mempool.c index 6ece63a00acf58..3223337135d0a4 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -273,7 +273,7 @@ mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn, { mempool_t *pool; - pool = kzalloc_node(sizeof(*pool), gfp_mask, node_id); + pool = kmalloc_node_noprof(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id); if (!pool) return NULL; diff --git a/mm/util.c b/mm/util.c index c9e519e6811f55..6c3e6710e4defa 100644 --- a/mm/util.c +++ b/mm/util.c @@ -705,7 +705,7 @@ void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flag if (oldsize >= newsize) return (void *)p; - newp = kvmalloc(newsize, flags); + newp = kvmalloc_noprof(newsize, flags); if (!newp) return NULL; memcpy(newp, p, oldsize); @@ -726,7 +726,7 @@ void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags) if (unlikely(check_mul_overflow(n, size, &bytes))) return NULL; - return __vmalloc(bytes, flags); + return __vmalloc_noprof(bytes, flags); } EXPORT_SYMBOL(__vmalloc_array_noprof); @@ -737,7 +737,7 @@ EXPORT_SYMBOL(__vmalloc_array_noprof); */ void *vmalloc_array_noprof(size_t n, size_t size) { - return __vmalloc_array(n, size, GFP_KERNEL); + return __vmalloc_array_noprof(n, size, GFP_KERNEL); } EXPORT_SYMBOL(vmalloc_array_noprof); @@ -749,7 +749,7 @@ EXPORT_SYMBOL(vmalloc_array_noprof); */ void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) { - return __vmalloc_array(n, size, flags | __GFP_ZERO); + return __vmalloc_array_noprof(n, size, flags | __GFP_ZERO); } EXPORT_SYMBOL(__vcalloc_noprof); @@ -760,7 +760,7 @@ EXPORT_SYMBOL(__vcalloc_noprof); */ void *vcalloc_noprof(size_t n, size_t size) { - return __vmalloc_array(n, size, GFP_KERNEL | __GFP_ZERO); + return __vmalloc_array_noprof(n, size, GFP_KERNEL | __GFP_ZERO); } EXPORT_SYMBOL(vcalloc_noprof); From 7373a51e7998b508af7136530f3a997b286ce81c Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Tue, 4 Jun 2024 22:42:55 +0900 Subject: [PATCH 0660/1578] nilfs2: fix nilfs_empty_dir() misjudgment and long loop on I/O errors The error handling in nilfs_empty_dir() when a directory folio/page read fails is incorrect, as in the old ext2 implementation, and if the folio/page cannot be read or nilfs_check_folio() fails, it will falsely determine the directory as empty and corrupt the file system. In addition, since nilfs_empty_dir() does not immediately return on a failed folio/page read, but continues to loop, this can cause a long loop with I/O if i_size of the directory's inode is also corrupted, causing the log writer thread to wait and hang, as reported by syzbot. Fix these issues by making nilfs_empty_dir() immediately return a false value (0) if it fails to get a directory folio/page. Link: https://lkml.kernel.org/r/20240604134255.7165-1-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Reported-by: syzbot+c8166c541d3971bf6c87@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=c8166c541d3971bf6c87 Fixes: 2ba466d74ed7 ("nilfs2: directory entry operations") Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index a002a44ff16175..52e50b1b7f22db 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -607,7 +607,7 @@ int nilfs_empty_dir(struct inode *inode) kaddr = nilfs_get_folio(inode, i, &folio); if (IS_ERR(kaddr)) - continue; + return 0; de = (struct nilfs_dir_entry *)kaddr; kaddr += nilfs_last_byte(inode, i) - NILFS_DIR_REC_LEN(1); From b91e05f1fcf755c9d2c4ca10907383e39def05bd Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 18 May 2024 00:28:58 +0100 Subject: [PATCH 0661/1578] drm/vmwgfx: remove unused struct 'vmw_stdu_dma' 'vmw_stdu_dma' is unused since commit 39985eea5a6d ("drm/vmwgfx: Abstract placement selection") Remove it. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Zack Rusin Link: https://patchwork.freedesktop.org/patch/msgid/20240517232858.230860-1-linux@treblig.org --- drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c b/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c index afc0bc4ef4e1e0..a04e0736318da6 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c @@ -92,11 +92,6 @@ struct vmw_stdu_update { SVGA3dCmdUpdateGBScreenTarget body; }; -struct vmw_stdu_dma { - SVGA3dCmdHeader header; - SVGA3dCmdSurfaceDMA body; -}; - struct vmw_stdu_surface_copy { SVGA3dCmdHeader header; SVGA3dCmdSurfaceCopy body; From 03e4a092be8ce3de7c1baa7ae14e68b64e3ea644 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 3 Jun 2024 14:42:30 -0700 Subject: [PATCH 0662/1578] ice: fix iteration of TLVs in Preserved Fields Area The ice_get_pfa_module_tlv() function iterates over the Type-Length-Value structures in the Preserved Fields Area (PFA) of the NVM. This is used by the driver to access data such as the Part Board Assembly identifier. The function uses simple logic to iterate over the PFA. First, the pointer to the PFA in the NVM is read. Then the total length of the PFA is read from the first word. A pointer to the first TLV is initialized, and a simple loop iterates over each TLV. The pointer is moved forward through the NVM until it exceeds the PFA area. The logic seems sound, but it is missing a key detail. The Preserved Fields Area length includes one additional final word. This is documented in the device data sheet as a dummy word which contains 0xFFFF. All NVMs have this extra word. If the driver tries to scan for a TLV that is not in the PFA, it will read past the size of the PFA. It reads and interprets the last dummy word of the PFA as a TLV with type 0xFFFF. It then reads the word following the PFA as a length. The PFA resides within the Shadow RAM portion of the NVM, which is relatively small. All of its offsets are within a 16-bit size. The PFA pointer and TLV pointer are stored by the driver as 16-bit values. In almost all cases, the word following the PFA will be such that interpreting it as a length will result in 16-bit arithmetic overflow. Once overflowed, the new next_tlv value is now below the maximum offset of the PFA. Thus, the driver will continue to iterate the data as TLVs. In the worst case, the driver hits on a sequence of reads which loop back to reading the same offsets in an endless loop. To fix this, we need to correct the loop iteration check to account for this extra word at the end of the PFA. This alone is sufficient to resolve the known cases of this issue in the field. However, it is plausible that an NVM could be misconfigured or have corrupt data which results in the same kind of overflow. Protect against this by using check_add_overflow when calculating both the maximum offset of the TLVs, and when calculating the next_tlv offset at the end of each loop iteration. This ensures that the driver will not get stuck in an infinite loop when scanning the PFA. Fixes: e961b679fb0b ("ice: add board identifier info to devlink .info_get") Co-developed-by: Paul Greenwalt Signed-off-by: Paul Greenwalt Reviewed-by: Przemek Kitszel Tested-by: Pucha Himasekhar Reddy Signed-off-by: Jacob Keller Link: https://lore.kernel.org/r/20240603-net-2024-05-30-intel-net-fixes-v2-1-e3563aa89b0c@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_nvm.c | 28 ++++++++++++++++++------ 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_nvm.c b/drivers/net/ethernet/intel/ice/ice_nvm.c index 84eab92dc03cfe..ea7cbdf8492d6a 100644 --- a/drivers/net/ethernet/intel/ice/ice_nvm.c +++ b/drivers/net/ethernet/intel/ice/ice_nvm.c @@ -440,8 +440,7 @@ int ice_get_pfa_module_tlv(struct ice_hw *hw, u16 *module_tlv, u16 *module_tlv_len, u16 module_type) { - u16 pfa_len, pfa_ptr; - u16 next_tlv; + u16 pfa_len, pfa_ptr, next_tlv, max_tlv; int status; status = ice_read_sr_word(hw, ICE_SR_PFA_PTR, &pfa_ptr); @@ -454,11 +453,23 @@ ice_get_pfa_module_tlv(struct ice_hw *hw, u16 *module_tlv, u16 *module_tlv_len, ice_debug(hw, ICE_DBG_INIT, "Failed to read PFA length.\n"); return status; } + + /* The Preserved Fields Area contains a sequence of Type-Length-Value + * structures which define its contents. The PFA length includes all + * of the TLVs, plus the initial length word itself, *and* one final + * word at the end after all of the TLVs. + */ + if (check_add_overflow(pfa_ptr, pfa_len - 1, &max_tlv)) { + dev_warn(ice_hw_to_dev(hw), "PFA starts at offset %u. PFA length of %u caused 16-bit arithmetic overflow.\n", + pfa_ptr, pfa_len); + return -EINVAL; + } + /* Starting with first TLV after PFA length, iterate through the list * of TLVs to find the requested one. */ next_tlv = pfa_ptr + 1; - while (next_tlv < pfa_ptr + pfa_len) { + while (next_tlv < max_tlv) { u16 tlv_sub_module_type; u16 tlv_len; @@ -482,10 +493,13 @@ ice_get_pfa_module_tlv(struct ice_hw *hw, u16 *module_tlv, u16 *module_tlv_len, } return -EINVAL; } - /* Check next TLV, i.e. current TLV pointer + length + 2 words - * (for current TLV's type and length) - */ - next_tlv = next_tlv + tlv_len + 2; + + if (check_add_overflow(next_tlv, 2, &next_tlv) || + check_add_overflow(next_tlv, tlv_len, &next_tlv)) { + dev_warn(ice_hw_to_dev(hw), "TLV of type %u and length 0x%04x caused 16-bit arithmetic overflow. The PFA starts at 0x%04x and has length of 0x%04x\n", + tlv_sub_module_type, tlv_len, pfa_ptr, pfa_len); + return -EINVAL; + } } /* Module does not exist */ return -ENOENT; From cfa747a66e5da34793ac08c26b814e7709613fab Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 3 Jun 2024 14:42:31 -0700 Subject: [PATCH 0663/1578] ice: fix reads from NVM Shadow RAM on E830 and E825-C devices The ice driver reads data from the Shadow RAM portion of the NVM during initialization, including data used to identify the NVM image and device, such as the ETRACK ID used to populate devlink dev info fw.bundle. Currently it is using a fixed offset defined by ICE_CSS_HEADER_LENGTH to compute the appropriate offset. This worked fine for E810 and E822 devices which both have CSS header length of 330 words. Other devices, including both E825-C and E830 devices have different sizes for their CSS header. The use of a hard coded value results in the driver reading from the wrong block in the NVM when attempting to access the Shadow RAM copy. This results in the driver reporting the fw.bundle as 0x0 in both the devlink dev info and ethtool -i output. The first E830 support was introduced by commit ba20ecb1d1bb ("ice: Hook up 4 E830 devices by adding their IDs") and the first E825-C support was introducted by commit f64e18944233 ("ice: introduce new E825C devices family") The NVM actually contains the CSS header length embedded in it. Remove the hard coded value and replace it with logic to read the length from the NVM directly. This is more resilient against all existing and future hardware, vs looking up the expected values from a table. It ensures the driver will read from the appropriate place when determining the ETRACK ID value used for populating the fw.bundle_id and for reporting in ethtool -i. The CSS header length for both the active and inactive flash bank is stored in the ice_bank_info structure to avoid unnecessary duplicate work when accessing multiple words of the Shadow RAM. Both banks are read in the unlikely event that the header length is different for the NVM in the inactive bank, rather than being different only by the overall device family. Fixes: ba20ecb1d1bb ("ice: Hook up 4 E830 devices by adding their IDs") Co-developed-by: Paul Greenwalt Signed-off-by: Paul Greenwalt Reviewed-by: Przemek Kitszel Tested-by: Pucha Himasekhar Reddy Signed-off-by: Jacob Keller Link: https://lore.kernel.org/r/20240603-net-2024-05-30-intel-net-fixes-v2-2-e3563aa89b0c@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_nvm.c | 88 ++++++++++++++++++++++- drivers/net/ethernet/intel/ice/ice_type.h | 14 ++-- 2 files changed, 93 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_nvm.c b/drivers/net/ethernet/intel/ice/ice_nvm.c index ea7cbdf8492d6a..59e8879ac0598a 100644 --- a/drivers/net/ethernet/intel/ice/ice_nvm.c +++ b/drivers/net/ethernet/intel/ice/ice_nvm.c @@ -374,11 +374,25 @@ ice_read_nvm_module(struct ice_hw *hw, enum ice_bank_select bank, u32 offset, u1 * * Read the specified word from the copy of the Shadow RAM found in the * specified NVM module. + * + * Note that the Shadow RAM copy is always located after the CSS header, and + * is aligned to 64-byte (32-word) offsets. */ static int ice_read_nvm_sr_copy(struct ice_hw *hw, enum ice_bank_select bank, u32 offset, u16 *data) { - return ice_read_nvm_module(hw, bank, ICE_NVM_SR_COPY_WORD_OFFSET + offset, data); + u32 sr_copy; + + switch (bank) { + case ICE_ACTIVE_FLASH_BANK: + sr_copy = roundup(hw->flash.banks.active_css_hdr_len, 32); + break; + case ICE_INACTIVE_FLASH_BANK: + sr_copy = roundup(hw->flash.banks.inactive_css_hdr_len, 32); + break; + } + + return ice_read_nvm_module(hw, bank, sr_copy + offset, data); } /** @@ -1023,6 +1037,72 @@ static int ice_determine_active_flash_banks(struct ice_hw *hw) return 0; } +/** + * ice_get_nvm_css_hdr_len - Read the CSS header length from the NVM CSS header + * @hw: pointer to the HW struct + * @bank: whether to read from the active or inactive flash bank + * @hdr_len: storage for header length in words + * + * Read the CSS header length from the NVM CSS header and add the Authentication + * header size, and then convert to words. + * + * Return: zero on success, or a negative error code on failure. + */ +static int +ice_get_nvm_css_hdr_len(struct ice_hw *hw, enum ice_bank_select bank, + u32 *hdr_len) +{ + u16 hdr_len_l, hdr_len_h; + u32 hdr_len_dword; + int status; + + status = ice_read_nvm_module(hw, bank, ICE_NVM_CSS_HDR_LEN_L, + &hdr_len_l); + if (status) + return status; + + status = ice_read_nvm_module(hw, bank, ICE_NVM_CSS_HDR_LEN_H, + &hdr_len_h); + if (status) + return status; + + /* CSS header length is in DWORD, so convert to words and add + * authentication header size + */ + hdr_len_dword = hdr_len_h << 16 | hdr_len_l; + *hdr_len = (hdr_len_dword * 2) + ICE_NVM_AUTH_HEADER_LEN; + + return 0; +} + +/** + * ice_determine_css_hdr_len - Discover CSS header length for the device + * @hw: pointer to the HW struct + * + * Determine the size of the CSS header at the start of the NVM module. This + * is useful for locating the Shadow RAM copy in the NVM, as the Shadow RAM is + * always located just after the CSS header. + * + * Return: zero on success, or a negative error code on failure. + */ +static int ice_determine_css_hdr_len(struct ice_hw *hw) +{ + struct ice_bank_info *banks = &hw->flash.banks; + int status; + + status = ice_get_nvm_css_hdr_len(hw, ICE_ACTIVE_FLASH_BANK, + &banks->active_css_hdr_len); + if (status) + return status; + + status = ice_get_nvm_css_hdr_len(hw, ICE_INACTIVE_FLASH_BANK, + &banks->inactive_css_hdr_len); + if (status) + return status; + + return 0; +} + /** * ice_init_nvm - initializes NVM setting * @hw: pointer to the HW struct @@ -1069,6 +1149,12 @@ int ice_init_nvm(struct ice_hw *hw) return status; } + status = ice_determine_css_hdr_len(hw); + if (status) { + ice_debug(hw, ICE_DBG_NVM, "Failed to determine Shadow RAM copy offsets.\n"); + return status; + } + status = ice_get_nvm_ver_info(hw, ICE_ACTIVE_FLASH_BANK, &flash->nvm); if (status) { ice_debug(hw, ICE_DBG_INIT, "Failed to read NVM info.\n"); diff --git a/drivers/net/ethernet/intel/ice/ice_type.h b/drivers/net/ethernet/intel/ice/ice_type.h index f0796a93f4287e..eef397e5baa07d 100644 --- a/drivers/net/ethernet/intel/ice/ice_type.h +++ b/drivers/net/ethernet/intel/ice/ice_type.h @@ -482,6 +482,8 @@ struct ice_bank_info { u32 orom_size; /* Size of OROM bank */ u32 netlist_ptr; /* Pointer to 1st Netlist bank */ u32 netlist_size; /* Size of Netlist bank */ + u32 active_css_hdr_len; /* Active CSS header length */ + u32 inactive_css_hdr_len; /* Inactive CSS header length */ enum ice_flash_bank nvm_bank; /* Active NVM bank */ enum ice_flash_bank orom_bank; /* Active OROM bank */ enum ice_flash_bank netlist_bank; /* Active Netlist bank */ @@ -1087,17 +1089,13 @@ struct ice_aq_get_set_rss_lut_params { #define ICE_SR_SECTOR_SIZE_IN_WORDS 0x800 /* CSS Header words */ +#define ICE_NVM_CSS_HDR_LEN_L 0x02 +#define ICE_NVM_CSS_HDR_LEN_H 0x03 #define ICE_NVM_CSS_SREV_L 0x14 #define ICE_NVM_CSS_SREV_H 0x15 -/* Length of CSS header section in words */ -#define ICE_CSS_HEADER_LENGTH 330 - -/* Offset of Shadow RAM copy in the NVM bank area. */ -#define ICE_NVM_SR_COPY_WORD_OFFSET roundup(ICE_CSS_HEADER_LENGTH, 32) - -/* Size in bytes of Option ROM trailer */ -#define ICE_NVM_OROM_TRAILER_LENGTH (2 * ICE_CSS_HEADER_LENGTH) +/* Length of Authentication header section in words */ +#define ICE_NVM_AUTH_HEADER_LEN 0x08 /* The Link Topology Netlist section is stored as a series of words. It is * stored in the NVM as a TLV, with the first two words containing the type From adbf5a42341f6ea038d3626cd4437d9f0ad0b2dd Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Mon, 3 Jun 2024 14:42:32 -0700 Subject: [PATCH 0664/1578] ice: remove af_xdp_zc_qps bitmap Referenced commit has introduced a bitmap to distinguish between ZC and copy-mode AF_XDP queues, because xsk_get_pool_from_qid() does not do this for us. The bitmap would be especially useful when restoring previous state after rebuild, if only it was not reallocated in the process. This leads to e.g. xdpsock dying after changing number of queues. Instead of preserving the bitmap during the rebuild, remove it completely and distinguish between ZC and copy-mode queues based on the presence of a device associated with the pool. Fixes: e102db780e1c ("ice: track AF_XDP ZC enabled queues in bitmap") Reviewed-by: Przemek Kitszel Signed-off-by: Larysa Zaremba Reviewed-by: Simon Horman Tested-by: Chandan Kumar Rout Signed-off-by: Jacob Keller Link: https://lore.kernel.org/r/20240603-net-2024-05-30-intel-net-fixes-v2-3-e3563aa89b0c@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice.h | 32 ++++++++++++++++-------- drivers/net/ethernet/intel/ice/ice_lib.c | 8 ------ drivers/net/ethernet/intel/ice/ice_xsk.c | 13 +++++----- 3 files changed, 27 insertions(+), 26 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index 6ad8002b22e14a..5e51cb84b60069 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -409,7 +409,6 @@ struct ice_vsi { struct ice_tc_cfg tc_cfg; struct bpf_prog *xdp_prog; struct ice_tx_ring **xdp_rings; /* XDP ring array */ - unsigned long *af_xdp_zc_qps; /* tracks AF_XDP ZC enabled qps */ u16 num_xdp_txq; /* Used XDP queues */ u8 xdp_mapping_mode; /* ICE_MAP_MODE_[CONTIG|SCATTER] */ @@ -746,6 +745,25 @@ static inline void ice_set_ring_xdp(struct ice_tx_ring *ring) ring->flags |= ICE_TX_FLAGS_RING_XDP; } +/** + * ice_get_xp_from_qid - get ZC XSK buffer pool bound to a queue ID + * @vsi: pointer to VSI + * @qid: index of a queue to look at XSK buff pool presence + * + * Return: A pointer to xsk_buff_pool structure if there is a buffer pool + * attached and configured as zero-copy, NULL otherwise. + */ +static inline struct xsk_buff_pool *ice_get_xp_from_qid(struct ice_vsi *vsi, + u16 qid) +{ + struct xsk_buff_pool *pool = xsk_get_pool_from_qid(vsi->netdev, qid); + + if (!ice_is_xdp_ena_vsi(vsi)) + return NULL; + + return (pool && pool->dev) ? pool : NULL; +} + /** * ice_xsk_pool - get XSK buffer pool bound to a ring * @ring: Rx ring to use @@ -758,10 +776,7 @@ static inline struct xsk_buff_pool *ice_xsk_pool(struct ice_rx_ring *ring) struct ice_vsi *vsi = ring->vsi; u16 qid = ring->q_index; - if (!ice_is_xdp_ena_vsi(vsi) || !test_bit(qid, vsi->af_xdp_zc_qps)) - return NULL; - - return xsk_get_pool_from_qid(vsi->netdev, qid); + return ice_get_xp_from_qid(vsi, qid); } /** @@ -786,12 +801,7 @@ static inline void ice_tx_xsk_pool(struct ice_vsi *vsi, u16 qid) if (!ring) return; - if (!ice_is_xdp_ena_vsi(vsi) || !test_bit(qid, vsi->af_xdp_zc_qps)) { - ring->xsk_pool = NULL; - return; - } - - ring->xsk_pool = xsk_get_pool_from_qid(vsi->netdev, qid); + ring->xsk_pool = ice_get_xp_from_qid(vsi, qid); } /** diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index 5371e91f6bbb4b..c0a7ff6c7e8779 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -114,14 +114,8 @@ static int ice_vsi_alloc_arrays(struct ice_vsi *vsi) if (!vsi->q_vectors) goto err_vectors; - vsi->af_xdp_zc_qps = bitmap_zalloc(max_t(int, vsi->alloc_txq, vsi->alloc_rxq), GFP_KERNEL); - if (!vsi->af_xdp_zc_qps) - goto err_zc_qps; - return 0; -err_zc_qps: - devm_kfree(dev, vsi->q_vectors); err_vectors: devm_kfree(dev, vsi->rxq_map); err_rxq_map: @@ -309,8 +303,6 @@ static void ice_vsi_free_arrays(struct ice_vsi *vsi) dev = ice_pf_to_dev(pf); - bitmap_free(vsi->af_xdp_zc_qps); - vsi->af_xdp_zc_qps = NULL; /* free the ring and vector containers */ devm_kfree(dev, vsi->q_vectors); vsi->q_vectors = NULL; diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 7541f223bf4f69..a65955eb23c0bd 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -269,7 +269,6 @@ static int ice_xsk_pool_disable(struct ice_vsi *vsi, u16 qid) if (!pool) return -EINVAL; - clear_bit(qid, vsi->af_xdp_zc_qps); xsk_pool_dma_unmap(pool, ICE_RX_DMA_ATTR); return 0; @@ -300,8 +299,6 @@ ice_xsk_pool_enable(struct ice_vsi *vsi, struct xsk_buff_pool *pool, u16 qid) if (err) return err; - set_bit(qid, vsi->af_xdp_zc_qps); - return 0; } @@ -349,11 +346,13 @@ ice_realloc_rx_xdp_bufs(struct ice_rx_ring *rx_ring, bool pool_present) int ice_realloc_zc_buf(struct ice_vsi *vsi, bool zc) { struct ice_rx_ring *rx_ring; - unsigned long q; + uint i; + + ice_for_each_rxq(vsi, i) { + rx_ring = vsi->rx_rings[i]; + if (!rx_ring->xsk_pool) + continue; - for_each_set_bit(q, vsi->af_xdp_zc_qps, - max_t(int, vsi->alloc_txq, vsi->alloc_rxq)) { - rx_ring = vsi->rx_rings[q]; if (ice_realloc_rx_xdp_bufs(rx_ring, zc)) return -ENOMEM; } From 744d197162c2070a6045a71e2666ed93a57cc65d Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Mon, 3 Jun 2024 14:42:33 -0700 Subject: [PATCH 0665/1578] ice: add flag to distinguish reset from .ndo_bpf in XDP rings config Commit 6624e780a577 ("ice: split ice_vsi_setup into smaller functions") has placed ice_vsi_free_q_vectors() after ice_destroy_xdp_rings() in the rebuild process. The behaviour of the XDP rings config functions is context-dependent, so the change of order has led to ice_destroy_xdp_rings() doing additional work and removing XDP prog, when it was supposed to be preserved. Also, dependency on the PF state reset flags creates an additional, fortunately less common problem: * PFR is requested e.g. by tx_timeout handler * .ndo_bpf() is asked to delete the program, calls ice_destroy_xdp_rings(), but reset flag is set, so rings are destroyed without deleting the program * ice_vsi_rebuild tries to delete non-existent XDP rings, because the program is still on the VSI * system crashes With a similar race, when requested to attach a program, ice_prepare_xdp_rings() can actually skip setting the program in the VSI and nevertheless report success. Instead of reverting to the old order of function calls, add an enum argument to both ice_prepare_xdp_rings() and ice_destroy_xdp_rings() in order to distinguish between calls from rebuild and .ndo_bpf(). Fixes: efc2214b6047 ("ice: Add support for XDP") Reviewed-by: Igor Bagnucki Signed-off-by: Larysa Zaremba Reviewed-by: Simon Horman Tested-by: Chandan Kumar Rout Signed-off-by: Jacob Keller Link: https://lore.kernel.org/r/20240603-net-2024-05-30-intel-net-fixes-v2-4-e3563aa89b0c@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice.h | 11 +++++++++-- drivers/net/ethernet/intel/ice/ice_lib.c | 5 +++-- drivers/net/ethernet/intel/ice/ice_main.c | 22 ++++++++++++---------- 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index 5e51cb84b60069..a5de6ef9c07ef6 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -930,9 +930,16 @@ int ice_down(struct ice_vsi *vsi); int ice_down_up(struct ice_vsi *vsi); int ice_vsi_cfg_lan(struct ice_vsi *vsi); struct ice_vsi *ice_lb_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi); + +enum ice_xdp_cfg { + ICE_XDP_CFG_FULL, /* Fully apply new config in .ndo_bpf() */ + ICE_XDP_CFG_PART, /* Save/use part of config in VSI rebuild */ +}; + int ice_vsi_determine_xdp_res(struct ice_vsi *vsi); -int ice_prepare_xdp_rings(struct ice_vsi *vsi, struct bpf_prog *prog); -int ice_destroy_xdp_rings(struct ice_vsi *vsi); +int ice_prepare_xdp_rings(struct ice_vsi *vsi, struct bpf_prog *prog, + enum ice_xdp_cfg cfg_type); +int ice_destroy_xdp_rings(struct ice_vsi *vsi, enum ice_xdp_cfg cfg_type); int ice_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags); diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index c0a7ff6c7e8779..dd8b374823eeca 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -2285,7 +2285,8 @@ static int ice_vsi_cfg_def(struct ice_vsi *vsi) ret = ice_vsi_determine_xdp_res(vsi); if (ret) goto unroll_vector_base; - ret = ice_prepare_xdp_rings(vsi, vsi->xdp_prog); + ret = ice_prepare_xdp_rings(vsi, vsi->xdp_prog, + ICE_XDP_CFG_PART); if (ret) goto unroll_vector_base; } @@ -2429,7 +2430,7 @@ void ice_vsi_decfg(struct ice_vsi *vsi) /* return value check can be skipped here, it always returns * 0 if reset is in progress */ - ice_destroy_xdp_rings(vsi); + ice_destroy_xdp_rings(vsi, ICE_XDP_CFG_PART); ice_vsi_clear_rings(vsi); ice_vsi_free_q_vectors(vsi); diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index f60c022f796096..2a270aacd24a9a 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -2711,10 +2711,12 @@ static void ice_vsi_assign_bpf_prog(struct ice_vsi *vsi, struct bpf_prog *prog) * ice_prepare_xdp_rings - Allocate, configure and setup Tx rings for XDP * @vsi: VSI to bring up Tx rings used by XDP * @prog: bpf program that will be assigned to VSI + * @cfg_type: create from scratch or restore the existing configuration * * Return 0 on success and negative value on error */ -int ice_prepare_xdp_rings(struct ice_vsi *vsi, struct bpf_prog *prog) +int ice_prepare_xdp_rings(struct ice_vsi *vsi, struct bpf_prog *prog, + enum ice_xdp_cfg cfg_type) { u16 max_txqs[ICE_MAX_TRAFFIC_CLASS] = { 0 }; int xdp_rings_rem = vsi->num_xdp_txq; @@ -2790,7 +2792,7 @@ int ice_prepare_xdp_rings(struct ice_vsi *vsi, struct bpf_prog *prog) * taken into account at the end of ice_vsi_rebuild, where * ice_cfg_vsi_lan is being called */ - if (ice_is_reset_in_progress(pf->state)) + if (cfg_type == ICE_XDP_CFG_PART) return 0; /* tell the Tx scheduler that right now we have @@ -2842,22 +2844,21 @@ int ice_prepare_xdp_rings(struct ice_vsi *vsi, struct bpf_prog *prog) /** * ice_destroy_xdp_rings - undo the configuration made by ice_prepare_xdp_rings * @vsi: VSI to remove XDP rings + * @cfg_type: disable XDP permanently or allow it to be restored later * * Detach XDP rings from irq vectors, clean up the PF bitmap and free * resources */ -int ice_destroy_xdp_rings(struct ice_vsi *vsi) +int ice_destroy_xdp_rings(struct ice_vsi *vsi, enum ice_xdp_cfg cfg_type) { u16 max_txqs[ICE_MAX_TRAFFIC_CLASS] = { 0 }; struct ice_pf *pf = vsi->back; int i, v_idx; /* q_vectors are freed in reset path so there's no point in detaching - * rings; in case of rebuild being triggered not from reset bits - * in pf->state won't be set, so additionally check first q_vector - * against NULL + * rings */ - if (ice_is_reset_in_progress(pf->state) || !vsi->q_vectors[0]) + if (cfg_type == ICE_XDP_CFG_PART) goto free_qmap; ice_for_each_q_vector(vsi, v_idx) { @@ -2898,7 +2899,7 @@ int ice_destroy_xdp_rings(struct ice_vsi *vsi) if (static_key_enabled(&ice_xdp_locking_key)) static_branch_dec(&ice_xdp_locking_key); - if (ice_is_reset_in_progress(pf->state) || !vsi->q_vectors[0]) + if (cfg_type == ICE_XDP_CFG_PART) return 0; ice_vsi_assign_bpf_prog(vsi, NULL); @@ -3009,7 +3010,8 @@ ice_xdp_setup_prog(struct ice_vsi *vsi, struct bpf_prog *prog, if (xdp_ring_err) { NL_SET_ERR_MSG_MOD(extack, "Not enough Tx resources for XDP"); } else { - xdp_ring_err = ice_prepare_xdp_rings(vsi, prog); + xdp_ring_err = ice_prepare_xdp_rings(vsi, prog, + ICE_XDP_CFG_FULL); if (xdp_ring_err) NL_SET_ERR_MSG_MOD(extack, "Setting up XDP Tx resources failed"); } @@ -3020,7 +3022,7 @@ ice_xdp_setup_prog(struct ice_vsi *vsi, struct bpf_prog *prog, NL_SET_ERR_MSG_MOD(extack, "Setting up XDP Rx resources failed"); } else if (ice_is_xdp_ena_vsi(vsi) && !prog) { xdp_features_clear_redirect_target(vsi->netdev); - xdp_ring_err = ice_destroy_xdp_rings(vsi); + xdp_ring_err = ice_destroy_xdp_rings(vsi, ICE_XDP_CFG_FULL); if (xdp_ring_err) NL_SET_ERR_MSG_MOD(extack, "Freeing XDP Tx resources failed"); /* reallocate Rx queues that were used for zero-copy */ From f3df4044254c98128890b512bf19cc05588f1fe5 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Mon, 3 Jun 2024 14:42:34 -0700 Subject: [PATCH 0666/1578] ice: map XDP queues to vectors in ice_vsi_map_rings_to_vectors() ice_pf_dcb_recfg() re-maps queues to vectors with ice_vsi_map_rings_to_vectors(), which does not restore the previous state for XDP queues. This leads to no AF_XDP traffic after rebuild. Map XDP queues to vectors in ice_vsi_map_rings_to_vectors(). Also, move the code around, so XDP queues are mapped independently only through .ndo_bpf(). Fixes: 6624e780a577 ("ice: split ice_vsi_setup into smaller functions") Reviewed-by: Przemek Kitszel Signed-off-by: Larysa Zaremba Reviewed-by: Simon Horman Tested-by: Chandan Kumar Rout Signed-off-by: Jacob Keller Link: https://lore.kernel.org/r/20240603-net-2024-05-30-intel-net-fixes-v2-5-e3563aa89b0c@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice.h | 1 + drivers/net/ethernet/intel/ice/ice_base.c | 3 + drivers/net/ethernet/intel/ice/ice_lib.c | 14 ++-- drivers/net/ethernet/intel/ice/ice_main.c | 96 ++++++++++++++--------- 4 files changed, 68 insertions(+), 46 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index a5de6ef9c07ef6..99a75a59078ef3 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -940,6 +940,7 @@ int ice_vsi_determine_xdp_res(struct ice_vsi *vsi); int ice_prepare_xdp_rings(struct ice_vsi *vsi, struct bpf_prog *prog, enum ice_xdp_cfg cfg_type); int ice_destroy_xdp_rings(struct ice_vsi *vsi, enum ice_xdp_cfg cfg_type); +void ice_map_xdp_rings(struct ice_vsi *vsi); int ice_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags); diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index 687f6cb2b917af..5d396c1a773148 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -842,6 +842,9 @@ void ice_vsi_map_rings_to_vectors(struct ice_vsi *vsi) } rx_rings_rem -= rx_rings_per_v; } + + if (ice_is_xdp_ena_vsi(vsi)) + ice_map_xdp_rings(vsi); } /** diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index dd8b374823eeca..7629b0190578b3 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -2274,13 +2274,6 @@ static int ice_vsi_cfg_def(struct ice_vsi *vsi) if (ret) goto unroll_vector_base; - ice_vsi_map_rings_to_vectors(vsi); - - /* Associate q_vector rings to napi */ - ice_vsi_set_napi_queues(vsi); - - vsi->stat_offsets_loaded = false; - if (ice_is_xdp_ena_vsi(vsi)) { ret = ice_vsi_determine_xdp_res(vsi); if (ret) @@ -2291,6 +2284,13 @@ static int ice_vsi_cfg_def(struct ice_vsi *vsi) goto unroll_vector_base; } + ice_vsi_map_rings_to_vectors(vsi); + + /* Associate q_vector rings to napi */ + ice_vsi_set_napi_queues(vsi); + + vsi->stat_offsets_loaded = false; + /* ICE_VSI_CTRL does not need RSS so skip RSS processing */ if (vsi->type != ICE_VSI_CTRL) /* Do not exit if configuring RSS had an issue, at diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 2a270aacd24a9a..1b61ca3a6eb6e1 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -2707,6 +2707,60 @@ static void ice_vsi_assign_bpf_prog(struct ice_vsi *vsi, struct bpf_prog *prog) bpf_prog_put(old_prog); } +static struct ice_tx_ring *ice_xdp_ring_from_qid(struct ice_vsi *vsi, int qid) +{ + struct ice_q_vector *q_vector; + struct ice_tx_ring *ring; + + if (static_key_enabled(&ice_xdp_locking_key)) + return vsi->xdp_rings[qid % vsi->num_xdp_txq]; + + q_vector = vsi->rx_rings[qid]->q_vector; + ice_for_each_tx_ring(ring, q_vector->tx) + if (ice_ring_is_xdp(ring)) + return ring; + + return NULL; +} + +/** + * ice_map_xdp_rings - Map XDP rings to interrupt vectors + * @vsi: the VSI with XDP rings being configured + * + * Map XDP rings to interrupt vectors and perform the configuration steps + * dependent on the mapping. + */ +void ice_map_xdp_rings(struct ice_vsi *vsi) +{ + int xdp_rings_rem = vsi->num_xdp_txq; + int v_idx, q_idx; + + /* follow the logic from ice_vsi_map_rings_to_vectors */ + ice_for_each_q_vector(vsi, v_idx) { + struct ice_q_vector *q_vector = vsi->q_vectors[v_idx]; + int xdp_rings_per_v, q_id, q_base; + + xdp_rings_per_v = DIV_ROUND_UP(xdp_rings_rem, + vsi->num_q_vectors - v_idx); + q_base = vsi->num_xdp_txq - xdp_rings_rem; + + for (q_id = q_base; q_id < (q_base + xdp_rings_per_v); q_id++) { + struct ice_tx_ring *xdp_ring = vsi->xdp_rings[q_id]; + + xdp_ring->q_vector = q_vector; + xdp_ring->next = q_vector->tx.tx_ring; + q_vector->tx.tx_ring = xdp_ring; + } + xdp_rings_rem -= xdp_rings_per_v; + } + + ice_for_each_rxq(vsi, q_idx) { + vsi->rx_rings[q_idx]->xdp_ring = ice_xdp_ring_from_qid(vsi, + q_idx); + ice_tx_xsk_pool(vsi, q_idx); + } +} + /** * ice_prepare_xdp_rings - Allocate, configure and setup Tx rings for XDP * @vsi: VSI to bring up Tx rings used by XDP @@ -2719,7 +2773,6 @@ int ice_prepare_xdp_rings(struct ice_vsi *vsi, struct bpf_prog *prog, enum ice_xdp_cfg cfg_type) { u16 max_txqs[ICE_MAX_TRAFFIC_CLASS] = { 0 }; - int xdp_rings_rem = vsi->num_xdp_txq; struct ice_pf *pf = vsi->back; struct ice_qs_cfg xdp_qs_cfg = { .qs_mutex = &pf->avail_q_mutex, @@ -2732,8 +2785,7 @@ int ice_prepare_xdp_rings(struct ice_vsi *vsi, struct bpf_prog *prog, .mapping_mode = ICE_VSI_MAP_CONTIG }; struct device *dev; - int i, v_idx; - int status; + int status, i; dev = ice_pf_to_dev(pf); vsi->xdp_rings = devm_kcalloc(dev, vsi->num_xdp_txq, @@ -2752,42 +2804,6 @@ int ice_prepare_xdp_rings(struct ice_vsi *vsi, struct bpf_prog *prog, if (ice_xdp_alloc_setup_rings(vsi)) goto clear_xdp_rings; - /* follow the logic from ice_vsi_map_rings_to_vectors */ - ice_for_each_q_vector(vsi, v_idx) { - struct ice_q_vector *q_vector = vsi->q_vectors[v_idx]; - int xdp_rings_per_v, q_id, q_base; - - xdp_rings_per_v = DIV_ROUND_UP(xdp_rings_rem, - vsi->num_q_vectors - v_idx); - q_base = vsi->num_xdp_txq - xdp_rings_rem; - - for (q_id = q_base; q_id < (q_base + xdp_rings_per_v); q_id++) { - struct ice_tx_ring *xdp_ring = vsi->xdp_rings[q_id]; - - xdp_ring->q_vector = q_vector; - xdp_ring->next = q_vector->tx.tx_ring; - q_vector->tx.tx_ring = xdp_ring; - } - xdp_rings_rem -= xdp_rings_per_v; - } - - ice_for_each_rxq(vsi, i) { - if (static_key_enabled(&ice_xdp_locking_key)) { - vsi->rx_rings[i]->xdp_ring = vsi->xdp_rings[i % vsi->num_xdp_txq]; - } else { - struct ice_q_vector *q_vector = vsi->rx_rings[i]->q_vector; - struct ice_tx_ring *ring; - - ice_for_each_tx_ring(ring, q_vector->tx) { - if (ice_ring_is_xdp(ring)) { - vsi->rx_rings[i]->xdp_ring = ring; - break; - } - } - } - ice_tx_xsk_pool(vsi, i); - } - /* omit the scheduler update if in reset path; XDP queues will be * taken into account at the end of ice_vsi_rebuild, where * ice_cfg_vsi_lan is being called @@ -2795,6 +2811,8 @@ int ice_prepare_xdp_rings(struct ice_vsi *vsi, struct bpf_prog *prog, if (cfg_type == ICE_XDP_CFG_PART) return 0; + ice_map_xdp_rings(vsi); + /* tell the Tx scheduler that right now we have * additional queues */ From 7d67d11fbe194f71298263f48e33ae2afa38197e Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Mon, 3 Jun 2024 14:42:35 -0700 Subject: [PATCH 0667/1578] igc: Fix Energy Efficient Ethernet support declaration The commit 01cf893bf0f4 ("net: intel: i40e/igc: Remove setting Autoneg in EEE capabilities") removed SUPPORTED_Autoneg field but left inappropriate ethtool_keee structure initialization. When "ethtool --show " (get_eee) invoke, the 'ethtool_keee' structure was accidentally overridden. Remove the 'ethtool_keee' overriding and add EEE declaration as per IEEE specification that allows reporting Energy Efficient Ethernet capabilities. Examples: Before fix: ethtool --show-eee enp174s0 EEE settings for enp174s0: EEE status: not supported After fix: EEE settings for enp174s0: EEE status: disabled Tx LPI: disabled Supported EEE link modes: 100baseT/Full 1000baseT/Full 2500baseT/Full Fixes: 01cf893bf0f4 ("net: intel: i40e/igc: Remove setting Autoneg in EEE capabilities") Suggested-by: Dima Ruinskiy Signed-off-by: Sasha Neftin Tested-by: Naama Meir Signed-off-by: Jacob Keller Link: https://lore.kernel.org/r/20240603-net-2024-05-30-intel-net-fixes-v2-6-e3563aa89b0c@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/igc/igc_ethtool.c | 9 +++++++-- drivers/net/ethernet/intel/igc/igc_main.c | 4 ++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c index f2c4f1966bb041..0cd2bd695db1df 100644 --- a/drivers/net/ethernet/intel/igc/igc_ethtool.c +++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c @@ -1629,12 +1629,17 @@ static int igc_ethtool_get_eee(struct net_device *netdev, struct igc_hw *hw = &adapter->hw; u32 eeer; + linkmode_set_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT, + edata->supported); + linkmode_set_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, + edata->supported); + linkmode_set_bit(ETHTOOL_LINK_MODE_100baseT_Full_BIT, + edata->supported); + if (hw->dev_spec._base.eee_enable) mii_eee_cap1_mod_linkmode_t(edata->advertised, adapter->eee_advert); - *edata = adapter->eee; - eeer = rd32(IGC_EEER); /* EEE status on negotiated link */ diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 12f004f46082cd..305e05294a2659 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -12,6 +12,7 @@ #include #include #include +#include #include @@ -4975,6 +4976,9 @@ void igc_up(struct igc_adapter *adapter) /* start the watchdog. */ hw->mac.get_link_status = true; schedule_work(&adapter->watchdog_task); + + adapter->eee_advert = MDIO_EEE_100TX | MDIO_EEE_1000T | + MDIO_EEE_2_5GT; } /** From 5703fc058efdafcdd6b70776ee562478f0753acb Mon Sep 17 00:00:00 2001 From: Ian Forbes Date: Thu, 28 Mar 2024 14:07:16 -0500 Subject: [PATCH 0668/1578] drm/vmwgfx: Don't memcmp equivalent pointers These pointers are frequently the same and memcmp does not compare the pointers before comparing their contents so this was wasting cycles comparing 16 KiB of memory which will always be equal. Fixes: bb6780aa5a1d ("drm/vmwgfx: Diff cursors when using cmds") Signed-off-by: Ian Forbes Signed-off-by: Zack Rusin Link: https://patchwork.freedesktop.org/patch/msgid/20240328190716.27367-1-ian.forbes@broadcom.com --- drivers/gpu/drm/vmwgfx/vmwgfx_kms.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c index 9532258a0848c0..00c4ff68413012 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c @@ -224,7 +224,7 @@ static bool vmw_du_cursor_plane_has_changed(struct vmw_plane_state *old_vps, new_image = vmw_du_cursor_plane_acquire_image(new_vps); changed = false; - if (old_image && new_image) + if (old_image && new_image && old_image != new_image) changed = memcmp(old_image, new_image, size) != 0; return changed; From 831bcbcead6668ebf20b64fdb27518f1362ace3a Mon Sep 17 00:00:00 2001 From: Aditya Nagesh Date: Fri, 31 May 2024 03:48:41 -0700 Subject: [PATCH 0669/1578] Drivers: hv: Cosmetic changes for hv.c and balloon.c Fix issues reported by checkpatch.pl script in hv.c and balloon.c - Remove unnecessary parentheses - Remove extra newlines - Remove extra spaces - Add spaces between comparison operators - Remove comparison with NULL in if statements No functional changes intended Signed-off-by: Aditya Nagesh Reviewed-by: Saurabh Sengar Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/1717152521-6439-1-git-send-email-adityanagesh@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1717152521-6439-1-git-send-email-adityanagesh@linux.microsoft.com> --- drivers/hv/hv.c | 37 ++++++++-------- drivers/hv/hv_balloon.c | 98 +++++++++++++++-------------------------- 2 files changed, 53 insertions(+), 82 deletions(-) diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index a8ad728354cb09..e0d676c74f1478 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -45,8 +45,8 @@ int hv_init(void) * This involves a hypercall. */ int hv_post_message(union hv_connection_id connection_id, - enum hv_message_type message_type, - void *payload, size_t payload_size) + enum hv_message_type message_type, + void *payload, size_t payload_size) { struct hv_input_post_message *aligned_msg; unsigned long flags; @@ -86,7 +86,7 @@ int hv_post_message(union hv_connection_id connection_id, status = HV_STATUS_INVALID_PARAMETER; } else { status = hv_do_hypercall(HVCALL_POST_MESSAGE, - aligned_msg, NULL); + aligned_msg, NULL); } local_irq_restore(flags); @@ -111,7 +111,7 @@ int hv_synic_alloc(void) hv_context.hv_numa_map = kcalloc(nr_node_ids, sizeof(struct cpumask), GFP_KERNEL); - if (hv_context.hv_numa_map == NULL) { + if (!hv_context.hv_numa_map) { pr_err("Unable to allocate NUMA map\n"); goto err; } @@ -120,11 +120,11 @@ int hv_synic_alloc(void) hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); tasklet_init(&hv_cpu->msg_dpc, - vmbus_on_msg_dpc, (unsigned long) hv_cpu); + vmbus_on_msg_dpc, (unsigned long)hv_cpu); if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) { hv_cpu->post_msg_page = (void *)get_zeroed_page(GFP_ATOMIC); - if (hv_cpu->post_msg_page == NULL) { + if (!hv_cpu->post_msg_page) { pr_err("Unable to allocate post msg page\n"); goto err; } @@ -147,14 +147,14 @@ int hv_synic_alloc(void) if (!ms_hyperv.paravisor_present && !hv_root_partition) { hv_cpu->synic_message_page = (void *)get_zeroed_page(GFP_ATOMIC); - if (hv_cpu->synic_message_page == NULL) { + if (!hv_cpu->synic_message_page) { pr_err("Unable to allocate SYNIC message page\n"); goto err; } hv_cpu->synic_event_page = (void *)get_zeroed_page(GFP_ATOMIC); - if (hv_cpu->synic_event_page == NULL) { + if (!hv_cpu->synic_event_page) { pr_err("Unable to allocate SYNIC event page\n"); free_page((unsigned long)hv_cpu->synic_message_page); @@ -203,14 +203,13 @@ int hv_synic_alloc(void) return ret; } - void hv_synic_free(void) { int cpu, ret; for_each_present_cpu(cpu) { - struct hv_per_cpu_context *hv_cpu - = per_cpu_ptr(hv_context.cpu_context, cpu); + struct hv_per_cpu_context *hv_cpu = + per_cpu_ptr(hv_context.cpu_context, cpu); /* It's better to leak the page if the encryption fails. */ if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) { @@ -262,8 +261,8 @@ void hv_synic_free(void) */ void hv_synic_enable_regs(unsigned int cpu) { - struct hv_per_cpu_context *hv_cpu - = per_cpu_ptr(hv_context.cpu_context, cpu); + struct hv_per_cpu_context *hv_cpu = + per_cpu_ptr(hv_context.cpu_context, cpu); union hv_synic_simp simp; union hv_synic_siefp siefp; union hv_synic_sint shared_sint; @@ -277,8 +276,8 @@ void hv_synic_enable_regs(unsigned int cpu) /* Mask out vTOM bit. ioremap_cache() maps decrypted */ u64 base = (simp.base_simp_gpa << HV_HYP_PAGE_SHIFT) & ~ms_hyperv.shared_gpa_boundary; - hv_cpu->synic_message_page - = (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE); + hv_cpu->synic_message_page = + (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE); if (!hv_cpu->synic_message_page) pr_err("Fail to map synic message page.\n"); } else { @@ -296,8 +295,8 @@ void hv_synic_enable_regs(unsigned int cpu) /* Mask out vTOM bit. ioremap_cache() maps decrypted */ u64 base = (siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT) & ~ms_hyperv.shared_gpa_boundary; - hv_cpu->synic_event_page - = (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE); + hv_cpu->synic_event_page = + (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE); if (!hv_cpu->synic_event_page) pr_err("Fail to map synic event page.\n"); } else { @@ -348,8 +347,8 @@ int hv_synic_init(unsigned int cpu) */ void hv_synic_disable_regs(unsigned int cpu) { - struct hv_per_cpu_context *hv_cpu - = per_cpu_ptr(hv_context.cpu_context, cpu); + struct hv_per_cpu_context *hv_cpu = + per_cpu_ptr(hv_context.cpu_context, cpu); union hv_synic_sint shared_sint; union hv_synic_simp simp; union hv_synic_siefp siefp; diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 4370ad31b5b381..0e7427c2baf588 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -42,8 +42,6 @@ * Begin protocol definitions. */ - - /* * Protocol versions. The low word is the minor version, the high word the major * version. @@ -72,8 +70,6 @@ enum { DYNMEM_PROTOCOL_VERSION_CURRENT = DYNMEM_PROTOCOL_VERSION_WIN10 }; - - /* * Message Types */ @@ -102,7 +98,6 @@ enum dm_message_type { DM_VERSION_1_MAX = 12 }; - /* * Structures defining the dynamic memory management * protocol. @@ -116,7 +111,6 @@ union dm_version { __u32 version; } __packed; - union dm_caps { struct { __u64 balloon:1; @@ -149,8 +143,6 @@ union dm_mem_page_range { __u64 page_range; } __packed; - - /* * The header for all dynamic memory messages: * @@ -175,7 +167,6 @@ struct dm_message { __u8 data[]; /* enclosed message */ } __packed; - /* * Specific message types supporting the dynamic memory protocol. */ @@ -272,7 +263,6 @@ struct dm_status { __u32 io_diff; } __packed; - /* * Message to ask the guest to allocate memory - balloon up message. * This message is sent from the host to the guest. The guest may not be @@ -287,14 +277,13 @@ struct dm_balloon { __u32 reservedz; } __packed; - /* * Balloon response message; this message is sent from the guest * to the host in response to the balloon message. * * reservedz: Reserved; must be set to zero. * more_pages: If FALSE, this is the last message of the transaction. - * if TRUE there will atleast one more message from the guest. + * if TRUE there will be at least one more message from the guest. * * range_count: The number of ranges in the range array. * @@ -315,7 +304,7 @@ struct dm_balloon_response { * to the guest to give guest more memory. * * more_pages: If FALSE, this is the last message of the transaction. - * if TRUE there will atleast one more message from the guest. + * if TRUE there will be at least one more message from the guest. * * reservedz: Reserved; must be set to zero. * @@ -343,7 +332,6 @@ struct dm_unballoon_response { struct dm_header hdr; } __packed; - /* * Hot add request message. Message sent from the host to the guest. * @@ -391,7 +379,6 @@ enum dm_info_type { MAX_INFO_TYPE }; - /* * Header for the information message. */ @@ -481,10 +468,10 @@ static unsigned long last_post_time; static int hv_hypercall_multi_failure; -module_param(hot_add, bool, (S_IRUGO | S_IWUSR)); +module_param(hot_add, bool, 0644); MODULE_PARM_DESC(hot_add, "If set attempt memory hot_add"); -module_param(pressure_report_delay, uint, (S_IRUGO | S_IWUSR)); +module_param(pressure_report_delay, uint, 0644); MODULE_PARM_DESC(pressure_report_delay, "Delay in secs in reporting pressure"); static atomic_t trans_id = ATOMIC_INIT(0); @@ -503,7 +490,6 @@ enum hv_dm_state { DM_INIT_ERROR }; - static __u8 recv_buffer[HV_HYP_PAGE_SIZE]; static __u8 balloon_up_send_buffer[HV_HYP_PAGE_SIZE]; @@ -599,12 +585,12 @@ static inline bool has_pfn_is_backed(struct hv_hotadd_state *has, struct hv_hotadd_gap *gap; /* The page is not backed. */ - if ((pfn < has->covered_start_pfn) || (pfn >= has->covered_end_pfn)) + if (pfn < has->covered_start_pfn || pfn >= has->covered_end_pfn) return false; /* Check for gaps. */ list_for_each_entry(gap, &has->gap_list, list) { - if ((pfn >= gap->start_pfn) && (pfn < gap->end_pfn)) + if (pfn >= gap->start_pfn && pfn < gap->end_pfn) return false; } @@ -784,8 +770,8 @@ static void hv_online_page(struct page *pg, unsigned int order) guard(spinlock_irqsave)(&dm_device.ha_lock); list_for_each_entry(has, &dm_device.ha_region_list, list) { /* The page belongs to a different HAS. */ - if ((pfn < has->start_pfn) || - (pfn + (1UL << order) > has->end_pfn)) + if (pfn < has->start_pfn || + (pfn + (1UL << order) > has->end_pfn)) continue; hv_bring_pgs_online(has, pfn, 1UL << order); @@ -846,7 +832,7 @@ static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) } static unsigned long handle_pg_range(unsigned long pg_start, - unsigned long pg_count) + unsigned long pg_count) { unsigned long start_pfn = pg_start; unsigned long pfn_cnt = pg_count; @@ -857,7 +843,7 @@ static unsigned long handle_pg_range(unsigned long pg_start, unsigned long res = 0, flags; pr_debug("Hot adding %lu pages starting at pfn 0x%lx.\n", pg_count, - pg_start); + pg_start); spin_lock_irqsave(&dm_device.ha_lock, flags); list_for_each_entry(has, &dm_device.ha_region_list, list) { @@ -893,10 +879,9 @@ static unsigned long handle_pg_range(unsigned long pg_start, if (start_pfn > has->start_pfn && online_section_nr(pfn_to_section_nr(start_pfn))) hv_bring_pgs_online(has, start_pfn, pgs_ol); - } - if ((has->ha_end_pfn < has->end_pfn) && (pfn_cnt > 0)) { + if (has->ha_end_pfn < has->end_pfn && pfn_cnt > 0) { /* * We have some residual hot add range * that needs to be hot added; hot add @@ -999,7 +984,7 @@ static void hot_add_req(struct work_struct *dummy) rg_start = dm->ha_wrk.ha_region_range.finfo.start_page; rg_sz = dm->ha_wrk.ha_region_range.finfo.page_cnt; - if ((rg_start == 0) && (!dm->host_specified_ha_region)) { + if (rg_start == 0 && !dm->host_specified_ha_region) { /* * The host has not specified the hot-add region. * Based on the hot-add page range being specified, @@ -1013,7 +998,7 @@ static void hot_add_req(struct work_struct *dummy) if (do_hot_add) resp.page_count = process_hot_add(pg_start, pfn_cnt, - rg_start, rg_sz); + rg_start, rg_sz); dm->num_pages_added += resp.page_count; #endif @@ -1191,11 +1176,10 @@ static void post_status(struct hv_dynmem_device *dm) sizeof(struct dm_status), (unsigned long)NULL, VM_PKT_DATA_INBAND, 0); - } static void free_balloon_pages(struct hv_dynmem_device *dm, - union dm_mem_page_range *range_array) + union dm_mem_page_range *range_array) { int num_pages = range_array->finfo.page_cnt; __u64 start_frame = range_array->finfo.start_page; @@ -1211,8 +1195,6 @@ static void free_balloon_pages(struct hv_dynmem_device *dm, } } - - static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm, unsigned int num_pages, struct dm_balloon_response *bl_resp, @@ -1258,7 +1240,6 @@ static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm, page_to_pfn(pg); bl_resp->range_array[i].finfo.page_cnt = alloc_unit; bl_resp->hdr.size += sizeof(union dm_mem_page_range); - } return i * alloc_unit; @@ -1312,7 +1293,7 @@ static void balloon_up(struct work_struct *dummy) if (num_ballooned == 0 || num_ballooned == num_pages) { pr_debug("Ballooned %u out of %u requested pages.\n", - num_pages, dm_device.balloon_wrk.num_pages); + num_pages, dm_device.balloon_wrk.num_pages); bl_resp->more_pages = 0; done = true; @@ -1346,16 +1327,15 @@ static void balloon_up(struct work_struct *dummy) for (i = 0; i < bl_resp->range_count; i++) free_balloon_pages(&dm_device, - &bl_resp->range_array[i]); + &bl_resp->range_array[i]); done = true; } } - } static void balloon_down(struct hv_dynmem_device *dm, - struct dm_unballoon_request *req) + struct dm_unballoon_request *req) { union dm_mem_page_range *range_array = req->range_array; int range_count = req->range_count; @@ -1369,7 +1349,7 @@ static void balloon_down(struct hv_dynmem_device *dm, } pr_debug("Freed %u ballooned pages.\n", - prev_pages_ballooned - dm->num_pages_ballooned); + prev_pages_ballooned - dm->num_pages_ballooned); if (req->more_pages == 1) return; @@ -1394,8 +1374,7 @@ static int dm_thread_func(void *dm_dev) struct hv_dynmem_device *dm = dm_dev; while (!kthread_should_stop()) { - wait_for_completion_interruptible_timeout( - &dm_device.config_event, 1*HZ); + wait_for_completion_interruptible_timeout(&dm_device.config_event, 1 * HZ); /* * The host expects us to post information on the memory * pressure every second. @@ -1419,9 +1398,8 @@ static int dm_thread_func(void *dm_dev) return 0; } - static void version_resp(struct hv_dynmem_device *dm, - struct dm_version_response *vresp) + struct dm_version_response *vresp) { struct dm_version_request version_req; int ret; @@ -1482,7 +1460,7 @@ static void version_resp(struct hv_dynmem_device *dm, } static void cap_resp(struct hv_dynmem_device *dm, - struct dm_capabilities_resp_msg *cap_resp) + struct dm_capabilities_resp_msg *cap_resp) { if (!cap_resp->is_accepted) { pr_err("Capabilities not accepted by host\n"); @@ -1515,7 +1493,7 @@ static void balloon_onchannelcallback(void *context) switch (dm_hdr->type) { case DM_VERSION_RESPONSE: version_resp(dm, - (struct dm_version_response *)dm_msg); + (struct dm_version_response *)dm_msg); break; case DM_CAPABILITIES_RESPONSE: @@ -1545,7 +1523,7 @@ static void balloon_onchannelcallback(void *context) dm->state = DM_BALLOON_DOWN; balloon_down(dm, - (struct dm_unballoon_request *)recv_buffer); + (struct dm_unballoon_request *)recv_buffer); break; case DM_MEM_HOT_ADD_REQUEST: @@ -1583,17 +1561,15 @@ static void balloon_onchannelcallback(void *context) default: pr_warn_ratelimited("Unhandled message: type: %d\n", dm_hdr->type); - } } - } #define HV_LARGE_REPORTING_ORDER 9 #define HV_LARGE_REPORTING_LEN (HV_HYP_PAGE_SIZE << \ HV_LARGE_REPORTING_ORDER) static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info, - struct scatterlist *sgl, unsigned int nents) + struct scatterlist *sgl, unsigned int nents) { unsigned long flags; struct hv_memory_hint *hint; @@ -1628,7 +1604,7 @@ static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info, */ /* page reporting for pages 2MB or higher */ - if (order >= HV_LARGE_REPORTING_ORDER ) { + if (order >= HV_LARGE_REPORTING_ORDER) { range->page.largepage = 1; range->page_size = HV_GPA_PAGE_RANGE_PAGE_SIZE_2MB; range->base_large_pfn = page_to_hvpfn( @@ -1642,23 +1618,21 @@ static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info, range->page.additional_pages = (sg->length / HV_HYP_PAGE_SIZE) - 1; } - } status = hv_do_rep_hypercall(HV_EXT_CALL_MEMORY_HEAT_HINT, nents, 0, hint, NULL); local_irq_restore(flags); if (!hv_result_success(status)) { - pr_err("Cold memory discard hypercall failed with status %llx\n", - status); + status); if (hv_hypercall_multi_failure > 0) hv_hypercall_multi_failure++; if (hv_result(status) == HV_STATUS_INVALID_PARAMETER) { pr_err("Underlying Hyper-V does not support order less than 9. Hypercall failed\n"); pr_err("Defaulting to page_reporting_order %d\n", - pageblock_order); + pageblock_order); page_reporting_order = pageblock_order; hv_hypercall_multi_failure++; return -EINVAL; @@ -1692,7 +1666,7 @@ static void enable_page_reporting(void) pr_err("Failed to enable cold memory discard: %d\n", ret); } else { pr_info("Cold memory discard hint enabled with order %d\n", - page_reporting_order); + page_reporting_order); } } @@ -1775,7 +1749,7 @@ static int balloon_connect_vsp(struct hv_device *dev) if (ret) goto out; - t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ); + t = wait_for_completion_timeout(&dm_device.host_event, 5 * HZ); if (t == 0) { ret = -ETIMEDOUT; goto out; @@ -1833,7 +1807,7 @@ static int balloon_connect_vsp(struct hv_device *dev) if (ret) goto out; - t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ); + t = wait_for_completion_timeout(&dm_device.host_event, 5 * HZ); if (t == 0) { ret = -ETIMEDOUT; goto out; @@ -1874,8 +1848,8 @@ static int hv_balloon_debug_show(struct seq_file *f, void *offset) char *sname; seq_printf(f, "%-22s: %u.%u\n", "host_version", - DYNMEM_MAJOR_VERSION(dm->version), - DYNMEM_MINOR_VERSION(dm->version)); + DYNMEM_MAJOR_VERSION(dm->version), + DYNMEM_MINOR_VERSION(dm->version)); seq_printf(f, "%-22s:", "capabilities"); if (ballooning_enabled()) @@ -1924,10 +1898,10 @@ static int hv_balloon_debug_show(struct seq_file *f, void *offset) seq_printf(f, "%-22s: %u\n", "pages_ballooned", dm->num_pages_ballooned); seq_printf(f, "%-22s: %lu\n", "total_pages_committed", - get_pages_committed(dm)); + get_pages_committed(dm)); seq_printf(f, "%-22s: %llu\n", "max_dynamic_page_count", - dm->max_dynamic_page_count); + dm->max_dynamic_page_count); return 0; } @@ -1937,7 +1911,7 @@ DEFINE_SHOW_ATTRIBUTE(hv_balloon_debug); static void hv_balloon_debugfs_init(struct hv_dynmem_device *b) { debugfs_create_file("hv-balloon", 0444, NULL, b, - &hv_balloon_debug_fops); + &hv_balloon_debug_fops); } static void hv_balloon_debugfs_exit(struct hv_dynmem_device *b) @@ -2095,7 +2069,6 @@ static int balloon_suspend(struct hv_device *hv_dev) tasklet_enable(&hv_dev->channel->callback_event); return 0; - } static int balloon_resume(struct hv_device *dev) @@ -2154,7 +2127,6 @@ static struct hv_driver balloon_drv = { static int __init init_balloon_drv(void) { - return vmbus_driver_register(&balloon_drv); } From 0d92e4a7ffd5c42b9fa864692f82476c0bf8bcc8 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 5 Jun 2024 18:56:37 +0100 Subject: [PATCH 0670/1578] KVM: arm64: Disassociate vcpus from redistributor region on teardown When tearing down a redistributor region, make sure we don't have any dangling pointer to that region stored in a vcpu. Fixes: e5a35635464b ("kvm: arm64: vgic-v3: Introduce vgic_v3_free_redist_region()") Reported-by: Alexander Potapenko Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20240605175637.1635653-1-maz@kernel.org Cc: stable@vger.kernel.org --- arch/arm64/kvm/vgic/vgic-init.c | 2 +- arch/arm64/kvm/vgic/vgic-mmio-v3.c | 15 +++++++++++++-- arch/arm64/kvm/vgic/vgic.h | 2 +- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 8f5b7a3e7009e3..7f68cf58b978fb 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -391,7 +391,7 @@ static void kvm_vgic_dist_destroy(struct kvm *kvm) if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list) - vgic_v3_free_redist_region(rdreg); + vgic_v3_free_redist_region(kvm, rdreg); INIT_LIST_HEAD(&dist->rd_regions); } else { dist->vgic_cpu_base = VGIC_ADDR_UNDEF; diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c index a3983a631b5ad1..9e50928f5d7dfd 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c +++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c @@ -919,8 +919,19 @@ static int vgic_v3_alloc_redist_region(struct kvm *kvm, uint32_t index, return ret; } -void vgic_v3_free_redist_region(struct vgic_redist_region *rdreg) +void vgic_v3_free_redist_region(struct kvm *kvm, struct vgic_redist_region *rdreg) { + struct kvm_vcpu *vcpu; + unsigned long c; + + lockdep_assert_held(&kvm->arch.config_lock); + + /* Garbage collect the region */ + kvm_for_each_vcpu(c, vcpu, kvm) { + if (vcpu->arch.vgic_cpu.rdreg == rdreg) + vcpu->arch.vgic_cpu.rdreg = NULL; + } + list_del(&rdreg->list); kfree(rdreg); } @@ -945,7 +956,7 @@ int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count) mutex_lock(&kvm->arch.config_lock); rdreg = vgic_v3_rdist_region_from_index(kvm, index); - vgic_v3_free_redist_region(rdreg); + vgic_v3_free_redist_region(kvm, rdreg); mutex_unlock(&kvm->arch.config_lock); return ret; } diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 6106ebd5ba4299..03d356a123771f 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -316,7 +316,7 @@ vgic_v3_rd_region_size(struct kvm *kvm, struct vgic_redist_region *rdreg) struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm, u32 index); -void vgic_v3_free_redist_region(struct vgic_redist_region *rdreg); +void vgic_v3_free_redist_region(struct kvm *kvm, struct vgic_redist_region *rdreg); bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size); From b0c9a26435413b81799047a7be53255640432547 Mon Sep 17 00:00:00 2001 From: Aleksandr Mishin Date: Tue, 4 Jun 2024 11:25:00 +0300 Subject: [PATCH 0671/1578] net: wwan: iosm: Fix tainted pointer delete is case of region creation fail In case of region creation fail in ipc_devlink_create_region(), previously created regions delete process starts from tainted pointer which actually holds error code value. Fix this bug by decreasing region index before delete. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 4dcd183fbd67 ("net: wwan: iosm: devlink registration") Signed-off-by: Aleksandr Mishin Acked-by: Sergey Ryazanov Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240604082500.20769-1-amishin@t-argos.ru Signed-off-by: Paolo Abeni --- drivers/net/wwan/iosm/iosm_ipc_devlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wwan/iosm/iosm_ipc_devlink.c b/drivers/net/wwan/iosm/iosm_ipc_devlink.c index bef6819986e939..33d6342124bc33 100644 --- a/drivers/net/wwan/iosm/iosm_ipc_devlink.c +++ b/drivers/net/wwan/iosm/iosm_ipc_devlink.c @@ -211,7 +211,7 @@ static int ipc_devlink_create_region(struct iosm_devlink *devlink) rc = PTR_ERR(devlink->cd_regions[i]); dev_err(devlink->dev, "Devlink region fail,err %d", rc); /* Delete previously created regions */ - for ( ; i >= 0; i--) + for (i--; i >= 0; i--) devlink_region_destroy(devlink->cd_regions[i]); goto region_create_fail; } From 0fc670d07d5de36a54f061f457743c9cde1d8b46 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Mon, 3 Jun 2024 14:20:46 +0200 Subject: [PATCH 0672/1578] KVM: selftests: Fix RISC-V compilation Due to commit 2b7deea3ec7c ("Revert "kvm: selftests: move base kvm_util.h declarations to kvm_util_base.h"") kvm selftests now requires explicitly including ucall_common.h when needed. The commit added the directives everywhere they were needed at the time, but, by merge time, new places had been merged for RISC-V. Add those now to fix RISC-V's compilation. Fixes: dee7ea42a1eb ("Merge tag 'kvm-x86-selftests_utils-6.10' of https://github.com/kvm-x86/linux into HEAD") Signed-off-by: Andrew Jones Link: https://lore.kernel.org/r/20240603122045.323064-2-ajones@ventanamicro.com Signed-off-by: Anup Patel --- tools/testing/selftests/kvm/lib/riscv/ucall.c | 1 + tools/testing/selftests/kvm/riscv/ebreak_test.c | 1 + tools/testing/selftests/kvm/riscv/sbi_pmu_test.c | 1 + 3 files changed, 3 insertions(+) diff --git a/tools/testing/selftests/kvm/lib/riscv/ucall.c b/tools/testing/selftests/kvm/lib/riscv/ucall.c index 14ee17151a590b..b5035c63d5163b 100644 --- a/tools/testing/selftests/kvm/lib/riscv/ucall.c +++ b/tools/testing/selftests/kvm/lib/riscv/ucall.c @@ -9,6 +9,7 @@ #include "kvm_util.h" #include "processor.h" +#include "sbi.h" void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu) { diff --git a/tools/testing/selftests/kvm/riscv/ebreak_test.c b/tools/testing/selftests/kvm/riscv/ebreak_test.c index 823c132069b46a..0e071285495384 100644 --- a/tools/testing/selftests/kvm/riscv/ebreak_test.c +++ b/tools/testing/selftests/kvm/riscv/ebreak_test.c @@ -6,6 +6,7 @@ * */ #include "kvm_util.h" +#include "ucall_common.h" #define LABEL_ADDRESS(v) ((uint64_t)&(v)) diff --git a/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c b/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c index 69bb94e6b22762..f299cbfd23ca09 100644 --- a/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c +++ b/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c @@ -15,6 +15,7 @@ #include "processor.h" #include "sbi.h" #include "arch_timer.h" +#include "ucall_common.h" /* Maximum counters(firmware + hardware) */ #define RISCV_MAX_PMU_COUNTERS 64 From 26bfb8b57063f52b867f9b6c8d1742fcb5bd656c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 4 Jun 2024 09:52:27 -0700 Subject: [PATCH 0673/1578] af_unix: Set sk->sk_state under unix_state_lock() for truly disconencted peer. When a SOCK_DGRAM socket connect()s to another socket, the both sockets' sk->sk_state are changed to TCP_ESTABLISHED so that we can register them to BPF SOCKMAP. When the socket disconnects from the peer by connect(AF_UNSPEC), the state is set back to TCP_CLOSE. Then, the peer's state is also set to TCP_CLOSE, but the update is done locklessly and unconditionally. Let's say socket A connect()ed to B, B connect()ed to C, and A disconnects from B. After the first two connect()s, all three sockets' sk->sk_state are TCP_ESTABLISHED: $ ss -xa Netid State Recv-Q Send-Q Local Address:Port Peer Address:PortProcess u_dgr ESTAB 0 0 @A 641 * 642 u_dgr ESTAB 0 0 @B 642 * 643 u_dgr ESTAB 0 0 @C 643 * 0 And after the disconnect, B's state is TCP_CLOSE even though it's still connected to C and C's state is TCP_ESTABLISHED. $ ss -xa Netid State Recv-Q Send-Q Local Address:Port Peer Address:PortProcess u_dgr UNCONN 0 0 @A 641 * 0 u_dgr UNCONN 0 0 @B 642 * 643 u_dgr ESTAB 0 0 @C 643 * 0 In this case, we cannot register B to SOCKMAP. So, when a socket disconnects from the peer, we should not set TCP_CLOSE to the peer if the peer is connected to yet another socket, and this must be done under unix_state_lock(). Note that we use WRITE_ONCE() for sk->sk_state as there are many lockless readers. These data-races will be fixed in the following patches. Fixes: 83301b5367a9 ("af_unix: Set TCP_ESTABLISHED for datagram sockets too") Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 25b49efc0926b0..b162164b7a4205 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -570,7 +570,6 @@ static void unix_dgram_disconnected(struct sock *sk, struct sock *other) sk_error_report(other); } } - other->sk_state = TCP_CLOSE; } static void unix_sock_destructor(struct sock *sk) @@ -1424,8 +1423,15 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, unix_state_double_unlock(sk, other); - if (other != old_peer) + if (other != old_peer) { unix_dgram_disconnected(sk, old_peer); + + unix_state_lock(old_peer); + if (!unix_peer(old_peer)) + WRITE_ONCE(old_peer->sk_state, TCP_CLOSE); + unix_state_unlock(old_peer); + } + sock_put(old_peer); } else { unix_peer(sk) = other; From 942238f9735a4a4ebf8274b218d9a910158941d1 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 4 Jun 2024 09:52:28 -0700 Subject: [PATCH 0674/1578] af_unix: Annodate data-races around sk->sk_state for writers. sk->sk_state is changed under unix_state_lock(), but it's read locklessly in many places. This patch adds WRITE_ONCE() on the writer side. We will add READ_ONCE() to the lockless readers in the following patches. Fixes: 83301b5367a9 ("af_unix: Set TCP_ESTABLISHED for datagram sockets too") Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index b162164b7a4205..424d021a4d7d76 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -616,7 +616,7 @@ static void unix_release_sock(struct sock *sk, int embrion) u->path.dentry = NULL; u->path.mnt = NULL; state = sk->sk_state; - sk->sk_state = TCP_CLOSE; + WRITE_ONCE(sk->sk_state, TCP_CLOSE); skpair = unix_peer(sk); unix_peer(sk) = NULL; @@ -738,7 +738,8 @@ static int unix_listen(struct socket *sock, int backlog) if (backlog > sk->sk_max_ack_backlog) wake_up_interruptible_all(&u->peer_wait); sk->sk_max_ack_backlog = backlog; - sk->sk_state = TCP_LISTEN; + WRITE_ONCE(sk->sk_state, TCP_LISTEN); + /* set credentials so connect can copy them */ init_peercred(sk); err = 0; @@ -1401,7 +1402,8 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, if (err) goto out_unlock; - sk->sk_state = other->sk_state = TCP_ESTABLISHED; + WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); + WRITE_ONCE(other->sk_state, TCP_ESTABLISHED); } else { /* * 1003.1g breaking connected state with AF_UNSPEC @@ -1418,7 +1420,7 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, unix_peer(sk) = other; if (!other) - sk->sk_state = TCP_CLOSE; + WRITE_ONCE(sk->sk_state, TCP_CLOSE); unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); unix_state_double_unlock(sk, other); @@ -1639,7 +1641,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, copy_peercred(sk, other); sock->state = SS_CONNECTED; - sk->sk_state = TCP_ESTABLISHED; + WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); sock_hold(newsk); smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ @@ -2050,7 +2052,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, unix_peer(sk) = NULL; unix_dgram_peer_wake_disconnect_wakeup(sk, other); - sk->sk_state = TCP_CLOSE; + WRITE_ONCE(sk->sk_state, TCP_CLOSE); unix_state_unlock(sk); unix_dgram_disconnected(sk, other); From 3a0f38eb285c8c2eead4b3230c7ac2983707599d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 4 Jun 2024 09:52:29 -0700 Subject: [PATCH 0675/1578] af_unix: Annotate data-race of sk->sk_state in unix_inq_len(). ioctl(SIOCINQ) calls unix_inq_len() that checks sk->sk_state first and returns -EINVAL if it's TCP_LISTEN. Then, for SOCK_STREAM sockets, unix_inq_len() returns the number of bytes in recvq. However, unix_inq_len() does not hold unix_state_lock(), and the concurrent listen() might change the state after checking sk->sk_state. If the race occurs, 0 is returned for the listener, instead of -EINVAL, because the length of skb with embryo is 0. We could hold unix_state_lock() in unix_inq_len(), but it's overkill given the result is true for pre-listen() TCP_CLOSE state. So, let's use READ_ONCE() for sk->sk_state in unix_inq_len(). Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 424d021a4d7d76..b37b53767b29b6 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -3017,7 +3017,7 @@ long unix_inq_len(struct sock *sk) struct sk_buff *skb; long amount = 0; - if (sk->sk_state == TCP_LISTEN) + if (READ_ONCE(sk->sk_state) == TCP_LISTEN) return -EINVAL; spin_lock(&sk->sk_receive_queue.lock); From eb0718fb3e97ad0d6f4529b810103451c90adf94 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 4 Jun 2024 09:52:30 -0700 Subject: [PATCH 0676/1578] af_unix: Annotate data-races around sk->sk_state in unix_write_space() and poll(). unix_poll() and unix_dgram_poll() read sk->sk_state locklessly and calls unix_writable() which also reads sk->sk_state without holding unix_state_lock(). Let's use READ_ONCE() in unix_poll() and unix_dgram_poll() and pass it to unix_writable(). While at it, we remove TCP_SYN_SENT check in unix_dgram_poll() as that state does not exist for AF_UNIX socket since the code was added. Fixes: 1586a5877db9 ("af_unix: do not report POLLOUT on listeners") Fixes: 3c73419c09a5 ("af_unix: fix 'poll for write'/ connected DGRAM sockets") Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index b37b53767b29b6..3bacf47cb94deb 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -530,9 +530,9 @@ static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) return 0; } -static int unix_writable(const struct sock *sk) +static int unix_writable(const struct sock *sk, unsigned char state) { - return sk->sk_state != TCP_LISTEN && + return state != TCP_LISTEN && (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf; } @@ -541,7 +541,7 @@ static void unix_write_space(struct sock *sk) struct socket_wq *wq; rcu_read_lock(); - if (unix_writable(sk)) { + if (unix_writable(sk, READ_ONCE(sk->sk_state))) { wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, @@ -3129,12 +3129,14 @@ static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned lon static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; + unsigned char state; __poll_t mask; u8 shutdown; sock_poll_wait(file, sock, wait); mask = 0; shutdown = READ_ONCE(sk->sk_shutdown); + state = READ_ONCE(sk->sk_state); /* exceptional events? */ if (READ_ONCE(sk->sk_err)) @@ -3156,14 +3158,14 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa /* Connection-based need to check for termination and startup */ if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && - sk->sk_state == TCP_CLOSE) + state == TCP_CLOSE) mask |= EPOLLHUP; /* * we set writable also when the other side has shut down the * connection. This prevents stuck sockets. */ - if (unix_writable(sk)) + if (unix_writable(sk, state)) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; return mask; @@ -3174,12 +3176,14 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, { struct sock *sk = sock->sk, *other; unsigned int writable; + unsigned char state; __poll_t mask; u8 shutdown; sock_poll_wait(file, sock, wait); mask = 0; shutdown = READ_ONCE(sk->sk_shutdown); + state = READ_ONCE(sk->sk_state); /* exceptional events? */ if (READ_ONCE(sk->sk_err) || @@ -3199,19 +3203,14 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, mask |= EPOLLIN | EPOLLRDNORM; /* Connection-based need to check for termination and startup */ - if (sk->sk_type == SOCK_SEQPACKET) { - if (sk->sk_state == TCP_CLOSE) - mask |= EPOLLHUP; - /* connection hasn't started yet? */ - if (sk->sk_state == TCP_SYN_SENT) - return mask; - } + if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE) + mask |= EPOLLHUP; /* No write status requested, avoid expensive OUT tests. */ if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) return mask; - writable = unix_writable(sk); + writable = unix_writable(sk, state); if (writable) { unix_state_lock(sk); From a9bf9c7dc6a5899c01cb8f6e773a66315a5cd4b7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 4 Jun 2024 09:52:31 -0700 Subject: [PATCH 0677/1578] af_unix: Annotate data-race of sk->sk_state in unix_stream_connect(). As small optimisation, unix_stream_connect() prefetches the client's sk->sk_state without unix_state_lock() and checks if it's TCP_CLOSE. Later, sk->sk_state is checked again under unix_state_lock(). Let's use READ_ONCE() for the first check and TCP_CLOSE directly for the second check. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3bacf47cb94deb..84552826530df0 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1481,7 +1481,6 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, struct sk_buff *skb = NULL; long timeo; int err; - int st; err = unix_validate_addr(sunaddr, addr_len); if (err) @@ -1571,9 +1570,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, Well, and we have to recheck the state after socket locked. */ - st = sk->sk_state; - - switch (st) { + switch (READ_ONCE(sk->sk_state)) { case TCP_CLOSE: /* This is ok... continue with connect */ break; @@ -1588,7 +1585,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, unix_state_lock_nested(sk, U_LOCK_SECOND); - if (sk->sk_state != st) { + if (sk->sk_state != TCP_CLOSE) { unix_state_unlock(sk); unix_state_unlock(other); sock_put(other); From 1b536948e805aab61a48c5aa5db10c9afee880bd Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 4 Jun 2024 09:52:32 -0700 Subject: [PATCH 0678/1578] af_unix: Annotate data-race of sk->sk_state in unix_accept(). Once sk->sk_state is changed to TCP_LISTEN, it never changes. unix_accept() takes the advantage and reads sk->sk_state without holding unix_state_lock(). Let's use READ_ONCE() there. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 84552826530df0..4763c26ae480a1 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1710,7 +1710,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, goto out; arg->err = -EINVAL; - if (sk->sk_state != TCP_LISTEN) + if (READ_ONCE(sk->sk_state) != TCP_LISTEN) goto out; /* If socket state is TCP_LISTEN it cannot change (for now...), From 8a34d4e8d9742a24f74998f45a6a98edd923319b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 4 Jun 2024 09:52:33 -0700 Subject: [PATCH 0679/1578] af_unix: Annotate data-races around sk->sk_state in sendmsg() and recvmsg(). The following functions read sk->sk_state locklessly and proceed only if the state is TCP_ESTABLISHED. * unix_stream_sendmsg * unix_stream_read_generic * unix_seqpacket_sendmsg * unix_seqpacket_recvmsg Let's use READ_ONCE() there. Fixes: a05d2ad1c1f3 ("af_unix: Only allow recv on connected seqpacket sockets.") Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 4763c26ae480a1..4ef9c21783a5a1 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2226,7 +2226,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, } if (msg->msg_namelen) { - err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; + err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; goto out_err; } else { err = -ENOTCONN; @@ -2340,7 +2340,7 @@ static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, if (err) return err; - if (sk->sk_state != TCP_ESTABLISHED) + if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) return -ENOTCONN; if (msg->msg_namelen) @@ -2354,7 +2354,7 @@ static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, { struct sock *sk = sock->sk; - if (sk->sk_state != TCP_ESTABLISHED) + if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) return -ENOTCONN; return unix_dgram_recvmsg(sock, msg, size, flags); @@ -2683,7 +2683,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, size_t size = state->size; unsigned int last_len; - if (unlikely(sk->sk_state != TCP_ESTABLISHED)) { + if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) { err = -EINVAL; goto out; } From af4c733b6b1aded4dc808fafece7dfe6e9d2ebb3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 4 Jun 2024 09:52:34 -0700 Subject: [PATCH 0680/1578] af_unix: Annotate data-race of sk->sk_state in unix_stream_read_skb(). unix_stream_read_skb() is called from sk->sk_data_ready() context where unix_state_lock() is not held. Let's use READ_ONCE() there. Fixes: 77462de14a43 ("af_unix: Add read_sock for stream socket types") Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 4ef9c21783a5a1..e7b74207aa3bda 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2659,7 +2659,7 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { - if (unlikely(sk->sk_state != TCP_ESTABLISHED)) + if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) return -ENOTCONN; return unix_read_skb(sk, recv_actor); From 0aa3be7b3e1f8f997312cc4705f8165e02806f8f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 4 Jun 2024 09:52:35 -0700 Subject: [PATCH 0681/1578] af_unix: Annotate data-races around sk->sk_state in UNIX_DIAG. While dumping AF_UNIX sockets via UNIX_DIAG, sk->sk_state is read locklessly. Let's use READ_ONCE() there. Note that the result could be inconsistent if the socket is dumped during the state change. This is common for other SOCK_DIAG and similar interfaces. Fixes: c9da99e6475f ("unix_diag: Fixup RQLEN extension report") Fixes: 2aac7a2cb0d9 ("unix_diag: Pending connections IDs NLA") Fixes: 45a96b9be6ec ("unix_diag: Dumping all sockets core") Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/unix/diag.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/unix/diag.c b/net/unix/diag.c index ae39538c5042b3..116cf508aea4ab 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -65,7 +65,7 @@ static int sk_diag_dump_icons(struct sock *sk, struct sk_buff *nlskb) u32 *buf; int i; - if (sk->sk_state == TCP_LISTEN) { + if (READ_ONCE(sk->sk_state) == TCP_LISTEN) { spin_lock(&sk->sk_receive_queue.lock); attr = nla_reserve(nlskb, UNIX_DIAG_ICONS, @@ -103,7 +103,7 @@ static int sk_diag_show_rqlen(struct sock *sk, struct sk_buff *nlskb) { struct unix_diag_rqlen rql; - if (sk->sk_state == TCP_LISTEN) { + if (READ_ONCE(sk->sk_state) == TCP_LISTEN) { rql.udiag_rqueue = sk->sk_receive_queue.qlen; rql.udiag_wqueue = sk->sk_max_ack_backlog; } else { @@ -136,7 +136,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r rep = nlmsg_data(nlh); rep->udiag_family = AF_UNIX; rep->udiag_type = sk->sk_type; - rep->udiag_state = sk->sk_state; + rep->udiag_state = READ_ONCE(sk->sk_state); rep->pad = 0; rep->udiag_ino = sk_ino; sock_diag_save_cookie(sk, rep->udiag_cookie); @@ -215,7 +215,7 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) sk_for_each(sk, &net->unx.table.buckets[slot]) { if (num < s_num) goto next; - if (!(req->udiag_states & (1 << sk->sk_state))) + if (!(req->udiag_states & (1 << READ_ONCE(sk->sk_state)))) goto next; if (sk_diag_dump(sk, skb, req, sk_user_ns(skb->sk), NETLINK_CB(cb->skb).portid, From b0632e53e0da8054e36bc973f0eec69d30f1b7c6 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 4 Jun 2024 09:52:36 -0700 Subject: [PATCH 0682/1578] af_unix: Annotate data-races around sk->sk_sndbuf. sk_setsockopt() changes sk->sk_sndbuf under lock_sock(), but it's not used in af_unix.c. Let's use READ_ONCE() to read sk->sk_sndbuf in unix_writable(), unix_dgram_sendmsg(), and unix_stream_sendmsg(). Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e7b74207aa3bda..de2a5e334a88eb 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -533,7 +533,7 @@ static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) static int unix_writable(const struct sock *sk, unsigned char state) { return state != TCP_LISTEN && - (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf; + (refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf); } static void unix_write_space(struct sock *sk) @@ -1967,7 +1967,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, } err = -EMSGSIZE; - if (len > sk->sk_sndbuf - 32) + if (len > READ_ONCE(sk->sk_sndbuf) - 32) goto out; if (len > SKB_MAX_ALLOC) { @@ -2247,7 +2247,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, &err, 0); } else { /* Keep two messages in the pipe so it schedules better */ - size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64); + size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64); /* allow fallback to order-0 allocations */ size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); From bd9f2d05731f6a112d0c7391a0d537bfc588dbe6 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 4 Jun 2024 09:52:37 -0700 Subject: [PATCH 0683/1578] af_unix: Annotate data-race of net->unx.sysctl_max_dgram_qlen. net->unx.sysctl_max_dgram_qlen is exposed as a sysctl knob and can be changed concurrently. Let's use READ_ONCE() in unix_create1(). Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index de2a5e334a88eb..623ee39657e235 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -976,7 +976,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, sk->sk_hash = unix_unbound_hash(sk); sk->sk_allocation = GFP_KERNEL_ACCOUNT; sk->sk_write_space = unix_write_space; - sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen; + sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen); sk->sk_destruct = unix_sock_destructor; u = unix_sk(sk); u->listener = NULL; From 45d872f0e65593176d880ec148f41ad7c02e40a7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 4 Jun 2024 09:52:38 -0700 Subject: [PATCH 0684/1578] af_unix: Use unix_recvq_full_lockless() in unix_stream_connect(). Once sk->sk_state is changed to TCP_LISTEN, it never changes. unix_accept() takes advantage of this characteristics; it does not hold the listener's unix_state_lock() and only acquires recvq lock to pop one skb. It means unix_state_lock() does not prevent the queue length from changing in unix_stream_connect(). Thus, we need to use unix_recvq_full_lockless() to avoid data-race. Now we remove unix_recvq_full() as no one uses it. Note that we can remove READ_ONCE() for sk->sk_max_ack_backlog in unix_recvq_full_lockless() because of the following reasons: (1) For SOCK_DGRAM, it is a written-once field in unix_create1() (2) For SOCK_STREAM and SOCK_SEQPACKET, it is changed under the listener's unix_state_lock() in unix_listen(), and we hold the lock in unix_stream_connect() Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 623ee39657e235..eb3ba3448ed34f 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -221,15 +221,9 @@ static inline int unix_may_send(struct sock *sk, struct sock *osk) return unix_peer(osk) == NULL || unix_our_peer(sk, osk); } -static inline int unix_recvq_full(const struct sock *sk) -{ - return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; -} - static inline int unix_recvq_full_lockless(const struct sock *sk) { - return skb_queue_len_lockless(&sk->sk_receive_queue) > - READ_ONCE(sk->sk_max_ack_backlog); + return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; } struct sock *unix_peer_get(struct sock *s) @@ -1545,7 +1539,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, if (other->sk_shutdown & RCV_SHUTDOWN) goto out_unlock; - if (unix_recvq_full(other)) { + if (unix_recvq_full_lockless(other)) { err = -EAGAIN; if (!timeo) goto out_unlock; From 83690b82d228b3570565ebd0b41873933238b97f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 4 Jun 2024 09:52:39 -0700 Subject: [PATCH 0685/1578] af_unix: Use skb_queue_empty_lockless() in unix_release_sock(). If the socket type is SOCK_STREAM or SOCK_SEQPACKET, unix_release_sock() checks the length of the peer socket's recvq under unix_state_lock(). However, unix_stream_read_generic() calls skb_unlink() after releasing the lock. Also, for SOCK_SEQPACKET, __skb_try_recv_datagram() unlinks skb without unix_state_lock(). Thues, unix_state_lock() does not protect qlen. Let's use skb_queue_empty_lockless() in unix_release_sock(). Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index eb3ba3448ed34f..80846279de9f3b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -631,7 +631,7 @@ static void unix_release_sock(struct sock *sk, int embrion) unix_state_lock(skpair); /* No more writes */ WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); - if (!skb_queue_empty(&sk->sk_receive_queue) || embrion) + if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion) WRITE_ONCE(skpair->sk_err, ECONNRESET); unix_state_unlock(skpair); skpair->sk_state_change(skpair); From 5d915e584d8408211d4567c22685aae8820bfc55 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 4 Jun 2024 09:52:40 -0700 Subject: [PATCH 0686/1578] af_unix: Use skb_queue_len_lockless() in sk_diag_show_rqlen(). We can dump the socket queue length via UNIX_DIAG by specifying UDIAG_SHOW_RQLEN. If sk->sk_state is TCP_LISTEN, we return the recv queue length, but here we do not hold recvq lock. Let's use skb_queue_len_lockless() in sk_diag_show_rqlen(). Fixes: c9da99e6475f ("unix_diag: Fixup RQLEN extension report") Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/unix/diag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/unix/diag.c b/net/unix/diag.c index 116cf508aea4ab..321336f91a0afb 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -104,7 +104,7 @@ static int sk_diag_show_rqlen(struct sock *sk, struct sk_buff *nlskb) struct unix_diag_rqlen rql; if (READ_ONCE(sk->sk_state) == TCP_LISTEN) { - rql.udiag_rqueue = sk->sk_receive_queue.qlen; + rql.udiag_rqueue = skb_queue_len_lockless(&sk->sk_receive_queue); rql.udiag_wqueue = sk->sk_max_ack_backlog; } else { rql.udiag_rqueue = (u32) unix_inq_len(sk); From efaf24e30ec39ebbea9112227485805a48b0ceb1 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 4 Jun 2024 09:52:41 -0700 Subject: [PATCH 0687/1578] af_unix: Annotate data-race of sk->sk_shutdown in sk_diag_fill(). While dumping sockets via UNIX_DIAG, we do not hold unix_state_lock(). Let's use READ_ONCE() to read sk->sk_shutdown. Fixes: e4e541a84863 ("sock-diag: Report shutdown for inet and unix sockets (v2)") Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/unix/diag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/unix/diag.c b/net/unix/diag.c index 321336f91a0afb..937edf4afed413 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -165,7 +165,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r sock_diag_put_meminfo(sk, skb, UNIX_DIAG_MEMINFO)) goto out_nlmsg_trim; - if (nla_put_u8(skb, UNIX_DIAG_SHUTDOWN, sk->sk_shutdown)) + if (nla_put_u8(skb, UNIX_DIAG_SHUTDOWN, READ_ONCE(sk->sk_shutdown))) goto out_nlmsg_trim; if ((req->udiag_show & UDIAG_SHOW_UID) && From c181689bc479d3b2300f91fc4d53e089d7631898 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 2 Jun 2024 03:20:38 +0900 Subject: [PATCH 0688/1578] kconfig: remove unneeded code for user-supplied values being out of range This is a leftover from commit ce1fc9345a59 ("kconfig: do not clear SYMBOL_DEF_USER when the value is out of range"). This code is now redundant because if a user-supplied value is out of range, the value adjusted by sym_validate_range() differs, and conf_unsaved has already been incremented a few lines above. Signed-off-by: Masahiro Yamada --- scripts/kconfig/confdata.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/scripts/kconfig/confdata.c b/scripts/kconfig/confdata.c index 387503daf0f73c..85b53069ba7a40 100644 --- a/scripts/kconfig/confdata.c +++ b/scripts/kconfig/confdata.c @@ -533,19 +533,6 @@ int conf_read(const char *name) */ if (sym->visible == no && !conf_unsaved) sym->flags &= ~SYMBOL_DEF_USER; - switch (sym->type) { - case S_STRING: - case S_INT: - case S_HEX: - /* Reset a string value if it's out of range */ - if (sym_string_within_range(sym, sym->def[S_DEF_USER].val)) - break; - sym->flags &= ~SYMBOL_VALID; - conf_unsaved++; - break; - default: - break; - } } } From 46edf4372e336ef3a61c3126e49518099d2e2e6d Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 2 Jun 2024 03:20:40 +0900 Subject: [PATCH 0689/1578] kconfig: gconf: give a proper initial state to the Save button Currently, the initial state of the "Save" button is always active. If none of the CONFIG options are changed while loading the .config file, the "Save" button should be greyed out. This can be fixed by calling conf_read() after widget initialization. Signed-off-by: Masahiro Yamada --- scripts/kconfig/gconf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/kconfig/gconf.c b/scripts/kconfig/gconf.c index cc400ffe66150b..e04dbafd3add9a 100644 --- a/scripts/kconfig/gconf.c +++ b/scripts/kconfig/gconf.c @@ -1422,7 +1422,6 @@ int main(int ac, char *av[]) conf_parse(name); fixup_rootmenu(&rootmenu); - conf_read(NULL); /* Load the interface and connect signals */ init_main_window(glade_file); @@ -1430,6 +1429,8 @@ int main(int ac, char *av[]) init_left_tree(); init_right_tree(); + conf_read(NULL); + switch (view_mode) { case SINGLE_VIEW: display_tree_part(); From bf83266a1eef8251e2f126dba635039de069104a Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 4 Jun 2024 01:19:02 +0900 Subject: [PATCH 0690/1578] kconfig: doc: fix a typo in the note about 'imply' This sentence does not make sense due to a typo. Fix it. Fixes: def2fbffe62c ("kconfig: allow symbols implied by y to become m") Signed-off-by: Masahiro Yamada Reviewed-by: Randy Dunlap --- Documentation/kbuild/kconfig-language.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/kbuild/kconfig-language.rst b/Documentation/kbuild/kconfig-language.rst index 555c2f8399697c..86be5b857cc498 100644 --- a/Documentation/kbuild/kconfig-language.rst +++ b/Documentation/kbuild/kconfig-language.rst @@ -184,7 +184,7 @@ applicable everywhere (see syntax). ability to hook into a secondary subsystem while allowing the user to configure that subsystem out without also having to unset these drivers. - Note: If the combination of FOO=y and BAR=m causes a link error, + Note: If the combination of FOO=y and BAZ=m causes a link error, you can guard the function call with IS_REACHABLE():: foo_init() From 45c7f555bf5e716d9c6ffb737e97d4cc9b4c21ef Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 4 Jun 2024 01:19:03 +0900 Subject: [PATCH 0691/1578] kconfig: doc: document behavior of 'select' and 'imply' followed by 'if' Documentation/kbuild/kconfig-language.rst explains the behavior of 'select' as follows: reverse dependencies can be used to force a lower limit of another symbol. The value of the current menu symbol is used as the minimal value can be set to. This is not true when the 'select' property is followed by 'if'. [Test Code] config MODULES def_bool y modules config A def_tristate y select C if B config B def_tristate m config C tristate [Result] CONFIG_MODULES=y CONFIG_A=y CONFIG_B=m CONFIG_C=m If "the value of A is used as the minimal value C can be set to", C must be 'y'. The actual behavior is "C is selected by (A && B)". The lower limit of C is downgraded due to B being 'm'. This behavior is kind of weird, and this has arisen several times in the mailing list. I do not know whether it is a bug or intended behavior. Anyway, it is not feasible to change it now because many Kconfig files are written based on this behavior. The same applies to 'imply'. Document this (but reserve the possibility for a future change). Signed-off-by: Masahiro Yamada Reviewed-by: Randy Dunlap --- Documentation/kbuild/kconfig-language.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Documentation/kbuild/kconfig-language.rst b/Documentation/kbuild/kconfig-language.rst index 86be5b857cc498..1fb3f5e6193c33 100644 --- a/Documentation/kbuild/kconfig-language.rst +++ b/Documentation/kbuild/kconfig-language.rst @@ -150,6 +150,12 @@ applicable everywhere (see syntax). That will limit the usefulness but on the other hand avoid the illegal configurations all over. + If "select" is followed by "if" , will be + selected by the logical AND of the value of the current menu symbol + and . This means, the lower limit can be downgraded due to the + presence of "if" . This behavior may seem weird, but we rely on + it. (The future of this behavior is undecided.) + - weak reverse dependencies: "imply" ["if" ] This is similar to "select" as it enforces a lower limit on another @@ -202,6 +208,10 @@ applicable everywhere (see syntax). imply BAR imply BAZ + Note: If "imply" is followed by "if" , the default of + will be the logical AND of the value of the current menu symbol and . + (The future of this behavior is undecided.) + - limiting menu display: "visible if" This attribute is only applicable to menu blocks, if the condition is From b01e1c030770ff3b4fe37fc7cc6bca03f594133f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 4 Jun 2024 19:35:49 +0000 Subject: [PATCH 0692/1578] ipv6: fix possible race in __fib6_drop_pcpu_from() syzbot found a race in __fib6_drop_pcpu_from() [1] If compiler reads more than once (*ppcpu_rt), second read could read NULL, if another cpu clears the value in rt6_get_pcpu_route(). Add a READ_ONCE() to prevent this race. Also add rcu_read_lock()/rcu_read_unlock() because we rely on RCU protection while dereferencing pcpu_rt. [1] Oops: general protection fault, probably for non-canonical address 0xdffffc0000000012: 0000 [#1] PREEMPT SMP KASAN PTI KASAN: null-ptr-deref in range [0x0000000000000090-0x0000000000000097] CPU: 0 PID: 7543 Comm: kworker/u8:17 Not tainted 6.10.0-rc1-syzkaller-00013-g2bfcfd584ff5 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/02/2024 Workqueue: netns cleanup_net RIP: 0010:__fib6_drop_pcpu_from.part.0+0x10a/0x370 net/ipv6/ip6_fib.c:984 Code: f8 48 c1 e8 03 80 3c 28 00 0f 85 16 02 00 00 4d 8b 3f 4d 85 ff 74 31 e8 74 a7 fa f7 49 8d bf 90 00 00 00 48 89 f8 48 c1 e8 03 <80> 3c 28 00 0f 85 1e 02 00 00 49 8b 87 90 00 00 00 48 8b 0c 24 48 RSP: 0018:ffffc900040df070 EFLAGS: 00010206 RAX: 0000000000000012 RBX: 0000000000000001 RCX: ffffffff89932e16 RDX: ffff888049dd1e00 RSI: ffffffff89932d7c RDI: 0000000000000091 RBP: dffffc0000000000 R08: 0000000000000005 R09: 0000000000000007 R10: 0000000000000001 R11: 0000000000000006 R12: ffff88807fa080b8 R13: fffffbfff1a9a07d R14: ffffed100ff41022 R15: 0000000000000001 FS: 0000000000000000(0000) GS:ffff8880b9200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b32c26000 CR3: 000000005d56e000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __fib6_drop_pcpu_from net/ipv6/ip6_fib.c:966 [inline] fib6_drop_pcpu_from net/ipv6/ip6_fib.c:1027 [inline] fib6_purge_rt+0x7f2/0x9f0 net/ipv6/ip6_fib.c:1038 fib6_del_route net/ipv6/ip6_fib.c:1998 [inline] fib6_del+0xa70/0x17b0 net/ipv6/ip6_fib.c:2043 fib6_clean_node+0x426/0x5b0 net/ipv6/ip6_fib.c:2205 fib6_walk_continue+0x44f/0x8d0 net/ipv6/ip6_fib.c:2127 fib6_walk+0x182/0x370 net/ipv6/ip6_fib.c:2175 fib6_clean_tree+0xd7/0x120 net/ipv6/ip6_fib.c:2255 __fib6_clean_all+0x100/0x2d0 net/ipv6/ip6_fib.c:2271 rt6_sync_down_dev net/ipv6/route.c:4906 [inline] rt6_disable_ip+0x7ed/0xa00 net/ipv6/route.c:4911 addrconf_ifdown.isra.0+0x117/0x1b40 net/ipv6/addrconf.c:3855 addrconf_notify+0x223/0x19e0 net/ipv6/addrconf.c:3778 notifier_call_chain+0xb9/0x410 kernel/notifier.c:93 call_netdevice_notifiers_info+0xbe/0x140 net/core/dev.c:1992 call_netdevice_notifiers_extack net/core/dev.c:2030 [inline] call_netdevice_notifiers net/core/dev.c:2044 [inline] dev_close_many+0x333/0x6a0 net/core/dev.c:1585 unregister_netdevice_many_notify+0x46d/0x19f0 net/core/dev.c:11193 unregister_netdevice_many net/core/dev.c:11276 [inline] default_device_exit_batch+0x85b/0xae0 net/core/dev.c:11759 ops_exit_list+0x128/0x180 net/core/net_namespace.c:178 cleanup_net+0x5b7/0xbf0 net/core/net_namespace.c:640 process_one_work+0x9fb/0x1b60 kernel/workqueue.c:3231 process_scheduled_works kernel/workqueue.c:3312 [inline] worker_thread+0x6c8/0xf70 kernel/workqueue.c:3393 kthread+0x2c1/0x3a0 kernel/kthread.c:389 ret_from_fork+0x45/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Fixes: d52d3997f843 ("ipv6: Create percpu rt6_info") Signed-off-by: Eric Dumazet Cc: Martin KaFai Lau Link: https://lore.kernel.org/r/20240604193549.981839-1-edumazet@google.com Signed-off-by: Paolo Abeni --- net/ipv6/ip6_fib.c | 6 +++++- net/ipv6/route.c | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 31d77885bcae3e..6e57c03e3255f0 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -966,6 +966,7 @@ static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, if (!fib6_nh->rt6i_pcpu) return; + rcu_read_lock(); /* release the reference to this fib entry from * all of its cached pcpu routes */ @@ -974,7 +975,9 @@ static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, struct rt6_info *pcpu_rt; ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); - pcpu_rt = *ppcpu_rt; + + /* Paired with xchg() in rt6_get_pcpu_route() */ + pcpu_rt = READ_ONCE(*ppcpu_rt); /* only dropping the 'from' reference if the cached route * is using 'match'. The cached pcpu_rt->from only changes @@ -988,6 +991,7 @@ static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, fib6_info_release(from); } } + rcu_read_unlock(); } struct fib6_nh_pcpu_arg { diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a504b88ec06b5a..f083d9faba6b1e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1409,6 +1409,7 @@ static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) struct rt6_info *prev, **p; p = this_cpu_ptr(res->nh->rt6i_pcpu); + /* Paired with READ_ONCE() in __fib6_drop_pcpu_from() */ prev = xchg(p, NULL); if (prev) { dst_dev_put(&prev->dst); From 77a92660d8fe8d29503fae768d9f5eb529c88b36 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 4 Jun 2024 01:19:04 +0900 Subject: [PATCH 0693/1578] kconfig: remove wrong expr_trans_bool() expr_trans_bool() performs an incorrect transformation. [Test Code] config MODULES def_bool y modules config A def_bool y select C if B != n config B def_tristate m config C tristate [Result] CONFIG_MODULES=y CONFIG_A=y CONFIG_B=m CONFIG_C=m This output is incorrect because CONFIG_C=y is expected. Documentation/kbuild/kconfig-language.rst clearly explains the function of the '!=' operator: If the values of both symbols are equal, it returns 'n', otherwise 'y'. Therefore, the statement: select C if B != n should be equivalent to: select C if y Or, more simply: select C Hence, the symbol C should be selected by the value of A, which is 'y'. However, expr_trans_bool() wrongly transforms it to: select C if B Therefore, the symbol C is selected by (A && B), which is 'm'. The comment block of expr_trans_bool() correctly explains its intention: * bool FOO!=n => FOO ^^^^ If FOO is bool, FOO!=n can be simplified into FOO. This is correct. However, the actual code performs this transformation when FOO is tristate: if (e->left.sym->type == S_TRISTATE) { ^^^^^^^^^^ While it can be fixed to S_BOOLEAN, there is no point in doing so because expr_tranform() already transforms FOO!=n to FOO when FOO is bool. (see the "case E_UNEQUAL" part) expr_trans_bool() is wrong and unnecessary. Signed-off-by: Masahiro Yamada Acked-by: Randy Dunlap --- scripts/kconfig/expr.c | 29 ----------------------------- scripts/kconfig/expr.h | 1 - scripts/kconfig/menu.c | 2 -- 3 files changed, 32 deletions(-) diff --git a/scripts/kconfig/expr.c b/scripts/kconfig/expr.c index 4d95fce5f9a7ab..fcc190b67b6f65 100644 --- a/scripts/kconfig/expr.c +++ b/scripts/kconfig/expr.c @@ -396,35 +396,6 @@ static struct expr *expr_eliminate_yn(struct expr *e) return e; } -/* - * bool FOO!=n => FOO - */ -struct expr *expr_trans_bool(struct expr *e) -{ - if (!e) - return NULL; - switch (e->type) { - case E_AND: - case E_OR: - case E_NOT: - e->left.expr = expr_trans_bool(e->left.expr); - e->right.expr = expr_trans_bool(e->right.expr); - break; - case E_UNEQUAL: - // FOO!=n -> FOO - if (e->left.sym->type == S_TRISTATE) { - if (e->right.sym == &symbol_no) { - e->type = E_SYMBOL; - e->right.sym = NULL; - } - } - break; - default: - ; - } - return e; -} - /* * e1 || e2 -> ? */ diff --git a/scripts/kconfig/expr.h b/scripts/kconfig/expr.h index fa50fc45622e38..7c0c242318bcdc 100644 --- a/scripts/kconfig/expr.h +++ b/scripts/kconfig/expr.h @@ -284,7 +284,6 @@ void expr_free(struct expr *e); void expr_eliminate_eq(struct expr **ep1, struct expr **ep2); int expr_eq(struct expr *e1, struct expr *e2); tristate expr_calc_value(struct expr *e); -struct expr *expr_trans_bool(struct expr *e); struct expr *expr_eliminate_dups(struct expr *e); struct expr *expr_transform(struct expr *e); int expr_contains_symbol(struct expr *dep, struct symbol *sym); diff --git a/scripts/kconfig/menu.c b/scripts/kconfig/menu.c index 53151c5a60281b..eef9b63cdf1156 100644 --- a/scripts/kconfig/menu.c +++ b/scripts/kconfig/menu.c @@ -398,8 +398,6 @@ static void _menu_finalize(struct menu *parent, bool inside_choice) dep = expr_transform(dep); dep = expr_alloc_and(expr_copy(basedep), dep); dep = expr_eliminate_dups(dep); - if (menu->sym && menu->sym->type != S_TRISTATE) - dep = expr_trans_bool(dep); prop->visible.expr = dep; /* From 0dcc53abf58d572d34c5313de85f607cd33fc691 Mon Sep 17 00:00:00 2001 From: Su Hui Date: Wed, 5 Jun 2024 11:47:43 +0800 Subject: [PATCH 0694/1578] net: ethtool: fix the error condition in ethtool_get_phy_stats_ethtool() Clang static checker (scan-build) warning: net/ethtool/ioctl.c:line 2233, column 2 Called function pointer is null (null dereference). Return '-EOPNOTSUPP' when 'ops->get_ethtool_phy_stats' is NULL to fix this typo error. Fixes: 201ed315f967 ("net/ethtool/ioctl: split ethtool_get_phy_stats into multiple helpers") Signed-off-by: Su Hui Reviewed-by: Przemek Kitszel Reviewed-by: Hariprasad Kelam Link: https://lore.kernel.org/r/20240605034742.921751-1-suhui@nfschina.com Signed-off-by: Paolo Abeni --- net/ethtool/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 5a55270aa86e88..e645d751a5e899 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -2220,7 +2220,7 @@ static int ethtool_get_phy_stats_ethtool(struct net_device *dev, const struct ethtool_ops *ops = dev->ethtool_ops; int n_stats, ret; - if (!ops || !ops->get_sset_count || ops->get_ethtool_phy_stats) + if (!ops || !ops->get_sset_count || !ops->get_ethtool_phy_stats) return -EOPNOTSUPP; n_stats = ops->get_sset_count(dev, ETH_SS_PHY_STATS); From 09fe2bfa6b83f865126ce3964744863f69a4a030 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 6 Jun 2024 21:14:45 +1000 Subject: [PATCH 0695/1578] ata: pata_macio: Fix max_segment_size with PAGE_SIZE == 64K The pata_macio driver advertises a max_segment_size of 0xff00, because the hardware doesn't cope with requests >= 64K. However the SCSI core requires max_segment_size to be at least PAGE_SIZE, which is a problem for pata_macio when the kernel is built with 64K pages. In older kernels the SCSI core would just increase the segment size to be equal to PAGE_SIZE, however since the commit tagged below it causes a warning and the device fails to probe: WARNING: CPU: 0 PID: 26 at block/blk-settings.c:202 .blk_validate_limits+0x2f8/0x35c CPU: 0 PID: 26 Comm: kworker/u4:1 Not tainted 6.10.0-rc1 #1 Hardware name: PowerMac7,2 PPC970 0x390202 PowerMac ... NIP .blk_validate_limits+0x2f8/0x35c LR .blk_alloc_queue+0xc0/0x2f8 Call Trace: .blk_alloc_queue+0xc0/0x2f8 .blk_mq_alloc_queue+0x60/0xf8 .scsi_alloc_sdev+0x208/0x3c0 .scsi_probe_and_add_lun+0x314/0x52c .__scsi_add_device+0x170/0x1a4 .ata_scsi_scan_host+0x2bc/0x3e4 .async_port_probe+0x6c/0xa0 .async_run_entry_fn+0x60/0x1bc .process_one_work+0x228/0x510 .worker_thread+0x360/0x530 .kthread+0x134/0x13c .start_kernel_thread+0x10/0x14 ... scsi_alloc_sdev: Allocation failure during SCSI scanning, some SCSI devices might not be configured Although the hardware can't cope with a 64K segment, the driver already deals with that internally by splitting large requests in pata_macio_qc_prep(). That is how the driver has managed to function until now on 64K kernels. So fix the driver to advertise a max_segment_size of 64K, which avoids the warning and keeps the SCSI core happy. Fixes: afd53a3d8528 ("scsi: core: Initialize scsi midlayer limits before allocating the queue") Reported-by: Guenter Roeck Closes: https://lore.kernel.org/all/ce2bf6af-4382-4fe1-b392-cc6829f5ceb2@roeck-us.net/ Reported-by: Doru Iorgulescu Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218858 Signed-off-by: Michael Ellerman Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: John Garry Signed-off-by: Niklas Cassel --- drivers/ata/pata_macio.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/ata/pata_macio.c b/drivers/ata/pata_macio.c index 817838e2f70e09..3cb455a32d9266 100644 --- a/drivers/ata/pata_macio.c +++ b/drivers/ata/pata_macio.c @@ -915,10 +915,13 @@ static const struct scsi_host_template pata_macio_sht = { .sg_tablesize = MAX_DCMDS, /* We may not need that strict one */ .dma_boundary = ATA_DMA_BOUNDARY, - /* Not sure what the real max is but we know it's less than 64K, let's - * use 64K minus 256 + /* + * The SCSI core requires the segment size to cover at least a page, so + * for 64K page size kernels this must be at least 64K. However the + * hardware can't handle 64K, so pata_macio_qc_prep() will split large + * requests. */ - .max_segment_size = MAX_DBDMA_SEG, + .max_segment_size = SZ_64K, .device_configure = pata_macio_device_configure, .sdev_groups = ata_common_sdev_groups, .can_queue = ATA_DEF_QUEUE, From 2ed22161b19b11239aa742804549f63edd7c91e3 Mon Sep 17 00:00:00 2001 From: Andrei Simion Date: Tue, 4 Jun 2024 13:10:30 +0300 Subject: [PATCH 0696/1578] ASoC: atmel: atmel-classd: Re-add dai_link->platform to fix card init The removed dai_link->platform component cause a fail which is exposed at runtime. (ex: when a sound tool is used) This patch re-adds the dai_link->platform component to have a full card registered. Before this patch: :~$ aplay -l **** List of PLAYBACK Hardware Devices **** card 0: CLASSD [CLASSD], device 0: CLASSD PCM snd-soc-dummy-dai-0 [] Subdevices: 1/1 Subdevice #0: subdevice #0 :~$ speaker-test -t sine speaker-test 1.2.6 Playback device is default Stream parameters are 48000Hz, S16_LE, 1 channels Sine wave rate is 440.0000Hz Playback open error: -22,Invalid argument After this patch which restores the platform component: :~$ aplay -l **** List of PLAYBACK Hardware Devices **** card 0: CLASSD [CLASSD], device 0: CLASSD PCM snd-soc-dummy-dai-0 [CLASSD PCM snd-soc-dummy-dai-0] Subdevices: 1/1 Subdevice #0: subdevice #0 -> Resolve the playback error. Fixes: 2f650f87c03c ("ASoC: atmel: remove unnecessary dai_link->platform") Signed-off-by: Andrei Simion Acked-by: Kuninori Morimoto Link: https://msgid.link/r/20240604101030.237792-1-andrei.simion@microchip.com Signed-off-by: Mark Brown --- sound/soc/atmel/atmel-classd.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sound/soc/atmel/atmel-classd.c b/sound/soc/atmel/atmel-classd.c index 6aed1ee443b443..ba314b2799190e 100644 --- a/sound/soc/atmel/atmel-classd.c +++ b/sound/soc/atmel/atmel-classd.c @@ -473,19 +473,22 @@ static int atmel_classd_asoc_card_init(struct device *dev, if (!dai_link) return -ENOMEM; - comp = devm_kzalloc(dev, sizeof(*comp), GFP_KERNEL); + comp = devm_kzalloc(dev, 2 * sizeof(*comp), GFP_KERNEL); if (!comp) return -ENOMEM; - dai_link->cpus = comp; + dai_link->cpus = &comp[0]; dai_link->codecs = &snd_soc_dummy_dlc; + dai_link->platforms = &comp[1]; dai_link->num_cpus = 1; dai_link->num_codecs = 1; + dai_link->num_platforms = 1; dai_link->name = "CLASSD"; dai_link->stream_name = "CLASSD PCM"; dai_link->cpus->dai_name = dev_name(dev); + dai_link->platforms->name = dev_name(dev); card->dai_link = dai_link; card->num_links = 1; From 41b02ea4c0adfcc6761fbfed42c3ce6b6412d881 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 5 Jun 2024 11:21:16 +0200 Subject: [PATCH 0697/1578] selftests: net: lib: support errexit with busywait If errexit is enabled ('set -e'), loopy_wait -- or busywait and others using it -- will stop after the first failure. Note that if the returned status of loopy_wait is checked, and even if errexit is enabled, Bash will not stop at the first error. Fixes: 25ae948b4478 ("selftests/net: add lib.sh") Cc: stable@vger.kernel.org Acked-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Reviewed-by: Hangbin Liu Link: https://lore.kernel.org/r/20240605-upstream-net-20240605-selftests-net-lib-fixes-v1-1-b3afadd368c9@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib.sh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index edc030e81a4649..a422e10d3d3a0b 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -67,9 +67,7 @@ loopy_wait() while true do local out - out=$("$@") - local ret=$? - if ((!ret)); then + if out=$("$@"); then echo -n "$out" return 0 fi From 79322174bcc780b99795cb89d237b26006a8b94b Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 5 Jun 2024 11:21:17 +0200 Subject: [PATCH 0698/1578] selftests: net: lib: avoid error removing empty netns name If there is an error to create the first netns with 'setup_ns()', 'cleanup_ns()' will be called with an empty string as first parameter. The consequences is that 'cleanup_ns()' will try to delete an invalid netns, and wait 20 seconds if the netns list is empty. Instead of just checking if the name is not empty, convert the string separated by spaces to an array. Manipulating the array is cleaner, and calling 'cleanup_ns()' with an empty array will be a no-op. Fixes: 25ae948b4478 ("selftests/net: add lib.sh") Cc: stable@vger.kernel.org Acked-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Reviewed-by: Petr Machata Reviewed-by: Hangbin Liu Link: https://lore.kernel.org/r/20240605-upstream-net-20240605-selftests-net-lib-fixes-v1-2-b3afadd368c9@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib.sh | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index a422e10d3d3a0b..e2f51102d7e1eb 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -15,7 +15,7 @@ ksft_xfail=2 ksft_skip=4 # namespace list created by setup_ns -NS_LIST="" +NS_LIST=() ############################################################################## # Helpers @@ -137,6 +137,7 @@ cleanup_ns() fi for ns in "$@"; do + [ -z "${ns}" ] && continue ip netns delete "${ns}" &> /dev/null if ! busywait $BUSYWAIT_TIMEOUT ip netns list \| grep -vq "^$ns$" &> /dev/null; then echo "Warn: Failed to remove namespace $ns" @@ -150,7 +151,7 @@ cleanup_ns() cleanup_all_ns() { - cleanup_ns $NS_LIST + cleanup_ns "${NS_LIST[@]}" } # setup netns with given names as prefix. e.g @@ -159,7 +160,7 @@ setup_ns() { local ns="" local ns_name="" - local ns_list="" + local ns_list=() local ns_exist= for ns_name in "$@"; do # Some test may setup/remove same netns multi times @@ -175,13 +176,13 @@ setup_ns() if ! ip netns add "$ns"; then echo "Failed to create namespace $ns_name" - cleanup_ns "$ns_list" + cleanup_ns "${ns_list[@]}" return $ksft_skip fi ip -n "$ns" link set lo up - ! $ns_exist && ns_list="$ns_list $ns" + ! $ns_exist && ns_list+=("$ns") done - NS_LIST="$NS_LIST $ns_list" + NS_LIST+=("${ns_list[@]}") } tc_rule_stats_get() From 84a8bc3ec225b28067b168e9410e452c83d706da Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 5 Jun 2024 11:21:18 +0200 Subject: [PATCH 0699/1578] selftests: net: lib: set 'i' as local Without this, the 'i' variable declared before could be overridden by accident, e.g. for i in "${@}"; do __ksft_status_merge "${i}" ## 'i' has been modified foo "${i}" ## using 'i' with an unexpected value done After a quick look, it looks like 'i' is currently not used after having been modified in __ksft_status_merge(), but still, better be safe than sorry. I saw this while modifying the same file, not because I suspected an issue somewhere. Fixes: 596c8819cb78 ("selftests: forwarding: Have RET track kselftest framework constants") Acked-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Reviewed-by: Hangbin Liu Link: https://lore.kernel.org/r/20240605-upstream-net-20240605-selftests-net-lib-fixes-v1-3-b3afadd368c9@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index e2f51102d7e1eb..9155c914c064fe 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -27,6 +27,7 @@ __ksft_status_merge() local -A weights local weight=0 + local i for i in "$@"; do weights[$i]=$((weight++)) done From 0ee14725471cea66e03e3cd4f4c582d759de502c Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Thu, 6 Jun 2024 15:46:09 +0100 Subject: [PATCH 0700/1578] mm/util: Swap kmemdup_array() arguments GCC 14.1 complains about the argument usage of kmemdup_array(): drivers/soc/tegra/fuse/fuse-tegra.c:130:65: error: 'kmemdup_array' sizes specified with 'sizeof' in the earlier argument and not in the later argument [-Werror=calloc-transposed-args] 130 | fuse->lookups = kmemdup_array(fuse->soc->lookups, sizeof(*fuse->lookups), | ^ drivers/soc/tegra/fuse/fuse-tegra.c:130:65: note: earlier argument should specify number of elements, later size of each element The annotation introduced by commit 7d78a7773355 ("string: Add additional __realloc_size() annotations for "dup" helpers") lets the compiler think that kmemdup_array() follows the same format as calloc(), with the number of elements preceding the size of one element. So we could simply swap the arguments to __realloc_size() to get rid of that warning, but it seems cleaner to instead have kmemdup_array() follow the same format as krealloc_array(), memdup_array_user(), calloc() etc. Fixes: 7d78a7773355 ("string: Add additional __realloc_size() annotations for "dup" helpers") Signed-off-by: Jean-Philippe Brucker Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240606144608.97817-2-jean-philippe@linaro.org Signed-off-by: Kees Cook --- drivers/soc/tegra/fuse/fuse-tegra.c | 4 ++-- include/linux/string.h | 2 +- lib/fortify_kunit.c | 2 +- mm/util.c | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/soc/tegra/fuse/fuse-tegra.c b/drivers/soc/tegra/fuse/fuse-tegra.c index b6bfd6729df390..d276672838465c 100644 --- a/drivers/soc/tegra/fuse/fuse-tegra.c +++ b/drivers/soc/tegra/fuse/fuse-tegra.c @@ -127,8 +127,8 @@ static void tegra_fuse_print_sku_info(struct tegra_sku_info *tegra_sku_info) static int tegra_fuse_add_lookups(struct tegra_fuse *fuse) { - fuse->lookups = kmemdup_array(fuse->soc->lookups, sizeof(*fuse->lookups), - fuse->soc->num_lookups, GFP_KERNEL); + fuse->lookups = kmemdup_array(fuse->soc->lookups, fuse->soc->num_lookups, + sizeof(*fuse->lookups), GFP_KERNEL); if (!fuse->lookups) return -ENOMEM; diff --git a/include/linux/string.h b/include/linux/string.h index 60168aa2af075d..9edace076ddbfd 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -289,7 +289,7 @@ extern void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp) __realloc_si extern void *kvmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2); extern char *kmemdup_nul(const char *s, size_t len, gfp_t gfp); -extern void *kmemdup_array(const void *src, size_t element_size, size_t count, gfp_t gfp) +extern void *kmemdup_array(const void *src, size_t count, size_t element_size, gfp_t gfp) __realloc_size(2, 3); /* lib/argv_split.c */ diff --git a/lib/fortify_kunit.c b/lib/fortify_kunit.c index f9cc467334ce3d..e17d520f532cfb 100644 --- a/lib/fortify_kunit.c +++ b/lib/fortify_kunit.c @@ -374,7 +374,7 @@ static const char * const test_strs[] = { for (i = 0; i < ARRAY_SIZE(test_strs); i++) { \ len = strlen(test_strs[i]); \ KUNIT_EXPECT_EQ(test, __builtin_constant_p(len), 0); \ - checker(len, kmemdup_array(test_strs[i], len, 1, gfp), \ + checker(len, kmemdup_array(test_strs[i], 1, len, gfp), \ kfree(p)); \ checker(len, kmemdup(test_strs[i], len, gfp), \ kfree(p)); \ diff --git a/mm/util.c b/mm/util.c index c9e519e6811f55..6682097372efcd 100644 --- a/mm/util.c +++ b/mm/util.c @@ -139,14 +139,14 @@ EXPORT_SYMBOL(kmemdup_noprof); * kmemdup_array - duplicate a given array. * * @src: array to duplicate. - * @element_size: size of each element of array. * @count: number of elements to duplicate from array. + * @element_size: size of each element of array. * @gfp: GFP mask to use. * * Return: duplicated array of @src or %NULL in case of error, * result is physically contiguous. Use kfree() to free. */ -void *kmemdup_array(const void *src, size_t element_size, size_t count, gfp_t gfp) +void *kmemdup_array(const void *src, size_t count, size_t element_size, gfp_t gfp) { return kmemdup(src, size_mul(element_size, count), gfp); } From f7d3b1ffc654b0435ac2c9c02a72fb2752bdb0fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20G=C3=B6ttsche?= Date: Fri, 15 Mar 2024 13:54:10 +0100 Subject: [PATCH 0701/1578] yama: document function parameter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Document the unused function parameter of yama_relation_cleanup() to please kernel doc warnings. Signed-off-by: Christian Göttsche Reviewed-by: Paul Moore Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20240315125418.273104-2-cgzones@googlemail.com Signed-off-by: Kees Cook --- security/yama/yama_lsm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c index b6684a074a59be..39944a859ff6be 100644 --- a/security/yama/yama_lsm.c +++ b/security/yama/yama_lsm.c @@ -111,6 +111,7 @@ static void report_access(const char *access, struct task_struct *target, /** * yama_relation_cleanup - remove invalid entries from the relation list + * @work: unused * */ static void yama_relation_cleanup(struct work_struct *work) From 3ac36aa7307363b7247ccb6f6a804e11496b2b36 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 29 May 2024 09:42:05 +0200 Subject: [PATCH 0702/1578] x86/mm/numa: Use NUMA_NO_NODE when calling memblock_set_node() memblock_set_node() warns about using MAX_NUMNODES, see e0eec24e2e19 ("memblock: make memblock_set_node() also warn about use of MAX_NUMNODES") for details. Reported-by: Narasimhan V Signed-off-by: Jan Beulich Cc: stable@vger.kernel.org [bp: commit message] Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Mike Rapoport (IBM) Tested-by: Paul E. McKenney Link: https://lore.kernel.org/r/20240603141005.23261-1-bp@kernel.org Link: https://lore.kernel.org/r/abadb736-a239-49e4-ab42-ace7acdd4278@suse.com Signed-off-by: Mike Rapoport (IBM) --- arch/x86/mm/numa.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index ce84ba86e69e9a..6ce10e3c622858 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -493,7 +493,7 @@ static void __init numa_clear_kernel_node_hotplug(void) for_each_reserved_mem_region(mb_region) { int nid = memblock_get_region_node(mb_region); - if (nid != MAX_NUMNODES) + if (nid != NUMA_NO_NODE) node_set(nid, reserved_nodemask); } @@ -614,9 +614,9 @@ static int __init numa_init(int (*init_func)(void)) nodes_clear(node_online_map); memset(&numa_meminfo, 0, sizeof(numa_meminfo)); WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory, - MAX_NUMNODES)); + NUMA_NO_NODE)); WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved, - MAX_NUMNODES)); + NUMA_NO_NODE)); /* In case that parsing SRAT failed. */ WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX)); numa_reset_distance(); From f3a5367c679d31473d3fbb391675055b4792c309 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 6 Jun 2024 11:01:51 +0930 Subject: [PATCH 0703/1578] btrfs: protect folio::private when attaching extent buffer folios MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [BUG] Since v6.8 there are rare kernel crashes reported by various people, the common factor is bad page status error messages like this: BUG: Bad page state in process kswapd0 pfn:d6e840 page: refcount:0 mapcount:0 mapping:000000007512f4f2 index:0x2796c2c7c pfn:0xd6e840 aops:btree_aops ino:1 flags: 0x17ffffe0000008(uptodate|node=0|zone=2|lastcpupid=0x3fffff) page_type: 0xffffffff() raw: 0017ffffe0000008 dead000000000100 dead000000000122 ffff88826d0be4c0 raw: 00000002796c2c7c 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: non-NULL mapping [CAUSE] Commit 09e6cef19c9f ("btrfs: refactor alloc_extent_buffer() to allocate-then-attach method") changes the sequence when allocating a new extent buffer. Previously we always called grab_extent_buffer() under mapping->i_private_lock, to ensure the safety on modification on folio::private (which is a pointer to extent buffer for regular sectorsize). This can lead to the following race: Thread A is trying to allocate an extent buffer at bytenr X, with 4 4K pages, meanwhile thread B is trying to release the page at X + 4K (the second page of the extent buffer at X). Thread A | Thread B -----------------------------------+------------------------------------- | btree_release_folio() | | This is for the page at X + 4K, | | Not page X. | | alloc_extent_buffer() | |- release_extent_buffer() |- filemap_add_folio() for the | | |- atomic_dec_and_test(eb->refs) | page at bytenr X (the first | | | | page). | | | | Which returned -EEXIST. | | | | | | | |- filemap_lock_folio() | | | | Returned the first page locked. | | | | | | | |- grab_extent_buffer() | | | | |- atomic_inc_not_zero() | | | | | Returned false | | | | |- folio_detach_private() | | |- folio_detach_private() for X | |- folio_test_private() | | |- folio_test_private() | Returned true | | | Returned true |- folio_put() | |- folio_put() Now there are two puts on the same folio at folio X, leading to refcount underflow of the folio X, and eventually causing the BUG_ON() on the page->mapping. The condition is not that easy to hit: - The release must be triggered for the middle page of an eb If the release is on the same first page of an eb, page lock would kick in and prevent the race. - folio_detach_private() has a very small race window It's only between folio_test_private() and folio_clear_private(). That's exactly when mapping->i_private_lock is used to prevent such race, and commit 09e6cef19c9f ("btrfs: refactor alloc_extent_buffer() to allocate-then-attach method") screwed that up. At that time, I thought the page lock would kick in as filemap_release_folio() also requires the page to be locked, but forgot the filemap_release_folio() only locks one page, not all pages of an extent buffer. [FIX] Move all the code requiring i_private_lock into attach_eb_folio_to_filemap(), so that everything is done with proper lock protection. Furthermore to prevent future problems, add an extra lockdep_assert_locked() to ensure we're holding the proper lock. To reproducer that is able to hit the race (takes a few minutes with instrumented code inserting delays to alloc_extent_buffer()): #!/bin/sh drop_caches () { while(true); do echo 3 > /proc/sys/vm/drop_caches echo 1 > /proc/sys/vm/compact_memory done } run_tar () { while(true); do for x in `seq 1 80` ; do tar cf /dev/zero /mnt > /dev/null & done wait done } mkfs.btrfs -f -d single -m single /dev/vda mount -o noatime /dev/vda /mnt # create 200,000 files, 1K each ./simoop -n 200000 -E -f 1k /mnt drop_caches & (run_tar) Reported-by: Linus Torvalds Link: https://lore.kernel.org/linux-btrfs/CAHk-=wgt362nGfScVOOii8cgKn2LVVHeOvOA7OBwg1OwbuJQcw@mail.gmail.com/ Reported-by: Mikhail Gavrilov Link: https://lore.kernel.org/lkml/CABXGCsPktcHQOvKTbPaTwegMExije=Gpgci5NW=hqORo-s7diA@mail.gmail.com/ Reported-by: Toralf Förster Link: https://lore.kernel.org/linux-btrfs/e8b3311c-9a75-4903-907f-fc0f7a3fe423@gmx.de/ Reported-by: syzbot+f80b066392366b4af85e@syzkaller.appspotmail.com Fixes: 09e6cef19c9f ("btrfs: refactor alloc_extent_buffer() to allocate-then-attach method") CC: stable@vger.kernel.org # 6.8+ CC: Chris Mason Reviewed-by: Filipe Manana Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 60 +++++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 597387e9f04007..f688fab55251ea 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3689,6 +3689,8 @@ static struct extent_buffer *grab_extent_buffer( struct folio *folio = page_folio(page); struct extent_buffer *exists; + lockdep_assert_held(&page->mapping->i_private_lock); + /* * For subpage case, we completely rely on radix tree to ensure we * don't try to insert two ebs for the same bytenr. So here we always @@ -3756,13 +3758,14 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) * The caller needs to free the existing folios and retry using the same order. */ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, + struct btrfs_subpage *prealloc, struct extent_buffer **found_eb_ret) { struct btrfs_fs_info *fs_info = eb->fs_info; struct address_space *mapping = fs_info->btree_inode->i_mapping; const unsigned long index = eb->start >> PAGE_SHIFT; - struct folio *existing_folio; + struct folio *existing_folio = NULL; int ret; ASSERT(found_eb_ret); @@ -3774,12 +3777,14 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, ret = filemap_add_folio(mapping, eb->folios[i], index + i, GFP_NOFS | __GFP_NOFAIL); if (!ret) - return 0; + goto finish; existing_folio = filemap_lock_folio(mapping, index + i); /* The page cache only exists for a very short time, just retry. */ - if (IS_ERR(existing_folio)) + if (IS_ERR(existing_folio)) { + existing_folio = NULL; goto retry; + } /* For now, we should only have single-page folios for btree inode. */ ASSERT(folio_nr_pages(existing_folio) == 1); @@ -3790,14 +3795,13 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, return -EAGAIN; } - if (fs_info->nodesize < PAGE_SIZE) { - /* - * We're going to reuse the existing page, can drop our page - * and subpage structure now. - */ +finish: + spin_lock(&mapping->i_private_lock); + if (existing_folio && fs_info->nodesize < PAGE_SIZE) { + /* We're going to reuse the existing page, can drop our folio now. */ __free_page(folio_page(eb->folios[i], 0)); eb->folios[i] = existing_folio; - } else { + } else if (existing_folio) { struct extent_buffer *existing_eb; existing_eb = grab_extent_buffer(fs_info, @@ -3805,6 +3809,7 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, if (existing_eb) { /* The extent buffer still exists, we can use it directly. */ *found_eb_ret = existing_eb; + spin_unlock(&mapping->i_private_lock); folio_unlock(existing_folio); folio_put(existing_folio); return 1; @@ -3813,6 +3818,22 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, __free_page(folio_page(eb->folios[i], 0)); eb->folios[i] = existing_folio; } + eb->folio_size = folio_size(eb->folios[i]); + eb->folio_shift = folio_shift(eb->folios[i]); + /* Should not fail, as we have preallocated the memory. */ + ret = attach_extent_buffer_folio(eb, eb->folios[i], prealloc); + ASSERT(!ret); + /* + * To inform we have an extra eb under allocation, so that + * detach_extent_buffer_page() won't release the folio private when the + * eb hasn't been inserted into radix tree yet. + * + * The ref will be decreased when the eb releases the page, in + * detach_extent_buffer_page(). Thus needs no special handling in the + * error path. + */ + btrfs_folio_inc_eb_refs(fs_info, eb->folios[i]); + spin_unlock(&mapping->i_private_lock); return 0; } @@ -3824,7 +3845,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, int attached = 0; struct extent_buffer *eb; struct extent_buffer *existing_eb = NULL; - struct address_space *mapping = fs_info->btree_inode->i_mapping; struct btrfs_subpage *prealloc = NULL; u64 lockdep_owner = owner_root; bool page_contig = true; @@ -3890,7 +3910,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, for (int i = 0; i < num_folios; i++) { struct folio *folio; - ret = attach_eb_folio_to_filemap(eb, i, &existing_eb); + ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb); if (ret > 0) { ASSERT(existing_eb); goto out; @@ -3927,24 +3947,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, * and free the allocated page. */ folio = eb->folios[i]; - eb->folio_size = folio_size(folio); - eb->folio_shift = folio_shift(folio); - spin_lock(&mapping->i_private_lock); - /* Should not fail, as we have preallocated the memory */ - ret = attach_extent_buffer_folio(eb, folio, prealloc); - ASSERT(!ret); - /* - * To inform we have extra eb under allocation, so that - * detach_extent_buffer_page() won't release the folio private - * when the eb hasn't yet been inserted into radix tree. - * - * The ref will be decreased when the eb released the page, in - * detach_extent_buffer_page(). - * Thus needs no special handling in error path. - */ - btrfs_folio_inc_eb_refs(fs_info, folio); - spin_unlock(&mapping->i_private_lock); - WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len)); /* From bb93148ca831c30248270b4d81d3ccca23faa1bf Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Wed, 15 May 2024 19:22:22 +0300 Subject: [PATCH 0704/1578] MAINTAINERS: update Xe driver maintainers Because I left Intel, I'm removing myself from the list of Xe driver maintainers. Signed-off-by: Oded Gabbay Acked-by: Lucas De Marchi Link: https://patchwork.freedesktop.org/patch/msgid/20240515162222.12958-3-ogabbay@kernel.org Signed-off-by: Lucas De Marchi (cherry picked from commit 8de6625dafd202da733188f632701e9109188035) Signed-off-by: Lucas De Marchi --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index d6c90161c7bfe3..69f553485b9c2d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11035,7 +11035,6 @@ F: include/uapi/drm/i915_drm.h INTEL DRM XE DRIVER (Lunar Lake and newer) M: Lucas De Marchi -M: Oded Gabbay M: Thomas Hellström L: intel-xe@lists.freedesktop.org S: Supported From a9f9b30e1748252d158f78a0c0affdc949671dd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Sun, 2 Jun 2024 21:09:59 +0200 Subject: [PATCH 0705/1578] MAINTAINERS: Update Xe driver maintainers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Rodrigo Vivi as an Xe driver maintainer. v2: - Cc also Lucas De Marchi (Rodrigo vivi) - Remove a blank line in commit the commit message (Lucas De Marchi) Cc: David Airlie Cc: Daniel Vetter Cc: Rodrigo Vivi Cc: Lucas De Marchi Cc: dri-devel@lists.freedesktop.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Thomas Hellström Acked-by: Rodrigo Vivi Acked-by: Lucas De Marchi Acked-by: Jani Nikula Acked-by: Dave Airlie Link: https://patchwork.freedesktop.org/patch/msgid/20240602190959.2981-1-thomas.hellstrom@linux.intel.com Signed-off-by: Lucas De Marchi --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 69f553485b9c2d..c517af00df84d6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11036,6 +11036,7 @@ F: include/uapi/drm/i915_drm.h INTEL DRM XE DRIVER (Lunar Lake and newer) M: Lucas De Marchi M: Thomas Hellström +M: Rodrigo Vivi L: intel-xe@lists.freedesktop.org S: Supported W: https://drm.pages.freedesktop.org/intel-docs/ From 96c965667b9dbbd713acdffa95ebab8c225f8595 Mon Sep 17 00:00:00 2001 From: Richard Acayan Date: Wed, 5 Jun 2024 13:58:09 -0400 Subject: [PATCH 0706/1578] kbuild: explicitly run mksysmap as sed script from link-vmlinux.sh In commit b18b047002b7 ("kbuild: change scripts/mksysmap into sed script"), the mksysmap script was transformed into a sed script, made directly executable with "#!/bin/sed -f". Apparently, the path to sed is different on NixOS. The shebang can't use the env command, otherwise the "sed -f" command would be treated as a single argument. This can be solved with the -S flag, but that is a GNU extension. Explicitly use sed instead of relying on the executable shebang to fix NixOS builds without breaking build environments using Busybox. Fixes: b18b047002b7 ("kbuild: change scripts/mksysmap into sed script") Reported-by: Kent Overstreet Signed-off-by: Richard Acayan Reviewed-by: Nathan Chancellor Tested-by: Dmitry Safonov <0x7f454c46@gmail.com> Signed-off-by: Masahiro Yamada --- scripts/link-vmlinux.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 46ce5d04dbeb10..518c70b8db5078 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -193,7 +193,7 @@ kallsyms_step() mksysmap() { info NM ${2} - ${NM} -n "${1}" | "${srctree}/scripts/mksysmap" > "${2}" + ${NM} -n "${1}" | sed -f "${srctree}/scripts/mksysmap" > "${2}" } sorttable() From 9185afeac2a3dcce8300a5684291a43c2838cfd6 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 7 Jun 2024 03:36:12 +0900 Subject: [PATCH 0707/1578] modpost: do not warn about missing MODULE_DESCRIPTION() for vmlinux.o Building with W=1 incorrectly emits the following warning: WARNING: modpost: missing MODULE_DESCRIPTION() in vmlinux.o This check should apply only to modules. Fixes: 1fffe7a34c89 ("script: modpost: emit a warning when the description is missing") Signed-off-by: Masahiro Yamada Reviewed-by: Vincenzo Palazzo --- scripts/mod/modpost.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 937294ff164fc8..f48d72d22dc2a6 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -1647,10 +1647,11 @@ static void read_symbols(const char *modname) namespace = get_next_modinfo(&info, "import_ns", namespace); } + + if (extra_warn && !get_modinfo(&info, "description")) + warn("missing MODULE_DESCRIPTION() in %s\n", modname); } - if (extra_warn && !get_modinfo(&info, "description")) - warn("missing MODULE_DESCRIPTION() in %s\n", modname); for (sym = info.symtab_start; sym < info.symtab_stop; sym++) { symname = remove_dot(info.strtab + sym->st_name); From e96b2933152fd87b6a41765b2f58b158fde855b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cs=C3=B3k=C3=A1s=2C=20Bence?= Date: Wed, 5 Jun 2024 10:42:51 +0200 Subject: [PATCH 0708/1578] net: sfp: Always call `sfp_sm_mod_remove()` on remove MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the module is in SFP_MOD_ERROR, `sfp_sm_mod_remove()` will not be run. As a consequence, `sfp_hwmon_remove()` is not getting run either, leaving a stale `hwmon` device behind. `sfp_sm_mod_remove()` itself checks `sfp->sm_mod_state` anyways, so this check was not really needed in the first place. Fixes: d2e816c0293f ("net: sfp: handle module remove outside state machine") Signed-off-by: "Csókás, Bence" Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20240605084251.63502-1-csokas.bence@prolan.hu Signed-off-by: Jakub Kicinski --- drivers/net/phy/sfp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c index 3f9cbd797fd67d..a5684ef5884bda 100644 --- a/drivers/net/phy/sfp.c +++ b/drivers/net/phy/sfp.c @@ -2429,8 +2429,7 @@ static void sfp_sm_module(struct sfp *sfp, unsigned int event) /* Handle remove event globally, it resets this state machine */ if (event == SFP_E_REMOVE) { - if (sfp->sm_mod_state > SFP_MOD_PROBE) - sfp_sm_mod_remove(sfp); + sfp_sm_mod_remove(sfp); sfp_sm_mod_next(sfp, SFP_MOD_EMPTY, 0); return; } From 02c418774f76a0a36a6195c9dbf8971eb4130a15 Mon Sep 17 00:00:00 2001 From: Enzo Matsumiya Date: Thu, 6 Jun 2024 13:13:13 -0300 Subject: [PATCH 0709/1578] smb: client: fix deadlock in smb2_find_smb_tcon() Unlock cifs_tcp_ses_lock before calling cifs_put_smb_ses() to avoid such deadlock. Cc: stable@vger.kernel.org Signed-off-by: Enzo Matsumiya Reviewed-by: Shyam Prasad N Reviewed-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/smb2transport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c index 02135a6053051e..1476c445cadcf0 100644 --- a/fs/smb/client/smb2transport.c +++ b/fs/smb/client/smb2transport.c @@ -216,8 +216,8 @@ smb2_find_smb_tcon(struct TCP_Server_Info *server, __u64 ses_id, __u32 tid) } tcon = smb2_find_smb_sess_tcon_unlocked(ses, tid); if (!tcon) { - cifs_put_smb_ses(ses); spin_unlock(&cifs_tcp_ses_lock); + cifs_put_smb_ses(ses); return NULL; } spin_unlock(&cifs_tcp_ses_lock); From a88d60903696c01de577558080ec4fc738a70475 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 31 May 2024 15:53:42 +0100 Subject: [PATCH 0710/1578] cifs: Don't advance the I/O iterator before terminating subrequest There's now no need to make sure subreq->io_iter is advanced to match subreq->transferred before calling one of the netfs subrequest termination functions as the check has been removed netfslib and the iterator is reset prior to retrying a subreq. Fixes: 3ee1a1fc3981 ("cifs: Cut over to using netfslib") Signed-off-by: David Howells cc: Steve French cc: Paulo Alcantara cc: Shyam Prasad N cc: Rohith Surabattula cc: Jeff Layton cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org Signed-off-by: Steve French --- fs/smb/client/smb2pdu.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 993ac36c3d585a..38a06e8a0f90f9 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -4577,8 +4577,6 @@ smb2_readv_callback(struct mid_q_entry *mid) if (rdata->subreq.start < rdata->subreq.rreq->i_size) rdata->result = 0; } - if (rdata->result == 0 || rdata->result == -EAGAIN) - iov_iter_advance(&rdata->subreq.io_iter, rdata->got_bytes); rdata->credits.value = 0; netfs_subreq_terminated(&rdata->subreq, (rdata->result == 0 || rdata->result == -EAGAIN) ? @@ -4789,7 +4787,6 @@ smb2_writev_callback(struct mid_q_entry *mid) wdata->result = -ENOSPC; else wdata->subreq.len = written; - iov_iter_advance(&wdata->subreq.io_iter, written); break; case MID_REQUEST_SUBMITTED: case MID_RETRY_NEEDED: From 64054eb716db52e4246527dc9414377c5bc5b01d Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Thu, 6 Jun 2024 20:23:50 -0700 Subject: [PATCH 0711/1578] gpio: add missing MODULE_DESCRIPTION() macros On x86, make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/gpio/gpio-gw-pld.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/gpio/gpio-mc33880.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/gpio/gpio-pcf857x.o Add the missing invocations of the MODULE_DESCRIPTION() macro, including the one missing in gpio-pl061.c, which is not built for x86. Signed-off-by: Jeff Johnson Link: https://lore.kernel.org/r/20240606-md-drivers-gpio-v1-1-cb42d240ca5c@quicinc.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-gw-pld.c | 1 + drivers/gpio/gpio-mc33880.c | 1 + drivers/gpio/gpio-pcf857x.c | 1 + drivers/gpio/gpio-pl061.c | 1 + 4 files changed, 4 insertions(+) diff --git a/drivers/gpio/gpio-gw-pld.c b/drivers/gpio/gpio-gw-pld.c index 899335da93c78b..7e29a2d8de1aaf 100644 --- a/drivers/gpio/gpio-gw-pld.c +++ b/drivers/gpio/gpio-gw-pld.c @@ -130,5 +130,6 @@ static struct i2c_driver gw_pld_driver = { }; module_i2c_driver(gw_pld_driver); +MODULE_DESCRIPTION("Gateworks I2C PLD GPIO expander"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Linus Walleij "); diff --git a/drivers/gpio/gpio-mc33880.c b/drivers/gpio/gpio-mc33880.c index cd9b16dbe1a970..94f6fefc011bfa 100644 --- a/drivers/gpio/gpio-mc33880.c +++ b/drivers/gpio/gpio-mc33880.c @@ -168,5 +168,6 @@ static void __exit mc33880_exit(void) module_exit(mc33880_exit); MODULE_AUTHOR("Mocean Laboratories "); +MODULE_DESCRIPTION("MC33880 high-side/low-side switch GPIO driver"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/gpio/gpio-pcf857x.c b/drivers/gpio/gpio-pcf857x.c index 53b69abe678714..7c57eaeb0afeba 100644 --- a/drivers/gpio/gpio-pcf857x.c +++ b/drivers/gpio/gpio-pcf857x.c @@ -438,5 +438,6 @@ static void __exit pcf857x_exit(void) } module_exit(pcf857x_exit); +MODULE_DESCRIPTION("Driver for pcf857x, pca857x, and pca967x I2C GPIO expanders"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("David Brownell"); diff --git a/drivers/gpio/gpio-pl061.c b/drivers/gpio/gpio-pl061.c index 9fc1f3dd4190df..a211a02d4b4a63 100644 --- a/drivers/gpio/gpio-pl061.c +++ b/drivers/gpio/gpio-pl061.c @@ -438,4 +438,5 @@ static struct amba_driver pl061_gpio_driver = { }; module_amba_driver(pl061_gpio_driver); +MODULE_DESCRIPTION("Driver for the ARM PrimeCell(tm) General Purpose Input/Output (PL061)"); MODULE_LICENSE("GPL v2"); From 89e1ee118d6f0ee6bd6e80d8fe08839875daa241 Mon Sep 17 00:00:00 2001 From: Andrew Ballance Date: Sun, 2 Jun 2024 03:50:23 -0500 Subject: [PATCH 0712/1578] hid: asus: asus_report_fixup: fix potential read out of bounds syzbot reported a potential read out of bounds in asus_report_fixup. this patch adds checks so that a read out of bounds will not occur Signed-off-by: Andrew Ballance Reported-by: Closes: https://syzkaller.appspot.com/bug?extid=07762f019fd03d01f04c Fixes: 59d2f5b7392e ("HID: asus: fix more n-key report descriptors if n-key quirked") Link: https://lore.kernel.org/r/20240602085023.1720492-1-andrewjballance@gmail.com Signed-off-by: Benjamin Tissoires --- drivers/hid/hid-asus.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/hid/hid-asus.c b/drivers/hid/hid-asus.c index 02de2bf4f7907e..37e6d25593c211 100644 --- a/drivers/hid/hid-asus.c +++ b/drivers/hid/hid-asus.c @@ -1204,8 +1204,8 @@ static __u8 *asus_report_fixup(struct hid_device *hdev, __u8 *rdesc, } /* match many more n-key devices */ - if (drvdata->quirks & QUIRK_ROG_NKEY_KEYBOARD) { - for (int i = 0; i < *rsize + 1; i++) { + if (drvdata->quirks & QUIRK_ROG_NKEY_KEYBOARD && *rsize > 15) { + for (int i = 0; i < *rsize - 15; i++) { /* offset to the count from 0x5a report part always 14 */ if (rdesc[i] == 0x85 && rdesc[i + 1] == 0x5a && rdesc[i + 14] == 0x95 && rdesc[i + 15] == 0x05) { From 0c7dd00de018ff70b3452c424901816e26366a8a Mon Sep 17 00:00:00 2001 From: Aseda Aboagye Date: Tue, 4 Jun 2024 23:10:47 +0000 Subject: [PATCH 0713/1578] input: Add event code for accessibility key HUTRR116 added support for a new usage titled "System Accessibility Binding" which toggles a system-wide bound accessibility UI or command. This commit simply adds a new event code for the usage. Signed-off-by: Aseda Aboagye Acked-by: Dmitry Torokhov Link: https://lore.kernel.org/r/Zl-e97O9nvudco5z@google.com Signed-off-by: Benjamin Tissoires --- drivers/hid/hid-debug.c | 1 + drivers/hid/hid-input.c | 1 + include/uapi/linux/input-event-codes.h | 1 + 3 files changed, 3 insertions(+) diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c index 87a961cae7756b..322ed63a98abe6 100644 --- a/drivers/hid/hid-debug.c +++ b/drivers/hid/hid-debug.c @@ -3366,6 +3366,7 @@ static const char *keys[KEY_MAX + 1] = { [KEY_CAMERA_ACCESS_ENABLE] = "CameraAccessEnable", [KEY_CAMERA_ACCESS_DISABLE] = "CameraAccessDisable", [KEY_CAMERA_ACCESS_TOGGLE] = "CameraAccessToggle", + [KEY_ACCESSIBILITY] = "Accessibility", [KEY_DICTATE] = "Dictate", [KEY_MICMUTE] = "MicrophoneMute", [KEY_BRIGHTNESS_MIN] = "BrightnessMin", diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c index e03d300d2bac49..d5a6e89c3086e4 100644 --- a/drivers/hid/hid-input.c +++ b/drivers/hid/hid-input.c @@ -836,6 +836,7 @@ static void hidinput_configure_usage(struct hid_input *hidinput, struct hid_fiel if ((usage->hid & 0xf0) == 0xa0) { /* SystemControl */ switch (usage->hid & 0xf) { case 0x9: map_key_clear(KEY_MICMUTE); break; + case 0xa: map_key_clear(KEY_ACCESSIBILITY); break; default: goto ignore; } break; diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h index 03edf2ccdf6c80..39f11ec676fae9 100644 --- a/include/uapi/linux/input-event-codes.h +++ b/include/uapi/linux/input-event-codes.h @@ -618,6 +618,7 @@ #define KEY_CAMERA_ACCESS_ENABLE 0x24b /* Enables programmatic access to camera devices. (HUTRR72) */ #define KEY_CAMERA_ACCESS_DISABLE 0x24c /* Disables programmatic access to camera devices. (HUTRR72) */ #define KEY_CAMERA_ACCESS_TOGGLE 0x24d /* Toggles the current state of the camera access control. (HUTRR72) */ +#define KEY_ACCESSIBILITY 0x24e /* Toggles the system bound accessibility UI/command (HUTRR116) */ #define KEY_BRIGHTNESS_MIN 0x250 /* Set Brightness to Minimum */ #define KEY_BRIGHTNESS_MAX 0x251 /* Set Brightness to Maximum */ From 22d6d060ac77955291deb43efc2f3f4f9632c6cb Mon Sep 17 00:00:00 2001 From: Aseda Aboagye Date: Tue, 4 Jun 2024 23:16:32 +0000 Subject: [PATCH 0714/1578] input: Add support for "Do Not Disturb" HUTRR94 added support for a new usage titled "System Do Not Disturb" which toggles a system-wide Do Not Disturb setting. This commit simply adds a new event code for the usage. Signed-off-by: Aseda Aboagye Acked-by: Dmitry Torokhov Link: https://lore.kernel.org/r/Zl-gUHE70s7wCAoB@google.com Signed-off-by: Benjamin Tissoires --- drivers/hid/hid-debug.c | 1 + drivers/hid/hid-input.c | 8 ++++++++ include/uapi/linux/input-event-codes.h | 1 + 3 files changed, 10 insertions(+) diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c index 322ed63a98abe6..d5abfe652fb506 100644 --- a/drivers/hid/hid-debug.c +++ b/drivers/hid/hid-debug.c @@ -3367,6 +3367,7 @@ static const char *keys[KEY_MAX + 1] = { [KEY_CAMERA_ACCESS_DISABLE] = "CameraAccessDisable", [KEY_CAMERA_ACCESS_TOGGLE] = "CameraAccessToggle", [KEY_ACCESSIBILITY] = "Accessibility", + [KEY_DO_NOT_DISTURB] = "DoNotDisturb", [KEY_DICTATE] = "Dictate", [KEY_MICMUTE] = "MicrophoneMute", [KEY_BRIGHTNESS_MIN] = "BrightnessMin", diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c index d5a6e89c3086e4..8bb16e9b94aa52 100644 --- a/drivers/hid/hid-input.c +++ b/drivers/hid/hid-input.c @@ -833,6 +833,14 @@ static void hidinput_configure_usage(struct hid_input *hidinput, struct hid_fiel break; } + if ((usage->hid & 0xf0) == 0x90) { /* SystemControl*/ + switch (usage->hid & 0xf) { + case 0xb: map_key_clear(KEY_DO_NOT_DISTURB); break; + default: goto ignore; + } + break; + } + if ((usage->hid & 0xf0) == 0xa0) { /* SystemControl */ switch (usage->hid & 0xf) { case 0x9: map_key_clear(KEY_MICMUTE); break; diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h index 39f11ec676fae9..a4206723f50333 100644 --- a/include/uapi/linux/input-event-codes.h +++ b/include/uapi/linux/input-event-codes.h @@ -619,6 +619,7 @@ #define KEY_CAMERA_ACCESS_DISABLE 0x24c /* Disables programmatic access to camera devices. (HUTRR72) */ #define KEY_CAMERA_ACCESS_TOGGLE 0x24d /* Toggles the current state of the camera access control. (HUTRR72) */ #define KEY_ACCESSIBILITY 0x24e /* Toggles the system bound accessibility UI/command (HUTRR116) */ +#define KEY_DO_NOT_DISTURB 0x24f /* Toggles the system-wide "Do Not Disturb" control (HUTRR94)*/ #define KEY_BRIGHTNESS_MIN 0x250 /* Set Brightness to Minimum */ #define KEY_BRIGHTNESS_MAX 0x251 /* Set Brightness to Maximum */ From 8d3ae46c64336b538898af76b4f504dfbe89b886 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 7 May 2024 16:48:15 +0200 Subject: [PATCH 0715/1578] dt-bindings: HID: i2c-hid: add dedicated Ilitek ILI2901 schema The Ilitek ILI2901 touch screen controller was apparently incorrectly added to the Elan eKTH6915 schema simply because it also has a reset gpio and is currently managed by the Elan driver in Linux. The two controllers are not related even if an unfortunate wording in the commit message adding the Ilitek compatible made it sound like they were. Add a dedicated schema for the ILI2901 which does not specify the I2C address (which is likely 0x41 rather than 0x10 as for other Ilitek touch controllers) to avoid cluttering the Elan schema with unrelated devices and to make it easier to find the correct schema when adding further Ilitek controllers. Fixes: d74ac6f60a7e ("dt-bindings: HID: i2c-hid: elan: Introduce Ilitek ili2901") Cc: Zhengqiao Xia Reviewed-by: Krzysztof Kozlowski Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20240507144821.12275-2-johan+linaro@kernel.org Signed-off-by: Benjamin Tissoires --- .../bindings/input/elan,ekth6915.yaml | 1 - .../bindings/input/ilitek,ili2901.yaml | 66 +++++++++++++++++++ 2 files changed, 66 insertions(+), 1 deletion(-) create mode 100644 Documentation/devicetree/bindings/input/ilitek,ili2901.yaml diff --git a/Documentation/devicetree/bindings/input/elan,ekth6915.yaml b/Documentation/devicetree/bindings/input/elan,ekth6915.yaml index dc4ac41f244117..c1fcf8c90c24db 100644 --- a/Documentation/devicetree/bindings/input/elan,ekth6915.yaml +++ b/Documentation/devicetree/bindings/input/elan,ekth6915.yaml @@ -20,7 +20,6 @@ properties: compatible: enum: - elan,ekth6915 - - ilitek,ili2901 reg: const: 0x10 diff --git a/Documentation/devicetree/bindings/input/ilitek,ili2901.yaml b/Documentation/devicetree/bindings/input/ilitek,ili2901.yaml new file mode 100644 index 00000000000000..1abeec768d79cb --- /dev/null +++ b/Documentation/devicetree/bindings/input/ilitek,ili2901.yaml @@ -0,0 +1,66 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/input/ilitek,ili2901.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Ilitek ILI2901 touchscreen controller + +maintainers: + - Jiri Kosina + +description: + Supports the Ilitek ILI2901 touchscreen controller. + This touchscreen controller uses the i2c-hid protocol with a reset GPIO. + +allOf: + - $ref: /schemas/input/touchscreen/touchscreen.yaml# + +properties: + compatible: + enum: + - ilitek,ili2901 + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + panel: true + + reset-gpios: + maxItems: 1 + + vcc33-supply: true + + vccio-supply: true + +required: + - compatible + - reg + - interrupts + - vcc33-supply + +additionalProperties: false + +examples: + - | + #include + #include + + i2c { + #address-cells = <1>; + #size-cells = <0>; + + touchscreen@41 { + compatible = "ilitek,ili2901"; + reg = <0x41>; + + interrupt-parent = <&tlmm>; + interrupts = <9 IRQ_TYPE_LEVEL_LOW>; + + reset-gpios = <&tlmm 8 GPIO_ACTIVE_LOW>; + vcc33-supply = <&pp3300_ts>; + }; + }; From 07fc16fa55522520f5e4b2e35c72650f11079c35 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 7 May 2024 16:48:16 +0200 Subject: [PATCH 0716/1578] dt-bindings: HID: i2c-hid: elan: add Elan eKTH5015M Add a compatible string for the Elan eKTH5015M touch controller. Judging from the current binding and commit bd3cba00dcc6 ("HID: i2c-hid: elan: Add support for Elan eKTH6915 i2c-hid touchscreens"), eKTH5015M appears to be compatible with eKTH6915. Notably the power-on sequence is the same. While at it, drop a redundant label from the example. Reviewed-by: Krzysztof Kozlowski Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20240507144821.12275-3-johan+linaro@kernel.org Signed-off-by: Benjamin Tissoires --- .../devicetree/bindings/input/elan,ekth6915.yaml | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/Documentation/devicetree/bindings/input/elan,ekth6915.yaml b/Documentation/devicetree/bindings/input/elan,ekth6915.yaml index c1fcf8c90c24db..be84f7ed0abc9d 100644 --- a/Documentation/devicetree/bindings/input/elan,ekth6915.yaml +++ b/Documentation/devicetree/bindings/input/elan,ekth6915.yaml @@ -18,8 +18,12 @@ allOf: properties: compatible: - enum: - - elan,ekth6915 + oneOf: + - items: + - enum: + - elan,ekth5015m + - const: elan,ekth6915 + - const: elan,ekth6915 reg: const: 0x10 @@ -57,8 +61,8 @@ examples: #address-cells = <1>; #size-cells = <0>; - ap_ts: touchscreen@10 { - compatible = "elan,ekth6915"; + touchscreen@10 { + compatible = "elan,ekth5015m", "elan,ekth6915"; reg = <0x10>; interrupt-parent = <&tlmm>; From e538d4b85b8f1e3534dfbb42c2273f18bbb59d6e Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 7 May 2024 16:48:17 +0200 Subject: [PATCH 0717/1578] dt-bindings: HID: i2c-hid: elan: add 'no-reset-on-power-off' property When the power supply is shared with other peripherals the reset line can be wired in such a way that it can remain deasserted regardless of whether the supply is on or not. This is important as it can be used to avoid holding the controller in reset for extended periods of time when it remains powered, something which can lead to increased power consumption. Leaving reset deasserted also avoids leaking current through the reset circuitry pull-up resistors. Add a new 'no-reset-on-power-off' devicetree property which can be used by the OS to determine when reset needs to be asserted on power down. Note that this property can also be used when the supply cannot be turned off by the OS at all. Signed-off-by: Johan Hovold Reviewed-by: Krzysztof Kozlowski Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20240507144821.12275-4-johan+linaro@kernel.org Signed-off-by: Benjamin Tissoires --- Documentation/devicetree/bindings/input/elan,ekth6915.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/devicetree/bindings/input/elan,ekth6915.yaml b/Documentation/devicetree/bindings/input/elan,ekth6915.yaml index be84f7ed0abc9d..a62916d07a08c7 100644 --- a/Documentation/devicetree/bindings/input/elan,ekth6915.yaml +++ b/Documentation/devicetree/bindings/input/elan,ekth6915.yaml @@ -36,6 +36,12 @@ properties: reset-gpios: description: Reset GPIO; not all touchscreens using eKTH6915 hook this up. + no-reset-on-power-off: + type: boolean + description: + Reset line is wired so that it can (and should) be left deasserted when + the power supply is off. + vcc33-supply: description: The 3.3V supply to the touchscreen. From 0eafc58f2194dbd01d4be40f99a697681171995b Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 7 May 2024 16:48:18 +0200 Subject: [PATCH 0718/1578] HID: i2c-hid: elan: fix reset suspend current leakage The Elan eKTH5015M touch controller found on the Lenovo ThinkPad X13s shares the VCC33 supply with other peripherals that may remain powered during suspend (e.g. when enabled as wakeup sources). The reset line is also wired so that it can be left deasserted when the supply is off. This is important as it avoids holding the controller in reset for extended periods of time when it remains powered, which can lead to increased power consumption, and also avoids leaking current through the X13s reset circuitry during suspend (and after driver unbind). Use the new 'no-reset-on-power-off' devicetree property to determine when reset needs to be asserted on power down. Notably this also avoids wasting power on machine variants without a touchscreen for which the driver would otherwise exit probe with reset asserted. Fixes: bd3cba00dcc6 ("HID: i2c-hid: elan: Add support for Elan eKTH6915 i2c-hid touchscreens") Cc: # 6.0 Cc: Douglas Anderson Tested-by: Steev Klimaszewski Signed-off-by: Johan Hovold Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20240507144821.12275-5-johan+linaro@kernel.org Signed-off-by: Benjamin Tissoires --- drivers/hid/i2c-hid/i2c-hid-of-elan.c | 59 +++++++++++++++++++++------ 1 file changed, 47 insertions(+), 12 deletions(-) diff --git a/drivers/hid/i2c-hid/i2c-hid-of-elan.c b/drivers/hid/i2c-hid/i2c-hid-of-elan.c index 5b91fb106cfc3c..091e37933225ad 100644 --- a/drivers/hid/i2c-hid/i2c-hid-of-elan.c +++ b/drivers/hid/i2c-hid/i2c-hid-of-elan.c @@ -31,6 +31,7 @@ struct i2c_hid_of_elan { struct regulator *vcc33; struct regulator *vccio; struct gpio_desc *reset_gpio; + bool no_reset_on_power_off; const struct elan_i2c_hid_chip_data *chip_data; }; @@ -40,17 +41,17 @@ static int elan_i2c_hid_power_up(struct i2chid_ops *ops) container_of(ops, struct i2c_hid_of_elan, ops); int ret; + gpiod_set_value_cansleep(ihid_elan->reset_gpio, 1); + if (ihid_elan->vcc33) { ret = regulator_enable(ihid_elan->vcc33); if (ret) - return ret; + goto err_deassert_reset; } ret = regulator_enable(ihid_elan->vccio); - if (ret) { - regulator_disable(ihid_elan->vcc33); - return ret; - } + if (ret) + goto err_disable_vcc33; if (ihid_elan->chip_data->post_power_delay_ms) msleep(ihid_elan->chip_data->post_power_delay_ms); @@ -60,6 +61,15 @@ static int elan_i2c_hid_power_up(struct i2chid_ops *ops) msleep(ihid_elan->chip_data->post_gpio_reset_on_delay_ms); return 0; + +err_disable_vcc33: + if (ihid_elan->vcc33) + regulator_disable(ihid_elan->vcc33); +err_deassert_reset: + if (ihid_elan->no_reset_on_power_off) + gpiod_set_value_cansleep(ihid_elan->reset_gpio, 0); + + return ret; } static void elan_i2c_hid_power_down(struct i2chid_ops *ops) @@ -67,7 +77,14 @@ static void elan_i2c_hid_power_down(struct i2chid_ops *ops) struct i2c_hid_of_elan *ihid_elan = container_of(ops, struct i2c_hid_of_elan, ops); - gpiod_set_value_cansleep(ihid_elan->reset_gpio, 1); + /* + * Do not assert reset when the hardware allows for it to remain + * deasserted regardless of the state of the (shared) power supply to + * avoid wasting power when the supply is left on. + */ + if (!ihid_elan->no_reset_on_power_off) + gpiod_set_value_cansleep(ihid_elan->reset_gpio, 1); + if (ihid_elan->chip_data->post_gpio_reset_off_delay_ms) msleep(ihid_elan->chip_data->post_gpio_reset_off_delay_ms); @@ -79,6 +96,7 @@ static void elan_i2c_hid_power_down(struct i2chid_ops *ops) static int i2c_hid_of_elan_probe(struct i2c_client *client) { struct i2c_hid_of_elan *ihid_elan; + int ret; ihid_elan = devm_kzalloc(&client->dev, sizeof(*ihid_elan), GFP_KERNEL); if (!ihid_elan) @@ -93,21 +111,38 @@ static int i2c_hid_of_elan_probe(struct i2c_client *client) if (IS_ERR(ihid_elan->reset_gpio)) return PTR_ERR(ihid_elan->reset_gpio); + ihid_elan->no_reset_on_power_off = of_property_read_bool(client->dev.of_node, + "no-reset-on-power-off"); + ihid_elan->vccio = devm_regulator_get(&client->dev, "vccio"); - if (IS_ERR(ihid_elan->vccio)) - return PTR_ERR(ihid_elan->vccio); + if (IS_ERR(ihid_elan->vccio)) { + ret = PTR_ERR(ihid_elan->vccio); + goto err_deassert_reset; + } ihid_elan->chip_data = device_get_match_data(&client->dev); if (ihid_elan->chip_data->main_supply_name) { ihid_elan->vcc33 = devm_regulator_get(&client->dev, ihid_elan->chip_data->main_supply_name); - if (IS_ERR(ihid_elan->vcc33)) - return PTR_ERR(ihid_elan->vcc33); + if (IS_ERR(ihid_elan->vcc33)) { + ret = PTR_ERR(ihid_elan->vcc33); + goto err_deassert_reset; + } } - return i2c_hid_core_probe(client, &ihid_elan->ops, - ihid_elan->chip_data->hid_descriptor_address, 0); + ret = i2c_hid_core_probe(client, &ihid_elan->ops, + ihid_elan->chip_data->hid_descriptor_address, 0); + if (ret) + goto err_deassert_reset; + + return 0; + +err_deassert_reset: + if (ihid_elan->no_reset_on_power_off) + gpiod_set_value_cansleep(ihid_elan->reset_gpio, 0); + + return ret; } static const struct elan_i2c_hid_chip_data elan_ekth6915_chip_data = { From b472b996a43404a912c5cb4f27050022fdbce10c Mon Sep 17 00:00:00 2001 From: Udit Kumar Date: Fri, 31 May 2024 22:27:25 +0530 Subject: [PATCH 0719/1578] dt-bindings: net: dp8386x: Add MIT license along with GPL-2.0 Modify license to include dual licensing as GPL-2.0-only OR MIT license for TI specific phy header files. This allows for Linux kernel files to be used in other Operating System ecosystems such as Zephyr or FreeBSD. While at this, update the GPL-2.0 to be GPL-2.0-only to be in sync with latest SPDX conventions (GPL-2.0 is deprecated). While at this, update the TI copyright year to sync with current year to indicate license change. Cc: Thomas Gleixner Cc: Trent Piepho Cc: Wadim Egorov Cc: Kip Broadhurst Signed-off-by: Udit Kumar Acked-by: Wadim Egorov Acked-by: Rob Herring Signed-off-by: David S. Miller --- include/dt-bindings/net/ti-dp83867.h | 4 ++-- include/dt-bindings/net/ti-dp83869.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/dt-bindings/net/ti-dp83867.h b/include/dt-bindings/net/ti-dp83867.h index 6fc4b445d3a143..b8a4f3ff4a3b75 100644 --- a/include/dt-bindings/net/ti-dp83867.h +++ b/include/dt-bindings/net/ti-dp83867.h @@ -1,10 +1,10 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ +/* SPDX-License-Identifier: GPL-2.0-only OR MIT */ /* * Device Tree constants for the Texas Instruments DP83867 PHY * * Author: Dan Murphy * - * Copyright: (C) 2015 Texas Instruments, Inc. + * Copyright (C) 2015-2024 Texas Instruments Incorporated - https://www.ti.com/ */ #ifndef _DT_BINDINGS_TI_DP83867_H diff --git a/include/dt-bindings/net/ti-dp83869.h b/include/dt-bindings/net/ti-dp83869.h index 218b1a64e9750a..917114aad7d0ea 100644 --- a/include/dt-bindings/net/ti-dp83869.h +++ b/include/dt-bindings/net/ti-dp83869.h @@ -1,10 +1,10 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ +/* SPDX-License-Identifier: GPL-2.0-only OR MIT */ /* * Device Tree constants for the Texas Instruments DP83869 PHY * * Author: Dan Murphy * - * Copyright: (C) 2019 Texas Instruments, Inc. + * Copyright (C) 2015-2024 Texas Instruments Incorporated - https://www.ti.com/ */ #ifndef _DT_BINDINGS_TI_DP83869_H From 12cda920212a49fa22d9e8b9492ac4ea013310a4 Mon Sep 17 00:00:00 2001 From: Yonglong Liu Date: Wed, 5 Jun 2024 15:20:57 +0800 Subject: [PATCH 0720/1578] net: hns3: fix kernel crash problem in concurrent scenario When link status change, the nic driver need to notify the roce driver to handle this event, but at this time, the roce driver may uninit, then cause kernel crash. To fix the problem, when link status change, need to check whether the roce registered, and when uninit, need to wait link update finish. Fixes: 45e92b7e4e27 ("net: hns3: add calling roce callback function when link status change") Signed-off-by: Yonglong Liu Signed-off-by: Jijie Shao Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- .../hisilicon/hns3/hns3pf/hclge_main.c | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 43cc6ee4d87db0..82574ce0194fba 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -3086,9 +3086,7 @@ static void hclge_push_link_status(struct hclge_dev *hdev) static void hclge_update_link_status(struct hclge_dev *hdev) { - struct hnae3_handle *rhandle = &hdev->vport[0].roce; struct hnae3_handle *handle = &hdev->vport[0].nic; - struct hnae3_client *rclient = hdev->roce_client; struct hnae3_client *client = hdev->nic_client; int state; int ret; @@ -3112,8 +3110,15 @@ static void hclge_update_link_status(struct hclge_dev *hdev) client->ops->link_status_change(handle, state); hclge_config_mac_tnl_int(hdev, state); - if (rclient && rclient->ops->link_status_change) - rclient->ops->link_status_change(rhandle, state); + + if (test_bit(HCLGE_STATE_ROCE_REGISTERED, &hdev->state)) { + struct hnae3_handle *rhandle = &hdev->vport[0].roce; + struct hnae3_client *rclient = hdev->roce_client; + + if (rclient && rclient->ops->link_status_change) + rclient->ops->link_status_change(rhandle, + state); + } hclge_push_link_status(hdev); } @@ -11319,6 +11324,12 @@ static int hclge_init_client_instance(struct hnae3_client *client, return ret; } +static bool hclge_uninit_need_wait(struct hclge_dev *hdev) +{ + return test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state) || + test_bit(HCLGE_STATE_LINK_UPDATING, &hdev->state); +} + static void hclge_uninit_client_instance(struct hnae3_client *client, struct hnae3_ae_dev *ae_dev) { @@ -11327,7 +11338,7 @@ static void hclge_uninit_client_instance(struct hnae3_client *client, if (hdev->roce_client) { clear_bit(HCLGE_STATE_ROCE_REGISTERED, &hdev->state); - while (test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state)) + while (hclge_uninit_need_wait(hdev)) msleep(HCLGE_WAIT_RESET_DONE); hdev->roce_client->ops->uninit_instance(&vport->roce, 0); From 968fde83841a8c23558dfbd0a0c69d636db52b55 Mon Sep 17 00:00:00 2001 From: Jie Wang Date: Wed, 5 Jun 2024 15:20:58 +0800 Subject: [PATCH 0721/1578] net: hns3: add cond_resched() to hns3 ring buffer init process Currently hns3 ring buffer init process would hold cpu too long with big Tx/Rx ring depth. This could cause soft lockup. So this patch adds cond_resched() to the process. Then cpu can break to run other tasks instead of busy looping. Fixes: a723fb8efe29 ("net: hns3: refine for set ring parameters") Signed-off-by: Jie Wang Signed-off-by: Jijie Shao Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 4 ++++ drivers/net/ethernet/hisilicon/hns3/hns3_enet.h | 2 ++ 2 files changed, 6 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index ff71fb1eced949..a5fc0209d628eb 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -3535,6 +3535,9 @@ static int hns3_alloc_ring_buffers(struct hns3_enet_ring *ring) ret = hns3_alloc_and_attach_buffer(ring, i); if (ret) goto out_buffer_fail; + + if (!(i % HNS3_RESCHED_BD_NUM)) + cond_resched(); } return 0; @@ -5107,6 +5110,7 @@ int hns3_init_all_ring(struct hns3_nic_priv *priv) } u64_stats_init(&priv->ring[i].syncp); + cond_resched(); } return 0; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index acd756b0c7c9a4..d36c4ed16d8dd2 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -214,6 +214,8 @@ enum hns3_nic_state { #define HNS3_CQ_MODE_EQE 1U #define HNS3_CQ_MODE_CQE 0U +#define HNS3_RESCHED_BD_NUM 1024 + enum hns3_pkt_l2t_type { HNS3_L2_TYPE_UNICAST, HNS3_L2_TYPE_MULTICAST, From 1af89dedc8a58006d8e385b1e0d2cd24df8a3b69 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 6 Jun 2024 20:27:19 +0200 Subject: [PATCH 0722/1578] thermal: core: Do not fail cdev registration because of invalid initial state It is reported that commit 31a0fa0019b0 ("thermal/debugfs: Pass cooling device state to thermal_debug_cdev_add()") causes the ACPI fan driver to fail probing on some systems which turns out to be due to the _FST control method returning an invalid value until _FSL is first evaluated for the given fan. If this happens, the .get_cur_state() cooling device callback returns an error and __thermal_cooling_device_register() fails as uses that callback after commit 31a0fa0019b0. Arguably, _FST should not return an invalid value even if it is evaluated before _FSL, so this may be regarded as a platform firmware issue, but at the same time it is not a good enough reason for failing the cooling device registration where the initial cooling device state is only needed to initialize a thermal debug facility. Accordingly, modify __thermal_cooling_device_register() to avoid calling thermal_debug_cdev_add() instead of returning an error if the initial .get_cur_state() callback invocation fails. Fixes: 31a0fa0019b0 ("thermal/debugfs: Pass cooling device state to thermal_debug_cdev_add()") Closes: https://lore.kernel.org/linux-acpi/20240530153727.843378-1-laura.nao@collabora.com Reported-by: Laura Nao Signed-off-by: Rafael J. Wysocki Acked-by: Daniel Lezcano Tested-by: Laura Nao --- drivers/thermal/thermal_core.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 30567b4994551b..d70e76dd3c9438 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -999,9 +999,17 @@ __thermal_cooling_device_register(struct device_node *np, if (ret) goto out_cdev_type; + /* + * The cooling device's current state is only needed for debug + * initialization below, so a failure to get it does not cause + * the entire cooling device initialization to fail. However, + * the debug will not work for the device if its initial state + * cannot be determined and drivers are responsible for ensuring + * that this will not happen. + */ ret = cdev->ops->get_cur_state(cdev, ¤t_state); if (ret) - goto out_cdev_type; + current_state = ULONG_MAX; thermal_cooling_device_setup_sysfs(cdev); @@ -1016,7 +1024,8 @@ __thermal_cooling_device_register(struct device_node *np, return ERR_PTR(ret); } - thermal_debug_cdev_add(cdev, current_state); + if (current_state <= cdev->max_state) + thermal_debug_cdev_add(cdev, current_state); /* Add 'this' new cdev to the global cdev list */ mutex_lock(&thermal_list_lock); From 7f18bd49cb6b6a3ab6d860fefccdc94f2a247db0 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 6 Jun 2024 20:27:30 +0200 Subject: [PATCH 0723/1578] thermal: ACPI: Invalidate trip points with temperature of 0 or below It is reported that commit 950210887670 ("thermal: core: Drop trips_disabled bitmask") causes the maximum frequency of CPUs to drop further down with every system sleep-wake cycle on Intel Core i7-4710HQ. This turns out to be due to a trip point whose temperature is equal to 0 degrees Celsius which is acted on every time the system wakes from sleep. Before commit 950210887670 this trip point would be disabled wia the trips_disabled bitmask, but now it is treated as a valid one. Since ACPI thermal control is generally about protection against overheating, trip points with temperature of 0 centigrade or below are not particularly useful there, so initialize them all as invalid which fixes the problem at hand. Fixes: 950210887670 ("thermal: core: Drop trips_disabled bitmask") Closes: https://lore.kernel.org/linux-pm/3f71747b-f852-4ee0-b384-cf46b2aefa3f@gmx.com Reported-by: Tibor Billes Tested-by: Tibor Billes Cc: 6.7+ # 6.7+ Signed-off-by: Rafael J. Wysocki --- drivers/acpi/thermal.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c index d67881b50bca28..a0cfc857fb553e 100644 --- a/drivers/acpi/thermal.c +++ b/drivers/acpi/thermal.c @@ -168,11 +168,17 @@ static int acpi_thermal_get_polling_frequency(struct acpi_thermal *tz) static int acpi_thermal_temp(struct acpi_thermal *tz, int temp_deci_k) { + int temp; + if (temp_deci_k == THERMAL_TEMP_INVALID) return THERMAL_TEMP_INVALID; - return deci_kelvin_to_millicelsius_with_offset(temp_deci_k, + temp = deci_kelvin_to_millicelsius_with_offset(temp_deci_k, tz->kelvin_offset); + if (temp <= 0) + return THERMAL_TEMP_INVALID; + + return temp; } static bool acpi_thermal_trip_valid(struct acpi_thermal_trip *acpi_trip) From c44711b78608c98a3e6b49ce91678cd0917d5349 Mon Sep 17 00:00:00 2001 From: Aleksandr Mishin Date: Wed, 5 Jun 2024 13:11:35 +0300 Subject: [PATCH 0724/1578] liquidio: Adjust a NULL pointer handling path in lio_vf_rep_copy_packet In lio_vf_rep_copy_packet() pg_info->page is compared to a NULL value, but then it is unconditionally passed to skb_add_rx_frag() which looks strange and could lead to null pointer dereference. lio_vf_rep_copy_packet() call trace looks like: octeon_droq_process_packets octeon_droq_fast_process_packets octeon_droq_dispatch_pkt octeon_create_recv_info ...search in the dispatch_list... ->disp_fn(rdisp->rinfo, ...) lio_vf_rep_pkt_recv(struct octeon_recv_info *recv_info, ...) In this path there is no code which sets pg_info->page to NULL. So this check looks unneeded and doesn't solve potential problem. But I guess the author had reason to add a check and I have no such card and can't do real test. In addition, the code in the function liquidio_push_packet() in liquidio/lio_core.c does exactly the same. Based on this, I consider the most acceptable compromise solution to adjust this issue by moving skb_add_rx_frag() into conditional scope. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 1f233f327913 ("liquidio: switchdev support for LiquidIO NIC") Signed-off-by: Aleksandr Mishin Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/cavium/liquidio/lio_vf_rep.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_rep.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_rep.c index 96c6ea12279f8b..989b4ddae3426b 100644 --- a/drivers/net/ethernet/cavium/liquidio/lio_vf_rep.c +++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_rep.c @@ -272,13 +272,12 @@ lio_vf_rep_copy_packet(struct octeon_device *oct, pg_info->page_offset; memcpy(skb->data, va, MIN_SKB_SIZE); skb_put(skb, MIN_SKB_SIZE); + skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, + pg_info->page, + pg_info->page_offset + MIN_SKB_SIZE, + len - MIN_SKB_SIZE, + LIO_RXBUFFER_SZ); } - - skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, - pg_info->page, - pg_info->page_offset + MIN_SKB_SIZE, - len - MIN_SKB_SIZE, - LIO_RXBUFFER_SZ); } else { struct octeon_skb_page_info *pg_info = ((struct octeon_skb_page_info *)(skb->cb)); From 60980cf5b8c8cc9182e5e9dbb62cbfd345c54074 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Fri, 7 Jun 2024 11:34:23 +0100 Subject: [PATCH 0725/1578] spi: cs42l43: Drop cs35l56 SPI speed down to 11MHz Some internals of the cs35l56 can only support SPI speeds of up to 11MHz. Whilst some use-cases could support higher rates, keep things simple by dropping the SPI speed down to this avoid any potential issues. Signed-off-by: Charles Keepax Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240607103423.4159834-1-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- drivers/spi/spi-cs42l43.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-cs42l43.c b/drivers/spi/spi-cs42l43.c index 902a0734cc361e..8b618ef0f7111f 100644 --- a/drivers/spi/spi-cs42l43.c +++ b/drivers/spi/spi-cs42l43.c @@ -54,7 +54,7 @@ static const struct software_node ampr = { static struct spi_board_info ampl_info = { .modalias = "cs35l56", - .max_speed_hz = 20 * HZ_PER_MHZ, + .max_speed_hz = 11 * HZ_PER_MHZ, .chip_select = 0, .mode = SPI_MODE_0, .swnode = &l, @@ -62,7 +62,7 @@ static struct spi_board_info ampl_info = { static struct spi_board_info ampr_info = { .modalias = "cs35l56", - .max_speed_hz = 20 * HZ_PER_MHZ, + .max_speed_hz = 11 * HZ_PER_MHZ, .chip_select = 1, .mode = SPI_MODE_0, .swnode = &r, From 0c76053e3fec801e86aca73f80072b3da4e72849 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Thu, 16 May 2024 11:33:43 +0300 Subject: [PATCH 0726/1578] drm: have config DRM_WERROR depend on !WERROR If WERROR is already enabled, there's no point in enabling DRM_WERROR or asking users about it. Reported-by: Linus Torvalds Closes: https://lore.kernel.org/r/CAHk-=whxT8D_0j=bjtrvj-O=VEOjn6GW8GK4j2V+BiDUntZKAQ@mail.gmail.com Fixes: f89632a9e5fa ("drm: Add CONFIG_DRM_WERROR") Reviewed-by: Javier Martinez Canillas Link: https://patchwork.freedesktop.org/patch/msgid/20240516083343.1375687-1-jani.nikula@intel.com Signed-off-by: Jani Nikula --- drivers/gpu/drm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index 026444eeb5c60f..d0aa277fc3bff8 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -450,6 +450,7 @@ config DRM_PRIVACY_SCREEN config DRM_WERROR bool "Compile the drm subsystem with warnings as errors" depends on DRM && EXPERT + depends on !WERROR default n help A kernel build should not cause any compiler warnings, and this From a3a5a37efba11b7cf1a86abe7bccfbcdb521764e Mon Sep 17 00:00:00 2001 From: Louis Dalibard Date: Fri, 7 Jun 2024 16:53:43 +0200 Subject: [PATCH 0727/1578] HID: Ignore battery for ELAN touchscreens 2F2C and 4116 At least ASUS Zenbook 14 (2023) and ASUS Zenbook 14 Pro (2023) are affected. The touchscreen reports a battery status of 0% and jumps to 1% when a stylus is used. The device ID was added and the battery ignore quirk was enabled for it. [jkosina@suse.com: reformatted changelog a bit] Signed-off-by: Louis Dalibard Signed-off-by: Jiri Kosina --- drivers/hid/hid-ids.h | 2 ++ drivers/hid/hid-input.c | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 61d2a21affa264..72d56ee7ce1b98 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -423,6 +423,8 @@ #define I2C_DEVICE_ID_HP_SPECTRE_X360_13_AW0020NG 0x29DF #define I2C_DEVICE_ID_ASUS_TP420IA_TOUCHSCREEN 0x2BC8 #define I2C_DEVICE_ID_ASUS_GV301RA_TOUCHSCREEN 0x2C82 +#define I2C_DEVICE_ID_ASUS_UX3402_TOUCHSCREEN 0x2F2C +#define I2C_DEVICE_ID_ASUS_UX6404_TOUCHSCREEN 0x4116 #define USB_DEVICE_ID_ASUS_UX550VE_TOUCHSCREEN 0x2544 #define USB_DEVICE_ID_ASUS_UX550_TOUCHSCREEN 0x2706 #define I2C_DEVICE_ID_SURFACE_GO_TOUCHSCREEN 0x261A diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c index 8bb16e9b94aa52..c9094a4f281e90 100644 --- a/drivers/hid/hid-input.c +++ b/drivers/hid/hid-input.c @@ -377,6 +377,10 @@ static const struct hid_device_id hid_battery_quirks[] = { HID_BATTERY_QUIRK_IGNORE }, { HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, I2C_DEVICE_ID_ASUS_GV301RA_TOUCHSCREEN), HID_BATTERY_QUIRK_IGNORE }, + { HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, I2C_DEVICE_ID_ASUS_UX3402_TOUCHSCREEN), + HID_BATTERY_QUIRK_IGNORE }, + { HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, I2C_DEVICE_ID_ASUS_UX6404_TOUCHSCREEN), + HID_BATTERY_QUIRK_IGNORE }, { HID_USB_DEVICE(USB_VENDOR_ID_ELAN, USB_DEVICE_ID_ASUS_UX550_TOUCHSCREEN), HID_BATTERY_QUIRK_IGNORE }, { HID_USB_DEVICE(USB_VENDOR_ID_ELAN, USB_DEVICE_ID_ASUS_UX550VE_TOUCHSCREEN), From 231035f18d6b80e5c28732a20872398116a54ecd Mon Sep 17 00:00:00 2001 From: Wenchao Hao Date: Thu, 6 Jun 2024 16:52:15 +0800 Subject: [PATCH 0728/1578] workqueue: Increase worker desc's length to 32 Commit 31c89007285d ("workqueue.c: Increase workqueue name length") increased WQ_NAME_LEN from 24 to 32, but forget to increase WORKER_DESC_LEN, which would cause truncation when setting kworker's desc from workqueue_struct's name, process_one_work() for example. Fixes: 31c89007285d ("workqueue.c: Increase workqueue name length") Signed-off-by: Wenchao Hao CC: Audra Mitchell Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index fb399389453657..d9968bfc8eacc7 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -95,7 +95,7 @@ enum wq_misc_consts { WORK_BUSY_RUNNING = 1 << 1, /* maximum string length for set_worker_desc() */ - WORKER_DESC_LEN = 24, + WORKER_DESC_LEN = 32, }; /* Convenience constants - of type 'unsigned long', not 'enum'! */ From 462237d2d93fc9e9221d1cf9f773954d27da83c0 Mon Sep 17 00:00:00 2001 From: Louis Chauvet Date: Fri, 7 Jun 2024 10:34:38 +0200 Subject: [PATCH 0729/1578] dmaengine: xilinx: xdma: Fix data synchronisation in xdma_channel_isr() Requests the vchan lock before using xdma->stop_request. Fixes: 6a40fb824596 ("dmaengine: xilinx: xdma: Fix synchronization issue") Cc: stable@vger.kernel.org Signed-off-by: Louis Chauvet Link: https://lore.kernel.org/r/20240607-xdma-fixes-v2-1-0282319ce345@bootlin.com Signed-off-by: Vinod Koul --- drivers/dma/xilinx/xdma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/dma/xilinx/xdma.c b/drivers/dma/xilinx/xdma.c index e143a733081610..718842fdaf98ed 100644 --- a/drivers/dma/xilinx/xdma.c +++ b/drivers/dma/xilinx/xdma.c @@ -885,11 +885,11 @@ static irqreturn_t xdma_channel_isr(int irq, void *dev_id) u32 st; bool repeat_tx; + spin_lock(&xchan->vchan.lock); + if (xchan->stop_requested) complete(&xchan->last_interrupt); - spin_lock(&xchan->vchan.lock); - /* get submitted request */ vd = vchan_next_desc(&xchan->vchan); if (!vd) From e79a10652bbd320649da705ca1ea0c04351af403 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 9 May 2024 13:45:02 -0500 Subject: [PATCH 0730/1578] ACPI: x86: Force StorageD3Enable on more products A Rembrandt-based HP thin client is reported to have problems where the NVME disk isn't present after resume from s2idle. This is because the NVME disk wasn't put into D3 at suspend, and that happened because the StorageD3Enable _DSD was missing in the BIOS. As AMD's architecture requires that the NVME is in D3 for s2idle, adjust the criteria for force_storage_d3 to match *all* Zen SoCs when the FADT advertises low power idle support. This will ensure that any future products with this BIOS deficiency don't need to be added to the allow list of overrides. Cc: All applicable Signed-off-by: Mario Limonciello Acked-by: Hans de Goede Signed-off-by: Rafael J. Wysocki --- drivers/acpi/x86/utils.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/drivers/acpi/x86/utils.c b/drivers/acpi/x86/utils.c index 7dca73417e2bf5..2fe0934dcd6408 100644 --- a/drivers/acpi/x86/utils.c +++ b/drivers/acpi/x86/utils.c @@ -206,16 +206,16 @@ bool acpi_device_override_status(struct acpi_device *adev, unsigned long long *s } /* - * AMD systems from Renoir and Lucienne *require* that the NVME controller + * AMD systems from Renoir onwards *require* that the NVME controller * is put into D3 over a Modern Standby / suspend-to-idle cycle. * * This is "typically" accomplished using the `StorageD3Enable` * property in the _DSD that is checked via the `acpi_storage_d3` function - * but this property was introduced after many of these systems launched - * and most OEM systems don't have it in their BIOS. + * but some OEM systems still don't have it in their BIOS. * * The Microsoft documentation for StorageD3Enable mentioned that Windows has - * a hardcoded allowlist for D3 support, which was used for these platforms. + * a hardcoded allowlist for D3 support as well as a registry key to override + * the BIOS, which has been used for these cases. * * This allows quirking on Linux in a similar fashion. * @@ -228,19 +228,15 @@ bool acpi_device_override_status(struct acpi_device *adev, unsigned long long *s * https://bugzilla.kernel.org/show_bug.cgi?id=216773 * https://bugzilla.kernel.org/show_bug.cgi?id=217003 * 2) On at least one HP system StorageD3Enable is missing on the second NVME - disk in the system. + * disk in the system. + * 3) On at least one HP Rembrandt system StorageD3Enable is missing on the only + * NVME device. */ -static const struct x86_cpu_id storage_d3_cpu_ids[] = { - X86_MATCH_VENDOR_FAM_MODEL(AMD, 23, 24, NULL), /* Picasso */ - X86_MATCH_VENDOR_FAM_MODEL(AMD, 23, 96, NULL), /* Renoir */ - X86_MATCH_VENDOR_FAM_MODEL(AMD, 23, 104, NULL), /* Lucienne */ - X86_MATCH_VENDOR_FAM_MODEL(AMD, 25, 80, NULL), /* Cezanne */ - {} -}; - bool force_storage_d3(void) { - return x86_match_cpu(storage_d3_cpu_ids); + if (!cpu_feature_enabled(X86_FEATURE_ZEN)) + return false; + return acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0; } /* From 9f0fad0382124e7e23b3c730fa78818c22c89c0a Mon Sep 17 00:00:00 2001 From: John Keeping Date: Thu, 23 May 2024 09:56:24 +0100 Subject: [PATCH 0731/1578] Input: ili210x - fix ili251x_read_touch_data() return value The caller of this function treats all non-zero values as an error, so the return value of i2c_master_recv() cannot be returned directly. This fixes touch reporting when there are more than 6 active touches. Fixes: ef536abd3afd1 ("Input: ili210x - define and use chip operations structure") Signed-off-by: John Keeping Link: https://lore.kernel.org/r/20240523085624.2295988-1-jkeeping@inmusicbrands.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/ili210x.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/input/touchscreen/ili210x.c b/drivers/input/touchscreen/ili210x.c index 31ffdc2a93f35a..79bdb2b109496f 100644 --- a/drivers/input/touchscreen/ili210x.c +++ b/drivers/input/touchscreen/ili210x.c @@ -261,8 +261,8 @@ static int ili251x_read_touch_data(struct i2c_client *client, u8 *data) if (!error && data[0] == 2) { error = i2c_master_recv(client, data + ILI251X_DATA_SIZE1, ILI251X_DATA_SIZE2); - if (error >= 0 && error != ILI251X_DATA_SIZE2) - error = -EIO; + if (error >= 0) + error = error == ILI251X_DATA_SIZE2 ? 0 : -EIO; } return error; From cee77149ebe9cd971ba238d87aa10e09bd98f1c9 Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Fri, 7 Jun 2024 16:37:48 -0700 Subject: [PATCH 0732/1578] Input: xpad - add support for ASUS ROG RAIKIRI PRO Add the VID/PID for ASUS ROG RAIKIRI PRO to the list of known devices. Signed-off-by: Luke D. Jones Link: https://lore.kernel.org/r/20240607223722.1170776-1-luke@ljones.dev Signed-off-by: Dmitry Torokhov --- drivers/input/joystick/xpad.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index 70f0654c58b6a1..2b8370ecf42a88 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -209,6 +209,7 @@ static const struct xpad_device { { 0x0738, 0xf738, "Super SFIV FightStick TE S", 0, XTYPE_XBOX360 }, { 0x07ff, 0xffff, "Mad Catz GamePad", 0, XTYPE_XBOX360 }, { 0x0b05, 0x1a38, "ASUS ROG RAIKIRI", 0, XTYPE_XBOXONE }, + { 0x0b05, 0x1abb, "ASUS ROG RAIKIRI PRO", 0, XTYPE_XBOXONE }, { 0x0c12, 0x0005, "Intec wireless", 0, XTYPE_XBOX }, { 0x0c12, 0x8801, "Nyko Xbox Controller", 0, XTYPE_XBOX }, { 0x0c12, 0x8802, "Zeroplus Xbox Controller", 0, XTYPE_XBOX }, From 89b898c627a49b978a4c323ea6856eacfc21f6ba Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Mon, 11 Mar 2024 12:28:00 -0400 Subject: [PATCH 0733/1578] iio: xilinx-ams: Don't include ams_ctrl_channels in scan_mask ams_enable_channel_sequence constructs a "scan_mask" for all the PS and PL channels. This works out fine, since scan_index for these channels is less than 64. However, it also includes the ams_ctrl_channels, where scan_index is greater than 64, triggering undefined behavior. Since we don't need these channels anyway, just exclude them. Fixes: d5c70627a794 ("iio: adc: Add Xilinx AMS driver") Signed-off-by: Sean Anderson Link: https://lore.kernel.org/r/20240311162800.11074-1-sean.anderson@linux.dev Signed-off-by: Jonathan Cameron --- drivers/iio/adc/xilinx-ams.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/iio/adc/xilinx-ams.c b/drivers/iio/adc/xilinx-ams.c index f0b71a1220e02e..f52abf759260f2 100644 --- a/drivers/iio/adc/xilinx-ams.c +++ b/drivers/iio/adc/xilinx-ams.c @@ -414,8 +414,12 @@ static void ams_enable_channel_sequence(struct iio_dev *indio_dev) /* Run calibration of PS & PL as part of the sequence */ scan_mask = BIT(0) | BIT(AMS_PS_SEQ_MAX); - for (i = 0; i < indio_dev->num_channels; i++) - scan_mask |= BIT_ULL(indio_dev->channels[i].scan_index); + for (i = 0; i < indio_dev->num_channels; i++) { + const struct iio_chan_spec *chan = &indio_dev->channels[i]; + + if (chan->scan_index < AMS_CTRL_SEQ_BASE) + scan_mask |= BIT_ULL(chan->scan_index); + } if (ams->ps_base) { /* put sysmon in a soft reset to change the sequence */ From a2b86132955268b2a1703082fbc2d4832fc001b8 Mon Sep 17 00:00:00 2001 From: Fernando Yang Date: Mon, 3 Jun 2024 15:07:54 -0300 Subject: [PATCH 0734/1578] iio: adc: ad7266: Fix variable checking bug The ret variable was not checked after iio_device_release_direct_mode(), which could possibly cause errors Fixes: c70df20e3159 ("iio: adc: ad7266: claim direct mode during sensor read") Signed-off-by: Fernando Yang Link: https://lore.kernel.org/r/20240603180757.8560-1-hagisf@usp.br Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/adc/ad7266.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iio/adc/ad7266.c b/drivers/iio/adc/ad7266.c index 353a97f9c0860a..13ea8a1073d281 100644 --- a/drivers/iio/adc/ad7266.c +++ b/drivers/iio/adc/ad7266.c @@ -157,6 +157,8 @@ static int ad7266_read_raw(struct iio_dev *indio_dev, ret = ad7266_read_single(st, val, chan->address); iio_device_release_direct_mode(indio_dev); + if (ret < 0) + return ret; *val = (*val >> 2) & 0xfff; if (chan->scan_type.sign == 's') *val = sign_extend32(*val, From a821d7111e3f7c8869961b606714a299bfe20014 Mon Sep 17 00:00:00 2001 From: Alexander Sverdlin Date: Wed, 5 Jun 2024 22:38:06 +0200 Subject: [PATCH 0735/1578] iio: accel: fxls8962af: select IIO_BUFFER & IIO_KFIFO_BUF Provide missing symbols to the module: ERROR: modpost: iio_push_to_buffers [drivers/iio/accel/fxls8962af-core.ko] undefined! ERROR: modpost: devm_iio_kfifo_buffer_setup_ext [drivers/iio/accel/fxls8962af-core.ko] undefined! Cc: stable@vger.kernel.org Fixes: 79e3a5bdd9ef ("iio: accel: fxls8962af: add hw buffered sampling") Signed-off-by: Alexander Sverdlin Reviewed-by: Sean Nyekjaer Link: https://lore.kernel.org/r/20240605203810.2908980-2-alexander.sverdlin@siemens.com Signed-off-by: Jonathan Cameron --- drivers/iio/accel/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iio/accel/Kconfig b/drivers/iio/accel/Kconfig index c2da5066e9a7bd..80b57d3ee3a726 100644 --- a/drivers/iio/accel/Kconfig +++ b/drivers/iio/accel/Kconfig @@ -330,6 +330,8 @@ config DMARD10 config FXLS8962AF tristate depends on I2C || !I2C # cannot be built-in for modular I2C + select IIO_BUFFER + select IIO_KFIFO_BUF config FXLS8962AF_I2C tristate "NXP FXLS8962AF/FXLS8964AF Accelerometer I2C Driver" From f67ac0061c7614c1548963d3ef1ee1606efd8636 Mon Sep 17 00:00:00 2001 From: Honggang LI Date: Thu, 23 May 2024 17:46:17 +0800 Subject: [PATCH 0736/1578] RDMA/rxe: Fix responder length checking for UD request packets According to the IBA specification: If a UD request packet is detected with an invalid length, the request shall be an invalid request and it shall be silently dropped by the responder. The responder then waits for a new request packet. commit 689c5421bfe0 ("RDMA/rxe: Fix incorrect responder length checking") defers responder length check for UD QPs in function `copy_data`. But it introduces a regression issue for UD QPs. When the packet size is too large to fit in the receive buffer. `copy_data` will return error code -EINVAL. Then `send_data_in` will return RESPST_ERR_MALFORMED_WQE. UD QP will transfer into ERROR state. Fixes: 689c5421bfe0 ("RDMA/rxe: Fix incorrect responder length checking") Signed-off-by: Honggang LI Link: https://lore.kernel.org/r/20240523094617.141148-1-honggangli@163.com Reviewed-by: Zhu Yanjun Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_resp.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index c6a7fa3054fada..6596a85723c9a5 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -344,6 +344,19 @@ static enum resp_states rxe_resp_check_length(struct rxe_qp *qp, * receive buffer later. For rmda operations additional * length checks are performed in check_rkey. */ + if ((qp_type(qp) == IB_QPT_GSI) || (qp_type(qp) == IB_QPT_UD)) { + unsigned int payload = payload_size(pkt); + unsigned int recv_buffer_len = 0; + int i; + + for (i = 0; i < qp->resp.wqe->dma.num_sge; i++) + recv_buffer_len += qp->resp.wqe->dma.sge[i].length; + if (payload + 40 > recv_buffer_len) { + rxe_dbg_qp(qp, "The receive buffer is too small for this UD packet.\n"); + return RESPST_ERR_LENGTH; + } + } + if (pkt->mask & RXE_PAYLOAD_MASK && ((qp_type(qp) == IB_QPT_RC) || (qp_type(qp) == IB_QPT_UC))) { unsigned int mtu = qp->mtu; From 75183e461ce033605c3e85518a9f3d4e4ef848a3 Mon Sep 17 00:00:00 2001 From: Ke Sun Date: Sun, 9 Jun 2024 11:08:47 +0800 Subject: [PATCH 0737/1578] iio: dac: fix ad9739a random config compile error WARNING: unmet direct dependencies detected for REGMAP_SPI Depends on [n]: SPI [=n] Selected by [m]: - AD9739A [=m] && IIO [=m] && (SPI [=n] || COMPILE_TEST [=y]) ... ERROR: modpost: "spi_write_then_read" [drivers/base/regmap/regmap-spi.ko] undefined! ERROR: modpost: "spi_async" [drivers/base/regmap/regmap-spi.ko] undefined! ERROR: modpost: "spi_sync" [drivers/base/regmap/regmap-spi.ko] undefined! ERROR: modpost: "__spi_register_driver" [drivers/iio/dac/ad9739a.ko] undefined! Fixes: e77603d5468b ("iio: dac: support the ad9739a RF DAC") Reported-by: k2ci Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202404250156.2PQRWmex-lkp@intel.com/ Signed-off-by: Ke Sun Reviewed-by: Nuno Sa Link: https://lore.kernel.org/r/20240609030847.2869455-1-sunke@kylinos.cn Signed-off-by: Jonathan Cameron --- drivers/iio/dac/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/dac/Kconfig b/drivers/iio/dac/Kconfig index 3c2bf620f00fb4..ee0d9798d8b444 100644 --- a/drivers/iio/dac/Kconfig +++ b/drivers/iio/dac/Kconfig @@ -133,7 +133,7 @@ config AD5624R_SPI config AD9739A tristate "Analog Devices AD9739A RF DAC spi driver" - depends on SPI || COMPILE_TEST + depends on SPI select REGMAP_SPI select IIO_BACKEND help From 9547d6a4c65e975e40e203900322342ef7379c52 Mon Sep 17 00:00:00 2001 From: Dimitri Fedrau Date: Wed, 5 Jun 2024 21:21:35 +0200 Subject: [PATCH 0738/1578] iio: humidity: hdc3020: fix hysteresis representation According to the ABI docs hysteresis values are represented as offsets to threshold values. Current implementation represents hysteresis values as absolute values which is wrong. Nevertheless the device stores them as absolute values and the datasheet refers to them as clear thresholds. Fix the reading and writing of hysteresis values by including thresholds into calculations. Hysteresis values that result in threshold clear values that are out of limits will be truncated. To check that the threshold clear values are correct, registers are read out using i2ctransfer and the corresponding temperature and relative humidity thresholds are calculated using the formulas in the datasheet. Fixes: 3ad0e7e5f0cb ("iio: humidity: hdc3020: add threshold events support") Signed-off-by: Dimitri Fedrau Reviewed-by: Javier Carrasco Link: https://lore.kernel.org/r/20240605192136.38146-1-dima.fedrau@gmail.com Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/humidity/hdc3020.c | 325 +++++++++++++++++++++++++-------- 1 file changed, 249 insertions(+), 76 deletions(-) diff --git a/drivers/iio/humidity/hdc3020.c b/drivers/iio/humidity/hdc3020.c index cdc4789213ba70..a82dcc3da4216f 100644 --- a/drivers/iio/humidity/hdc3020.c +++ b/drivers/iio/humidity/hdc3020.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -66,8 +67,10 @@ #define HDC3020_CRC8_POLYNOMIAL 0x31 -#define HDC3020_MIN_TEMP -40 -#define HDC3020_MAX_TEMP 125 +#define HDC3020_MIN_TEMP_MICRO -39872968 +#define HDC3020_MAX_TEMP_MICRO 124875639 +#define HDC3020_MAX_TEMP_HYST_MICRO 164748607 +#define HDC3020_MAX_HUM_MICRO 99220264 struct hdc3020_data { struct i2c_client *client; @@ -368,6 +371,105 @@ static int hdc3020_write_raw(struct iio_dev *indio_dev, return -EINVAL; } +static int hdc3020_thresh_get_temp(u16 thresh) +{ + int temp; + + /* + * Get the temperature threshold from 9 LSBs, shift them to get + * the truncated temperature threshold representation and + * calculate the threshold according to the formula in the + * datasheet. Result is degree celsius scaled by 65535. + */ + temp = FIELD_GET(HDC3020_THRESH_TEMP_MASK, thresh) << + HDC3020_THRESH_TEMP_TRUNC_SHIFT; + + return -2949075 + (175 * temp); +} + +static int hdc3020_thresh_get_hum(u16 thresh) +{ + int hum; + + /* + * Get the humidity threshold from 7 MSBs, shift them to get the + * truncated humidity threshold representation and calculate the + * threshold according to the formula in the datasheet. Result is + * percent scaled by 65535. + */ + hum = FIELD_GET(HDC3020_THRESH_HUM_MASK, thresh) << + HDC3020_THRESH_HUM_TRUNC_SHIFT; + + return hum * 100; +} + +static u16 hdc3020_thresh_set_temp(int s_temp, u16 curr_thresh) +{ + u64 temp; + u16 thresh; + + /* + * Calculate temperature threshold, shift it down to get the + * truncated threshold representation in the 9LSBs while keeping + * the current humidity threshold in the 7 MSBs. + */ + temp = (u64)(s_temp + 45000000) * 65535ULL; + temp = div_u64(temp, 1000000 * 175) >> HDC3020_THRESH_TEMP_TRUNC_SHIFT; + thresh = FIELD_PREP(HDC3020_THRESH_TEMP_MASK, temp); + thresh |= (FIELD_GET(HDC3020_THRESH_HUM_MASK, curr_thresh) << + HDC3020_THRESH_HUM_TRUNC_SHIFT); + + return thresh; +} + +static u16 hdc3020_thresh_set_hum(int s_hum, u16 curr_thresh) +{ + u64 hum; + u16 thresh; + + /* + * Calculate humidity threshold, shift it down and up to get the + * truncated threshold representation in the 7MSBs while keeping + * the current temperature threshold in the 9 LSBs. + */ + hum = (u64)(s_hum) * 65535ULL; + hum = div_u64(hum, 1000000 * 100) >> HDC3020_THRESH_HUM_TRUNC_SHIFT; + thresh = FIELD_PREP(HDC3020_THRESH_HUM_MASK, hum); + thresh |= FIELD_GET(HDC3020_THRESH_TEMP_MASK, curr_thresh); + + return thresh; +} + +static +int hdc3020_thresh_clr(s64 s_thresh, s64 s_hyst, enum iio_event_direction dir) +{ + s64 s_clr; + + /* + * Include directions when calculation the clear value, + * since hysteresis is unsigned by definition and the + * clear value is an absolute value which is signed. + */ + if (dir == IIO_EV_DIR_RISING) + s_clr = s_thresh - s_hyst; + else + s_clr = s_thresh + s_hyst; + + /* Divide by 65535 to get units of micro */ + return div_s64(s_clr, 65535); +} + +static int _hdc3020_write_thresh(struct hdc3020_data *data, u16 reg, u16 val) +{ + u8 buf[5]; + + put_unaligned_be16(reg, buf); + put_unaligned_be16(val, buf + 2); + buf[4] = crc8(hdc3020_crc8_table, buf + 2, 2, CRC8_INIT_VALUE); + + return hdc3020_write_bytes(data, buf, 5); +} + static int hdc3020_write_thresh(struct iio_dev *indio_dev, const struct iio_chan_spec *chan, enum iio_event_type type, @@ -376,67 +478,126 @@ static int hdc3020_write_thresh(struct iio_dev *indio_dev, int val, int val2) { struct hdc3020_data *data = iio_priv(indio_dev); - u8 buf[5]; - u64 tmp; - u16 reg; - int ret; - - /* Supported temperature range is from –40 to 125 degree celsius */ - if (val < HDC3020_MIN_TEMP || val > HDC3020_MAX_TEMP) - return -EINVAL; - - /* Select threshold register */ - if (info == IIO_EV_INFO_VALUE) { - if (dir == IIO_EV_DIR_RISING) - reg = HDC3020_S_T_RH_THRESH_HIGH; - else - reg = HDC3020_S_T_RH_THRESH_LOW; + u16 reg, reg_val, reg_thresh_rd, reg_clr_rd, reg_thresh_wr, reg_clr_wr; + s64 s_thresh, s_hyst, s_clr; + int s_val, thresh, clr, ret; + + /* Select threshold registers */ + if (dir == IIO_EV_DIR_RISING) { + reg_thresh_rd = HDC3020_R_T_RH_THRESH_HIGH; + reg_thresh_wr = HDC3020_S_T_RH_THRESH_HIGH; + reg_clr_rd = HDC3020_R_T_RH_THRESH_HIGH_CLR; + reg_clr_wr = HDC3020_S_T_RH_THRESH_HIGH_CLR; } else { - if (dir == IIO_EV_DIR_RISING) - reg = HDC3020_S_T_RH_THRESH_HIGH_CLR; - else - reg = HDC3020_S_T_RH_THRESH_LOW_CLR; + reg_thresh_rd = HDC3020_R_T_RH_THRESH_LOW; + reg_thresh_wr = HDC3020_S_T_RH_THRESH_LOW; + reg_clr_rd = HDC3020_R_T_RH_THRESH_LOW_CLR; + reg_clr_wr = HDC3020_S_T_RH_THRESH_LOW_CLR; } guard(mutex)(&data->lock); - ret = hdc3020_read_be16(data, reg); + ret = hdc3020_read_be16(data, reg_thresh_rd); + if (ret < 0) + return ret; + + thresh = ret; + ret = hdc3020_read_be16(data, reg_clr_rd); if (ret < 0) return ret; + clr = ret; + /* Scale value to include decimal part into calculations */ + s_val = (val < 0) ? (val * 1000000 - val2) : (val * 1000000 + val2); switch (chan->type) { case IIO_TEMP: - /* - * Calculate temperature threshold, shift it down to get the - * truncated threshold representation in the 9LSBs while keeping - * the current humidity threshold in the 7 MSBs. - */ - tmp = ((u64)(((val + 45) * MICRO) + val2)) * 65535ULL; - tmp = div_u64(tmp, MICRO * 175); - val = tmp >> HDC3020_THRESH_TEMP_TRUNC_SHIFT; - val = FIELD_PREP(HDC3020_THRESH_TEMP_MASK, val); - val |= (FIELD_GET(HDC3020_THRESH_HUM_MASK, ret) << - HDC3020_THRESH_HUM_TRUNC_SHIFT); + switch (info) { + case IIO_EV_INFO_VALUE: + s_val = max(s_val, HDC3020_MIN_TEMP_MICRO); + s_val = min(s_val, HDC3020_MAX_TEMP_MICRO); + reg = reg_thresh_wr; + reg_val = hdc3020_thresh_set_temp(s_val, thresh); + ret = _hdc3020_write_thresh(data, reg, reg_val); + if (ret < 0) + return ret; + + /* Calculate old hysteresis */ + s_thresh = (s64)hdc3020_thresh_get_temp(thresh) * 1000000; + s_clr = (s64)hdc3020_thresh_get_temp(clr) * 1000000; + s_hyst = div_s64(abs(s_thresh - s_clr), 65535); + /* Set new threshold */ + thresh = reg_val; + /* Set old hysteresis */ + s_val = s_hyst; + fallthrough; + case IIO_EV_INFO_HYSTERESIS: + /* + * Function hdc3020_thresh_get_temp returns temperature + * in degree celsius scaled by 65535. Scale by 1000000 + * to be able to subtract scaled hysteresis value. + */ + s_thresh = (s64)hdc3020_thresh_get_temp(thresh) * 1000000; + /* + * Units of s_val are in micro degree celsius, scale by + * 65535 to get same units as s_thresh. + */ + s_val = min(abs(s_val), HDC3020_MAX_TEMP_HYST_MICRO); + s_hyst = (s64)s_val * 65535; + s_clr = hdc3020_thresh_clr(s_thresh, s_hyst, dir); + s_clr = max(s_clr, HDC3020_MIN_TEMP_MICRO); + s_clr = min(s_clr, HDC3020_MAX_TEMP_MICRO); + reg = reg_clr_wr; + reg_val = hdc3020_thresh_set_temp(s_clr, clr); + break; + default: + return -EOPNOTSUPP; + } break; case IIO_HUMIDITYRELATIVE: - /* - * Calculate humidity threshold, shift it down and up to get the - * truncated threshold representation in the 7MSBs while keeping - * the current temperature threshold in the 9 LSBs. - */ - tmp = ((u64)((val * MICRO) + val2)) * 65535ULL; - tmp = div_u64(tmp, MICRO * 100); - val = tmp >> HDC3020_THRESH_HUM_TRUNC_SHIFT; - val = FIELD_PREP(HDC3020_THRESH_HUM_MASK, val); - val |= FIELD_GET(HDC3020_THRESH_TEMP_MASK, ret); + s_val = (s_val < 0) ? 0 : min(s_val, HDC3020_MAX_HUM_MICRO); + switch (info) { + case IIO_EV_INFO_VALUE: + reg = reg_thresh_wr; + reg_val = hdc3020_thresh_set_hum(s_val, thresh); + ret = _hdc3020_write_thresh(data, reg, reg_val); + if (ret < 0) + return ret; + + /* Calculate old hysteresis */ + s_thresh = (s64)hdc3020_thresh_get_hum(thresh) * 1000000; + s_clr = (s64)hdc3020_thresh_get_hum(clr) * 1000000; + s_hyst = div_s64(abs(s_thresh - s_clr), 65535); + /* Set new threshold */ + thresh = reg_val; + /* Try to set old hysteresis */ + s_val = min(abs(s_hyst), HDC3020_MAX_HUM_MICRO); + fallthrough; + case IIO_EV_INFO_HYSTERESIS: + /* + * Function hdc3020_thresh_get_hum returns relative + * humidity in percent scaled by 65535. Scale by 1000000 + * to be able to subtract scaled hysteresis value. + */ + s_thresh = (s64)hdc3020_thresh_get_hum(thresh) * 1000000; + /* + * Units of s_val are in micro percent, scale by 65535 + * to get same units as s_thresh. + */ + s_hyst = (s64)s_val * 65535; + s_clr = hdc3020_thresh_clr(s_thresh, s_hyst, dir); + s_clr = max(s_clr, 0); + s_clr = min(s_clr, HDC3020_MAX_HUM_MICRO); + reg = reg_clr_wr; + reg_val = hdc3020_thresh_set_hum(s_clr, clr); + break; + default: + return -EOPNOTSUPP; + } break; default: return -EOPNOTSUPP; } - put_unaligned_be16(reg, buf); - put_unaligned_be16(val, buf + 2); - buf[4] = crc8(hdc3020_crc8_table, buf + 2, 2, CRC8_INIT_VALUE); - return hdc3020_write_bytes(data, buf, 5); + return _hdc3020_write_thresh(data, reg, reg_val); } static int hdc3020_read_thresh(struct iio_dev *indio_dev, @@ -447,48 +608,60 @@ static int hdc3020_read_thresh(struct iio_dev *indio_dev, int *val, int *val2) { struct hdc3020_data *data = iio_priv(indio_dev); - u16 reg; - int ret; + u16 reg_thresh, reg_clr; + int thresh, clr, ret; - /* Select threshold register */ - if (info == IIO_EV_INFO_VALUE) { - if (dir == IIO_EV_DIR_RISING) - reg = HDC3020_R_T_RH_THRESH_HIGH; - else - reg = HDC3020_R_T_RH_THRESH_LOW; + /* Select threshold registers */ + if (dir == IIO_EV_DIR_RISING) { + reg_thresh = HDC3020_R_T_RH_THRESH_HIGH; + reg_clr = HDC3020_R_T_RH_THRESH_HIGH_CLR; } else { - if (dir == IIO_EV_DIR_RISING) - reg = HDC3020_R_T_RH_THRESH_HIGH_CLR; - else - reg = HDC3020_R_T_RH_THRESH_LOW_CLR; + reg_thresh = HDC3020_R_T_RH_THRESH_LOW; + reg_clr = HDC3020_R_T_RH_THRESH_LOW_CLR; } guard(mutex)(&data->lock); - ret = hdc3020_read_be16(data, reg); + ret = hdc3020_read_be16(data, reg_thresh); if (ret < 0) return ret; switch (chan->type) { case IIO_TEMP: - /* - * Get the temperature threshold from 9 LSBs, shift them to get - * the truncated temperature threshold representation and - * calculate the threshold according to the formula in the - * datasheet. - */ - *val = FIELD_GET(HDC3020_THRESH_TEMP_MASK, ret); - *val = *val << HDC3020_THRESH_TEMP_TRUNC_SHIFT; - *val = -2949075 + (175 * (*val)); + thresh = hdc3020_thresh_get_temp(ret); + switch (info) { + case IIO_EV_INFO_VALUE: + *val = thresh; + break; + case IIO_EV_INFO_HYSTERESIS: + ret = hdc3020_read_be16(data, reg_clr); + if (ret < 0) + return ret; + + clr = hdc3020_thresh_get_temp(ret); + *val = abs(thresh - clr); + break; + default: + return -EOPNOTSUPP; + } *val2 = 65535; return IIO_VAL_FRACTIONAL; case IIO_HUMIDITYRELATIVE: - /* - * Get the humidity threshold from 7 MSBs, shift them to get the - * truncated humidity threshold representation and calculate the - * threshold according to the formula in the datasheet. - */ - *val = FIELD_GET(HDC3020_THRESH_HUM_MASK, ret); - *val = (*val << HDC3020_THRESH_HUM_TRUNC_SHIFT) * 100; + thresh = hdc3020_thresh_get_hum(ret); + switch (info) { + case IIO_EV_INFO_VALUE: + *val = thresh; + break; + case IIO_EV_INFO_HYSTERESIS: + ret = hdc3020_read_be16(data, reg_clr); + if (ret < 0) + return ret; + + clr = hdc3020_thresh_get_hum(ret); + *val = abs(thresh - clr); + break; + default: + return -EOPNOTSUPP; + } *val2 = 65535; return IIO_VAL_FRACTIONAL; default: From ae1f7b93b52095be6776d0f34957b4f35dda44d9 Mon Sep 17 00:00:00 2001 From: Vasileios Amoiridis Date: Thu, 6 Jun 2024 23:22:53 +0200 Subject: [PATCH 0739/1578] iio: chemical: bme680: Fix pressure value output The IIO standard units are measured in kPa while the driver is using hPa. Apart from checking the userspace value itself, it is mentioned also in the Bosch API [1] that the pressure value is in Pascal. [1]: https://github.com/boschsensortec/BME68x_SensorAPI/blob/v4.4.8/bme68x_defs.h#L742 Fixes: 1b3bd8592780 ("iio: chemical: Add support for Bosch BME680 sensor") Signed-off-by: Vasileios Amoiridis Link: https://lore.kernel.org/r/20240606212313.207550-2-vassilisamir@gmail.com Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/chemical/bme680_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/chemical/bme680_core.c b/drivers/iio/chemical/bme680_core.c index ef5e0e46fd3447..2c40c13fe97a60 100644 --- a/drivers/iio/chemical/bme680_core.c +++ b/drivers/iio/chemical/bme680_core.c @@ -678,7 +678,7 @@ static int bme680_read_press(struct bme680_data *data, } *val = bme680_compensate_press(data, adc_press); - *val2 = 100; + *val2 = 1000; return IIO_VAL_FRACTIONAL; } From b47c0fee73a810c4503c4a94ea34858a1d865bba Mon Sep 17 00:00:00 2001 From: Vasileios Amoiridis Date: Thu, 6 Jun 2024 23:22:54 +0200 Subject: [PATCH 0740/1578] iio: chemical: bme680: Fix calibration data variable According to the BME68x Sensor API [1], the h6 calibration data variable should be an unsigned integer of size 8. [1]: https://github.com/boschsensortec/BME68x_SensorAPI/blob/v4.4.8/bme68x_defs.h#L789 Fixes: 1b3bd8592780 ("iio: chemical: Add support for Bosch BME680 sensor") Signed-off-by: Vasileios Amoiridis Link: https://lore.kernel.org/r/20240606212313.207550-3-vassilisamir@gmail.com Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/chemical/bme680_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/chemical/bme680_core.c b/drivers/iio/chemical/bme680_core.c index 2c40c13fe97a60..812829841733bd 100644 --- a/drivers/iio/chemical/bme680_core.c +++ b/drivers/iio/chemical/bme680_core.c @@ -38,7 +38,7 @@ struct bme680_calib { s8 par_h3; s8 par_h4; s8 par_h5; - s8 par_h6; + u8 par_h6; s8 par_h7; s8 par_gh1; s16 par_gh2; From fdd478c3ae98c3f13628e110dce9b6cfb0d9b3c8 Mon Sep 17 00:00:00 2001 From: Vasileios Amoiridis Date: Thu, 6 Jun 2024 23:22:55 +0200 Subject: [PATCH 0741/1578] iio: chemical: bme680: Fix overflows in compensate() functions There are cases in the compensate functions of the driver that there could be overflows of variables due to bit shifting ops. These implications were initially discussed here [1] and they were mentioned in log message of Commit 1b3bd8592780 ("iio: chemical: Add support for Bosch BME680 sensor"). [1]: https://lore.kernel.org/linux-iio/20180728114028.3c1bbe81@archlinux/ Fixes: 1b3bd8592780 ("iio: chemical: Add support for Bosch BME680 sensor") Signed-off-by: Vasileios Amoiridis Link: https://lore.kernel.org/r/20240606212313.207550-4-vassilisamir@gmail.com Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/chemical/bme680_core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/iio/chemical/bme680_core.c b/drivers/iio/chemical/bme680_core.c index 812829841733bd..5db48f6d646c1d 100644 --- a/drivers/iio/chemical/bme680_core.c +++ b/drivers/iio/chemical/bme680_core.c @@ -342,10 +342,10 @@ static s16 bme680_compensate_temp(struct bme680_data *data, if (!calib->par_t2) bme680_read_calib(data, calib); - var1 = (adc_temp >> 3) - (calib->par_t1 << 1); + var1 = (adc_temp >> 3) - ((s32)calib->par_t1 << 1); var2 = (var1 * calib->par_t2) >> 11; var3 = ((var1 >> 1) * (var1 >> 1)) >> 12; - var3 = (var3 * (calib->par_t3 << 4)) >> 14; + var3 = (var3 * ((s32)calib->par_t3 << 4)) >> 14; data->t_fine = var2 + var3; calc_temp = (data->t_fine * 5 + 128) >> 8; @@ -368,9 +368,9 @@ static u32 bme680_compensate_press(struct bme680_data *data, var1 = (data->t_fine >> 1) - 64000; var2 = ((((var1 >> 2) * (var1 >> 2)) >> 11) * calib->par_p6) >> 2; var2 = var2 + (var1 * calib->par_p5 << 1); - var2 = (var2 >> 2) + (calib->par_p4 << 16); + var2 = (var2 >> 2) + ((s32)calib->par_p4 << 16); var1 = (((((var1 >> 2) * (var1 >> 2)) >> 13) * - (calib->par_p3 << 5)) >> 3) + + ((s32)calib->par_p3 << 5)) >> 3) + ((calib->par_p2 * var1) >> 1); var1 = var1 >> 18; var1 = ((32768 + var1) * calib->par_p1) >> 15; @@ -388,7 +388,7 @@ static u32 bme680_compensate_press(struct bme680_data *data, var3 = ((press_comp >> 8) * (press_comp >> 8) * (press_comp >> 8) * calib->par_p10) >> 17; - press_comp += (var1 + var2 + var3 + (calib->par_p7 << 7)) >> 4; + press_comp += (var1 + var2 + var3 + ((s32)calib->par_p7 << 7)) >> 4; return press_comp; } @@ -414,7 +414,7 @@ static u32 bme680_compensate_humid(struct bme680_data *data, (((temp_scaled * ((temp_scaled * calib->par_h5) / 100)) >> 6) / 100) + (1 << 14))) >> 10; var3 = var1 * var2; - var4 = calib->par_h6 << 7; + var4 = (s32)calib->par_h6 << 7; var4 = (var4 + ((temp_scaled * calib->par_h7) / 100)) >> 4; var5 = ((var3 >> 14) * (var3 >> 14)) >> 10; var6 = (var4 * var5) >> 1; From 4241665e6ea063a9c1d734de790121a71db763fc Mon Sep 17 00:00:00 2001 From: Vasileios Amoiridis Date: Thu, 6 Jun 2024 23:22:56 +0200 Subject: [PATCH 0742/1578] iio: chemical: bme680: Fix sensor data read operation A read operation is happening as follows: a) Set sensor to forced mode b) Sensor measures values and update data registers and sleeps again c) Read data registers In the current implementation the read operation happens immediately after the sensor is set to forced mode so the sensor does not have the time to update properly the registers. This leads to the following 2 problems: 1) The first ever value which is read by the register is always wrong 2) Every read operation, puts the register into forced mode and reads the data that were calculated in the previous conversion. This behaviour was tested in 2 ways: 1) The internal meas_status_0 register was read before and after every read operation in order to verify that the data were ready even before the register was set to forced mode and also to check that after the forced mode was set the new data were not yet ready. 2) Physically changing the temperature and measuring the temperature This commit adds the waiting time in between the set of the forced mode and the read of the data. The function is taken from the Bosch BME68x Sensor API [1]. [1]: https://github.com/boschsensortec/BME68x_SensorAPI/blob/v4.4.8/bme68x.c#L490 Fixes: 1b3bd8592780 ("iio: chemical: Add support for Bosch BME680 sensor") Signed-off-by: Vasileios Amoiridis Link: https://lore.kernel.org/r/20240606212313.207550-5-vassilisamir@gmail.com Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/chemical/bme680.h | 2 ++ drivers/iio/chemical/bme680_core.c | 46 ++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/drivers/iio/chemical/bme680.h b/drivers/iio/chemical/bme680.h index 4edc5d21cb9fa6..f959252a4fe665 100644 --- a/drivers/iio/chemical/bme680.h +++ b/drivers/iio/chemical/bme680.h @@ -54,7 +54,9 @@ #define BME680_NB_CONV_MASK GENMASK(3, 0) #define BME680_REG_MEAS_STAT_0 0x1D +#define BME680_NEW_DATA_BIT BIT(7) #define BME680_GAS_MEAS_BIT BIT(6) +#define BME680_MEAS_BIT BIT(5) /* Calibration Parameters */ #define BME680_T2_LSB_REG 0x8A diff --git a/drivers/iio/chemical/bme680_core.c b/drivers/iio/chemical/bme680_core.c index 5db48f6d646c1d..500f56834b01f6 100644 --- a/drivers/iio/chemical/bme680_core.c +++ b/drivers/iio/chemical/bme680_core.c @@ -10,6 +10,7 @@ */ #include #include +#include #include #include #include @@ -532,6 +533,43 @@ static u8 bme680_oversampling_to_reg(u8 val) return ilog2(val) + 1; } +/* + * Taken from Bosch BME680 API: + * https://github.com/boschsensortec/BME68x_SensorAPI/blob/v4.4.8/bme68x.c#L490 + */ +static int bme680_wait_for_eoc(struct bme680_data *data) +{ + struct device *dev = regmap_get_device(data->regmap); + unsigned int check; + int ret; + /* + * (Sum of oversampling ratios * time per oversampling) + + * TPH measurement + gas measurement + wait transition from forced mode + * + heater duration + */ + int wait_eoc_us = ((data->oversampling_temp + data->oversampling_press + + data->oversampling_humid) * 1936) + (477 * 4) + + (477 * 5) + 1000 + (data->heater_dur * 1000); + + usleep_range(wait_eoc_us, wait_eoc_us + 100); + + ret = regmap_read(data->regmap, BME680_REG_MEAS_STAT_0, &check); + if (ret) { + dev_err(dev, "failed to read measurement status register.\n"); + return ret; + } + if (check & BME680_MEAS_BIT) { + dev_err(dev, "Device measurement cycle incomplete.\n"); + return -EBUSY; + } + if (!(check & BME680_NEW_DATA_BIT)) { + dev_err(dev, "No new data available from the device.\n"); + return -ENODATA; + } + + return 0; +} + static int bme680_chip_config(struct bme680_data *data) { struct device *dev = regmap_get_device(data->regmap); @@ -622,6 +660,10 @@ static int bme680_read_temp(struct bme680_data *data, int *val) if (ret < 0) return ret; + ret = bme680_wait_for_eoc(data); + if (ret) + return ret; + ret = regmap_bulk_read(data->regmap, BME680_REG_TEMP_MSB, &tmp, 3); if (ret < 0) { @@ -738,6 +780,10 @@ static int bme680_read_gas(struct bme680_data *data, if (ret < 0) return ret; + ret = bme680_wait_for_eoc(data); + if (ret) + return ret; + ret = regmap_read(data->regmap, BME680_REG_MEAS_STAT_0, &check); if (check & BME680_GAS_MEAS_BIT) { dev_err(dev, "gas measurement incomplete\n"); From 0579f27249047006a818e463ee66a6c314d04cea Mon Sep 17 00:00:00 2001 From: Sagar Cheluvegowda Date: Wed, 5 Jun 2024 11:57:18 -0700 Subject: [PATCH 0743/1578] net: stmmac: dwmac-qcom-ethqos: Configure host DMA width Commit 070246e4674b ("net: stmmac: Fix for mismatched host/device DMA address width") added support in the stmmac driver for platform drivers to indicate the host DMA width, but left it up to authors of the specific platforms to indicate if their width differed from the addr64 register read from the MAC itself. Qualcomm's EMAC4 integration supports only up to 36 bit width (as opposed to the addr64 register indicating 40 bit width). Let's indicate that in the platform driver to avoid a scenario where the driver will allocate descriptors of size that is supported by the CPU which in our case is 36 bit, but as the addr64 register is still capable of 40 bits the device will use two descriptors as one address. Fixes: 8c4d92e82d50 ("net: stmmac: dwmac-qcom-ethqos: add support for emac4 on sa8775p platforms") Signed-off-by: Sagar Cheluvegowda Reviewed-by: Simon Horman Reviewed-by: Andrew Halaney Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c index e254b21fdb5986..65d7370b47d579 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c @@ -93,6 +93,7 @@ struct ethqos_emac_driver_data { bool has_emac_ge_3; const char *link_clk_name; bool has_integrated_pcs; + u32 dma_addr_width; struct dwmac4_addrs dwmac4_addrs; }; @@ -276,6 +277,7 @@ static const struct ethqos_emac_driver_data emac_v4_0_0_data = { .has_emac_ge_3 = true, .link_clk_name = "phyaux", .has_integrated_pcs = true, + .dma_addr_width = 36, .dwmac4_addrs = { .dma_chan = 0x00008100, .dma_chan_offset = 0x1000, @@ -845,6 +847,8 @@ static int qcom_ethqos_probe(struct platform_device *pdev) plat_dat->flags |= STMMAC_FLAG_RX_CLK_RUNS_IN_LPI; if (data->has_integrated_pcs) plat_dat->flags |= STMMAC_FLAG_HAS_INTEGRATED_PCS; + if (data->dma_addr_width) + plat_dat->host_dma_width = data->dma_addr_width; if (ethqos->serdes_phy) { plat_dat->serdes_powerup = qcom_ethqos_serdes_powerup; From 83a7eefedc9b56fe7bfeff13b6c7356688ffa670 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 9 Jun 2024 14:19:43 -0700 Subject: [PATCH 0744/1578] Linux 6.10-rc3 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 7f921ae547f116..925a75b8ba7def 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 10 SUBLEVEL = 0 -EXTRAVERSION = -rc2 +EXTRAVERSION = -rc3 NAME = Baby Opossum Posse # *DOCUMENTATION* From 799d4b392417ed6889030a5b2335ccb6dcf030ab Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Thu, 25 Apr 2024 11:48:51 +0200 Subject: [PATCH 0745/1578] drm/exynos: hdmi: report safe 640x480 mode as a fallback when no EDID found When reading EDID fails and driver reports no modes available, the DRM core adds an artificial 1024x786 mode to the connector. Unfortunately some variants of the Exynos HDMI (like the one in Exynos4 SoCs) are not able to drive such mode, so report a safe 640x480 mode instead of nothing in case of the EDID reading failure. This fixes the following issue observed on Trats2 board since commit 13d5b040363c ("drm/exynos: do not return negative values from .get_modes()"): [drm] Exynos DRM: using 11c00000.fimd device for DMA mapping operations exynos-drm exynos-drm: bound 11c00000.fimd (ops fimd_component_ops) exynos-drm exynos-drm: bound 12c10000.mixer (ops mixer_component_ops) exynos-dsi 11c80000.dsi: [drm:samsung_dsim_host_attach] Attached s6e8aa0 device (lanes:4 bpp:24 mode-flags:0x10b) exynos-drm exynos-drm: bound 11c80000.dsi (ops exynos_dsi_component_ops) exynos-drm exynos-drm: bound 12d00000.hdmi (ops hdmi_component_ops) [drm] Initialized exynos 1.1.0 20180330 for exynos-drm on minor 1 exynos-hdmi 12d00000.hdmi: [drm:hdmiphy_enable.part.0] *ERROR* PLL could not reach steady state panel-samsung-s6e8aa0 11c80000.dsi.0: ID: 0xa2, 0x20, 0x8c exynos-mixer 12c10000.mixer: timeout waiting for VSYNC ------------[ cut here ]------------ WARNING: CPU: 1 PID: 11 at drivers/gpu/drm/drm_atomic_helper.c:1682 drm_atomic_helper_wait_for_vblanks.part.0+0x2b0/0x2b8 [CRTC:70:crtc-1] vblank wait timed out Modules linked in: CPU: 1 PID: 11 Comm: kworker/u16:0 Not tainted 6.9.0-rc5-next-20240424 #14913 Hardware name: Samsung Exynos (Flattened Device Tree) Workqueue: events_unbound deferred_probe_work_func Call trace: unwind_backtrace from show_stack+0x10/0x14 show_stack from dump_stack_lvl+0x68/0x88 dump_stack_lvl from __warn+0x7c/0x1c4 __warn from warn_slowpath_fmt+0x11c/0x1a8 warn_slowpath_fmt from drm_atomic_helper_wait_for_vblanks.part.0+0x2b0/0x2b8 drm_atomic_helper_wait_for_vblanks.part.0 from drm_atomic_helper_commit_tail_rpm+0x7c/0x8c drm_atomic_helper_commit_tail_rpm from commit_tail+0x9c/0x184 commit_tail from drm_atomic_helper_commit+0x168/0x190 drm_atomic_helper_commit from drm_atomic_commit+0xb4/0xe0 drm_atomic_commit from drm_client_modeset_commit_atomic+0x23c/0x27c drm_client_modeset_commit_atomic from drm_client_modeset_commit_locked+0x60/0x1cc drm_client_modeset_commit_locked from drm_client_modeset_commit+0x24/0x40 drm_client_modeset_commit from __drm_fb_helper_restore_fbdev_mode_unlocked+0x9c/0xc4 __drm_fb_helper_restore_fbdev_mode_unlocked from drm_fb_helper_set_par+0x2c/0x3c drm_fb_helper_set_par from fbcon_init+0x3d8/0x550 fbcon_init from visual_init+0xc0/0x108 visual_init from do_bind_con_driver+0x1b8/0x3a4 do_bind_con_driver from do_take_over_console+0x140/0x1ec do_take_over_console from do_fbcon_takeover+0x70/0xd0 do_fbcon_takeover from fbcon_fb_registered+0x19c/0x1ac fbcon_fb_registered from register_framebuffer+0x190/0x21c register_framebuffer from __drm_fb_helper_initial_config_and_unlock+0x350/0x574 __drm_fb_helper_initial_config_and_unlock from exynos_drm_fbdev_client_hotplug+0x6c/0xb0 exynos_drm_fbdev_client_hotplug from drm_client_register+0x58/0x94 drm_client_register from exynos_drm_bind+0x160/0x190 exynos_drm_bind from try_to_bring_up_aggregate_device+0x200/0x2d8 try_to_bring_up_aggregate_device from __component_add+0xb0/0x170 __component_add from mixer_probe+0x74/0xcc mixer_probe from platform_probe+0x5c/0xb8 platform_probe from really_probe+0xe0/0x3d8 really_probe from __driver_probe_device+0x9c/0x1e4 __driver_probe_device from driver_probe_device+0x30/0xc0 driver_probe_device from __device_attach_driver+0xa8/0x120 __device_attach_driver from bus_for_each_drv+0x80/0xcc bus_for_each_drv from __device_attach+0xac/0x1fc __device_attach from bus_probe_device+0x8c/0x90 bus_probe_device from deferred_probe_work_func+0x98/0xe0 deferred_probe_work_func from process_one_work+0x240/0x6d0 process_one_work from worker_thread+0x1a0/0x3f4 worker_thread from kthread+0x104/0x138 kthread from ret_from_fork+0x14/0x28 Exception stack(0xf0895fb0 to 0xf0895ff8) ... irq event stamp: 82357 hardirqs last enabled at (82363): [] vprintk_emit+0x308/0x33c hardirqs last disabled at (82368): [] vprintk_emit+0x2bc/0x33c softirqs last enabled at (81614): [] __do_softirq+0x320/0x500 softirqs last disabled at (81609): [] __irq_exit_rcu+0x130/0x184 ---[ end trace 0000000000000000 ]--- exynos-drm exynos-drm: [drm] *ERROR* flip_done timed out exynos-drm exynos-drm: [drm] *ERROR* [CRTC:70:crtc-1] commit wait timed out exynos-drm exynos-drm: [drm] *ERROR* flip_done timed out exynos-drm exynos-drm: [drm] *ERROR* [CONNECTOR:74:HDMI-A-1] commit wait timed out exynos-drm exynos-drm: [drm] *ERROR* flip_done timed out exynos-drm exynos-drm: [drm] *ERROR* [PLANE:56:plane-5] commit wait timed out exynos-mixer 12c10000.mixer: timeout waiting for VSYNC Cc: stable@vger.kernel.org Fixes: 13d5b040363c ("drm/exynos: do not return negative values from .get_modes()") Signed-off-by: Marek Szyprowski Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_hdmi.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_hdmi.c b/drivers/gpu/drm/exynos/exynos_hdmi.c index e968824a4c728e..1e26cd4f834798 100644 --- a/drivers/gpu/drm/exynos/exynos_hdmi.c +++ b/drivers/gpu/drm/exynos/exynos_hdmi.c @@ -887,11 +887,11 @@ static int hdmi_get_modes(struct drm_connector *connector) int ret; if (!hdata->ddc_adpt) - return 0; + goto no_edid; edid = drm_get_edid(connector, hdata->ddc_adpt); if (!edid) - return 0; + goto no_edid; hdata->dvi_mode = !connector->display_info.is_hdmi; DRM_DEV_DEBUG_KMS(hdata->dev, "%s : width[%d] x height[%d]\n", @@ -906,6 +906,9 @@ static int hdmi_get_modes(struct drm_connector *connector) kfree(edid); return ret; + +no_edid: + return drm_add_modes_noedid(connector, 640, 480); } static int hdmi_find_phy_conf(struct hdmi_context *hdata, u32 pixel_clock) From 1f3512cdf8299f9edaea9046d53ea324a7730bab Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 4 Jun 2024 15:11:29 +0200 Subject: [PATCH 0746/1578] drm/exynos: dp: drop driver owner initialization Core in platform_driver_register() already sets the .owner, so driver does not need to. Whatever is set here will be anyway overwritten by main driver calling platform_driver_register(). Signed-off-by: Krzysztof Kozlowski Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_dp.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/exynos/exynos_dp.c b/drivers/gpu/drm/exynos/exynos_dp.c index f48c4343f4690f..3e6d4c6aa877ee 100644 --- a/drivers/gpu/drm/exynos/exynos_dp.c +++ b/drivers/gpu/drm/exynos/exynos_dp.c @@ -285,7 +285,6 @@ struct platform_driver dp_driver = { .remove_new = exynos_dp_remove, .driver = { .name = "exynos-dp", - .owner = THIS_MODULE, .pm = pm_ptr(&exynos_dp_pm_ops), .of_match_table = exynos_dp_match, }, From fe8a08973a0dea9757394c5adbdc3c0a03b0b432 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 7 Jun 2024 16:32:59 -0500 Subject: [PATCH 0747/1578] RAS/AMD/ATL: Fix MI300 bank hash Apply the SID bits to the correct offset in the Bank value. Do this in the temporary value so they don't need to be masked off later. Fixes: 87a612375307 ("RAS/AMD/ATL: Add MI300 DRAM to normalized address translation support") Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Cc: Link: https://lore.kernel.org/r/20240607-mi300-dram-xl-fix-v1-1-2f11547a178c@amd.com --- drivers/ras/amd/atl/umc.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/ras/amd/atl/umc.c b/drivers/ras/amd/atl/umc.c index 59b6169093f774..5cb92330dc6773 100644 --- a/drivers/ras/amd/atl/umc.c +++ b/drivers/ras/amd/atl/umc.c @@ -189,16 +189,11 @@ static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr) /* Calculate hash for PC bit. */ if (addr_hash.pc.xor_enable) { - /* Bits SID[1:0] act as Bank[6:5] for PC hash, so apply them here. */ - bank |= sid << 5; - temp = bitwise_xor_bits(col & addr_hash.pc.col_xor); temp ^= bitwise_xor_bits(row & addr_hash.pc.row_xor); - temp ^= bitwise_xor_bits(bank & addr_hash.bank_xor); + /* Bits SID[1:0] act as Bank[5:4] for PC hash, so apply them here. */ + temp ^= bitwise_xor_bits((bank | sid << NUM_BANK_BITS) & addr_hash.bank_xor); pc ^= temp; - - /* Drop SID bits for the sake of debug printing later. */ - bank &= 0x1F; } /* Reconstruct the normalized address starting with NA[4:0] = 0 */ From 38e3825631b1f314b21e3ade00b5a4d737eb054e Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Thu, 30 May 2024 13:01:51 +0300 Subject: [PATCH 0748/1578] drm/exynos/vidi: fix memory leak in .get_modes() The duplicated EDID is never freed. Fix it. Cc: stable@vger.kernel.org Signed-off-by: Jani Nikula Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_drm_vidi.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_vidi.c b/drivers/gpu/drm/exynos/exynos_drm_vidi.c index fab135308b70fb..11a720fef32b48 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_vidi.c +++ b/drivers/gpu/drm/exynos/exynos_drm_vidi.c @@ -309,6 +309,7 @@ static int vidi_get_modes(struct drm_connector *connector) struct vidi_context *ctx = ctx_from_connector(connector); struct edid *edid; int edid_len; + int count; /* * the edid data comes from user side and it would be set @@ -328,7 +329,11 @@ static int vidi_get_modes(struct drm_connector *connector) drm_connector_update_edid_property(connector, edid); - return drm_add_edid_modes(connector, edid); + count = drm_add_edid_modes(connector, edid); + + kfree(edid); + + return count; } static const struct drm_connector_helper_funcs vidi_connector_helper_funcs = { From 58f880711f2ba53fd5e959875aff5b3bf6d5c32e Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Thu, 6 Jun 2024 11:11:57 -0700 Subject: [PATCH 0749/1578] xfs: make sure sb_fdblocks is non-negative A user with a completely full filesystem experienced an unexpected shutdown when the filesystem tried to write the superblock during runtime. kernel shows the following dmesg: [ 8.176281] XFS (dm-4): Metadata corruption detected at xfs_sb_write_verify+0x60/0x120 [xfs], xfs_sb block 0x0 [ 8.177417] XFS (dm-4): Unmount and run xfs_repair [ 8.178016] XFS (dm-4): First 128 bytes of corrupted metadata buffer: [ 8.178703] 00000000: 58 46 53 42 00 00 10 00 00 00 00 00 01 90 00 00 XFSB............ [ 8.179487] 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ [ 8.180312] 00000020: cf 12 dc 89 ca 26 45 29 92 e6 e3 8d 3b b8 a2 c3 .....&E)....;... [ 8.181150] 00000030: 00 00 00 00 01 00 00 06 00 00 00 00 00 00 00 80 ................ [ 8.182003] 00000040: 00 00 00 00 00 00 00 81 00 00 00 00 00 00 00 82 ................ [ 8.182004] 00000050: 00 00 00 01 00 64 00 00 00 00 00 04 00 00 00 00 .....d.......... [ 8.182004] 00000060: 00 00 64 00 b4 a5 02 00 02 00 00 08 00 00 00 00 ..d............. [ 8.182005] 00000070: 00 00 00 00 00 00 00 00 0c 09 09 03 17 00 00 19 ................ [ 8.182008] XFS (dm-4): Corruption of in-memory data detected. Shutting down filesystem [ 8.182010] XFS (dm-4): Please unmount the filesystem and rectify the problem(s) When xfs_log_sb writes super block to disk, b_fdblocks is fetched from m_fdblocks without any lock. As m_fdblocks can experience a positive -> negative -> positive changing when the FS reaches fullness (see xfs_mod_fdblocks). So there is a chance that sb_fdblocks is negative, and because sb_fdblocks is type of unsigned long long, it reads super big. And sb_fdblocks being bigger than sb_dblocks is a problem during log recovery, xfs_validate_sb_write() complains. Fix: As sb_fdblocks will be re-calculated during mount when lazysbcount is enabled, We just need to make xfs_validate_sb_write() happy -- make sure sb_fdblocks is not nenative. This patch also takes care of other percpu counters in xfs_log_sb. Signed-off-by: Wengang Wang Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_sb.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 09e4bf949bf88c..6b56f0f6d4c1af 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -1038,11 +1038,12 @@ xfs_log_sb( * and hence we don't need have to update it here. */ if (xfs_has_lazysbcount(mp)) { - mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); + mp->m_sb.sb_icount = percpu_counter_sum_positive(&mp->m_icount); mp->m_sb.sb_ifree = min_t(uint64_t, - percpu_counter_sum(&mp->m_ifree), + percpu_counter_sum_positive(&mp->m_ifree), mp->m_sb.sb_icount); - mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); + mp->m_sb.sb_fdblocks = + percpu_counter_sum_positive(&mp->m_fdblocks); } xfs_sb_to_disk(bp->b_addr, &mp->m_sb); From f74fb5df429ebc6a614dc5aa9e44d7194d402e5a Mon Sep 17 00:00:00 2001 From: Tobias Jakobi Date: Sun, 10 Mar 2024 23:04:00 +0100 Subject: [PATCH 0750/1578] drm: panel-orientation-quirks: Add quirk for Aya Neo KUN Similar to the other Aya Neo devices this one features again a portrait screen, here with a native resolution of 1600x2560. Signed-off-by: Tobias Jakobi Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Link: https://patchwork.freedesktop.org/patch/msgid/20240310220401.895591-1-tjakobi@math.uni-bielefeld.de --- drivers/gpu/drm/drm_panel_orientation_quirks.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/drm_panel_orientation_quirks.c b/drivers/gpu/drm/drm_panel_orientation_quirks.c index aa93129c3397e4..2166208a961d68 100644 --- a/drivers/gpu/drm/drm_panel_orientation_quirks.c +++ b/drivers/gpu/drm/drm_panel_orientation_quirks.c @@ -202,6 +202,12 @@ static const struct dmi_system_id orientation_data[] = { DMI_MATCH(DMI_BOARD_NAME, "NEXT"), }, .driver_data = (void *)&lcd800x1280_rightside_up, + }, { /* AYA NEO KUN */ + .matches = { + DMI_EXACT_MATCH(DMI_BOARD_VENDOR, "AYANEO"), + DMI_MATCH(DMI_BOARD_NAME, "KUN"), + }, + .driver_data = (void *)&lcd1600x2560_rightside_up, }, { /* Chuwi HiBook (CWI514) */ .matches = { DMI_MATCH(DMI_BOARD_VENDOR, "Hampoo"), From 5add2f7288468f35a374620dabf126c13baaea9c Mon Sep 17 00:00:00 2001 From: David Wei Date: Thu, 6 Jun 2024 07:59:08 -0700 Subject: [PATCH 0751/1578] netdevsim: fix backwards compatibility in nsim_get_iflink() The default ndo_get_iflink() implementation returns the current ifindex of the netdev. But the overridden nsim_get_iflink() returns 0 if the current nsim is not linked, breaking backwards compatibility for userspace that depend on this behaviour. Fix the problem by returning the current ifindex if not linked to a peer. Fixes: 8debcf5832c3 ("netdevsim: add ndo_get_iflink() implementation") Reported-by: Yu Watanabe Suggested-by: Yu Watanabe Signed-off-by: David Wei Signed-off-by: David S. Miller --- drivers/net/netdevsim/netdev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c index c22897bf5509ca..017a6102be0a22 100644 --- a/drivers/net/netdevsim/netdev.c +++ b/drivers/net/netdevsim/netdev.c @@ -324,7 +324,8 @@ static int nsim_get_iflink(const struct net_device *dev) rcu_read_lock(); peer = rcu_dereference(nsim->peer); - iflink = peer ? READ_ONCE(peer->netdev->ifindex) : 0; + iflink = peer ? READ_ONCE(peer->netdev->ifindex) : + READ_ONCE(dev->ifindex); rcu_read_unlock(); return iflink; From e3cf20e5c68df604315ab30bdbe15dc8a5da556b Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 4 Jun 2024 22:32:34 +0100 Subject: [PATCH 0752/1578] ARM: 9405/1: ftrace: Don't assume stack frames are contiguous in memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The frame pointer unwinder relies on a standard layout of the stack frame, consisting of (in downward order) Calling frame: PC <---------+ LR | SP | FP | .. locals .. | Callee frame: | PC | LR | SP | FP ----------+ where after storing its previous value on the stack, FP is made to point at the location of PC in the callee stack frame, using the canonical prologue: mov ip, sp stmdb sp!, {fp, ip, lr, pc} sub fp, ip, #4 The ftrace code assumes that this activation record is pushed first, and that any stack space for locals is allocated below this. Strict adherence to this would imply that the caller's value of SP at the time of the function call can always be obtained by adding 4 to FP (which points to PC in the callee frame). However, recent versions of GCC appear to deviate from this rule, and so the only reliable way to obtain the caller's value of SP is to read it from the activation record. Since this involves a read from memory rather than simple arithmetic, we need to use the uaccess API here which protects against inadvertent data aborts resulting from attempts to dereference bogus FP values. The plain uaccess API is ftrace instrumented itself, so to avoid unbounded recursion, use the __get_kernel_nofault() primitive directly. Closes: https://lore.kernel.org/all/alp44tukzo6mvcwl4ke4ehhmojrqnv6xfcdeuliybxfjfvgd3e@gpjvwj33cc76 Closes: https://lore.kernel.org/all/d870c149-4363-43de-b0ea-7125dec5608e@broadcom.com/ Reported-by: Uwe Kleine-König Reported-by: Justin Chen Tested-by: Thorsten Scherer Reviewed-by: Linus Walleij Signed-off-by: Ard Biesheuvel Signed-off-by: Russell King (Oracle) --- arch/arm/kernel/ftrace.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c index a0b6d1e3812fdb..e61591f33a6cd1 100644 --- a/arch/arm/kernel/ftrace.c +++ b/arch/arm/kernel/ftrace.c @@ -232,11 +232,24 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, unsigned long old; if (unlikely(atomic_read(¤t->tracing_graph_pause))) +err_out: return; if (IS_ENABLED(CONFIG_UNWINDER_FRAME_POINTER)) { - /* FP points one word below parent's top of stack */ - frame_pointer += 4; + /* + * Usually, the stack frames are contiguous in memory but cases + * have been observed where the next stack frame does not live + * at 'frame_pointer + 4' as this code used to assume. + * + * Instead, dereference the field in the stack frame that + * stores the SP of the calling frame: to avoid unbounded + * recursion, this cannot involve any ftrace instrumented + * functions, so use the __get_kernel_nofault() primitive + * directly. + */ + __get_kernel_nofault(&frame_pointer, + (unsigned long *)(frame_pointer - 8), + unsigned long, err_out); } else { struct stackframe frame = { .fp = frame_pointer, From b880018edd3a577e50366338194dee9b899947e0 Mon Sep 17 00:00:00 2001 From: Amjad Ouled-Ameur Date: Mon, 10 Jun 2024 11:20:56 +0100 Subject: [PATCH 0753/1578] drm/komeda: check for error-valued pointer komeda_pipeline_get_state() may return an error-valued pointer, thus check the pointer for negative or null value before dereferencing. Fixes: 502932a03fce ("drm/komeda: Add the initial scaler support for CORE") Signed-off-by: Amjad Ouled-Ameur Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20240610102056.40406-1-amjad.ouled-ameur@arm.com --- drivers/gpu/drm/arm/display/komeda/komeda_pipeline_state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/arm/display/komeda/komeda_pipeline_state.c b/drivers/gpu/drm/arm/display/komeda/komeda_pipeline_state.c index f3e744172673c3..f4e76b46ca327a 100644 --- a/drivers/gpu/drm/arm/display/komeda/komeda_pipeline_state.c +++ b/drivers/gpu/drm/arm/display/komeda/komeda_pipeline_state.c @@ -259,7 +259,7 @@ komeda_component_get_avail_scaler(struct komeda_component *c, u32 avail_scalers; pipe_st = komeda_pipeline_get_state(c->pipeline, state); - if (!pipe_st) + if (IS_ERR_OR_NULL(pipe_st)) return NULL; avail_scalers = (pipe_st->active_comps & KOMEDA_PIPELINE_SCALERS) ^ From ce62600c4dbee8d43b02277669dd91785a9b81d9 Mon Sep 17 00:00:00 2001 From: Adam Miotk Date: Mon, 10 Jun 2024 11:27:39 +0100 Subject: [PATCH 0754/1578] drm/bridge/panel: Fix runtime warning on panel bridge release Device managed panel bridge wrappers are created by calling to drm_panel_bridge_add_typed() and registering a release handler for clean-up when the device gets unbound. Since the memory for this bridge is also managed and linked to the panel device, the release function should not try to free that memory. Moreover, the call to devm_kfree() inside drm_panel_bridge_remove() will fail in this case and emit a warning because the panel bridge resource is no longer on the device resources list (it has been removed from there before the call to release handlers). Fixes: 67022227ffb1 ("drm/bridge: Add a devm_ allocator for panel bridge.") Signed-off-by: Adam Miotk Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20240610102739.139852-1-adam.miotk@arm.com --- drivers/gpu/drm/bridge/panel.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/bridge/panel.c b/drivers/gpu/drm/bridge/panel.c index 32506524d9a2d1..fe5fb08c9fc42b 100644 --- a/drivers/gpu/drm/bridge/panel.c +++ b/drivers/gpu/drm/bridge/panel.c @@ -360,9 +360,12 @@ EXPORT_SYMBOL(drm_panel_bridge_set_orientation); static void devm_drm_panel_bridge_release(struct device *dev, void *res) { - struct drm_bridge **bridge = res; + struct drm_bridge *bridge = *(struct drm_bridge **)res; - drm_panel_bridge_remove(*bridge); + if (!bridge) + return; + + drm_bridge_remove(bridge); } /** From 97ab304ecd95c0b1703ff8c8c3956dc6e2afe8e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Amadeusz=20S=C5=82awi=C5=84ski?= Date: Mon, 3 Jun 2024 12:28:15 +0200 Subject: [PATCH 0755/1578] ASoC: topology: Fix references to freed memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Most users after parsing a topology file, release memory used by it, so having pointer references directly into topology file contents is wrong. Use devm_kmemdup(), to allocate memory as needed. Reported-by: Jason Montleon Link: https://github.com/thesofproject/avs-topology-xml/issues/22#issuecomment-2127892605 Reviewed-by: Cezary Rojewski Signed-off-by: Amadeusz Sławiński Link: https://lore.kernel.org/r/20240603102818.36165-2-amadeuszx.slawinski@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/soc-topology.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c index 90ca37e008b32c..75d9395a18ed49 100644 --- a/sound/soc/soc-topology.c +++ b/sound/soc/soc-topology.c @@ -1060,15 +1060,32 @@ static int soc_tplg_dapm_graph_elems_load(struct soc_tplg *tplg, break; } - route->source = elem->source; - route->sink = elem->sink; + route->source = devm_kmemdup(tplg->dev, elem->source, + min(strlen(elem->source), + SNDRV_CTL_ELEM_ID_NAME_MAXLEN), + GFP_KERNEL); + route->sink = devm_kmemdup(tplg->dev, elem->sink, + min(strlen(elem->sink), SNDRV_CTL_ELEM_ID_NAME_MAXLEN), + GFP_KERNEL); + if (!route->source || !route->sink) { + ret = -ENOMEM; + break; + } /* set to NULL atm for tplg users */ route->connected = NULL; - if (strnlen(elem->control, SNDRV_CTL_ELEM_ID_NAME_MAXLEN) == 0) + if (strnlen(elem->control, SNDRV_CTL_ELEM_ID_NAME_MAXLEN) == 0) { route->control = NULL; - else - route->control = elem->control; + } else { + route->control = devm_kmemdup(tplg->dev, elem->control, + min(strlen(elem->control), + SNDRV_CTL_ELEM_ID_NAME_MAXLEN), + GFP_KERNEL); + if (!route->control) { + ret = -ENOMEM; + break; + } + } /* add route dobj to dobj_list */ route->dobj.type = SND_SOC_DOBJ_GRAPH; From fd660b1bd015e5aa9a558ee04088f2431010548d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Amadeusz=20S=C5=82awi=C5=84ski?= Date: Mon, 3 Jun 2024 12:28:16 +0200 Subject: [PATCH 0756/1578] ASoC: Intel: avs: Fix route override MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of overriding existing memory strings that may be too short, just allocate needed memory and point the route at it. Reported-by: Jason Montleon Link: https://github.com/thesofproject/avs-topology-xml/issues/22#issuecomment-2127892605 Reviewed-by: Cezary Rojewski Signed-off-by: Amadeusz Sławiński Link: https://lore.kernel.org/r/20240603102818.36165-3-amadeuszx.slawinski@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/intel/avs/topology.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/sound/soc/intel/avs/topology.c b/sound/soc/intel/avs/topology.c index 02bae207f6ece3..b6c5d94a155482 100644 --- a/sound/soc/intel/avs/topology.c +++ b/sound/soc/intel/avs/topology.c @@ -1545,8 +1545,8 @@ static int avs_route_load(struct snd_soc_component *comp, int index, { struct snd_soc_acpi_mach *mach = dev_get_platdata(comp->card->dev); size_t len = SNDRV_CTL_ELEM_ID_NAME_MAXLEN; - char buf[SNDRV_CTL_ELEM_ID_NAME_MAXLEN]; int ssp_port, tdm_slot; + char *buf; /* See parse_link_formatted_string() for dynamic naming when(s). */ if (!avs_mach_singular_ssp(mach)) @@ -1557,13 +1557,24 @@ static int avs_route_load(struct snd_soc_component *comp, int index, return 0; tdm_slot = avs_mach_ssp_tdm(mach, ssp_port); + buf = devm_kzalloc(comp->card->dev, len, GFP_KERNEL); + if (!buf) + return -ENOMEM; avs_ssp_sprint(buf, len, route->source, ssp_port, tdm_slot); - strscpy((char *)route->source, buf, len); + route->source = buf; + + buf = devm_kzalloc(comp->card->dev, len, GFP_KERNEL); + if (!buf) + return -ENOMEM; avs_ssp_sprint(buf, len, route->sink, ssp_port, tdm_slot); - strscpy((char *)route->sink, buf, len); + route->sink = buf; + if (route->control) { + buf = devm_kzalloc(comp->card->dev, len, GFP_KERNEL); + if (!buf) + return -ENOMEM; avs_ssp_sprint(buf, len, route->control, ssp_port, tdm_slot); - strscpy((char *)route->control, buf, len); + route->control = buf; } return 0; From daf0b99d4720c9f05bdb81c73b2efdb43fa9def3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Amadeusz=20S=C5=82awi=C5=84ski?= Date: Mon, 3 Jun 2024 12:28:17 +0200 Subject: [PATCH 0757/1578] ASoC: topology: Do not assign fields that are already set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The routes are allocated with kzalloc(), so all fields are zeroed by default, skip unnecessary assignments. Reviewed-by: Cezary Rojewski Signed-off-by: Amadeusz Sławiński Link: https://lore.kernel.org/r/20240603102818.36165-4-amadeuszx.slawinski@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/soc-topology.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c index 75d9395a18ed49..1db540aaad451f 100644 --- a/sound/soc/soc-topology.c +++ b/sound/soc/soc-topology.c @@ -1072,11 +1072,7 @@ static int soc_tplg_dapm_graph_elems_load(struct soc_tplg *tplg, break; } - /* set to NULL atm for tplg users */ - route->connected = NULL; - if (strnlen(elem->control, SNDRV_CTL_ELEM_ID_NAME_MAXLEN) == 0) { - route->control = NULL; - } else { + if (strnlen(elem->control, SNDRV_CTL_ELEM_ID_NAME_MAXLEN) != 0) { route->control = devm_kmemdup(tplg->dev, elem->control, min(strlen(elem->control), SNDRV_CTL_ELEM_ID_NAME_MAXLEN), From e0e7bc2cbee93778c4ad7d9a792d425ffb5af6f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Amadeusz=20S=C5=82awi=C5=84ski?= Date: Mon, 3 Jun 2024 12:28:18 +0200 Subject: [PATCH 0758/1578] ASoC: topology: Clean up route loading MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of using very long macro name, assign it to shorter variable and use it instead. While doing that, we can reduce multiple if checks using this define to one. Reviewed-by: Cezary Rojewski Signed-off-by: Amadeusz Sławiński Link: https://lore.kernel.org/r/20240603102818.36165-5-amadeuszx.slawinski@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/soc-topology.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c index 1db540aaad451f..2ac442644ed4f9 100644 --- a/sound/soc/soc-topology.c +++ b/sound/soc/soc-topology.c @@ -1021,6 +1021,7 @@ static int soc_tplg_dapm_graph_elems_load(struct soc_tplg *tplg, struct snd_soc_tplg_hdr *hdr) { struct snd_soc_dapm_context *dapm = &tplg->comp->dapm; + const size_t maxlen = SNDRV_CTL_ELEM_ID_NAME_MAXLEN; struct snd_soc_tplg_dapm_graph_elem *elem; struct snd_soc_dapm_route *route; int count, i; @@ -1044,38 +1045,27 @@ static int soc_tplg_dapm_graph_elems_load(struct soc_tplg *tplg, tplg->pos += sizeof(struct snd_soc_tplg_dapm_graph_elem); /* validate routes */ - if (strnlen(elem->source, SNDRV_CTL_ELEM_ID_NAME_MAXLEN) == - SNDRV_CTL_ELEM_ID_NAME_MAXLEN) { - ret = -EINVAL; - break; - } - if (strnlen(elem->sink, SNDRV_CTL_ELEM_ID_NAME_MAXLEN) == - SNDRV_CTL_ELEM_ID_NAME_MAXLEN) { - ret = -EINVAL; - break; - } - if (strnlen(elem->control, SNDRV_CTL_ELEM_ID_NAME_MAXLEN) == - SNDRV_CTL_ELEM_ID_NAME_MAXLEN) { + if ((strnlen(elem->source, maxlen) == maxlen) || + (strnlen(elem->sink, maxlen) == maxlen) || + (strnlen(elem->control, maxlen) == maxlen)) { ret = -EINVAL; break; } route->source = devm_kmemdup(tplg->dev, elem->source, - min(strlen(elem->source), - SNDRV_CTL_ELEM_ID_NAME_MAXLEN), + min(strlen(elem->source), maxlen), GFP_KERNEL); route->sink = devm_kmemdup(tplg->dev, elem->sink, - min(strlen(elem->sink), SNDRV_CTL_ELEM_ID_NAME_MAXLEN), + min(strlen(elem->sink), maxlen), GFP_KERNEL); if (!route->source || !route->sink) { ret = -ENOMEM; break; } - if (strnlen(elem->control, SNDRV_CTL_ELEM_ID_NAME_MAXLEN) != 0) { + if (strnlen(elem->control, maxlen) != 0) { route->control = devm_kmemdup(tplg->dev, elem->control, - min(strlen(elem->control), - SNDRV_CTL_ELEM_ID_NAME_MAXLEN), + min(strlen(elem->control), maxlen), GFP_KERNEL); if (!route->control) { ret = -ENOMEM; From e3209a1827646daaab744aa6a5767b1f57fb5385 Mon Sep 17 00:00:00 2001 From: Thomas GENTY Date: Sat, 8 Jun 2024 19:02:51 +0200 Subject: [PATCH 0759/1578] bytcr_rt5640 : inverse jack detect for Archos 101 cesium When headphones are plugged in, they appear absent; when they are removed, they appear present. Add a specific entry in bytcr_rt5640 for this device Signed-off-by: Thomas GENTY Reviewed-by: Hans de Goede Acked-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20240608170251.99936-1-tomlohave@gmail.com Signed-off-by: Mark Brown --- sound/soc/intel/boards/bytcr_rt5640.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/sound/soc/intel/boards/bytcr_rt5640.c b/sound/soc/intel/boards/bytcr_rt5640.c index b41a1147f1c345..a64d1989e28a5e 100644 --- a/sound/soc/intel/boards/bytcr_rt5640.c +++ b/sound/soc/intel/boards/bytcr_rt5640.c @@ -610,6 +610,17 @@ static const struct dmi_system_id byt_rt5640_quirk_table[] = { BYT_RT5640_SSP0_AIF1 | BYT_RT5640_MCLK_EN), }, + { + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "ARCHOS"), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "ARCHOS 101 CESIUM"), + }, + .driver_data = (void *)(BYTCR_INPUT_DEFAULTS | + BYT_RT5640_JD_NOT_INV | + BYT_RT5640_DIFF_MIC | + BYT_RT5640_SSP0_AIF1 | + BYT_RT5640_MCLK_EN), + }, { .matches = { DMI_EXACT_MATCH(DMI_SYS_VENDOR, "ARCHOS"), From d37fe4255abe8e7b419b90c5847e8ec2b8debb08 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 6 Jun 2024 15:46:51 +0000 Subject: [PATCH 0760/1578] tcp: fix race in tcp_v6_syn_recv_sock() tcp_v6_syn_recv_sock() calls ip6_dst_store() before inet_sk(newsk)->pinet6 has been set up. This means ip6_dst_store() writes over the parent (listener) np->dst_cookie. This is racy because multiple threads could share the same parent and their final np->dst_cookie could be wrong. Move ip6_dst_store() call after inet_sk(newsk)->pinet6 has been changed and after the copy of parent ipv6_pinfo. Fixes: e994b2f0fb92 ("tcp: do not lock listener to process SYN packets") Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 8c577b651bfcd2..729faf8bd366ad 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1439,7 +1439,6 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * */ newsk->sk_gso_type = SKB_GSO_TCPV6; - ip6_dst_store(newsk, dst, NULL, NULL); inet6_sk_rx_dst_set(newsk, skb); inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk); @@ -1450,6 +1449,8 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + ip6_dst_store(newsk, dst, NULL, NULL); + newsk->sk_v6_daddr = ireq->ir_v6_rmt_addr; newnp->saddr = ireq->ir_v6_loc_addr; newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr; From d029edefed39647c797c2710aedd9d31f84c069e Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 6 Jun 2024 19:13:03 +0300 Subject: [PATCH 0761/1578] net dsa: qca8k: fix usages of device_get_named_child_node() The documentation for device_get_named_child_node() mentions this important point: " The caller is responsible for calling fwnode_handle_put() on the returned fwnode pointer. " Add fwnode_handle_put() to avoid leaked references. Fixes: 1e264f9d2918 ("net: dsa: qca8k: add LEDs basic support") Reviewed-by: Simon Horman Signed-off-by: Andy Shevchenko Signed-off-by: David S. Miller --- drivers/net/dsa/qca/qca8k-leds.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/qca/qca8k-leds.c b/drivers/net/dsa/qca/qca8k-leds.c index 811ebeeff4ed7d..43ac68052baf9f 100644 --- a/drivers/net/dsa/qca/qca8k-leds.c +++ b/drivers/net/dsa/qca/qca8k-leds.c @@ -431,8 +431,11 @@ qca8k_parse_port_leds(struct qca8k_priv *priv, struct fwnode_handle *port, int p init_data.devicename = kasprintf(GFP_KERNEL, "%s:0%d", priv->internal_mdio_bus->id, port_num); - if (!init_data.devicename) + if (!init_data.devicename) { + fwnode_handle_put(led); + fwnode_handle_put(leds); return -ENOMEM; + } ret = devm_led_classdev_register_ext(priv->dev, &port_led->cdev, &init_data); if (ret) @@ -441,6 +444,7 @@ qca8k_parse_port_leds(struct qca8k_priv *priv, struct fwnode_handle *port, int p kfree(init_data.devicename); } + fwnode_handle_put(leds); return 0; } @@ -471,9 +475,13 @@ qca8k_setup_led_ctrl(struct qca8k_priv *priv) * the correct port for LED setup. */ ret = qca8k_parse_port_leds(priv, port, qca8k_port_to_phy(port_num)); - if (ret) + if (ret) { + fwnode_handle_put(port); + fwnode_handle_put(ports); return ret; + } } + fwnode_handle_put(ports); return 0; } From c6ae073f5903f6c6439d0ac855836a4da5c0a701 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Thu, 6 Jun 2024 23:32:48 +0300 Subject: [PATCH 0762/1578] geneve: Fix incorrect inner network header offset when innerprotoinherit is set When innerprotoinherit is set, the tunneled packets do not have an inner Ethernet header. Change 'maclen' to not always assume the header length is ETH_HLEN, as there might not be a MAC header. This resolves issues with drivers (e.g. mlx5, in mlx5e_tx_tunnel_accel()) who rely on the skb inner network header offset to be correct, and use it for TX offloads. Fixes: d8a6213d70ac ("geneve: fix header validation in geneve[6]_xmit_skb") Signed-off-by: Gal Pressman Signed-off-by: Tariq Toukan Reviewed-by: Wojciech Drewek Signed-off-by: David S. Miller --- drivers/net/geneve.c | 10 ++++++---- include/net/ip_tunnels.h | 5 +++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 51495cb4b9be49..838e85ddec6710 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -815,6 +815,7 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, struct geneve_dev *geneve, const struct ip_tunnel_info *info) { + bool inner_proto_inherit = geneve->cfg.inner_proto_inherit; bool xnet = !net_eq(geneve->net, dev_net(geneve->dev)); struct geneve_sock *gs4 = rcu_dereference(geneve->sock4); const struct ip_tunnel_key *key = &info->key; @@ -826,7 +827,7 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, __be16 sport; int err; - if (!skb_vlan_inet_prepare(skb)) + if (!skb_vlan_inet_prepare(skb, inner_proto_inherit)) return -EINVAL; if (!gs4) @@ -908,7 +909,7 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, } err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr), - geneve->cfg.inner_proto_inherit); + inner_proto_inherit); if (unlikely(err)) return err; @@ -925,6 +926,7 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, struct geneve_dev *geneve, const struct ip_tunnel_info *info) { + bool inner_proto_inherit = geneve->cfg.inner_proto_inherit; bool xnet = !net_eq(geneve->net, dev_net(geneve->dev)); struct geneve_sock *gs6 = rcu_dereference(geneve->sock6); const struct ip_tunnel_key *key = &info->key; @@ -935,7 +937,7 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, __be16 sport; int err; - if (!skb_vlan_inet_prepare(skb)) + if (!skb_vlan_inet_prepare(skb, inner_proto_inherit)) return -EINVAL; if (!gs6) @@ -997,7 +999,7 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, ttl = ttl ? : ip6_dst_hoplimit(dst); } err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr), - geneve->cfg.inner_proto_inherit); + inner_proto_inherit); if (unlikely(err)) return err; diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 9a6a08ec77139c..1db2417b8ff52d 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -461,9 +461,10 @@ static inline bool pskb_inet_may_pull(struct sk_buff *skb) /* Variant of pskb_inet_may_pull(). */ -static inline bool skb_vlan_inet_prepare(struct sk_buff *skb) +static inline bool skb_vlan_inet_prepare(struct sk_buff *skb, + bool inner_proto_inherit) { - int nhlen = 0, maclen = ETH_HLEN; + int nhlen = 0, maclen = inner_proto_inherit ? 0 : ETH_HLEN; __be16 type = skb->protocol; /* Essentially this is skb_protocol(skb, true) From 791b4089e326271424b78f2fae778b20e53d071b Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Thu, 6 Jun 2024 23:32:49 +0300 Subject: [PATCH 0763/1578] net/mlx5e: Fix features validation check for tunneled UDP (non-VXLAN) packets Move the vxlan_features_check() call to after we verified the packet is a tunneled VXLAN packet. Without this, tunneled UDP non-VXLAN packets (for ex. GENENVE) might wrongly not get offloaded. In some cases, it worked by chance as GENEVE header is the same size as VXLAN, but it is obviously incorrect. Fixes: e3cfc7e6b7bd ("net/mlx5e: TX, Add geneve tunnel stateless offload support") Signed-off-by: Gal Pressman Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Reviewed-by: Wojciech Drewek Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index c53c99dde55872..a605eae56685df 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4875,7 +4875,7 @@ static netdev_features_t mlx5e_tunnel_features_check(struct mlx5e_priv *priv, /* Verify if UDP port is being offloaded by HW */ if (mlx5_vxlan_lookup_port(priv->mdev->vxlan, port)) - return features; + return vxlan_features_check(skb, features); #if IS_ENABLED(CONFIG_GENEVE) /* Support Geneve offload for default UDP port */ @@ -4901,7 +4901,6 @@ netdev_features_t mlx5e_features_check(struct sk_buff *skb, struct mlx5e_priv *priv = netdev_priv(netdev); features = vlan_features_check(skb, features); - features = vxlan_features_check(skb, features); /* Validate if the tunneled packet is being offloaded by HW */ if (skb->encapsulation && From 86fbd9f63a6b42b8f158361334f5a25762aea358 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 15 May 2024 10:32:01 -0400 Subject: [PATCH 0764/1578] Bluetooth: hci_sync: Fix not using correct handle When setting up an advertisement the code shall always attempt to use the handle set by the instance since it may not be equal to the instance ID. Fixes: e77f43d531af ("Bluetooth: hci_core: Fix not handling hdev->le_num_of_adv_sets=1") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 16daa79b7981f9..a8a7d2b368701b 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -1194,7 +1194,7 @@ int hci_setup_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance) cp.own_addr_type = own_addr_type; cp.channel_map = hdev->le_adv_channel_map; - cp.handle = instance; + cp.handle = adv ? adv->handle : instance; if (flags & MGMT_ADV_FLAG_SEC_2M) { cp.primary_phy = HCI_ADV_PHY_1M; From 806a5198c05987b748b50f3d0c0cfb3d417381a4 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 20 May 2024 16:03:07 -0400 Subject: [PATCH 0765/1578] Bluetooth: L2CAP: Fix rejecting L2CAP_CONN_PARAM_UPDATE_REQ This removes the bogus check for max > hcon->le_conn_max_interval since the later is just the initial maximum conn interval not the maximum the stack could support which is really 3200=4000ms. In order to pass GAP/CONN/CPUP/BV-05-C one shall probably enter values of the following fields in IXIT that would cause hci_check_conn_params to fail: TSPX_conn_update_int_min TSPX_conn_update_int_max TSPX_conn_update_peripheral_latency TSPX_conn_update_supervision_timeout Link: https://github.com/bluez/bluez/issues/847 Fixes: e4b019515f95 ("Bluetooth: Enforce validation on max value of connection interval") Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 36 ++++++++++++++++++++++++++++---- net/bluetooth/l2cap_core.c | 8 +------ 2 files changed, 33 insertions(+), 11 deletions(-) diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 9231396fe96fd2..c43716edf20566 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -2113,18 +2113,46 @@ static inline int hci_check_conn_params(u16 min, u16 max, u16 latency, { u16 max_latency; - if (min > max || min < 6 || max > 3200) + if (min > max) { + BT_WARN("min %d > max %d", min, max); return -EINVAL; + } + + if (min < 6) { + BT_WARN("min %d < 6", min); + return -EINVAL; + } + + if (max > 3200) { + BT_WARN("max %d > 3200", max); + return -EINVAL; + } + + if (to_multiplier < 10) { + BT_WARN("to_multiplier %d < 10", to_multiplier); + return -EINVAL; + } - if (to_multiplier < 10 || to_multiplier > 3200) + if (to_multiplier > 3200) { + BT_WARN("to_multiplier %d > 3200", to_multiplier); return -EINVAL; + } - if (max >= to_multiplier * 8) + if (max >= to_multiplier * 8) { + BT_WARN("max %d >= to_multiplier %d * 8", max, to_multiplier); return -EINVAL; + } max_latency = (to_multiplier * 4 / max) - 1; - if (latency > 499 || latency > max_latency) + if (latency > 499) { + BT_WARN("latency %d > 499", latency); return -EINVAL; + } + + if (latency > max_latency) { + BT_WARN("latency %d > max_latency %d", latency, max_latency); + return -EINVAL; + } return 0; } diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 5b509b76755723..c49e0d4b3c0dd1 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -4647,13 +4647,7 @@ static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn, memset(&rsp, 0, sizeof(rsp)); - if (max > hcon->le_conn_max_interval) { - BT_DBG("requested connection interval exceeds current bounds."); - err = -EINVAL; - } else { - err = hci_check_conn_params(min, max, latency, to_multiplier); - } - + err = hci_check_conn_params(min, max, latency, to_multiplier); if (err) rsp.result = cpu_to_le16(L2CAP_CONN_PARAM_REJECTED); else From c695439d198d30e10553a3b98360c5efe77b6903 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Sun, 9 Jun 2024 18:06:20 +0300 Subject: [PATCH 0766/1578] Bluetooth: fix connection setup in l2cap_connect The amp_id argument of l2cap_connect() was removed in commit 84a4bb6548a2 ("Bluetooth: HCI: Remove HCI_AMP support") It was always called with amp_id == 0, i.e. AMP_ID_BREDR == 0x00 (ie. non-AMP controller). In the above commit, the code path for amp_id != 0 was preserved, although it should have used the amp_id == 0 one. Restore the previous behavior of the non-AMP code path, to fix problems with L2CAP connections. Fixes: 84a4bb6548a2 ("Bluetooth: HCI: Remove HCI_AMP support") Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index c49e0d4b3c0dd1..aed025734d0479 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -4011,8 +4011,8 @@ static void l2cap_connect(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, status = L2CAP_CS_AUTHOR_PEND; chan->ops->defer(chan); } else { - l2cap_state_change(chan, BT_CONNECT2); - result = L2CAP_CR_PEND; + l2cap_state_change(chan, BT_CONFIG); + result = L2CAP_CR_SUCCESS; status = L2CAP_CS_NO_INFO; } } else { From a8768a134518e406d41799a3594aeb74e0889cf7 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Wed, 8 May 2024 17:20:53 +0800 Subject: [PATCH 0767/1578] md: do not delete safemode_timer in mddev_suspend The deletion of safemode_timer in mddev_suspend() is redundant and potentially harmful now. If timer is about to be woken up but gets deleted, 'in_sync' will remain 0 until the next write, causing array to stay in the 'active' state instead of transitioning to 'clean'. Commit 0d9f4f135eb6 ("MD: Add del_timer_sync to mddev_suspend (fix nasty panic))" introduced this deletion for dm, because if timer fired after dm is destroyed, the resource which the timer depends on might have been freed. However, commit 0dd84b319352 ("md: call __md_stop_writes in md_stop") added __md_stop_writes() to md_stop(), which is called before freeing resource. Timer is deleted in __md_stop_writes(), and the origin issue is resolved. Therefore, delete safemode_timer can be removed safely now. Signed-off-by: Li Nan Reviewed-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240508092053.1447930-1-linan666@huaweicloud.com --- drivers/md/md.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index aff9118ff69750..09c55d9a2c5424 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -479,7 +479,6 @@ int mddev_suspend(struct mddev *mddev, bool interruptible) */ WRITE_ONCE(mddev->suspended, mddev->suspended + 1); - del_timer_sync(&mddev->safemode_timer); /* restrict memory reclaim I/O during raid array is suspend */ mddev->noio_flag = memalloc_noio_save(); From 161f73c2c7d061a78390388811e3a6d11e99ce9d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 5 Jun 2024 11:08:20 -0400 Subject: [PATCH 0768/1578] bcachefs: Split out btree_write_submit_wq Split the workqueues for btree read completions and btree write submissions; we don't want concurrency control on btree read completions, but we do want concurrency control on write submissions, else blocking in submit_bio() will cause a ton of kworkers to be allocated. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 3 ++- fs/bcachefs/btree_io.c | 8 ++++---- fs/bcachefs/super.c | 10 +++++++--- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 2a538eb2af110c..2992a644d822c0 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -790,7 +790,8 @@ struct bch_fs { /* BTREE CACHE */ struct bio_set btree_bio; - struct workqueue_struct *io_complete_wq; + struct workqueue_struct *btree_read_complete_wq; + struct workqueue_struct *btree_write_submit_wq; struct btree_root btree_roots_known[BTREE_ID_NR]; DARRAY(struct btree_root) btree_roots_extra; diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 829c1b91477d79..7bca15c604f539 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1389,7 +1389,7 @@ static void btree_node_read_endio(struct bio *bio) bch2_latency_acct(ca, rb->start_time, READ); } - queue_work(c->io_complete_wq, &rb->work); + queue_work(c->btree_read_complete_wq, &rb->work); } struct btree_node_read_all { @@ -1656,7 +1656,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool btree_node_read_all_replicas_done(&ra->cl.work); } else { continue_at(&ra->cl, btree_node_read_all_replicas_done, - c->io_complete_wq); + c->btree_read_complete_wq); } return 0; @@ -1737,7 +1737,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, if (sync) btree_node_read_work(&rb->work); else - queue_work(c->io_complete_wq, &rb->work); + queue_work(c->btree_read_complete_wq, &rb->work); } } @@ -2229,7 +2229,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes); INIT_WORK(&wbio->work, btree_write_submit); - queue_work(c->io_complete_wq, &wbio->work); + queue_work(c->btree_write_submit_wq, &wbio->work); return; err: set_btree_node_noevict(b); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index df2bea38e83f0f..65e239d329157c 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -582,8 +582,10 @@ static void __bch2_fs_free(struct bch_fs *c) if (c->write_ref_wq) destroy_workqueue(c->write_ref_wq); - if (c->io_complete_wq) - destroy_workqueue(c->io_complete_wq); + if (c->btree_write_submit_wq) + destroy_workqueue(c->btree_write_submit_wq); + if (c->btree_read_complete_wq) + destroy_workqueue(c->btree_read_complete_wq); if (c->copygc_wq) destroy_workqueue(c->copygc_wq); if (c->btree_io_complete_wq) @@ -878,8 +880,10 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || - !(c->io_complete_wq = alloc_workqueue("bcachefs_io", + !(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) || + !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", WQ_FREEZABLE, 0)) || #ifndef BCH_WRITE_REF_DEBUG From 1c8cc24eef4a0e824f75e38f82766e4baede24ca Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 5 Jun 2024 21:16:29 -0400 Subject: [PATCH 0769/1578] bcachefs: Fix incorrect error handling found_btree_node_is_readable() error handling here is slightly odd, which is why we were accidently calling evict() on an error pointer Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_node_scan.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 45cb8149d374c2..2cb0442f6cc905 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -72,10 +72,11 @@ static bool found_btree_node_is_readable(struct btree_trans *trans, struct btree *b = bch2_btree_node_get_noiter(trans, &k.k, f->btree_id, f->level, false); bool ret = !IS_ERR_OR_NULL(b); - if (ret) { - f->sectors_written = b->written; - six_unlock_read(&b->c.lock); - } + if (!ret) + return ret; + + f->sectors_written = b->written; + six_unlock_read(&b->c.lock); /* * We might update this node's range; if that happens, we need the node From 04f635ede85b2e7457f3029b9179079a8ac42ff4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 6 Jun 2024 14:02:11 -0400 Subject: [PATCH 0770/1578] bcachefs: Delete incorrect BTREE_ID_NR assertion for forwards compat we now explicitly allow mounting and using filesystems with unknown btrees, and we have to walk them for fsck. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index d3bcb4e4e23060..53b63be537e559 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -221,11 +221,8 @@ static void bch2_btree_path_verify(struct btree_trans *trans, struct btree_path *path) { struct bch_fs *c = trans->c; - unsigned i; - - EBUG_ON(path->btree_id >= BTREE_ID_NR); - for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) { + for (unsigned i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) { if (!path->l[i].b) { BUG_ON(!path->cached && bch2_btree_id_root(c, path->btree_id)->b->c.level > i); @@ -251,8 +248,6 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) { struct btree_trans *trans = iter->trans; - BUG_ON(iter->btree_id >= BTREE_ID_NR); - BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached); BUG_ON((iter->flags & BTREE_ITER_is_extents) && From dab1870439a1176969c5bf06247e088ad0a3551d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 5 Jun 2024 21:45:24 -0400 Subject: [PATCH 0771/1578] bcachefs: fix stack frame size in fsck.c fsck.c always runs top of the stack so we're not too concerned here; noinline_for_stack is sufficient Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index fd277bd58ed34a..921bcdb3e5e4ed 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1677,6 +1677,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) trans_was_restarted(trans, restart_count); } +noinline_for_stack static int check_dirent_inode_dirent(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c_dirent d, @@ -1773,6 +1774,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans, return ret; } +noinline_for_stack static int check_dirent_target(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c_dirent d, @@ -1847,6 +1849,7 @@ static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *su return ret; } +noinline_for_stack static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c_dirent d) { From 26447d224a7f48f669bf95a98fa29c8f50da4d63 Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Mon, 3 Jun 2024 21:23:35 +0800 Subject: [PATCH 0772/1578] bcachefs: fix the display format for show-super There are three keys displayed in non-uniform format. Let's fix them. [Before] ``` Label: testbcachefs Version: 1.9: (unknown version) Version upgrade complete: 0.0: (unknown version) ``` [After] ``` Label: testbcachefs Version: 1.9: (unknown version) Version upgrade complete: 0.0: (unknown version) ``` Fixes: 7423330e30ab ("bcachefs: prt_printf() now respects \r\n\t") Signed-off-by: Hongbo Li Signed-off-by: Kent Overstreet --- fs/bcachefs/super-io.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index d73a0222f70954..055478d21e9ef8 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -1310,15 +1310,15 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, prt_printf(out, "Device index:\t%u\n", sb->dev_idx); - prt_str(out, "Label:\t"); + prt_printf(out, "Label:\t"); prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label); prt_newline(out); - prt_str(out, "Version:\t"); + prt_printf(out, "Version:\t"); bch2_version_to_text(out, le16_to_cpu(sb->version)); prt_newline(out); - prt_str(out, "Version upgrade complete:\t"); + prt_printf(out, "Version upgrade complete:\t"); bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb)); prt_newline(out); From 5ae67abcdfdfa49de84be00320ffe8a669ef674f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 6 Jun 2024 18:56:59 -0400 Subject: [PATCH 0773/1578] bcachefs: Enable automatic shrinking for rhashtables Since the key cache shrinker walks the rhashtable, a mostly empty rhashtable leads to really nasty reclaim performance issues. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 9 +++++---- fs/bcachefs/btree_key_cache.c | 9 +++++---- fs/bcachefs/io_read.c | 7 ++++--- fs/bcachefs/movinggc.c | 7 ++++--- 4 files changed, 18 insertions(+), 14 deletions(-) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 9e4ed75d367562..4f5e411771ba1f 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -91,10 +91,11 @@ static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, } static const struct rhashtable_params bch_btree_cache_params = { - .head_offset = offsetof(struct btree, hash), - .key_offset = offsetof(struct btree, hash_val), - .key_len = sizeof(u64), - .obj_cmpfn = bch2_btree_cache_cmp_fn, + .head_offset = offsetof(struct btree, hash), + .key_offset = offsetof(struct btree, hash_val), + .key_len = sizeof(u64), + .obj_cmpfn = bch2_btree_cache_cmp_fn, + .automatic_shrinking = true, }; static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 34056aaece009e..fb731d52b1eccb 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -32,10 +32,11 @@ static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, } static const struct rhashtable_params bch2_btree_key_cache_params = { - .head_offset = offsetof(struct bkey_cached, hash), - .key_offset = offsetof(struct bkey_cached, key), - .key_len = sizeof(struct bkey_cached_key), - .obj_cmpfn = bch2_btree_key_cache_cmp_fn, + .head_offset = offsetof(struct bkey_cached, hash), + .key_offset = offsetof(struct bkey_cached, key), + .key_len = sizeof(struct bkey_cached_key), + .obj_cmpfn = bch2_btree_key_cache_cmp_fn, + .automatic_shrinking = true, }; __flatten diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index f57486794484d9..862b79f86b91e2 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -84,9 +84,10 @@ struct promote_op { }; static const struct rhashtable_params bch_promote_params = { - .head_offset = offsetof(struct promote_op, hash), - .key_offset = offsetof(struct promote_op, pos), - .key_len = sizeof(struct bpos), + .head_offset = offsetof(struct promote_op, hash), + .key_offset = offsetof(struct promote_op, pos), + .key_len = sizeof(struct bpos), + .automatic_shrinking = true, }; static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 10bfb31c151b20..eb49dd045eff5c 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -35,9 +35,10 @@ struct buckets_in_flight { }; static const struct rhashtable_params bch_move_bucket_params = { - .head_offset = offsetof(struct move_bucket_in_flight, hash), - .key_offset = offsetof(struct move_bucket_in_flight, bucket.k), - .key_len = sizeof(struct move_bucket_key), + .head_offset = offsetof(struct move_bucket_in_flight, hash), + .key_offset = offsetof(struct move_bucket_in_flight, bucket.k), + .key_len = sizeof(struct move_bucket_key), + .automatic_shrinking = true, }; static struct move_bucket_in_flight * From bc65e98e68dac2c0b588e67ea75ee8674c208fc7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 6 Jun 2024 19:12:11 -0400 Subject: [PATCH 0774/1578] bcachefs: increase key cache shrinker batch size Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_key_cache.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index fb731d52b1eccb..eaf012ddca08ca 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -1026,9 +1026,10 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) if (!shrink) return -BCH_ERR_ENOMEM_fs_btree_cache_init; bc->shrink = shrink; - shrink->seeks = 0; shrink->count_objects = bch2_btree_key_cache_count; shrink->scan_objects = bch2_btree_key_cache_scan; + shrink->batch = 1 << 14; + shrink->seeks = 0; shrink->private_data = c; shrinker_register(shrink); return 0; From 9ac3e660cac3e29cfc817b6a23735b70f12bd16a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 6 Jun 2024 19:30:41 -0400 Subject: [PATCH 0775/1578] bcachefs: set sb->s_shrinker->seeks = 0 inodes and dentries are still present in the btree node cache, in much more compact form Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index cd388f1702dc85..f6f1dbc1fe15c5 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1967,6 +1967,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); sb->s_uuid = c->sb.user_uuid; + sb->s_shrink->seeks = 0; c->vfs_sb = sb; strscpy(sb->s_id, c->name, sizeof(sb->s_id)); From 2760bfe38826f65b1806f1cc62744404b5917dea Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 6 Jun 2024 20:01:34 -0400 Subject: [PATCH 0776/1578] bcachefs: Fix reporting of freed objects from key cache shrinker We count objects as freed when we move them to the srcu-pending lists because we're doing the equivalent of a kfree_srcu(); the only difference is managing the pending list ourself means we can allocate from the pending list. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_key_cache.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index eaf012ddca08ca..2ad3d27c91e6fe 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -841,7 +841,6 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, six_lock_exit(&ck->c.lock); kmem_cache_free(bch2_key_cache, ck); atomic_long_dec(&bc->nr_freed); - freed++; bc->nr_freed_nonpcpu--; bc->freed++; } @@ -855,7 +854,6 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, six_lock_exit(&ck->c.lock); kmem_cache_free(bch2_key_cache, ck); atomic_long_dec(&bc->nr_freed); - freed++; bc->nr_freed_pcpu--; bc->freed++; } @@ -877,23 +875,22 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { bc->skipped_dirty++; - goto next; } else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) { clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); bc->skipped_accessed++; - goto next; - } else if (bkey_cached_lock_for_evict(ck)) { + } else if (!bkey_cached_lock_for_evict(ck)) { + bc->skipped_lock_fail++; + } else { bkey_cached_evict(bc, ck); bkey_cached_free(bc, ck); bc->moved_to_freelist++; - } else { - bc->skipped_lock_fail++; + freed++; } scanned++; if (scanned >= nr) break; -next: + pos = next; } From bf2b356afdcafa18db1b409f7039059d1fd6f25f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 7 Jun 2024 14:25:18 -0400 Subject: [PATCH 0777/1578] bcachefs: Leave a buffer in the btree key cache to avoid lock thrashing Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_key_cache.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 2ad3d27c91e6fe..2d3c0d45c37f93 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -915,6 +915,14 @@ static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, long nr = atomic_long_read(&bc->nr_keys) - atomic_long_read(&bc->nr_dirty); + /* + * Avoid hammering our shrinker too much if it's nearly empty - the + * shrinker code doesn't take into account how big our cache is, if it's + * mostly empty but the system is under memory pressure it causes nasty + * lock contention: + */ + nr -= 128; + return max(0L, nr); } From f9035b0ce60cfaf8abd7e1cd5c55690c739aaaf6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 6 Jun 2024 21:59:12 -0400 Subject: [PATCH 0778/1578] bcachefs: Fix refcount leak in check_fix_ptrs() fsck_err() does a goto fsck_err on error; factor out check_fix_ptr() so that our error label can drop our device ref. Signed-off-by: Kent Overstreet --- fs/bcachefs/buckets.c | 249 ++++++++++++++++++++++-------------------- 1 file changed, 133 insertions(+), 116 deletions(-) diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index ed97712d0db1ed..75a54ed977d726 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -465,143 +465,161 @@ int bch2_update_cached_sectors_list(struct btree_trans *trans, unsigned dev, s64 return bch2_update_replicas_list(trans, &r.e, sectors); } -int bch2_check_fix_ptrs(struct btree_trans *trans, - enum btree_id btree, unsigned level, struct bkey_s_c k, - enum btree_iter_update_trigger_flags flags) +static int bch2_check_fix_ptr(struct btree_trans *trans, + struct bkey_s_c k, + struct extent_ptr_decoded p, + const union bch_extent_entry *entry, + bool *do_update) { struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry_c; - struct extent_ptr_decoded p = { 0 }; - bool do_update = false; struct printbuf buf = PRINTBUF; int ret = 0; - percpu_down_read(&c->mark_lock); + struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); + if (!ca) { + if (fsck_err(c, ptr_to_invalid_device, + "pointer to missing device %u\n" + "while marking %s", + p.ptr.dev, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + *do_update = true; + return 0; + } - bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) { - struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); - if (!ca) { - if (fsck_err(c, ptr_to_invalid_device, - "pointer to missing device %u\n" - "while marking %s", - p.ptr.dev, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - do_update = true; - continue; - } + struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry); - struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); - enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry_c); + if (fsck_err_on(!g->gen_valid, + c, ptr_to_missing_alloc_key, + "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), + p.ptr.gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + if (!p.ptr.cached) { + g->gen_valid = true; + g->gen = p.ptr.gen; + } else { + *do_update = true; + } + } - if (fsck_err_on(!g->gen_valid, - c, ptr_to_missing_alloc_key, - "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), - p.ptr.gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - if (!p.ptr.cached) { - g->gen_valid = true; - g->gen = p.ptr.gen; - } else { - do_update = true; - } + if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, + c, ptr_gen_newer_than_bucket_gen, + "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + if (!p.ptr.cached && + (g->data_type != BCH_DATA_btree || + data_type == BCH_DATA_btree)) { + g->gen_valid = true; + g->gen = p.ptr.gen; + g->data_type = 0; + g->dirty_sectors = 0; + g->cached_sectors = 0; + } else { + *do_update = true; } + } - if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, - c, ptr_gen_newer_than_bucket_gen, - "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), - p.ptr.gen, g->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - if (!p.ptr.cached && - (g->data_type != BCH_DATA_btree || - data_type == BCH_DATA_btree)) { - g->gen_valid = true; - g->gen = p.ptr.gen; - g->data_type = 0; - g->dirty_sectors = 0; - g->cached_sectors = 0; - } else { - do_update = true; - } + if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, + c, ptr_gen_newer_than_bucket_gen, + "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, + bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), + p.ptr.gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + *do_update = true; + + if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0, + c, stale_dirty_ptr, + "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + *do_update = true; + + if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) + goto out; + + if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type), + c, ptr_bucket_data_type_mismatch, + "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, + bch2_data_type_str(g->data_type), + bch2_data_type_str(data_type), + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + if (data_type == BCH_DATA_btree) { + g->gen_valid = true; + g->gen = p.ptr.gen; + g->data_type = data_type; + g->dirty_sectors = 0; + g->cached_sectors = 0; + } else { + *do_update = true; } + } - if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, - c, ptr_gen_newer_than_bucket_gen, - "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" + if (p.has_ec) { + struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx); + + if (fsck_err_on(!m || !m->alive, c, + ptr_to_missing_stripe, + "pointer to nonexistent stripe %llu\n" "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, - bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), - p.ptr.gen, + (u64) p.ec.idx, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - do_update = true; + *do_update = true; - if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0, - c, stale_dirty_ptr, - "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" + if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c, + ptr_to_incorrect_stripe, + "pointer does not match stripe %llu\n" "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), - p.ptr.gen, g->gen, + (u64) p.ec.idx, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - do_update = true; + *do_update = true; + } +out: +fsck_err: + bch2_dev_put(ca); + printbuf_exit(&buf); + return ret; +} - if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) - goto next; +int bch2_check_fix_ptrs(struct btree_trans *trans, + enum btree_id btree, unsigned level, struct bkey_s_c k, + enum btree_iter_update_trigger_flags flags) +{ + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry_c; + struct extent_ptr_decoded p = { 0 }; + bool do_update = false; + struct printbuf buf = PRINTBUF; + int ret = 0; - if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type), - c, ptr_bucket_data_type_mismatch, - "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, - bch2_data_type_str(g->data_type), - bch2_data_type_str(data_type), - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - if (data_type == BCH_DATA_btree) { - g->gen_valid = true; - g->gen = p.ptr.gen; - g->data_type = data_type; - g->dirty_sectors = 0; - g->cached_sectors = 0; - } else { - do_update = true; - } - } + percpu_down_read(&c->mark_lock); - if (p.has_ec) { - struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx); - - if (fsck_err_on(!m || !m->alive, c, - ptr_to_missing_stripe, - "pointer to nonexistent stripe %llu\n" - "while marking %s", - (u64) p.ec.idx, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - do_update = true; - - if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c, - ptr_to_incorrect_stripe, - "pointer does not match stripe %llu\n" - "while marking %s", - (u64) p.ec.idx, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - do_update = true; - } -next: - bch2_dev_put(ca); + bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) { + ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update); + if (ret) + goto err; } if (do_update) { @@ -716,7 +734,6 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, bch2_btree_node_update_key_early(trans, btree, level - 1, k, new); } err: -fsck_err: percpu_up_read(&c->mark_lock); printbuf_exit(&buf); return ret; From e0cb5722e112811d32d600ef750f9b39e6f684ca Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 7 Jun 2024 21:02:06 -0400 Subject: [PATCH 0779/1578] bcachefs: Fix snapshot_create_lock lock ordering ====================================================== WARNING: possible circular locking dependency detected 6.10.0-rc2-ktest-00018-gebd1d148b278 #144 Not tainted ------------------------------------------------------ fio/1345 is trying to acquire lock: ffff88813e200ab8 (&c->snapshot_create_lock){++++}-{3:3}, at: bch2_truncate+0x76/0xf0 but task is already holding lock: ffff888105a1fa38 (&sb->s_type->i_mutex_key#13){+.+.}-{3:3}, at: do_truncate+0x7b/0xc0 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (&sb->s_type->i_mutex_key#13){+.+.}-{3:3}: down_write+0x3d/0xd0 bch2_write_iter+0x1c0/0x10f0 vfs_write+0x24a/0x560 __x64_sys_pwrite64+0x77/0xb0 x64_sys_call+0x17e5/0x1ab0 do_syscall_64+0x68/0x130 entry_SYSCALL_64_after_hwframe+0x4b/0x53 -> #1 (sb_writers#10){.+.+}-{0:0}: mnt_want_write+0x4a/0x1d0 filename_create+0x69/0x1a0 user_path_create+0x38/0x50 bch2_fs_file_ioctl+0x315/0xbf0 __x64_sys_ioctl+0x297/0xaf0 x64_sys_call+0x10cb/0x1ab0 do_syscall_64+0x68/0x130 entry_SYSCALL_64_after_hwframe+0x4b/0x53 -> #0 (&c->snapshot_create_lock){++++}-{3:3}: __lock_acquire+0x1445/0x25b0 lock_acquire+0xbd/0x2b0 down_read+0x40/0x180 bch2_truncate+0x76/0xf0 bchfs_truncate+0x240/0x3f0 bch2_setattr+0x7b/0xb0 notify_change+0x322/0x4b0 do_truncate+0x8b/0xc0 do_ftruncate+0x110/0x270 __x64_sys_ftruncate+0x43/0x80 x64_sys_call+0x1373/0x1ab0 do_syscall_64+0x68/0x130 entry_SYSCALL_64_after_hwframe+0x4b/0x53 other info that might help us debug this: Chain exists of: &c->snapshot_create_lock --> sb_writers#10 --> &sb->s_type->i_mutex_key#13 Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&sb->s_type->i_mutex_key#13); lock(sb_writers#10); lock(&sb->s_type->i_mutex_key#13); rlock(&c->snapshot_create_lock); *** DEADLOCK *** Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-ioctl.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 205a323ffc6ddd..3551a737181b2e 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -308,8 +308,8 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) return ret; } -static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, - struct bch_ioctl_subvolume arg) +static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, + struct bch_ioctl_subvolume arg) { struct inode *dir; struct bch_inode_info *inode; @@ -406,9 +406,12 @@ static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, !arg.src_ptr) snapshot_src.subvol = inode_inum(to_bch_ei(dir)).subvol; + down_write(&c->snapshot_create_lock); inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir), dst_dentry, arg.mode|S_IFDIR, 0, snapshot_src, create_flags); + up_write(&c->snapshot_create_lock); + error = PTR_ERR_OR_ZERO(inode); if (error) goto err3; @@ -429,16 +432,6 @@ static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, return error; } -static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, - struct bch_ioctl_subvolume arg) -{ - down_write(&c->snapshot_create_lock); - long ret = __bch2_ioctl_subvolume_create(c, filp, arg); - up_write(&c->snapshot_create_lock); - - return ret; -} - static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, struct bch_ioctl_subvolume arg) { From 9c4acd19bbff5db4629c193366f82960e38d1c6f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 6 Jun 2024 14:50:06 -0400 Subject: [PATCH 0780/1578] bcachefs: Replace bucket_valid() asserts in bucket lookup with proper checks The bucket_gens array and gc_buckets array known their own size; we should be using those members, and returning an error. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 2 ++ fs/bcachefs/buckets.c | 2 ++ fs/bcachefs/buckets.h | 6 ++++-- fs/bcachefs/buckets_types.h | 2 ++ 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index dc97991bcd6ada..130a0131cd73e4 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -990,6 +990,8 @@ static int bch2_gc_alloc_start(struct bch_fs *c) buckets->first_bucket = ca->mi.first_bucket; buckets->nbuckets = ca->mi.nbuckets; + buckets->nbuckets_minus_first = + buckets->nbuckets - buckets->first_bucket; rcu_assign_pointer(ca->buckets_gc, buckets); } diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 75a54ed977d726..99a7824d0de2d8 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -1612,6 +1612,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) bucket_gens->first_bucket = ca->mi.first_bucket; bucket_gens->nbuckets = nbuckets; + bucket_gens->nbuckets_minus_first = + bucket_gens->nbuckets - bucket_gens->first_bucket; if (resize) { down_write(&c->gc_lock); diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 617ffde2fb7ad9..e1a5e3082bbfb8 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -93,7 +93,8 @@ static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) { struct bucket_array *buckets = gc_bucket_array(ca); - BUG_ON(!bucket_valid(ca, b)); + if (b - buckets->first_bucket >= buckets->nbuckets_minus_first) + return NULL; return buckets->b + b; } @@ -110,7 +111,8 @@ static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) { struct bucket_gens *gens = bucket_gens(ca); - BUG_ON(!bucket_valid(ca, b)); + if (b - gens->first_bucket >= gens->nbuckets_minus_first) + return NULL; return gens->b + b; } diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index 6a31740222a713..f636e17c4cafe0 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -22,6 +22,7 @@ struct bucket_array { struct rcu_head rcu; u16 first_bucket; size_t nbuckets; + size_t nbuckets_minus_first; struct bucket b[]; }; @@ -29,6 +30,7 @@ struct bucket_gens { struct rcu_head rcu; u16 first_bucket; size_t nbuckets; + size_t nbuckets_minus_first; u8 b[]; }; From 9432e90df1b8a544f220fd455b2fa39eed8a535d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 6 Jun 2024 15:06:22 -0400 Subject: [PATCH 0781/1578] bcachefs: Check for invalid bucket from bucket_gen(), gc_bucket() Turn more asserts into proper recoverable error paths. Reported-by: syzbot+246b47da27f8e7e7d6fb@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 22 +++++++++++++-- fs/bcachefs/btree_gc.c | 15 ++++++---- fs/bcachefs/buckets.c | 50 ++++++++++++++++++++++++---------- fs/bcachefs/buckets.h | 11 +++++--- fs/bcachefs/ec.c | 26 ++++++++++++++---- fs/bcachefs/extents.c | 9 ++++-- fs/bcachefs/io_read.c | 30 ++++++++++++++------ fs/bcachefs/io_write.c | 19 ++++++++++--- 8 files changed, 135 insertions(+), 47 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 346cd91f91f999..c4b6601f5b7482 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -741,6 +741,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; int ret = 0; struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p); @@ -860,8 +861,14 @@ int bch2_trigger_alloc(struct btree_trans *trans, } percpu_down_read(&c->mark_lock); - if (new_a->gen != old_a->gen) - *bucket_gen(ca, new.k->p.offset) = new_a->gen; + if (new_a->gen != old_a->gen) { + u8 *gen = bucket_gen(ca, new.k->p.offset); + if (unlikely(!gen)) { + percpu_up_read(&c->mark_lock); + goto invalid_bucket; + } + *gen = new_a->gen; + } bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false); percpu_up_read(&c->mark_lock); @@ -895,6 +902,11 @@ int bch2_trigger_alloc(struct btree_trans *trans, percpu_down_read(&c->mark_lock); struct bucket *g = gc_bucket(ca, new.k->p.offset); + if (unlikely(!g)) { + percpu_up_read(&c->mark_lock); + goto invalid_bucket; + } + g->gen_valid = 1; bucket_lock(g); @@ -910,8 +922,14 @@ int bch2_trigger_alloc(struct btree_trans *trans, percpu_up_read(&c->mark_lock); } err: + printbuf_exit(&buf); bch2_dev_put(ca); return ret; +invalid_bucket: + bch2_fs_inconsistent(c, "reference to invalid bucket\n %s", + (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)); + ret = -EIO; + goto err; } /* diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 130a0131cd73e4..0e477a926579a7 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -874,6 +874,9 @@ static int bch2_alloc_write_key(struct btree_trans *trans, const struct bch_alloc_v4 *old; int ret; + if (!bucket_valid(ca, k.k->p.offset)) + return 0; + old = bch2_alloc_to_v4(k, &old_convert); gc = new = *old; @@ -1005,12 +1008,14 @@ static int bch2_gc_alloc_start(struct bch_fs *c) continue; } - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); + if (bucket_valid(ca, k.k->p.offset)) { + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); - struct bucket *g = gc_bucket(ca, k.k->p.offset); - g->gen_valid = 1; - g->gen = a->gen; + struct bucket *g = gc_bucket(ca, k.k->p.offset); + g->gen_valid = 1; + g->gen = a->gen; + } 0; }))); bch2_dev_put(ca); diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 99a7824d0de2d8..743d57eba76070 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -488,6 +488,17 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, } struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); + if (!g) { + if (fsck_err(c, ptr_to_invalid_device, + "pointer to invalid bucket on device %u\n" + "while marking %s", + p.ptr.dev, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + *do_update = true; + goto out; + } + enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry); if (fsck_err_on(!g->gen_valid, @@ -577,8 +588,8 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, if (p.has_ec) { struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx); - if (fsck_err_on(!m || !m->alive, c, - ptr_to_missing_stripe, + if (fsck_err_on(!m || !m->alive, + c, ptr_to_missing_stripe, "pointer to nonexistent stripe %llu\n" "while marking %s", (u64) p.ec.idx, @@ -586,8 +597,8 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, bch2_bkey_val_to_text(&buf, c, k), buf.buf))) *do_update = true; - if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c, - ptr_to_incorrect_stripe, + if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), + c, ptr_to_incorrect_stripe, "pointer does not match stripe %llu\n" "while marking %s", (u64) p.ec.idx, @@ -1004,6 +1015,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans, enum btree_iter_update_trigger_flags flags) { bool insert = !(flags & BTREE_TRIGGER_overwrite); + struct printbuf buf = PRINTBUF; int ret = 0; struct bch_fs *c = trans->c; @@ -1036,6 +1048,13 @@ static int bch2_trigger_pointer(struct btree_trans *trans, if (flags & BTREE_TRIGGER_gc) { percpu_down_read(&c->mark_lock); struct bucket *g = gc_bucket(ca, bucket.offset); + if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", + p.ptr.dev, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = -EIO; + goto err_unlock; + } + bucket_lock(g); struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; ret = __mark_pointer(trans, ca, k, &p.ptr, *sectors, bp.data_type, &new); @@ -1044,10 +1063,12 @@ static int bch2_trigger_pointer(struct btree_trans *trans, bch2_dev_usage_update(c, ca, &old, &new, 0, true); } bucket_unlock(g); +err_unlock: percpu_up_read(&c->mark_lock); } err: bch2_dev_put(ca); + printbuf_exit(&buf); return ret; } @@ -1335,10 +1356,11 @@ static int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b, enum bch_data_type data_type, unsigned sectors, enum btree_iter_update_trigger_flags flags) { - int ret = 0; - percpu_down_read(&c->mark_lock); struct bucket *g = gc_bucket(ca, b); + if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s", + ca->dev_idx, bch2_data_type_str(data_type))) + goto err_unlock; bucket_lock(g); struct bch_alloc_v4 old = bucket_m_to_alloc(*g); @@ -1347,29 +1369,27 @@ static int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, g->data_type != data_type, c, "different types of data in same bucket: %s, %s", bch2_data_type_str(g->data_type), - bch2_data_type_str(data_type))) { - ret = -EIO; + bch2_data_type_str(data_type))) goto err; - } if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, "bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size", ca->dev_idx, b, g->gen, bch2_data_type_str(g->data_type ?: data_type), - g->dirty_sectors, sectors)) { - ret = -EIO; + g->dirty_sectors, sectors)) goto err; - } g->data_type = data_type; g->dirty_sectors += sectors; struct bch_alloc_v4 new = bucket_m_to_alloc(*g); + bch2_dev_usage_update(c, ca, &old, &new, 0, true); + percpu_up_read(&c->mark_lock); + return 0; err: bucket_unlock(g); - if (!ret) - bch2_dev_usage_update(c, ca, &old, &new, 0, true); +err_unlock: percpu_up_read(&c->mark_lock); - return ret; + return -EIO; } int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index e1a5e3082bbfb8..80ee0be9793e60 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -172,19 +172,22 @@ static inline int gen_after(u8 a, u8 b) return r > 0 ? r : 0; } -static inline u8 dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr) +static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr) { - return gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen); + u8 *gen = bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)); + if (!gen) + return -1; + return gen_after(*gen, ptr->gen); } /** * dev_ptr_stale() - check if a pointer points into a bucket that has been * invalidated. */ -static inline u8 dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr) +static inline int dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr) { rcu_read_lock(); - u8 ret = dev_ptr_stale_rcu(ca, ptr); + int ret = dev_ptr_stale_rcu(ca, ptr); rcu_read_unlock(); return ret; diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index d8b9beca377627..83e279d41829d0 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -268,6 +268,7 @@ static int mark_stripe_bucket(struct btree_trans *trans, { struct bch_fs *c = trans->c; const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx; + struct printbuf buf = PRINTBUF; int ret = 0; struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev); @@ -289,6 +290,13 @@ static int mark_stripe_bucket(struct btree_trans *trans, if (flags & BTREE_TRIGGER_gc) { percpu_down_read(&c->mark_lock); struct bucket *g = gc_bucket(ca, bucket.offset); + if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", + ptr->dev, + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { + ret = -EIO; + goto err_unlock; + } + bucket_lock(g); struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags); @@ -297,10 +305,12 @@ static int mark_stripe_bucket(struct btree_trans *trans, bch2_dev_usage_update(c, ca, &old, &new, 0, true); } bucket_unlock(g); +err_unlock: percpu_up_read(&c->mark_lock); } err: bch2_dev_put(ca); + printbuf_exit(&buf); return ret; } @@ -714,10 +724,12 @@ static void ec_block_endio(struct bio *bio) bch2_blk_status_to_str(bio->bi_status))) clear_bit(ec_bio->idx, ec_bio->buf->valid); - if (dev_ptr_stale(ca, ptr)) { + int stale = dev_ptr_stale(ca, ptr); + if (stale) { bch_err_ratelimited(ca->fs, - "error %s stripe: stale pointer after io", - bio_data_dir(bio) == READ ? "reading from" : "writing to"); + "error %s stripe: stale/invalid pointer (%i) after io", + bio_data_dir(bio) == READ ? "reading from" : "writing to", + stale); clear_bit(ec_bio->idx, ec_bio->buf->valid); } @@ -743,10 +755,12 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, return; } - if (dev_ptr_stale(ca, ptr)) { + int stale = dev_ptr_stale(ca, ptr); + if (stale) { bch_err_ratelimited(c, - "error %s stripe: stale pointer", - rw == READ ? "reading from" : "writing to"); + "error %s stripe: stale pointer (%i)", + rw == READ ? "reading from" : "writing to", + stale); clear_bit(idx, buf->valid); return; } diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 469037929685bc..410b8bd81b5a6e 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -137,7 +137,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); - if (p.ptr.cached && (!ca || dev_ptr_stale(ca, &p.ptr))) + if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr))) continue; f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL; @@ -999,7 +999,7 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) bch2_bkey_drop_ptrs(k, ptr, ptr->cached && (ca = bch2_dev_rcu(c, ptr->dev)) && - dev_ptr_stale_rcu(ca, ptr)); + dev_ptr_stale_rcu(ca, ptr) > 0); rcu_read_unlock(); return bkey_deleted(k.k); @@ -1024,8 +1024,11 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc prt_str(out, " cached"); if (ptr->unwritten) prt_str(out, " unwritten"); - if (bucket_valid(ca, b) && dev_ptr_stale_rcu(ca, ptr)) + int stale = dev_ptr_stale_rcu(ca, ptr); + if (stale > 0) prt_printf(out, " stale"); + else if (stale) + prt_printf(out, " invalid"); } rcu_read_unlock(); --out->atomic; diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 862b79f86b91e2..c97fa7002b06e0 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -777,18 +777,32 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, PTR_BUCKET_POS(ca, &ptr), BTREE_ITER_cached); - prt_printf(&buf, "Attempting to read from stale dirty pointer:\n"); - printbuf_indent_add(&buf, 2); + u8 *gen = bucket_gen(ca, iter.pos.offset); + if (gen) { - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); + prt_printf(&buf, "Attempting to read from stale dirty pointer:\n"); + printbuf_indent_add(&buf, 2); - prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset)); - - ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); - if (!ret) { + bch2_bkey_val_to_text(&buf, c, k); prt_newline(&buf); + + prt_printf(&buf, "memory gen: %u", *gen); + + ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + if (!ret) { + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, k); + } + } else { + prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n", + iter.pos.inode, iter.pos.offset); + printbuf_indent_add(&buf, 2); + + prt_printf(&buf, "first bucket %u nbuckets %llu\n", + ca->mi.first_bucket, ca->mi.nbuckets); + bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); } bch2_fs_inconsistent(c, "%s", buf.buf); diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 9401d13e31bb63..05e0cbef420bc7 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -1220,7 +1220,7 @@ static void bch2_nocow_write(struct bch_write_op *op) DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets; u32 snapshot; struct bucket_to_lock *stale_at; - int ret; + int stale, ret; if (op->flags & BCH_WRITE_MOVE) return; @@ -1299,7 +1299,8 @@ static void bch2_nocow_write(struct bch_write_op *op) BUCKET_NOCOW_LOCK_UPDATE); rcu_read_lock(); - bool stale = gen_after(*bucket_gen(ca, i->b.offset), i->gen); + u8 *gen = bucket_gen(ca, i->b.offset); + stale = !gen ? -1 : gen_after(*gen, i->gen); rcu_read_unlock(); if (unlikely(stale)) { @@ -1380,8 +1381,18 @@ static void bch2_nocow_write(struct bch_write_op *op) break; } - /* We can retry this: */ - ret = -BCH_ERR_transaction_restart; + struct printbuf buf = PRINTBUF; + if (bch2_fs_inconsistent_on(stale < 0, c, + "pointer to invalid bucket in nocow path on device %llu\n %s", + stale_at->b.inode, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = -EIO; + } else { + /* We can retry this: */ + ret = -BCH_ERR_transaction_restart; + } + printbuf_exit(&buf); + goto err_get_ioref; } From b79922009214e6ab23c07db32a5606a45710f86e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 8 Jun 2024 17:36:24 -0400 Subject: [PATCH 0782/1578] bcachefs: Add missing synchronize_srcu_expedited() call when shutting down We use the polling interface to srcu for tracking pending frees; when shutting down we don't need to wait for an srcu barrier to free them, but SRCU still gets confused if we shutdown with an outstanding grace period. Reported-by: syzbot+6a038377f0a594d7d44e@syzkaller.appspotmail.com Reported-by: syzbot+0ece6edfd05ed20e32d9@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 53b63be537e559..3694c600a3add8 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -3401,8 +3401,10 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) bch2_time_stats_exit(&s->lock_hold_times); } - if (c->btree_trans_barrier_initialized) + if (c->btree_trans_barrier_initialized) { + synchronize_srcu_expedited(&c->btree_trans_barrier); cleanup_srcu_struct(&c->btree_trans_barrier); + } mempool_exit(&c->btree_trans_mem_pool); mempool_exit(&c->btree_trans_pool); } From 31849bf07e0fb3e7d050c086b77ebdb6cec89167 Mon Sep 17 00:00:00 2001 From: Arunpravin Paneer Selvam Date: Mon, 10 Jun 2024 23:34:01 +0530 Subject: [PATCH 0783/1578] drm/amdgpu: Fix the BO release clear memory warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This happens when the amdgpu_bo_release_notify running before amdgpu_ttm_set_buffer_funcs_status set the buffer funcs to enabled. check the buffer funcs enablement before calling the fill buffer memory. v2:(Christian) - Apply it only for GEM buffers and since GEM buffers are only allocated/freed while the driver is loaded we never run into the issue to clear with buffer funcs disabled. v3:(Mario) - drop the stable tag as this will presumably go into a -fixes PR for 6.10 Log snip: *ERROR* Trying to clear memory with ring turned off. RIP: 0010:amdgpu_bo_release_notify+0x201/0x220 [amdgpu] Fixes: a68c7eaa7a8f ("drm/amdgpu: Enable clear page functionality") Signed-off-by: Arunpravin Paneer Selvam Reviewed-by: Christian König Tested-by: Mikhail Gavrilov Tested-by: Richard Gong Suggested-by: Christian König Link: https://patchwork.freedesktop.org/patch/msgid/20240610180401.9540-1-Arunpravin.PaneerSelvam@amd.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index 67c234bcf89ffd..3adaa46701036d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -108,6 +108,7 @@ int amdgpu_gem_object_create(struct amdgpu_device *adev, unsigned long size, memset(&bp, 0, sizeof(bp)); *obj = NULL; + flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; bp.size = size; bp.byte_align = alignment; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 8d8c39be612953..c556c8b653fa44 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -604,8 +604,6 @@ int amdgpu_bo_create(struct amdgpu_device *adev, if (!amdgpu_bo_support_uswc(bo->flags)) bo->flags &= ~AMDGPU_GEM_CREATE_CPU_GTT_USWC; - bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; - bo->tbo.bdev = &adev->mman.bdev; if (bp->domain & (AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA | AMDGPU_GEM_DOMAIN_GDS)) From 07c54cc5988f19c9642fd463c2dbdac7fc52f777 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 May 2024 14:20:19 +0200 Subject: [PATCH 0784/1578] tick/nohz_full: Don't abuse smp_call_function_single() in tick_setup_device() After the recent commit 5097cbcb38e6 ("sched/isolation: Prevent boot crash when the boot CPU is nohz_full") the kernel no longer crashes, but there is another problem. In this case tick_setup_device() calls tick_take_do_timer_from_boot() to update tick_do_timer_cpu and this triggers the WARN_ON_ONCE(irqs_disabled) in smp_call_function_single(). Kill tick_take_do_timer_from_boot() and just use WRITE_ONCE(), the new comment explains why this is safe (thanks Thomas!). Fixes: 08ae95f4fd3b ("nohz_full: Allow the boot CPU to be nohz_full") Signed-off-by: Oleg Nesterov Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240528122019.GA28794@redhat.com Link: https://lore.kernel.org/all/20240522151742.GA10400@redhat.com --- kernel/time/tick-common.c | 42 +++++++++++++-------------------------- 1 file changed, 14 insertions(+), 28 deletions(-) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index d88b13076b7944..a47bcf71defcf5 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -178,26 +178,6 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) } } -#ifdef CONFIG_NO_HZ_FULL -static void giveup_do_timer(void *info) -{ - int cpu = *(unsigned int *)info; - - WARN_ON(tick_do_timer_cpu != smp_processor_id()); - - tick_do_timer_cpu = cpu; -} - -static void tick_take_do_timer_from_boot(void) -{ - int cpu = smp_processor_id(); - int from = tick_do_timer_boot_cpu; - - if (from >= 0 && from != cpu) - smp_call_function_single(from, giveup_do_timer, &cpu, 1); -} -#endif - /* * Setup the tick device */ @@ -221,19 +201,25 @@ static void tick_setup_device(struct tick_device *td, tick_next_period = ktime_get(); #ifdef CONFIG_NO_HZ_FULL /* - * The boot CPU may be nohz_full, in which case set - * tick_do_timer_boot_cpu so the first housekeeping - * secondary that comes up will take do_timer from - * us. + * The boot CPU may be nohz_full, in which case the + * first housekeeping secondary will take do_timer() + * from it. */ if (tick_nohz_full_cpu(cpu)) tick_do_timer_boot_cpu = cpu; - } else if (tick_do_timer_boot_cpu != -1 && - !tick_nohz_full_cpu(cpu)) { - tick_take_do_timer_from_boot(); + } else if (tick_do_timer_boot_cpu != -1 && !tick_nohz_full_cpu(cpu)) { tick_do_timer_boot_cpu = -1; - WARN_ON(READ_ONCE(tick_do_timer_cpu) != cpu); + /* + * The boot CPU will stay in periodic (NOHZ disabled) + * mode until clocksource_done_booting() called after + * smp_init() selects a high resolution clocksource and + * timekeeping_notify() kicks the NOHZ stuff alive. + * + * So this WRITE_ONCE can only race with the READ_ONCE + * check in tick_periodic() but this race is harmless. + */ + WRITE_ONCE(tick_do_timer_cpu, cpu); #endif } From 9dd5134c61580ba4c219296c37e08ff64c109a74 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 10 Jun 2024 11:23:05 -0700 Subject: [PATCH 0785/1578] kunit/overflow: Adjust for __counted_by with DEFINE_RAW_FLEX() When a flexible array structure has a __counted_by annotation, its use with DEFINE_RAW_FLEX() will result in the count being zero-initialized. This is expected since one doesn't want to use RAW with a counted_by struct. Adjust the tests to check for the condition and for compiler support. Reported-by: Christian Schrefl Closes: https://lore.kernel.org/all/0bfc6b38-8bc5-4971-b6fb-dc642a73fbfe@gmail.com/ Suggested-by: Nathan Chancellor Reviewed-by: Nathan Chancellor Link: https://lore.kernel.org/r/20240610182301.work.272-kees@kernel.org Tested-by: Christian Schrefl Reviewed-by: Christian Schrefl Signed-off-by: Kees Cook --- lib/overflow_kunit.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/lib/overflow_kunit.c b/lib/overflow_kunit.c index 4ef31b0bb74d68..d305b0c054bb73 100644 --- a/lib/overflow_kunit.c +++ b/lib/overflow_kunit.c @@ -1178,14 +1178,28 @@ struct foo { s16 array[] __counted_by(counter); }; +struct bar { + int a; + u32 counter; + s16 array[]; +}; + static void DEFINE_FLEX_test(struct kunit *test) { - DEFINE_RAW_FLEX(struct foo, two, array, 2); + /* Using _RAW_ on a __counted_by struct will initialize "counter" to zero */ + DEFINE_RAW_FLEX(struct foo, two_but_zero, array, 2); +#if __has_attribute(__counted_by__) + int expected_raw_size = sizeof(struct foo); +#else + int expected_raw_size = sizeof(struct foo) + 2 * sizeof(s16); +#endif + /* Without annotation, it will always be on-stack size. */ + DEFINE_RAW_FLEX(struct bar, two, array, 2); DEFINE_FLEX(struct foo, eight, array, counter, 8); DEFINE_FLEX(struct foo, empty, array, counter, 0); - KUNIT_EXPECT_EQ(test, __struct_size(two), - sizeof(struct foo) + sizeof(s16) + sizeof(s16)); + KUNIT_EXPECT_EQ(test, __struct_size(two_but_zero), expected_raw_size); + KUNIT_EXPECT_EQ(test, __struct_size(two), sizeof(struct bar) + 2 * sizeof(s16)); KUNIT_EXPECT_EQ(test, __struct_size(eight), 24); KUNIT_EXPECT_EQ(test, __struct_size(empty), sizeof(struct foo)); } From 03e792eaf18ec2e93e2c623f9f1a4bdb97fe4126 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Sun, 26 May 2024 02:52:56 +0800 Subject: [PATCH 0786/1578] md: change the return value type of md_write_start to void Commit cc27b0c78c79 ("md: fix deadlock between mddev_suspend() and md_write_start()") aborted md_write_start() with false when mddev is suspended, which fixed a deadlock if calling mddev_suspend() with holding reconfig_mutex(). Since mddev_suspend() now includes lockdep_assert_not_held(), it no longer holds the reconfig_mutex. This makes previous abort unnecessary. Now, remove unnecessary abort and change function return value to void. Signed-off-by: Li Nan Reviewed-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240525185257.3896201-2-linan666@huaweicloud.com --- drivers/md/md.c | 14 ++++---------- drivers/md/md.h | 2 +- drivers/md/raid1.c | 3 +-- drivers/md/raid10.c | 3 +-- drivers/md/raid5.c | 3 +-- 5 files changed, 8 insertions(+), 17 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 09c55d9a2c5424..6bac20e82ff02f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8640,12 +8640,12 @@ EXPORT_SYMBOL(md_done_sync); * A return value of 'false' means that the write wasn't recorded * and cannot proceed as the array is being suspend. */ -bool md_write_start(struct mddev *mddev, struct bio *bi) +void md_write_start(struct mddev *mddev, struct bio *bi) { int did_change = 0; if (bio_data_dir(bi) != WRITE) - return true; + return; BUG_ON(mddev->ro == MD_RDONLY); if (mddev->ro == MD_AUTO_READ) { @@ -8678,15 +8678,9 @@ bool md_write_start(struct mddev *mddev, struct bio *bi) if (did_change) sysfs_notify_dirent_safe(mddev->sysfs_state); if (!mddev->has_superblocks) - return true; + return; wait_event(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) || - is_md_suspended(mddev)); - if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { - percpu_ref_put(&mddev->writes_pending); - return false; - } - return true; + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); } EXPORT_SYMBOL(md_write_start); diff --git a/drivers/md/md.h b/drivers/md/md.h index ca085ecad50449..487582058f7417 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -785,7 +785,7 @@ extern void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **t extern void md_wakeup_thread(struct md_thread __rcu *thread); extern void md_check_recovery(struct mddev *mddev); extern void md_reap_sync_thread(struct mddev *mddev); -extern bool md_write_start(struct mddev *mddev, struct bio *bi); +extern void md_write_start(struct mddev *mddev, struct bio *bi); extern void md_write_inc(struct mddev *mddev, struct bio *bi); extern void md_write_end(struct mddev *mddev); extern void md_done_sync(struct mddev *mddev, int blocks, int ok); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 7b8a71ca66dde0..0d80ff471c73dd 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1687,8 +1687,7 @@ static bool raid1_make_request(struct mddev *mddev, struct bio *bio) if (bio_data_dir(bio) == READ) raid1_read_request(mddev, bio, sectors, NULL); else { - if (!md_write_start(mddev,bio)) - return false; + md_write_start(mddev,bio); raid1_write_request(mddev, bio, sectors); } return true; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index a4556d2e46bf95..f8d7c02c6ed561 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1836,8 +1836,7 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio) && md_flush_request(mddev, bio)) return true; - if (!md_write_start(mddev, bio)) - return false; + md_write_start(mddev, bio); if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) if (!raid10_handle_discard(mddev, bio)) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2bd1ce9b39226a..a84389311dd1ea 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6078,8 +6078,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) ctx.do_flush = bi->bi_opf & REQ_PREFLUSH; } - if (!md_write_start(mddev, bi)) - return false; + md_write_start(mddev, bi); /* * If array is degraded, better not do chunk aligned read because * later we might have to read it again in order to reconstruct From 611d5cbc0b35a752e657a83eebadf40d814d006b Mon Sep 17 00:00:00 2001 From: Li Nan Date: Sun, 26 May 2024 02:52:57 +0800 Subject: [PATCH 0787/1578] md: fix deadlock between mddev_suspend and flush bio Deadlock occurs when mddev is being suspended while some flush bio is in progress. It is a complex issue. T1. the first flush is at the ending stage, it clears 'mddev->flush_bio' and tries to submit data, but is blocked because mddev is suspended by T4. T2. the second flush sets 'mddev->flush_bio', and attempts to queue md_submit_flush_data(), which is already running (T1) and won't execute again if on the same CPU as T1. T3. the third flush inc active_io and tries to flush, but is blocked because 'mddev->flush_bio' is not NULL (set by T2). T4. mddev_suspend() is called and waits for active_io dec to 0 which is inc by T3. T1 T2 T3 T4 (flush 1) (flush 2) (third 3) (suspend) md_submit_flush_data mddev->flush_bio = NULL; . . md_flush_request . mddev->flush_bio = bio . queue submit_flushes . . . . md_handle_request . . active_io + 1 . . md_flush_request . . wait !mddev->flush_bio . . . . mddev_suspend . . wait !active_io . . . submit_flushes . queue_work md_submit_flush_data . //md_submit_flush_data is already running (T1) . md_handle_request wait resume The root issue is non-atomic inc/dec of active_io during flush process. active_io is dec before md_submit_flush_data is queued, and inc soon after md_submit_flush_data() run. md_flush_request active_io + 1 submit_flushes active_io - 1 md_submit_flush_data md_handle_request active_io + 1 make_request active_io - 1 If active_io is dec after md_handle_request() instead of within submit_flushes(), make_request() can be called directly intead of md_handle_request() in md_submit_flush_data(), and active_io will only inc and dec once in the whole flush process. Deadlock will be fixed. Additionally, the only difference between fixing the issue and before is that there is no return error handling of make_request(). But after previous patch cleaned md_write_start(), make_requst() only return error in raid5_make_request() by dm-raid, see commit 41425f96d7aa ("dm-raid456, md/raid456: fix a deadlock for dm-raid456 while io concurrent with reshape)". Since dm always splits data and flush operation into two separate io, io size of flush submitted by dm always is 0, make_request() will not be called in md_submit_flush_data(). To prevent future modifications from introducing issues, add WARN_ON to ensure make_request() no error is returned in this context. Fixes: fa2bbff7b0b4 ("md: synchronize flush io with array reconfiguration") Signed-off-by: Li Nan Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240525185257.3896201-3-linan666@huaweicloud.com --- drivers/md/md.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 6bac20e82ff02f..e204e36e317081 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -549,13 +549,9 @@ static void md_end_flush(struct bio *bio) rdev_dec_pending(rdev, mddev); - if (atomic_dec_and_test(&mddev->flush_pending)) { - /* The pair is percpu_ref_get() from md_flush_request() */ - percpu_ref_put(&mddev->active_io); - + if (atomic_dec_and_test(&mddev->flush_pending)) /* The pre-request flush has finished */ queue_work(md_wq, &mddev->flush_work); - } } static void md_submit_flush_data(struct work_struct *ws); @@ -586,12 +582,8 @@ static void submit_flushes(struct work_struct *ws) rcu_read_lock(); } rcu_read_unlock(); - if (atomic_dec_and_test(&mddev->flush_pending)) { - /* The pair is percpu_ref_get() from md_flush_request() */ - percpu_ref_put(&mddev->active_io); - + if (atomic_dec_and_test(&mddev->flush_pending)) queue_work(md_wq, &mddev->flush_work); - } } static void md_submit_flush_data(struct work_struct *ws) @@ -616,8 +608,20 @@ static void md_submit_flush_data(struct work_struct *ws) bio_endio(bio); } else { bio->bi_opf &= ~REQ_PREFLUSH; - md_handle_request(mddev, bio); + + /* + * make_requst() will never return error here, it only + * returns error in raid5_make_request() by dm-raid. + * Since dm always splits data and flush operation into + * two separate io, io size of flush submitted by dm + * always is 0, make_request() will not be called here. + */ + if (WARN_ON_ONCE(!mddev->pers->make_request(mddev, bio))) + bio_io_error(bio);; } + + /* The pair is percpu_ref_get() from md_flush_request() */ + percpu_ref_put(&mddev->active_io); } /* From acc6680af28696a037ede62867e731841d4454c2 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Wed, 29 May 2024 04:31:49 +0800 Subject: [PATCH 0788/1578] md: make md_flush_request() more readable Setting bio to NULL and checking 'if(!bio)' is redundant and looks strange, just consolidate them into one condition. There are no functional changes. Suggested-by: Christoph Hellwig Signed-off-by: Li Nan Reviewed-by: Christoph Hellwig Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240528203149.2383260-1-linan666@huaweicloud.com --- drivers/md/md.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index e204e36e317081..b9b15aa79496fb 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -657,24 +657,22 @@ bool md_flush_request(struct mddev *mddev, struct bio *bio) WARN_ON(percpu_ref_is_zero(&mddev->active_io)); percpu_ref_get(&mddev->active_io); mddev->flush_bio = bio; - bio = NULL; - } - spin_unlock_irq(&mddev->lock); - - if (!bio) { + spin_unlock_irq(&mddev->lock); INIT_WORK(&mddev->flush_work, submit_flushes); queue_work(md_wq, &mddev->flush_work); - } else { - /* flush was performed for some other bio while we waited. */ - if (bio->bi_iter.bi_size == 0) - /* an empty barrier - all done */ - bio_endio(bio); - else { - bio->bi_opf &= ~REQ_PREFLUSH; - return false; - } + return true; } - return true; + + /* flush was performed for some other bio while we waited. */ + spin_unlock_irq(&mddev->lock); + if (bio->bi_iter.bi_size == 0) { + /* pure flush without data - all done */ + bio_endio(bio); + return true; + } + + bio->bi_opf &= ~REQ_PREFLUSH; + return false; } EXPORT_SYMBOL(md_flush_request); From 35f20acaa3585f25f8356da0ee6bc143e0256522 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 4 Jun 2024 19:25:28 +0200 Subject: [PATCH 0789/1578] md/raid0: don't free conf on raid0_run failure The core md code calls the ->free method which already frees conf. Fixes: 0c031fd37f69 ("md: Move alloc/free acct bioset in to personality") Signed-off-by: Christoph Hellwig Reviewed-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240604172607.3185916-2-hch@lst.de --- drivers/md/raid0.c | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index c5d4aeb68404c9..81c01347cd24e6 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -365,18 +365,13 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks return array_sectors; } -static void free_conf(struct mddev *mddev, struct r0conf *conf) -{ - kfree(conf->strip_zone); - kfree(conf->devlist); - kfree(conf); -} - static void raid0_free(struct mddev *mddev, void *priv) { struct r0conf *conf = priv; - free_conf(mddev, conf); + kfree(conf->strip_zone); + kfree(conf->devlist); + kfree(conf); } static int raid0_set_limits(struct mddev *mddev) @@ -415,7 +410,7 @@ static int raid0_run(struct mddev *mddev) if (!mddev_is_dm(mddev)) { ret = raid0_set_limits(mddev); if (ret) - goto out_free_conf; + return ret; } /* calculate array device size */ @@ -427,13 +422,7 @@ static int raid0_run(struct mddev *mddev) dump_zones(mddev); - ret = md_integrity_register(mddev); - if (ret) - goto out_free_conf; - return 0; -out_free_conf: - free_conf(mddev, conf); - return ret; + return md_integrity_register(mddev); } /* From 17f91ac0843b50462a9c9c8f18df962338bd3db2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 4 Jun 2024 19:25:29 +0200 Subject: [PATCH 0790/1578] md/raid1: don't free conf on raid0_run failure The core md code calls the ->free method which already frees conf. Fixes: 07f1a6850c5d ("md/raid1: fail run raid1 array when active disk less than one") Signed-off-by: Christoph Hellwig Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240604172607.3185916-3-hch@lst.de --- drivers/md/raid1.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 0d80ff471c73dd..3d54f30112a0e8 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -3203,7 +3203,6 @@ static int raid1_set_limits(struct mddev *mddev) return queue_limits_set(mddev->gendisk->queue, &lim); } -static void raid1_free(struct mddev *mddev, void *priv); static int raid1_run(struct mddev *mddev) { struct r1conf *conf; @@ -3237,7 +3236,7 @@ static int raid1_run(struct mddev *mddev) if (!mddev_is_dm(mddev)) { ret = raid1_set_limits(mddev); if (ret) - goto abort; + return ret; } mddev->degraded = 0; @@ -3251,8 +3250,7 @@ static int raid1_run(struct mddev *mddev) */ if (conf->raid_disks - mddev->degraded < 1) { md_unregister_thread(mddev, &conf->thread); - ret = -EINVAL; - goto abort; + return -EINVAL; } if (conf->raid_disks - mddev->degraded == 1) @@ -3276,14 +3274,8 @@ static int raid1_run(struct mddev *mddev) md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); ret = md_integrity_register(mddev); - if (ret) { + if (ret) md_unregister_thread(mddev, &mddev->thread); - goto abort; - } - return 0; - -abort: - raid1_free(mddev, conf); return ret; } From 3f60497c658d2072714d097a177612d34b34aa3d Mon Sep 17 00:00:00 2001 From: Biju Das Date: Mon, 10 Jun 2024 20:55:32 +0100 Subject: [PATCH 0791/1578] regulator: core: Fix modpost error "regulator_get_regmap" undefined Fix the modpost error "regulator_get_regmap" undefined by adding export symbol. Fixes: 04eca28cde52 ("regulator: Add helpers for low-level register access") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202406110117.mk5UR3VZ-lkp@intel.com Signed-off-by: Biju Das Link: https://lore.kernel.org/r/20240610195532.175942-1-biju.das.jz@bp.renesas.com Signed-off-by: Mark Brown --- drivers/regulator/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index 5794f4e9dd529f..844e9587a880fa 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -3347,6 +3347,7 @@ struct regmap *regulator_get_regmap(struct regulator *regulator) return map ? map : ERR_PTR(-EOPNOTSUPP); } +EXPORT_SYMBOL_GPL(regulator_get_regmap); /** * regulator_get_hardware_vsel_register - get the HW voltage selector register From 7124a8982b621e1a8af81c17f44b90587cdd161c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Jun 2024 20:48:41 -0400 Subject: [PATCH 0792/1578] bcachefs: Add missing bch_inode_info.ei_flags init Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index f6f1dbc1fe15c5..77126992dba8c2 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -227,7 +227,9 @@ static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c) mutex_init(&inode->ei_update_lock); two_state_lock_init(&inode->ei_pagecache_lock); INIT_LIST_HEAD(&inode->ei_vfs_inode_list); + inode->ei_flags = 0; mutex_init(&inode->ei_quota_lock); + memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); inode->v.i_state = 0; if (unlikely(inode_init_always(c->vfs_sb, &inode->v))) { From 44180feaccf266d9b0b28cc4ceaac019817deb5c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 7 Jun 2024 17:53:32 +0200 Subject: [PATCH 0793/1578] net/sched: initialize noop_qdisc owner When the noop_qdisc owner isn't initialized, then it will be 0, so packets will erroneously be regarded as having been subject to recursion as long as only CPU 0 queues them. For non-SMP, that's all packets, of course. This causes a change in what's reported to userspace, normally noop_qdisc would drop packets silently, but with this change the syscall returns -ENOBUFS if RECVERR is also set on the socket. Fix this by initializing the owner field to -1, just like it would be for dynamically allocated qdiscs by qdisc_alloc(). Fixes: 0f022d32c3ec ("net/sched: Fix mirred deadlock on device recursion") Signed-off-by: Johannes Berg Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240607175340.786bfb938803.I493bf8422e36be4454c08880a8d3703cea8e421a@changeid Signed-off-by: Jakub Kicinski --- net/sched/sch_generic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 2a637a17061b91..e22ff003d52ee8 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -676,6 +676,7 @@ struct Qdisc noop_qdisc = { .qlen = 0, .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.skb_bad_txq.lock), }, + .owner = -1, }; EXPORT_SYMBOL(noop_qdisc); From 8031b58c3a9b1db3ef68b3bd749fbee2e1e1aaa3 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 7 Jun 2024 17:01:48 +0200 Subject: [PATCH 0794/1578] mptcp: ensure snd_una is properly initialized on connect This is strictly related to commit fb7a0d334894 ("mptcp: ensure snd_nxt is properly initialized on connect"). It turns out that syzkaller can trigger the retransmit after fallback and before processing any other incoming packet - so that snd_una is still left uninitialized. Address the issue explicitly initializing snd_una together with snd_nxt and write_seq. Suggested-by: Mat Martineau Fixes: 8fd738049ac3 ("mptcp: fallback in case of simultaneous connect") Cc: stable@vger.kernel.org Reported-by: Christoph Paasch Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/485 Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240607-upstream-net-20240607-misc-fixes-v1-1-1ab9ddfa3d00@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 96b113854bd3ce..bb7dca8aa2d9c1 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3740,6 +3740,7 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) WRITE_ONCE(msk->write_seq, subflow->idsn); WRITE_ONCE(msk->snd_nxt, subflow->idsn); + WRITE_ONCE(msk->snd_una, subflow->idsn); if (likely(!__mptcp_check_fallback(msk))) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVE); From 6a09788c1a66e3d8b04b3b3e7618cc817bb60ae9 Mon Sep 17 00:00:00 2001 From: YonglongLi Date: Fri, 7 Jun 2024 17:01:49 +0200 Subject: [PATCH 0795/1578] mptcp: pm: inc RmAddr MIB counter once per RM_ADDR ID The RmAddr MIB counter is supposed to be incremented once when a valid RM_ADDR has been received. Before this patch, it could have been incremented as many times as the number of subflows connected to the linked address ID, so it could have been 0, 1 or more than 1. The "RmSubflow" is incremented after a local operation. In this case, it is normal to tied it with the number of subflows that have been actually removed. The "remove invalid addresses" MP Join subtest has been modified to validate this case. A broadcast IP address is now used instead: the client will not be able to create a subflow to this address. The consequence is that when receiving the RM_ADDR with the ID attached to this broadcast IP address, no subflow linked to this ID will be found. Fixes: 7a7e52e38a40 ("mptcp: add RM_ADDR related mibs") Cc: stable@vger.kernel.org Co-developed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: YonglongLi Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240607-upstream-net-20240607-misc-fixes-v1-2-1ab9ddfa3d00@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 5 ++++- tools/testing/selftests/net/mptcp/mptcp_join.sh | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 7f53e022e27ece..766a8409fa6779 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -814,10 +814,13 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, spin_lock_bh(&msk->pm.lock); removed = true; - __MPTCP_INC_STATS(sock_net(sk), rm_type); + if (rm_type == MPTCP_MIB_RMSUBFLOW) + __MPTCP_INC_STATS(sock_net(sk), rm_type); } if (rm_type == MPTCP_MIB_RMSUBFLOW) __set_bit(rm_id ? rm_id : msk->mpc_endpoint_id, msk->pm.id_avail_bitmap); + else if (rm_type == MPTCP_MIB_RMADDR) + __MPTCP_INC_STATS(sock_net(sk), rm_type); if (!removed) continue; diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 2b66c5fa71ebff..aea314d140c95f 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -2250,7 +2250,8 @@ remove_tests() pm_nl_set_limits $ns1 3 3 pm_nl_add_endpoint $ns1 10.0.12.1 flags signal pm_nl_add_endpoint $ns1 10.0.3.1 flags signal - pm_nl_add_endpoint $ns1 10.0.14.1 flags signal + # broadcast IP: no packet for this address will be received on ns1 + pm_nl_add_endpoint $ns1 224.0.0.1 flags signal pm_nl_set_limits $ns2 3 3 addr_nr_ns1=-3 speed=10 \ run_tests $ns1 $ns2 10.0.1.1 From 40eec1795cc27b076d49236649a29507c7ed8c2d Mon Sep 17 00:00:00 2001 From: YonglongLi Date: Fri, 7 Jun 2024 17:01:50 +0200 Subject: [PATCH 0796/1578] mptcp: pm: update add_addr counters after connect The creation of new subflows can fail for different reasons. If no subflow have been created using the received ADD_ADDR, the related counters should not be updated, otherwise they will never be decremented for events related to this ID later on. For the moment, the number of accepted ADD_ADDR is only decremented upon the reception of a related RM_ADDR, and only if the remote address ID is currently being used by at least one subflow. In other words, if no subflow can be created with the received address, the counter will not be decremented. In this case, it is then important not to increment pm.add_addr_accepted counter, and not to modify pm.accept_addr bit. Note that this patch does not modify the behaviour in case of failures later on, e.g. if the MP Join is dropped or rejected. The "remove invalid addresses" MP Join subtest has been modified to validate this case. The broadcast IP address is added before the "valid" address that will be used to successfully create a subflow, and the limit is decreased by one: without this patch, it was not possible to create the last subflow, because: - the broadcast address would have been accepted even if it was not usable: the creation of a subflow to this address results in an error, - the limit of 2 accepted ADD_ADDR would have then been reached. Fixes: 01cacb00b35c ("mptcp: add netlink-based PM") Cc: stable@vger.kernel.org Co-developed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: YonglongLi Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240607-upstream-net-20240607-misc-fixes-v1-3-1ab9ddfa3d00@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 16 ++++++++++------ tools/testing/selftests/net/mptcp/mptcp_join.sh | 4 ++-- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 766a8409fa6779..ea9e5817b9e9d3 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -677,6 +677,7 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) unsigned int add_addr_accept_max; struct mptcp_addr_info remote; unsigned int subflows_max; + bool sf_created = false; int i, nr; add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk); @@ -704,15 +705,18 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) if (nr == 0) return; - msk->pm.add_addr_accepted++; - if (msk->pm.add_addr_accepted >= add_addr_accept_max || - msk->pm.subflows >= subflows_max) - WRITE_ONCE(msk->pm.accept_addr, false); - spin_unlock_bh(&msk->pm.lock); for (i = 0; i < nr; i++) - __mptcp_subflow_connect(sk, &addrs[i], &remote); + if (__mptcp_subflow_connect(sk, &addrs[i], &remote) == 0) + sf_created = true; spin_lock_bh(&msk->pm.lock); + + if (sf_created) { + msk->pm.add_addr_accepted++; + if (msk->pm.add_addr_accepted >= add_addr_accept_max || + msk->pm.subflows >= subflows_max) + WRITE_ONCE(msk->pm.accept_addr, false); + } } void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index aea314d140c95f..108aeeb84ef10a 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -2249,10 +2249,10 @@ remove_tests() if reset "remove invalid addresses"; then pm_nl_set_limits $ns1 3 3 pm_nl_add_endpoint $ns1 10.0.12.1 flags signal - pm_nl_add_endpoint $ns1 10.0.3.1 flags signal # broadcast IP: no packet for this address will be received on ns1 pm_nl_add_endpoint $ns1 224.0.0.1 flags signal - pm_nl_set_limits $ns2 3 3 + pm_nl_add_endpoint $ns1 10.0.3.1 flags signal + pm_nl_set_limits $ns2 2 2 addr_nr_ns1=-3 speed=10 \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 1 1 1 From 74acb250e103f42be372177628f9272b6e888c49 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 7 Jun 2024 17:01:51 +0200 Subject: [PATCH 0797/1578] mailmap: map Geliang's new email address Just like my other email addresses, map my new one to kernel.org account too. My new email address uses "last name, first name" format, which is different from my other email addresses. This mailmap is also used to indicate that it is actually the same person. Suggested-by: Mat Martineau Suggested-by: Matthieu Baerts Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240607-upstream-net-20240607-misc-fixes-v1-4-1ab9ddfa3d00@kernel.org Signed-off-by: Jakub Kicinski --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index efd9fa867a8ecc..9af91bd3584bdd 100644 --- a/.mailmap +++ b/.mailmap @@ -217,6 +217,7 @@ Geliang Tang Geliang Tang Geliang Tang Geliang Tang +Geliang Tang Georgi Djakov Gerald Schaefer Gerald Schaefer From 36534d3c54537bf098224a32dc31397793d4594d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 7 Jun 2024 12:56:52 +0000 Subject: [PATCH 0798/1578] tcp: use signed arithmetic in tcp_rtx_probe0_timed_out() Due to timer wheel implementation, a timer will usually fire after its schedule. For instance, for HZ=1000, a timeout between 512ms and 4s has a granularity of 64ms. For this range of values, the extra delay could be up to 63ms. For TCP, this means that tp->rcv_tstamp may be after inet_csk(sk)->icsk_timeout whenever the timer interrupt finally triggers, if one packet came during the extra delay. We need to make sure tcp_rtx_probe0_timed_out() handles this case. Fixes: e89688e3e978 ("net: tcp: fix unexcepted socket die when snd_wnd is 0") Signed-off-by: Eric Dumazet Cc: Menglong Dong Acked-by: Neal Cardwell Reviewed-by: Jason Xing Link: https://lore.kernel.org/r/20240607125652.1472540-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_timer.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 83fe7f62f7f10a..5bfd76a31af6da 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -485,8 +485,12 @@ static bool tcp_rtx_probe0_timed_out(const struct sock *sk, { const struct tcp_sock *tp = tcp_sk(sk); const int timeout = TCP_RTO_MAX * 2; - u32 rcv_delta; + s32 rcv_delta; + /* Note: timer interrupt might have been delayed by at least one jiffy, + * and tp->rcv_tstamp might very well have been written recently. + * rcv_delta can thus be negative. + */ rcv_delta = inet_csk(sk)->icsk_timeout - tp->rcv_tstamp; if (rcv_delta <= timeout) return false; From b96a225377b6602299a03d2ce3c289b68cd41bb7 Mon Sep 17 00:00:00 2001 From: Vasily Khoruzhick Date: Fri, 7 Jun 2024 15:09:32 -0700 Subject: [PATCH 0799/1578] drm/nouveau: don't attempt to schedule hpd_work on headless cards If the card doesn't have display hardware, hpd_work and hpd_lock are left uninitialized which causes BUG when attempting to schedule hpd_work on runtime PM resume. Fix it by adding headless flag to DRM and skip any hpd if it's set. Fixes: ae1aadb1eb8d ("nouveau: don't fail driver load if no display hw present.") Link: https://gitlab.freedesktop.org/drm/nouveau/-/issues/337 Signed-off-by: Vasily Khoruzhick Reviewed-by: Ben Skeggs Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20240607221032.25918-1-anarsoul@gmail.com --- drivers/gpu/drm/nouveau/dispnv04/disp.c | 2 +- drivers/gpu/drm/nouveau/dispnv50/disp.c | 2 +- drivers/gpu/drm/nouveau/nouveau_display.c | 6 +++++- drivers/gpu/drm/nouveau/nouveau_drv.h | 1 + 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/nouveau/dispnv04/disp.c b/drivers/gpu/drm/nouveau/dispnv04/disp.c index 13705c5f149732..4b7497a8755cdb 100644 --- a/drivers/gpu/drm/nouveau/dispnv04/disp.c +++ b/drivers/gpu/drm/nouveau/dispnv04/disp.c @@ -68,7 +68,7 @@ nv04_display_fini(struct drm_device *dev, bool runtime, bool suspend) if (nv_two_heads(dev)) NVWriteCRTC(dev, 1, NV_PCRTC_INTR_EN_0, 0); - if (!runtime) + if (!runtime && !drm->headless) cancel_work_sync(&drm->hpd_work); if (!suspend) diff --git a/drivers/gpu/drm/nouveau/dispnv50/disp.c b/drivers/gpu/drm/nouveau/dispnv50/disp.c index 88728a0b2c2595..674dc567e1798a 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/disp.c +++ b/drivers/gpu/drm/nouveau/dispnv50/disp.c @@ -2680,7 +2680,7 @@ nv50_display_fini(struct drm_device *dev, bool runtime, bool suspend) nv50_mstm_fini(nouveau_encoder(encoder)); } - if (!runtime) + if (!runtime && !drm->headless) cancel_work_sync(&drm->hpd_work); } diff --git a/drivers/gpu/drm/nouveau/nouveau_display.c b/drivers/gpu/drm/nouveau/nouveau_display.c index aed5d5b51b43d6..d4725a968827e9 100644 --- a/drivers/gpu/drm/nouveau/nouveau_display.c +++ b/drivers/gpu/drm/nouveau/nouveau_display.c @@ -450,6 +450,9 @@ nouveau_display_hpd_resume(struct drm_device *dev) { struct nouveau_drm *drm = nouveau_drm(dev); + if (drm->headless) + return; + spin_lock_irq(&drm->hpd_lock); drm->hpd_pending = ~0; spin_unlock_irq(&drm->hpd_lock); @@ -635,7 +638,7 @@ nouveau_display_fini(struct drm_device *dev, bool suspend, bool runtime) } drm_connector_list_iter_end(&conn_iter); - if (!runtime) + if (!runtime && !drm->headless) cancel_work_sync(&drm->hpd_work); drm_kms_helper_poll_disable(dev); @@ -729,6 +732,7 @@ nouveau_display_create(struct drm_device *dev) /* no display hw */ if (ret == -ENODEV) { ret = 0; + drm->headless = true; goto disp_create_err; } diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h index e239c6bf4afa4f..25fca98a20bcd9 100644 --- a/drivers/gpu/drm/nouveau/nouveau_drv.h +++ b/drivers/gpu/drm/nouveau/nouveau_drv.h @@ -276,6 +276,7 @@ struct nouveau_drm { /* modesetting */ struct nvbios vbios; struct nouveau_display *display; + bool headless; struct work_struct hpd_work; spinlock_t hpd_lock; u32 hpd_pending; From ae9daffd9028f2500c9ac1517e46d4f2b57efb80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Wed, 8 May 2024 15:07:00 +0300 Subject: [PATCH 0800/1578] MIPS: Routerboard 532: Fix vendor retry check code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit read_config_dword() contains strange condition checking ret for a number of values. The ret variable, however, is always zero because config_access() never returns anything else. Thus, the retry is always taken until number of tries is exceeded. The code looks like it wants to check *val instead of ret to see if the read gave an error response. Fixes: 73b4390fb234 ("[MIPS] Routerboard 532: Support for base system") Signed-off-by: Ilpo Järvinen Signed-off-by: Thomas Bogendoerfer --- arch/mips/pci/ops-rc32434.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/pci/ops-rc32434.c b/arch/mips/pci/ops-rc32434.c index 874ed6df97683a..34b9323bdabb0e 100644 --- a/arch/mips/pci/ops-rc32434.c +++ b/arch/mips/pci/ops-rc32434.c @@ -112,8 +112,8 @@ static int read_config_dword(struct pci_bus *bus, unsigned int devfn, * gives them time to settle */ if (where == PCI_VENDOR_ID) { - if (ret == 0xffffffff || ret == 0x00000000 || - ret == 0x0000ffff || ret == 0xffff0000) { + if (*val == 0xffffffff || *val == 0x00000000 || + *val == 0x0000ffff || *val == 0xffff0000) { if (delay > 4) return 0; delay *= 2; From 277a0363120276645ae598d8d5fea7265e076ae9 Mon Sep 17 00:00:00 2001 From: Martin Schiller Date: Fri, 7 Jun 2024 11:04:00 +0200 Subject: [PATCH 0801/1578] MIPS: pci: lantiq: restore reset gpio polarity Commit 90c2d2eb7ab5 ("MIPS: pci: lantiq: switch to using gpiod API") not only switched to the gpiod API, but also inverted / changed the polarity of the GPIO. According to the PCI specification, the RST# pin is an active-low signal. However, most of the device trees that have been widely used for a long time (mainly in the openWrt project) define this GPIO as active-high and the old driver code inverted the signal internally. Apparently there are actually boards where the reset gpio must be operated inverted. For this reason, we cannot use the GPIOD_OUT_LOW/HIGH flag for initialization. Instead, we must explicitly set the gpio to value 1 in order to take into account any "GPIO_ACTIVE_LOW" flag that may have been set. In order to remain compatible with all these existing device trees, we should therefore keep the logic as it was before the commit. Fixes: 90c2d2eb7ab5 ("MIPS: pci: lantiq: switch to using gpiod API") Cc: stable@vger.kernel.org Signed-off-by: Martin Schiller Signed-off-by: Thomas Bogendoerfer --- arch/mips/pci/pci-lantiq.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/mips/pci/pci-lantiq.c b/arch/mips/pci/pci-lantiq.c index 68a8cefed420bf..0844db34022e45 100644 --- a/arch/mips/pci/pci-lantiq.c +++ b/arch/mips/pci/pci-lantiq.c @@ -124,14 +124,14 @@ static int ltq_pci_startup(struct platform_device *pdev) clk_disable(clk_external); /* setup reset gpio used by pci */ - reset_gpio = devm_gpiod_get_optional(&pdev->dev, "reset", - GPIOD_OUT_LOW); + reset_gpio = devm_gpiod_get_optional(&pdev->dev, "reset", GPIOD_ASIS); error = PTR_ERR_OR_ZERO(reset_gpio); if (error) { dev_err(&pdev->dev, "failed to request gpio: %d\n", error); return error; } gpiod_set_consumer_name(reset_gpio, "pci_reset"); + gpiod_direction_output(reset_gpio, 1); /* enable auto-switching between PCI and EBU */ ltq_pci_w32(0xa, PCI_CR_CLK_CTRL); @@ -194,10 +194,10 @@ static int ltq_pci_startup(struct platform_device *pdev) /* toggle reset pin */ if (reset_gpio) { - gpiod_set_value_cansleep(reset_gpio, 1); + gpiod_set_value_cansleep(reset_gpio, 0); wmb(); mdelay(1); - gpiod_set_value_cansleep(reset_gpio, 0); + gpiod_set_value_cansleep(reset_gpio, 1); } return 0; } From ce5cdd3b05216b704a704f466fb4c2dff3778caf Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Tue, 11 Jun 2024 13:35:33 +0200 Subject: [PATCH 0802/1578] mips: bmips: BCM6358: make sure CBR is correctly set It was discovered that some device have CBR address set to 0 causing kernel panic when arch_sync_dma_for_cpu_all is called. This was notice in situation where the system is booted from TP1 and BMIPS_GET_CBR() returns 0 instead of a valid address and !!(read_c0_brcm_cmt_local() & (1 << 31)); not failing. The current check whether RAC flush should be disabled or not are not enough hence lets check if CBR is a valid address or not. Fixes: ab327f8acdf8 ("mips: bmips: BCM6358: disable RAC flush for TP1") Signed-off-by: Christian Marangi Acked-by: Florian Fainelli Signed-off-by: Thomas Bogendoerfer --- arch/mips/bmips/setup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/mips/bmips/setup.c b/arch/mips/bmips/setup.c index ec180ab92eaa83..66a8ba19c28722 100644 --- a/arch/mips/bmips/setup.c +++ b/arch/mips/bmips/setup.c @@ -110,7 +110,8 @@ static void bcm6358_quirks(void) * RAC flush causes kernel panics on BCM6358 when booting from TP1 * because the bootloader is not initializing it properly. */ - bmips_rac_flush_disable = !!(read_c0_brcm_cmt_local() & (1 << 31)); + bmips_rac_flush_disable = !!(read_c0_brcm_cmt_local() & (1 << 31)) || + !!BMIPS_GET_CBR(); } static void bcm6368_quirks(void) From b01b8151efe47a432f3f73623a6c1438727e7880 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sun, 9 Jun 2024 10:21:39 +0200 Subject: [PATCH 0803/1578] s390: Update defconfigs Signed-off-by: Heiko Carstens Acked-by: Vasily Gorbik Signed-off-by: Vasily Gorbik --- arch/s390/configs/debug_defconfig | 43 ++++++++++++++++++++++------ arch/s390/configs/defconfig | 40 ++++++++++++++++++++++---- arch/s390/configs/zfcpdump_defconfig | 5 +--- 3 files changed, 69 insertions(+), 19 deletions(-) diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 145342e46ea835..8c4adece891122 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -43,7 +43,6 @@ CONFIG_PROFILING=y CONFIG_KEXEC=y CONFIG_KEXEC_FILE=y CONFIG_KEXEC_SIG=y -CONFIG_CRASH_DUMP=y CONFIG_LIVEPATCH=y CONFIG_MARCH_Z13=y CONFIG_NR_CPUS=512 @@ -51,6 +50,7 @@ CONFIG_NUMA=y CONFIG_HZ_100=y CONFIG_CERT_STORE=y CONFIG_EXPOLINE=y +# CONFIG_EXPOLINE_EXTERN is not set CONFIG_EXPOLINE_AUTO=y CONFIG_CHSC_SCH=y CONFIG_VFIO_CCW=m @@ -76,6 +76,7 @@ CONFIG_MODULE_FORCE_UNLOAD=y CONFIG_MODULE_UNLOAD_TAINT_TRACKING=y CONFIG_MODVERSIONS=y CONFIG_MODULE_SRCVERSION_ALL=y +CONFIG_MODULE_SIG_SHA256=y CONFIG_BLK_DEV_THROTTLING=y CONFIG_BLK_WBT=y CONFIG_BLK_CGROUP_IOLATENCY=y @@ -100,7 +101,6 @@ CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y CONFIG_KSM=y CONFIG_TRANSPARENT_HUGEPAGE=y -CONFIG_CMA_DEBUG=y CONFIG_CMA_DEBUGFS=y CONFIG_CMA_SYSFS=y CONFIG_CMA_AREAS=7 @@ -119,6 +119,7 @@ CONFIG_UNIX_DIAG=m CONFIG_XFRM_USER=m CONFIG_NET_KEY=m CONFIG_SMC_DIAG=m +CONFIG_SMC_LO=y CONFIG_INET=y CONFIG_IP_MULTICAST=y CONFIG_IP_ADVANCED_ROUTER=y @@ -133,7 +134,6 @@ CONFIG_IP_MROUTE=y CONFIG_IP_MROUTE_MULTIPLE_TABLES=y CONFIG_IP_PIMSM_V1=y CONFIG_IP_PIMSM_V2=y -CONFIG_SYN_COOKIES=y CONFIG_NET_IPVTI=m CONFIG_INET_AH=m CONFIG_INET_ESP=m @@ -167,6 +167,7 @@ CONFIG_BRIDGE_NETFILTER=m CONFIG_NETFILTER_NETLINK_HOOK=m CONFIG_NF_CONNTRACK=m CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_ZONES=y CONFIG_NF_CONNTRACK_PROCFS=y CONFIG_NF_CONNTRACK_EVENTS=y CONFIG_NF_CONNTRACK_TIMEOUT=y @@ -183,17 +184,39 @@ CONFIG_NF_CONNTRACK_SIP=m CONFIG_NF_CONNTRACK_TFTP=m CONFIG_NF_CT_NETLINK=m CONFIG_NF_CT_NETLINK_TIMEOUT=m +CONFIG_NF_CT_NETLINK_HELPER=m +CONFIG_NETFILTER_NETLINK_GLUE_CT=y CONFIG_NF_TABLES=m CONFIG_NF_TABLES_INET=y +CONFIG_NF_TABLES_NETDEV=y +CONFIG_NFT_NUMGEN=m CONFIG_NFT_CT=m +CONFIG_NFT_FLOW_OFFLOAD=m +CONFIG_NFT_CONNLIMIT=m CONFIG_NFT_LOG=m CONFIG_NFT_LIMIT=m +CONFIG_NFT_MASQ=m +CONFIG_NFT_REDIR=m CONFIG_NFT_NAT=m +CONFIG_NFT_TUNNEL=m +CONFIG_NFT_QUEUE=m +CONFIG_NFT_QUOTA=m CONFIG_NFT_REJECT=m CONFIG_NFT_COMPAT=m CONFIG_NFT_HASH=m CONFIG_NFT_FIB_INET=m -CONFIG_NETFILTER_XTABLES_COMPAT=y +CONFIG_NFT_XFRM=m +CONFIG_NFT_SOCKET=m +CONFIG_NFT_OSF=m +CONFIG_NFT_TPROXY=m +CONFIG_NFT_SYNPROXY=m +CONFIG_NFT_DUP_NETDEV=m +CONFIG_NFT_FWD_NETDEV=m +CONFIG_NFT_FIB_NETDEV=m +CONFIG_NFT_REJECT_NETDEV=m +CONFIG_NF_FLOW_TABLE_INET=m +CONFIG_NF_FLOW_TABLE=m +CONFIG_NF_FLOW_TABLE_PROCFS=y CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_AUDIT=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m @@ -206,8 +229,10 @@ CONFIG_NETFILTER_XT_TARGET_HMARK=m CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m CONFIG_NETFILTER_XT_TARGET_LOG=m CONFIG_NETFILTER_XT_TARGET_MARK=m +CONFIG_NETFILTER_XT_TARGET_NETMAP=m CONFIG_NETFILTER_XT_TARGET_NFLOG=m CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m +CONFIG_NETFILTER_XT_TARGET_REDIRECT=m CONFIG_NETFILTER_XT_TARGET_TEE=m CONFIG_NETFILTER_XT_TARGET_TPROXY=m CONFIG_NETFILTER_XT_TARGET_TRACE=m @@ -216,6 +241,7 @@ CONFIG_NETFILTER_XT_TARGET_TCPMSS=m CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m CONFIG_NETFILTER_XT_MATCH_BPF=m +CONFIG_NETFILTER_XT_MATCH_CGROUP=m CONFIG_NETFILTER_XT_MATCH_CLUSTER=m CONFIG_NETFILTER_XT_MATCH_COMMENT=m CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m @@ -230,6 +256,7 @@ CONFIG_NETFILTER_XT_MATCH_DSCP=m CONFIG_NETFILTER_XT_MATCH_ESP=m CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m CONFIG_NETFILTER_XT_MATCH_HELPER=m +CONFIG_NETFILTER_XT_MATCH_IPCOMP=m CONFIG_NETFILTER_XT_MATCH_IPRANGE=m CONFIG_NETFILTER_XT_MATCH_IPVS=m CONFIG_NETFILTER_XT_MATCH_LENGTH=m @@ -247,6 +274,7 @@ CONFIG_NETFILTER_XT_MATCH_QUOTA=m CONFIG_NETFILTER_XT_MATCH_RATEEST=m CONFIG_NETFILTER_XT_MATCH_REALM=m CONFIG_NETFILTER_XT_MATCH_RECENT=m +CONFIG_NETFILTER_XT_MATCH_SOCKET=m CONFIG_NETFILTER_XT_MATCH_STATE=m CONFIG_NETFILTER_XT_MATCH_STATISTIC=m CONFIG_NETFILTER_XT_MATCH_STRING=m @@ -302,7 +330,6 @@ CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m CONFIG_IP_NF_SECURITY=m -CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NFT_FIB_IPV6=m @@ -373,7 +400,6 @@ CONFIG_NET_ACT_POLICE=m CONFIG_NET_ACT_GACT=m CONFIG_GACT_PROB=y CONFIG_NET_ACT_MIRRED=m -CONFIG_NET_ACT_IPT=m CONFIG_NET_ACT_NAT=m CONFIG_NET_ACT_PEDIT=m CONFIG_NET_ACT_SIMP=m @@ -462,6 +488,7 @@ CONFIG_DM_VERITY=m CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y CONFIG_DM_SWITCH=m CONFIG_DM_INTEGRITY=m +CONFIG_DM_VDO=m CONFIG_NETDEVICES=y CONFIG_BONDING=m CONFIG_DUMMY=m @@ -574,7 +601,6 @@ CONFIG_WATCHDOG=y CONFIG_WATCHDOG_NOWAYOUT=y CONFIG_SOFT_WATCHDOG=m CONFIG_DIAG288_WATCHDOG=m -# CONFIG_DRM_DEBUG_MODESET_LOCK is not set CONFIG_FB=y # CONFIG_FB_DEVICE is not set CONFIG_FRAMEBUFFER_CONSOLE=y @@ -645,7 +671,6 @@ CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m CONFIG_EXFAT_FS=m CONFIG_NTFS_FS=m -CONFIG_NTFS_RW=y CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y @@ -663,6 +688,7 @@ CONFIG_SQUASHFS_XZ=y CONFIG_SQUASHFS_ZSTD=y CONFIG_ROMFS_FS=m CONFIG_NFS_FS=m +CONFIG_NFS_V2=m CONFIG_NFS_V3_ACL=y CONFIG_NFS_V4=m CONFIG_NFS_SWAP=y @@ -879,6 +905,5 @@ CONFIG_RBTREE_TEST=y CONFIG_INTERVAL_TREE_TEST=m CONFIG_PERCPU_TEST=m CONFIG_ATOMIC64_SELFTEST=y -CONFIG_STRING_SELFTEST=y CONFIG_TEST_BITOPS=m CONFIG_TEST_BPF=m diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index dc237896f99d30..6dd11d3b6aaa62 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -41,7 +41,6 @@ CONFIG_PROFILING=y CONFIG_KEXEC=y CONFIG_KEXEC_FILE=y CONFIG_KEXEC_SIG=y -CONFIG_CRASH_DUMP=y CONFIG_LIVEPATCH=y CONFIG_MARCH_Z13=y CONFIG_NR_CPUS=512 @@ -49,6 +48,7 @@ CONFIG_NUMA=y CONFIG_HZ_100=y CONFIG_CERT_STORE=y CONFIG_EXPOLINE=y +# CONFIG_EXPOLINE_EXTERN is not set CONFIG_EXPOLINE_AUTO=y CONFIG_CHSC_SCH=y CONFIG_VFIO_CCW=m @@ -71,6 +71,7 @@ CONFIG_MODULE_FORCE_UNLOAD=y CONFIG_MODULE_UNLOAD_TAINT_TRACKING=y CONFIG_MODVERSIONS=y CONFIG_MODULE_SRCVERSION_ALL=y +CONFIG_MODULE_SIG_SHA256=y CONFIG_BLK_DEV_THROTTLING=y CONFIG_BLK_WBT=y CONFIG_BLK_CGROUP_IOLATENCY=y @@ -110,6 +111,7 @@ CONFIG_UNIX_DIAG=m CONFIG_XFRM_USER=m CONFIG_NET_KEY=m CONFIG_SMC_DIAG=m +CONFIG_SMC_LO=y CONFIG_INET=y CONFIG_IP_MULTICAST=y CONFIG_IP_ADVANCED_ROUTER=y @@ -124,7 +126,6 @@ CONFIG_IP_MROUTE=y CONFIG_IP_MROUTE_MULTIPLE_TABLES=y CONFIG_IP_PIMSM_V1=y CONFIG_IP_PIMSM_V2=y -CONFIG_SYN_COOKIES=y CONFIG_NET_IPVTI=m CONFIG_INET_AH=m CONFIG_INET_ESP=m @@ -158,6 +159,7 @@ CONFIG_BRIDGE_NETFILTER=m CONFIG_NETFILTER_NETLINK_HOOK=m CONFIG_NF_CONNTRACK=m CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_ZONES=y CONFIG_NF_CONNTRACK_PROCFS=y CONFIG_NF_CONNTRACK_EVENTS=y CONFIG_NF_CONNTRACK_TIMEOUT=y @@ -174,17 +176,39 @@ CONFIG_NF_CONNTRACK_SIP=m CONFIG_NF_CONNTRACK_TFTP=m CONFIG_NF_CT_NETLINK=m CONFIG_NF_CT_NETLINK_TIMEOUT=m +CONFIG_NF_CT_NETLINK_HELPER=m +CONFIG_NETFILTER_NETLINK_GLUE_CT=y CONFIG_NF_TABLES=m CONFIG_NF_TABLES_INET=y +CONFIG_NF_TABLES_NETDEV=y +CONFIG_NFT_NUMGEN=m CONFIG_NFT_CT=m +CONFIG_NFT_FLOW_OFFLOAD=m +CONFIG_NFT_CONNLIMIT=m CONFIG_NFT_LOG=m CONFIG_NFT_LIMIT=m +CONFIG_NFT_MASQ=m +CONFIG_NFT_REDIR=m CONFIG_NFT_NAT=m +CONFIG_NFT_TUNNEL=m +CONFIG_NFT_QUEUE=m +CONFIG_NFT_QUOTA=m CONFIG_NFT_REJECT=m CONFIG_NFT_COMPAT=m CONFIG_NFT_HASH=m CONFIG_NFT_FIB_INET=m -CONFIG_NETFILTER_XTABLES_COMPAT=y +CONFIG_NFT_XFRM=m +CONFIG_NFT_SOCKET=m +CONFIG_NFT_OSF=m +CONFIG_NFT_TPROXY=m +CONFIG_NFT_SYNPROXY=m +CONFIG_NFT_DUP_NETDEV=m +CONFIG_NFT_FWD_NETDEV=m +CONFIG_NFT_FIB_NETDEV=m +CONFIG_NFT_REJECT_NETDEV=m +CONFIG_NF_FLOW_TABLE_INET=m +CONFIG_NF_FLOW_TABLE=m +CONFIG_NF_FLOW_TABLE_PROCFS=y CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_AUDIT=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m @@ -197,8 +221,10 @@ CONFIG_NETFILTER_XT_TARGET_HMARK=m CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m CONFIG_NETFILTER_XT_TARGET_LOG=m CONFIG_NETFILTER_XT_TARGET_MARK=m +CONFIG_NETFILTER_XT_TARGET_NETMAP=m CONFIG_NETFILTER_XT_TARGET_NFLOG=m CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m +CONFIG_NETFILTER_XT_TARGET_REDIRECT=m CONFIG_NETFILTER_XT_TARGET_TEE=m CONFIG_NETFILTER_XT_TARGET_TPROXY=m CONFIG_NETFILTER_XT_TARGET_TRACE=m @@ -207,6 +233,7 @@ CONFIG_NETFILTER_XT_TARGET_TCPMSS=m CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m CONFIG_NETFILTER_XT_MATCH_BPF=m +CONFIG_NETFILTER_XT_MATCH_CGROUP=m CONFIG_NETFILTER_XT_MATCH_CLUSTER=m CONFIG_NETFILTER_XT_MATCH_COMMENT=m CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m @@ -221,6 +248,7 @@ CONFIG_NETFILTER_XT_MATCH_DSCP=m CONFIG_NETFILTER_XT_MATCH_ESP=m CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m CONFIG_NETFILTER_XT_MATCH_HELPER=m +CONFIG_NETFILTER_XT_MATCH_IPCOMP=m CONFIG_NETFILTER_XT_MATCH_IPRANGE=m CONFIG_NETFILTER_XT_MATCH_IPVS=m CONFIG_NETFILTER_XT_MATCH_LENGTH=m @@ -238,6 +266,7 @@ CONFIG_NETFILTER_XT_MATCH_QUOTA=m CONFIG_NETFILTER_XT_MATCH_RATEEST=m CONFIG_NETFILTER_XT_MATCH_REALM=m CONFIG_NETFILTER_XT_MATCH_RECENT=m +CONFIG_NETFILTER_XT_MATCH_SOCKET=m CONFIG_NETFILTER_XT_MATCH_STATE=m CONFIG_NETFILTER_XT_MATCH_STATISTIC=m CONFIG_NETFILTER_XT_MATCH_STRING=m @@ -293,7 +322,6 @@ CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m CONFIG_IP_NF_SECURITY=m -CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NFT_FIB_IPV6=m @@ -363,7 +391,6 @@ CONFIG_NET_ACT_POLICE=m CONFIG_NET_ACT_GACT=m CONFIG_GACT_PROB=y CONFIG_NET_ACT_MIRRED=m -CONFIG_NET_ACT_IPT=m CONFIG_NET_ACT_NAT=m CONFIG_NET_ACT_PEDIT=m CONFIG_NET_ACT_SIMP=m @@ -452,6 +479,7 @@ CONFIG_DM_VERITY=m CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y CONFIG_DM_SWITCH=m CONFIG_DM_INTEGRITY=m +CONFIG_DM_VDO=m CONFIG_NETDEVICES=y CONFIG_BONDING=m CONFIG_DUMMY=m @@ -630,7 +658,6 @@ CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m CONFIG_EXFAT_FS=m CONFIG_NTFS_FS=m -CONFIG_NTFS_RW=y CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y @@ -649,6 +676,7 @@ CONFIG_SQUASHFS_XZ=y CONFIG_SQUASHFS_ZSTD=y CONFIG_ROMFS_FS=m CONFIG_NFS_FS=m +CONFIG_NFS_V2=m CONFIG_NFS_V3_ACL=y CONFIG_NFS_V4=m CONFIG_NFS_SWAP=y diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig index c51f3ec4eb28ab..8c2b61363bab91 100644 --- a/arch/s390/configs/zfcpdump_defconfig +++ b/arch/s390/configs/zfcpdump_defconfig @@ -9,25 +9,22 @@ CONFIG_BPF_SYSCALL=y CONFIG_BLK_DEV_INITRD=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_KEXEC=y -CONFIG_CRASH_DUMP=y CONFIG_MARCH_Z13=y CONFIG_NR_CPUS=2 CONFIG_HZ_100=y # CONFIG_CHSC_SCH is not set # CONFIG_SCM_BUS is not set +# CONFIG_AP is not set # CONFIG_PFAULT is not set # CONFIG_S390_HYPFS is not set # CONFIG_VIRTUALIZATION is not set # CONFIG_S390_GUEST is not set # CONFIG_SECCOMP is not set -# CONFIG_GCC_PLUGINS is not set # CONFIG_BLOCK_LEGACY_AUTOLOAD is not set CONFIG_PARTITION_ADVANCED=y # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set # CONFIG_SWAP is not set # CONFIG_COMPAT_BRK is not set -# CONFIG_COMPACTION is not set -# CONFIG_MIGRATION is not set CONFIG_NET=y # CONFIG_IUCV is not set # CONFIG_PCPU_DEV_REFCNT is not set From d8073dc6bc04a061660b31e49a990478a73f1883 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 7 Jun 2024 14:19:48 +0200 Subject: [PATCH 0804/1578] s390/mm: Allow large pages only for aligned physical addresses Do not allow creation of large pages against physical addresses, which itself are not aligned on the correct boundary. Failure to do so might lead to referencing wrong memory as result of the way DAT works. Fixes: c98d2ecae08f ("s390/mm: Uncouple physical vs virtual address spaces") Reviewed-by: Heiko Carstens Signed-off-by: Alexander Gordeev Signed-off-by: Vasily Gorbik --- arch/s390/boot/vmem.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c index 96d48b7112d40b..1f7dbb4b66b701 100644 --- a/arch/s390/boot/vmem.c +++ b/arch/s390/boot/vmem.c @@ -267,15 +267,21 @@ static bool large_allowed(enum populate_mode mode) static bool can_large_pud(pud_t *pu_dir, unsigned long addr, unsigned long end, enum populate_mode mode) { + unsigned long size = end - addr; + return machine.has_edat2 && large_allowed(mode) && - IS_ALIGNED(addr, PUD_SIZE) && (end - addr) >= PUD_SIZE; + IS_ALIGNED(addr, PUD_SIZE) && (size >= PUD_SIZE) && + IS_ALIGNED(_pa(addr, size, mode), PUD_SIZE); } static bool can_large_pmd(pmd_t *pm_dir, unsigned long addr, unsigned long end, enum populate_mode mode) { + unsigned long size = end - addr; + return machine.has_edat1 && large_allowed(mode) && - IS_ALIGNED(addr, PMD_SIZE) && (end - addr) >= PMD_SIZE; + IS_ALIGNED(addr, PMD_SIZE) && (size >= PMD_SIZE) && + IS_ALIGNED(_pa(addr, size, mode), PMD_SIZE); } static void pgtable_pte_populate(pmd_t *pmd, unsigned long addr, unsigned long end, From 693d41f7c938f92d881e6a51525e6c132a186afd Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 7 Jun 2024 14:19:50 +0200 Subject: [PATCH 0805/1578] s390/mm: Restore mapping of kernel image using large pages Since physical and virtual kernel address spaces are uncoupled the kernel image is not mapped using large segment pages anymore, which is a regression. Put the kernel image at the same large segment page offset in physical memory as in virtual memory. Such approach preserves the existing number of bits of entropy used for randomization of the kernel location in virtual memory when KASLR is on. As result, the kernel is mapped using large segment pages. Fixes: c98d2ecae08f ("s390/mm: Uncouple physical vs virtual address spaces") Reported-by: Heiko Carstens Reviewed-by: Heiko Carstens Signed-off-by: Alexander Gordeev Signed-off-by: Vasily Gorbik --- arch/s390/boot/startup.c | 27 ++++++++++++++++++++++++--- arch/s390/boot/vmem.c | 2 +- arch/s390/boot/vmlinux.lds.S | 1 + 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 182aac6a0f7795..48ef5fe5c08aaa 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -384,7 +384,7 @@ static void fixup_vmlinux_info(void) void startup_kernel(void) { unsigned long kernel_size = vmlinux.image_size + vmlinux.bss_size; - unsigned long nokaslr_offset_phys = mem_safe_offset(); + unsigned long nokaslr_offset_phys, kaslr_large_page_offset; unsigned long amode31_lma = 0; unsigned long max_physmem_end; unsigned long asce_limit; @@ -393,6 +393,12 @@ void startup_kernel(void) fixup_vmlinux_info(); setup_lpp(); + + /* + * Non-randomized kernel physical start address must be _SEGMENT_SIZE + * aligned (see blow). + */ + nokaslr_offset_phys = ALIGN(mem_safe_offset(), _SEGMENT_SIZE); safe_addr = PAGE_ALIGN(nokaslr_offset_phys + kernel_size); /* @@ -425,10 +431,25 @@ void startup_kernel(void) save_ipl_cert_comp_list(); rescue_initrd(safe_addr, ident_map_size); - if (kaslr_enabled()) - __kaslr_offset_phys = randomize_within_range(kernel_size, THREAD_SIZE, 0, ident_map_size); + /* + * __kaslr_offset_phys must be _SEGMENT_SIZE aligned, so the lower + * 20 bits (the offset within a large page) are zero. Copy the last + * 20 bits of __kaslr_offset, which is THREAD_SIZE aligned, to + * __kaslr_offset_phys. + * + * With this the last 20 bits of __kaslr_offset_phys and __kaslr_offset + * are identical, which is required to allow for large mappings of the + * kernel image. + */ + kaslr_large_page_offset = __kaslr_offset & ~_SEGMENT_MASK; + if (kaslr_enabled()) { + unsigned long end = ident_map_size - kaslr_large_page_offset; + + __kaslr_offset_phys = randomize_within_range(kernel_size, _SEGMENT_SIZE, 0, end); + } if (!__kaslr_offset_phys) __kaslr_offset_phys = nokaslr_offset_phys; + __kaslr_offset_phys |= kaslr_large_page_offset; kaslr_adjust_vmlinux_info(__kaslr_offset_phys); physmem_reserve(RR_VMLINUX, __kaslr_offset_phys, kernel_size); deploy_kernel((void *)__kaslr_offset_phys); diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c index 1f7dbb4b66b701..40cfce2687c437 100644 --- a/arch/s390/boot/vmem.c +++ b/arch/s390/boot/vmem.c @@ -261,7 +261,7 @@ static unsigned long _pa(unsigned long addr, unsigned long size, enum populate_m static bool large_allowed(enum populate_mode mode) { - return (mode == POPULATE_DIRECT) || (mode == POPULATE_IDENTITY); + return (mode == POPULATE_DIRECT) || (mode == POPULATE_IDENTITY) || (mode == POPULATE_KERNEL); } static bool can_large_pud(pud_t *pu_dir, unsigned long addr, unsigned long end, diff --git a/arch/s390/boot/vmlinux.lds.S b/arch/s390/boot/vmlinux.lds.S index 1fe5a1d3ff60d6..a750711d44c863 100644 --- a/arch/s390/boot/vmlinux.lds.S +++ b/arch/s390/boot/vmlinux.lds.S @@ -109,6 +109,7 @@ SECTIONS #ifdef CONFIG_KERNEL_UNCOMPRESSED . = ALIGN(PAGE_SIZE); . += AMODE31_SIZE; /* .amode31 section */ + . = ALIGN(1 << 20); /* _SEGMENT_SIZE */ #else . = ALIGN(8); #endif From 2049aad5d3a6921f80121029afe6fbcfb2727861 Mon Sep 17 00:00:00 2001 From: Amer Al Shanawany Date: Wed, 17 Apr 2024 20:49:13 +0200 Subject: [PATCH 0806/1578] selftests: filesystems: fix warn_unused_result build warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following warnings by adding return check and error messages. statmount_test.c: In function ‘cleanup_namespace’: statmount_test.c:128:9: warning: ignoring return value of ‘fchdir’ declared with attribute ‘warn_unused_result’ [-Wunused-result] 128 | fchdir(orig_root); | ^~~~~~~~~~~~~~~~~ statmount_test.c:129:9: warning: ignoring return value of ‘chroot’ declared with attribute ‘warn_unused_result’ [-Wunused-result] 129 | chroot("."); | ^~~~~~~~~~~ Signed-off-by: Amer Al Shanawany Reviewed-by: Muhammad Usama Anjum Signed-off-by: Shuah Khan --- .../selftests/filesystems/statmount/statmount_test.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test.c b/tools/testing/selftests/filesystems/statmount/statmount_test.c index e6d7c4f1c85b50..e8c019d72cbf3d 100644 --- a/tools/testing/selftests/filesystems/statmount/statmount_test.c +++ b/tools/testing/selftests/filesystems/statmount/statmount_test.c @@ -125,8 +125,16 @@ static uint32_t old_root_id, old_parent_id; static void cleanup_namespace(void) { - fchdir(orig_root); - chroot("."); + int ret; + + ret = fchdir(orig_root); + if (ret == -1) + ksft_perror("fchdir to original root"); + + ret = chroot("."); + if (ret == -1) + ksft_perror("chroot to original root"); + umount2(root_mntpoint, MNT_DETACH); rmdir(root_mntpoint); } From 04e1f99afe8bec27ad2d2726897ac8185bf0532c Mon Sep 17 00:00:00 2001 From: Amer Al Shanawany Date: Tue, 11 Jun 2024 17:16:08 +0200 Subject: [PATCH 0807/1578] selftests: seccomp: fix format-zero-length warnings fix the following errors by using string format specifier and an empty parameter: seccomp_benchmark.c:197:24: warning: zero-length gnu_printf format string [-Wformat-zero-length] 197 | ksft_print_msg(""); | ^~ seccomp_benchmark.c:202:24: warning: zero-length gnu_printf format string [-Wformat-zero-length] 202 | ksft_print_msg(""); | ^~ seccomp_benchmark.c:204:24: warning: zero-length gnu_printf format string [-Wformat-zero-length] 204 | ksft_print_msg(""); | ^~ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202312260235.Uj5ug8K9-lkp@intel.com/ Suggested-by: Kees Cook Signed-off-by: Amer Al Shanawany Signed-off-by: Shuah Khan --- tools/testing/selftests/seccomp/seccomp_benchmark.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/seccomp/seccomp_benchmark.c b/tools/testing/selftests/seccomp/seccomp_benchmark.c index b83099160fbcaa..94886c82ae609f 100644 --- a/tools/testing/selftests/seccomp/seccomp_benchmark.c +++ b/tools/testing/selftests/seccomp/seccomp_benchmark.c @@ -194,14 +194,14 @@ int main(int argc, char *argv[]) ksft_set_plan(7); ksft_print_msg("Running on:\n"); - ksft_print_msg(""); + ksft_print_msg("%s", ""); system("uname -a"); ksft_print_msg("Current BPF sysctl settings:\n"); /* Avoid using "sysctl" which may not be installed. */ - ksft_print_msg(""); + ksft_print_msg("%s", ""); system("grep -H . /proc/sys/net/core/bpf_jit_enable"); - ksft_print_msg(""); + ksft_print_msg("%s", ""); system("grep -H . /proc/sys/net/core/bpf_jit_harden"); affinity(); From e3215deca4520773cd2b155bed164c12365149a7 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Mon, 3 Jun 2024 09:24:44 +0800 Subject: [PATCH 0808/1578] dmaengine: idxd: Fix possible Use-After-Free in irq_process_work_list Use list_for_each_entry_safe() to allow iterating through the list and deleting the entry in the iteration process. The descriptor is freed via idxd_desc_complete() and there's a slight chance may cause issue for the list iterator when the descriptor is reused by another thread without it being deleted from the list. Fixes: 16e19e11228b ("dmaengine: idxd: Fix list corruption in description completion") Signed-off-by: Li RongQing Reviewed-by: Dave Jiang Reviewed-by: Fenghua Yu Link: https://lore.kernel.org/r/20240603012444.11902-1-lirongqing@baidu.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/irq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c index 8dc029c8655151..fc049c9c9892e6 100644 --- a/drivers/dma/idxd/irq.c +++ b/drivers/dma/idxd/irq.c @@ -611,11 +611,13 @@ static void irq_process_work_list(struct idxd_irq_entry *irq_entry) spin_unlock(&irq_entry->list_lock); - list_for_each_entry(desc, &flist, list) { + list_for_each_entry_safe(desc, n, &flist, list) { /* * Check against the original status as ABORT is software defined * and 0xff, which DSA_COMP_STATUS_MASK can mask out. */ + list_del(&desc->list); + if (unlikely(desc->completion->status == IDXD_COMP_DESC_ABORT)) { idxd_desc_complete(desc, IDXD_COMPLETE_ABORT, true); continue; From e8343410ddf08fc36a9b9cc7c51a4e53a262d4c6 Mon Sep 17 00:00:00 2001 From: Jai Luthra Date: Tue, 11 Jun 2024 18:02:55 +0530 Subject: [PATCH 0809/1578] ALSA: dmaengine: Synchronize dma channel after drop() Sometimes the stream may be stopped due to XRUN events, in which case the userspace can call snd_pcm_drop() and snd_pcm_prepare() to stop and start the stream again. In these cases, we must wait for the DMA channel to synchronize before marking the stream as prepared for playback, as the DMA channel gets stopped by drop() without any synchronization. Make sure the ALSA core synchronizes the DMA channel by adding a sync_stop() hook. Reviewed-by: Peter Ujfalusi Signed-off-by: Jai Luthra Link: https://lore.kernel.org/r/20240611-asoc_next-v3-1-fcfd84b12164@ti.com Signed-off-by: Mark Brown --- include/sound/dmaengine_pcm.h | 1 + sound/core/pcm_dmaengine.c | 10 ++++++++++ sound/soc/soc-generic-dmaengine-pcm.c | 8 ++++++++ 3 files changed, 19 insertions(+) diff --git a/include/sound/dmaengine_pcm.h b/include/sound/dmaengine_pcm.h index c11aaf8079fbbe..f6baa9a0186815 100644 --- a/include/sound/dmaengine_pcm.h +++ b/include/sound/dmaengine_pcm.h @@ -36,6 +36,7 @@ snd_pcm_uframes_t snd_dmaengine_pcm_pointer_no_residue(struct snd_pcm_substream int snd_dmaengine_pcm_open(struct snd_pcm_substream *substream, struct dma_chan *chan); int snd_dmaengine_pcm_close(struct snd_pcm_substream *substream); +int snd_dmaengine_pcm_sync_stop(struct snd_pcm_substream *substream); int snd_dmaengine_pcm_open_request_chan(struct snd_pcm_substream *substream, dma_filter_fn filter_fn, void *filter_data); diff --git a/sound/core/pcm_dmaengine.c b/sound/core/pcm_dmaengine.c index 12aa1cef11a133..ed07fa5693d2dd 100644 --- a/sound/core/pcm_dmaengine.c +++ b/sound/core/pcm_dmaengine.c @@ -349,6 +349,16 @@ int snd_dmaengine_pcm_open_request_chan(struct snd_pcm_substream *substream, } EXPORT_SYMBOL_GPL(snd_dmaengine_pcm_open_request_chan); +int snd_dmaengine_pcm_sync_stop(struct snd_pcm_substream *substream) +{ + struct dmaengine_pcm_runtime_data *prtd = substream_to_prtd(substream); + + dmaengine_synchronize(prtd->dma_chan); + + return 0; +} +EXPORT_SYMBOL_GPL(snd_dmaengine_pcm_sync_stop); + /** * snd_dmaengine_pcm_close - Close a dmaengine based PCM substream * @substream: PCM substream diff --git a/sound/soc/soc-generic-dmaengine-pcm.c b/sound/soc/soc-generic-dmaengine-pcm.c index ea3bc9318412a3..a63e942fdc0b7d 100644 --- a/sound/soc/soc-generic-dmaengine-pcm.c +++ b/sound/soc/soc-generic-dmaengine-pcm.c @@ -318,6 +318,12 @@ static int dmaengine_copy(struct snd_soc_component *component, return 0; } +static int dmaengine_pcm_sync_stop(struct snd_soc_component *component, + struct snd_pcm_substream *substream) +{ + return snd_dmaengine_pcm_sync_stop(substream); +} + static const struct snd_soc_component_driver dmaengine_pcm_component = { .name = SND_DMAENGINE_PCM_DRV_NAME, .probe_order = SND_SOC_COMP_ORDER_LATE, @@ -327,6 +333,7 @@ static const struct snd_soc_component_driver dmaengine_pcm_component = { .trigger = dmaengine_pcm_trigger, .pointer = dmaengine_pcm_pointer, .pcm_construct = dmaengine_pcm_new, + .sync_stop = dmaengine_pcm_sync_stop, }; static const struct snd_soc_component_driver dmaengine_pcm_component_process = { @@ -339,6 +346,7 @@ static const struct snd_soc_component_driver dmaengine_pcm_component_process = { .pointer = dmaengine_pcm_pointer, .copy = dmaengine_copy, .pcm_construct = dmaengine_pcm_new, + .sync_stop = dmaengine_pcm_sync_stop, }; static const char * const dmaengine_pcm_dma_channel_names[] = { From c5dcf8ab10606e76c1d8a0ec77f27d84a392e874 Mon Sep 17 00:00:00 2001 From: Jai Luthra Date: Tue, 11 Jun 2024 18:02:56 +0530 Subject: [PATCH 0810/1578] ASoC: ti: davinci-mcasp: Set min period size using FIFO config The minimum period size was enforced to 64 as older devices integrating McASP with EDMA used an internal FIFO of 64 samples. With UDMA based platforms this internal McASP FIFO is optional, as the DMA engine internally does some buffering which is already accounted for when registering the platform. So we should read the actual FIFO configuration (txnumevt/rxnumevt) instead of hardcoding frames.min to 64. Acked-by: Peter Ujfalusi Signed-off-by: Jai Luthra Link: https://lore.kernel.org/r/20240611-asoc_next-v3-2-fcfd84b12164@ti.com Signed-off-by: Mark Brown --- sound/soc/ti/davinci-mcasp.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/sound/soc/ti/davinci-mcasp.c b/sound/soc/ti/davinci-mcasp.c index 1e760c3155213d..2b1ed91a736c9a 100644 --- a/sound/soc/ti/davinci-mcasp.c +++ b/sound/soc/ti/davinci-mcasp.c @@ -1472,10 +1472,11 @@ static int davinci_mcasp_hw_rule_min_periodsize( { struct snd_interval *period_size = hw_param_interval(params, SNDRV_PCM_HW_PARAM_PERIOD_SIZE); + u8 numevt = *((u8 *)rule->private); struct snd_interval frames; snd_interval_any(&frames); - frames.min = 64; + frames.min = numevt; frames.integer = 1; return snd_interval_refine(period_size, &frames); @@ -1490,6 +1491,7 @@ static int davinci_mcasp_startup(struct snd_pcm_substream *substream, u32 max_channels = 0; int i, dir, ret; int tdm_slots = mcasp->tdm_slots; + u8 *numevt; /* Do not allow more then one stream per direction */ if (mcasp->substreams[substream->stream]) @@ -1589,9 +1591,12 @@ static int davinci_mcasp_startup(struct snd_pcm_substream *substream, return ret; } + numevt = (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) ? + &mcasp->txnumevt : + &mcasp->rxnumevt; snd_pcm_hw_rule_add(substream->runtime, 0, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, - davinci_mcasp_hw_rule_min_periodsize, NULL, + davinci_mcasp_hw_rule_min_periodsize, numevt, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, -1); return 0; From 524d3f126362b6033e92cbe107ae2158d7fbff94 Mon Sep 17 00:00:00 2001 From: Primoz Fiser Date: Mon, 10 Jun 2024 14:58:47 +0200 Subject: [PATCH 0811/1578] ASoC: ti: omap-hdmi: Fix too long driver name Set driver name to "HDMI". This simplifies the code and gets rid of the following error messages: ASoC: driver name too long 'HDMI 58040000.encoder' -> 'HDMI_58040000_e' Signed-off-by: Primoz Fiser Acked-by: Peter Ujfalusi Link: https://lore.kernel.org/r/20240610125847.773394-1-primoz.fiser@norik.com Signed-off-by: Mark Brown --- sound/soc/ti/omap-hdmi.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/sound/soc/ti/omap-hdmi.c b/sound/soc/ti/omap-hdmi.c index 639bc83f426394..cf43ac19c4a6d0 100644 --- a/sound/soc/ti/omap-hdmi.c +++ b/sound/soc/ti/omap-hdmi.c @@ -354,11 +354,7 @@ static int omap_hdmi_audio_probe(struct platform_device *pdev) if (!card) return -ENOMEM; - card->name = devm_kasprintf(dev, GFP_KERNEL, - "HDMI %s", dev_name(ad->dssdev)); - if (!card->name) - return -ENOMEM; - + card->name = "HDMI"; card->owner = THIS_MODULE; card->dai_link = devm_kzalloc(dev, sizeof(*(card->dai_link)), GFP_KERNEL); From c4ab9da85b9df3692f861512fe6c9812f38b7471 Mon Sep 17 00:00:00 2001 From: Davide Ornaghi Date: Wed, 5 Jun 2024 13:03:45 +0200 Subject: [PATCH 0812/1578] netfilter: nft_inner: validate mandatory meta and payload Check for mandatory netlink attributes in payload and meta expression when used embedded from the inner expression, otherwise NULL pointer dereference is possible from userspace. Fixes: a150d122b6bd ("netfilter: nft_meta: add inner match support") Fixes: 3a07327d10a0 ("netfilter: nft_inner: support for inner tunnel header matching") Signed-off-by: Davide Ornaghi Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_meta.c | 3 +++ net/netfilter/nft_payload.c | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index ba0d3683a45d32..9139ce38ea7b9a 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -839,6 +839,9 @@ static int nft_meta_inner_init(const struct nft_ctx *ctx, struct nft_meta *priv = nft_expr_priv(expr); unsigned int len; + if (!tb[NFTA_META_KEY] || !tb[NFTA_META_DREG]) + return -EINVAL; + priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); switch (priv->key) { case NFT_META_PROTOCOL: diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 0c43d748e23ae1..50429cbd42da42 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -650,6 +650,10 @@ static int nft_payload_inner_init(const struct nft_ctx *ctx, struct nft_payload *priv = nft_expr_priv(expr); u32 base; + if (!tb[NFTA_PAYLOAD_BASE] || !tb[NFTA_PAYLOAD_OFFSET] || + !tb[NFTA_PAYLOAD_LEN] || !tb[NFTA_PAYLOAD_DREG]) + return -EINVAL; + base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); switch (base) { case NFT_PAYLOAD_TUN_HEADER: From 4e7aaa6b82d63e8ddcbfb56b4fd3d014ca586f10 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Tue, 4 Jun 2024 15:58:03 +0200 Subject: [PATCH 0813/1578] netfilter: ipset: Fix race between namespace cleanup and gc in the list:set type Lion Ackermann reported that there is a race condition between namespace cleanup in ipset and the garbage collection of the list:set type. The namespace cleanup can destroy the list:set type of sets while the gc of the set type is waiting to run in rcu cleanup. The latter uses data from the destroyed set which thus leads use after free. The patch contains the following parts: - When destroying all sets, first remove the garbage collectors, then wait if needed and then destroy the sets. - Fix the badly ordered "wait then remove gc" for the destroy a single set case. - Fix the missing rcu locking in the list:set type in the userspace test case. - Use proper RCU list handlings in the list:set type. The patch depends on c1193d9bbbd3 (netfilter: ipset: Add list flush to cancel_gc). Fixes: 97f7cf1cd80e (netfilter: ipset: fix performance regression in swap operation) Reported-by: Lion Ackermann Tested-by: Lion Ackermann Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_core.c | 81 +++++++++++++++------------ net/netfilter/ipset/ip_set_list_set.c | 30 +++++----- 2 files changed, 60 insertions(+), 51 deletions(-) diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 3184cc6be4c9d3..c7ae4d9bf3d24c 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1172,23 +1172,50 @@ ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = { .len = IPSET_MAXNAMELEN - 1 }, }; +/* In order to return quickly when destroying a single set, it is split + * into two stages: + * - Cancel garbage collector + * - Destroy the set itself via call_rcu() + */ + static void -ip_set_destroy_set(struct ip_set *set) +ip_set_destroy_set_rcu(struct rcu_head *head) { - pr_debug("set: %s\n", set->name); + struct ip_set *set = container_of(head, struct ip_set, rcu); - /* Must call it without holding any lock */ set->variant->destroy(set); module_put(set->type->me); kfree(set); } static void -ip_set_destroy_set_rcu(struct rcu_head *head) +_destroy_all_sets(struct ip_set_net *inst) { - struct ip_set *set = container_of(head, struct ip_set, rcu); + struct ip_set *set; + ip_set_id_t i; + bool need_wait = false; - ip_set_destroy_set(set); + /* First cancel gc's: set:list sets are flushed as well */ + for (i = 0; i < inst->ip_set_max; i++) { + set = ip_set(inst, i); + if (set) { + set->variant->cancel_gc(set); + if (set->type->features & IPSET_TYPE_NAME) + need_wait = true; + } + } + /* Must wait for flush to be really finished */ + if (need_wait) + rcu_barrier(); + for (i = 0; i < inst->ip_set_max; i++) { + set = ip_set(inst, i); + if (set) { + ip_set(inst, i) = NULL; + set->variant->destroy(set); + module_put(set->type->me); + kfree(set); + } + } } static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, @@ -1202,11 +1229,10 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, if (unlikely(protocol_min_failed(attr))) return -IPSET_ERR_PROTOCOL; - /* Commands are serialized and references are * protected by the ip_set_ref_lock. * External systems (i.e. xt_set) must call - * ip_set_put|get_nfnl_* functions, that way we + * ip_set_nfnl_get_* functions, that way we * can safely check references here. * * list:set timer can only decrement the reference @@ -1214,8 +1240,6 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, * without holding the lock. */ if (!attr[IPSET_ATTR_SETNAME]) { - /* Must wait for flush to be really finished in list:set */ - rcu_barrier(); read_lock_bh(&ip_set_ref_lock); for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); @@ -1226,15 +1250,7 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, } inst->is_destroyed = true; read_unlock_bh(&ip_set_ref_lock); - for (i = 0; i < inst->ip_set_max; i++) { - s = ip_set(inst, i); - if (s) { - ip_set(inst, i) = NULL; - /* Must cancel garbage collectors */ - s->variant->cancel_gc(s); - ip_set_destroy_set(s); - } - } + _destroy_all_sets(inst); /* Modified by ip_set_destroy() only, which is serialized */ inst->is_destroyed = false; } else { @@ -1255,12 +1271,12 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, features = s->type->features; ip_set(inst, i) = NULL; read_unlock_bh(&ip_set_ref_lock); + /* Must cancel garbage collectors */ + s->variant->cancel_gc(s); if (features & IPSET_TYPE_NAME) { /* Must wait for flush to be really finished */ rcu_barrier(); } - /* Must cancel garbage collectors */ - s->variant->cancel_gc(s); call_rcu(&s->rcu, ip_set_destroy_set_rcu); } return 0; @@ -2365,30 +2381,25 @@ ip_set_net_init(struct net *net) } static void __net_exit -ip_set_net_exit(struct net *net) +ip_set_net_pre_exit(struct net *net) { struct ip_set_net *inst = ip_set_pernet(net); - struct ip_set *set = NULL; - ip_set_id_t i; - inst->is_deleted = true; /* flag for ip_set_nfnl_put */ +} - nfnl_lock(NFNL_SUBSYS_IPSET); - for (i = 0; i < inst->ip_set_max; i++) { - set = ip_set(inst, i); - if (set) { - ip_set(inst, i) = NULL; - set->variant->cancel_gc(set); - ip_set_destroy_set(set); - } - } - nfnl_unlock(NFNL_SUBSYS_IPSET); +static void __net_exit +ip_set_net_exit(struct net *net) +{ + struct ip_set_net *inst = ip_set_pernet(net); + + _destroy_all_sets(inst); kvfree(rcu_dereference_protected(inst->ip_set_list, 1)); } static struct pernet_operations ip_set_net_ops = { .init = ip_set_net_init, + .pre_exit = ip_set_net_pre_exit, .exit = ip_set_net_exit, .id = &ip_set_net_id, .size = sizeof(struct ip_set_net), diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 54e2a1dd7f5f51..bfae7066936bb9 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -79,7 +79,7 @@ list_set_kadd(struct ip_set *set, const struct sk_buff *skb, struct set_elem *e; int ret; - list_for_each_entry(e, &map->members, list) { + list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -99,7 +99,7 @@ list_set_kdel(struct ip_set *set, const struct sk_buff *skb, struct set_elem *e; int ret; - list_for_each_entry(e, &map->members, list) { + list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -188,9 +188,10 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct list_set *map = set->data; struct set_adt_elem *d = value; struct set_elem *e, *next, *prev = NULL; - int ret; + int ret = 0; - list_for_each_entry(e, &map->members, list) { + rcu_read_lock(); + list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -201,6 +202,7 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (d->before == 0) { ret = 1; + goto out; } else if (d->before > 0) { next = list_next_entry(e, list); ret = !list_is_last(&e->list, &map->members) && @@ -208,9 +210,11 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext, } else { ret = prev && prev->id == d->refid; } - return ret; + goto out; } - return 0; +out: + rcu_read_unlock(); + return ret; } static void @@ -239,7 +243,7 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, /* Find where to add the new entry */ n = prev = next = NULL; - list_for_each_entry(e, &map->members, list) { + list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -316,9 +320,9 @@ list_set_udel(struct ip_set *set, void *value, const struct ip_set_ext *ext, { struct list_set *map = set->data; struct set_adt_elem *d = value; - struct set_elem *e, *next, *prev = NULL; + struct set_elem *e, *n, *next, *prev = NULL; - list_for_each_entry(e, &map->members, list) { + list_for_each_entry_safe(e, n, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -424,14 +428,8 @@ static void list_set_destroy(struct ip_set *set) { struct list_set *map = set->data; - struct set_elem *e, *n; - list_for_each_entry_safe(e, n, &map->members, list) { - list_del(&e->list); - ip_set_put_byindex(map->net, e->id); - ip_set_ext_destroy(set, e); - kfree(e); - } + WARN_ON_ONCE(!list_empty(&map->members)); kfree(map); set->data = NULL; From 6f8f132cc7bac2ac76911e47d5baa378aafda4cb Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 6 Jun 2024 12:23:31 +0200 Subject: [PATCH 0814/1578] netfilter: Use flowlabel flow key when re-routing mangled packets 'ip6 dscp set $v' in an nftables outpute route chain has no effect. While nftables does detect the dscp change and calls the reroute hook. But ip6_route_me_harder never sets the dscp/flowlabel: flowlabel/dsfield routing rules are ignored and no reroute takes place. Thanks to Yi Chen for an excellent reproducer script that I used to validate this change. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Yi Chen Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 53d255838e6ab5..5d989d803009f5 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -36,6 +36,7 @@ int ip6_route_me_harder(struct net *net, struct sock *sk_partial, struct sk_buff .flowi6_uid = sock_net_uid(net, sk), .daddr = iph->daddr, .saddr = iph->saddr, + .flowlabel = ip6_flowinfo(iph), }; int err; From ba27e9d2207784da748b19170a2e56bd7770bd81 Mon Sep 17 00:00:00 2001 From: Siddharth Vadapalli Date: Sun, 2 Jun 2024 07:03:19 +0530 Subject: [PATCH 0815/1578] dmaengine: ti: k3-udma-glue: Fix of_k3_udma_glue_parse_chn_by_id() The of_k3_udma_glue_parse_chn_by_id() helper function erroneously invokes "of_node_put()" on the "udmax_np" device-node passed to it, without having incremented its reference count at any point. Fix it. Fixes: 81a1f90f20af ("dmaengine: ti: k3-udma-glue: Add function to parse channel by ID") Signed-off-by: Siddharth Vadapalli Acked-by: Peter Ujfalusi Acked-by: Peter Ujfalusi@gmail.com Link: https://lore.kernel.org/r/20240602013319.2975894-1-s-vadapalli@ti.com Signed-off-by: Vinod Koul --- drivers/dma/ti/k3-udma-glue.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/dma/ti/k3-udma-glue.c b/drivers/dma/ti/k3-udma-glue.c index c9b93055dc9d3d..f0a399cf45b2a5 100644 --- a/drivers/dma/ti/k3-udma-glue.c +++ b/drivers/dma/ti/k3-udma-glue.c @@ -200,12 +200,9 @@ of_k3_udma_glue_parse_chn_by_id(struct device_node *udmax_np, struct k3_udma_glu ret = of_k3_udma_glue_parse(udmax_np, common); if (ret) - goto out_put_spec; + return ret; ret = of_k3_udma_glue_parse_chn_common(common, thread_id, tx_chn); - -out_put_spec: - of_node_put(udmax_np); return ret; } From 1b11b4ef6bd68591dcaf8423c7d05e794e6aec6f Mon Sep 17 00:00:00 2001 From: Nikita Shubin Date: Tue, 28 May 2024 09:09:23 +0300 Subject: [PATCH 0816/1578] dmaengine: ioatdma: Fix leaking on version mismatch Fix leaking ioatdma_device if I/OAT version is less than IOAT_VER_3_0. Fixes: bf453a0a18b2 ("dmaengine: ioat: Support in-use unbind") Signed-off-by: Nikita Shubin Reviewed-by: Dave Jiang Link: https://lore.kernel.org/r/20240528-ioatdma-fixes-v2-1-a9f2fbe26ab1@yadro.com Signed-off-by: Vinod Koul --- drivers/dma/ioat/init.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/dma/ioat/init.c b/drivers/dma/ioat/init.c index 9c364e92cb828d..e76e507ae898ca 100644 --- a/drivers/dma/ioat/init.c +++ b/drivers/dma/ioat/init.c @@ -1350,6 +1350,7 @@ static int ioat_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) void __iomem * const *iomap; struct device *dev = &pdev->dev; struct ioatdma_device *device; + u8 version; int err; err = pcim_enable_device(pdev); @@ -1363,6 +1364,10 @@ static int ioat_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (!iomap) return -ENOMEM; + version = readb(iomap[IOAT_MMIO_BAR] + IOAT_VER_OFFSET); + if (version < IOAT_VER_3_0) + return -ENODEV; + err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); if (err) return err; @@ -1373,16 +1378,14 @@ static int ioat_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) pci_set_master(pdev); pci_set_drvdata(pdev, device); - device->version = readb(device->reg_base + IOAT_VER_OFFSET); + device->version = version; if (device->version >= IOAT_VER_3_4) ioat_dca_enabled = 0; - if (device->version >= IOAT_VER_3_0) { - if (is_skx_ioat(pdev)) - device->version = IOAT_VER_3_2; - err = ioat3_dma_probe(device, ioat_dca_enabled); - } else - return -ENODEV; + if (is_skx_ioat(pdev)) + device->version = IOAT_VER_3_2; + + err = ioat3_dma_probe(device, ioat_dca_enabled); if (err) { dev_err(dev, "Intel(R) I/OAT DMA Engine init failed\n"); return -ENODEV; From f0dc9fda2e0ee9e01496c2f5aca3a831131fad79 Mon Sep 17 00:00:00 2001 From: Nikita Shubin Date: Tue, 28 May 2024 09:09:24 +0300 Subject: [PATCH 0817/1578] dmaengine: ioatdma: Fix error path in ioat3_dma_probe() Make sure we are disabling interrupts and destroying DMA pool if pcie_capability_read/write_word() call failed. Fixes: 511deae0261c ("dmaengine: ioatdma: disable relaxed ordering for ioatdma") Signed-off-by: Nikita Shubin Reviewed-by: Dave Jiang Link: https://lore.kernel.org/r/20240528-ioatdma-fixes-v2-2-a9f2fbe26ab1@yadro.com Signed-off-by: Vinod Koul --- drivers/dma/ioat/init.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/drivers/dma/ioat/init.c b/drivers/dma/ioat/init.c index e76e507ae898ca..26964b7c8cf141 100644 --- a/drivers/dma/ioat/init.c +++ b/drivers/dma/ioat/init.c @@ -534,18 +534,6 @@ static int ioat_probe(struct ioatdma_device *ioat_dma) return err; } -static int ioat_register(struct ioatdma_device *ioat_dma) -{ - int err = dma_async_device_register(&ioat_dma->dma_dev); - - if (err) { - ioat_disable_interrupts(ioat_dma); - dma_pool_destroy(ioat_dma->completion_pool); - } - - return err; -} - static void ioat_dma_remove(struct ioatdma_device *ioat_dma) { struct dma_device *dma = &ioat_dma->dma_dev; @@ -1181,9 +1169,9 @@ static int ioat3_dma_probe(struct ioatdma_device *ioat_dma, int dca) ioat_chan->reg_base + IOAT_DCACTRL_OFFSET); } - err = ioat_register(ioat_dma); + err = dma_async_device_register(&ioat_dma->dma_dev); if (err) - return err; + goto err_disable_interrupts; ioat_kobject_add(ioat_dma, &ioat_ktype); @@ -1192,20 +1180,29 @@ static int ioat3_dma_probe(struct ioatdma_device *ioat_dma, int dca) /* disable relaxed ordering */ err = pcie_capability_read_word(pdev, PCI_EXP_DEVCTL, &val16); - if (err) - return pcibios_err_to_errno(err); + if (err) { + err = pcibios_err_to_errno(err); + goto err_disable_interrupts; + } /* clear relaxed ordering enable */ val16 &= ~PCI_EXP_DEVCTL_RELAX_EN; err = pcie_capability_write_word(pdev, PCI_EXP_DEVCTL, val16); - if (err) - return pcibios_err_to_errno(err); + if (err) { + err = pcibios_err_to_errno(err); + goto err_disable_interrupts; + } if (ioat_dma->cap & IOAT_CAP_DPS) writeb(ioat_pending_level + 1, ioat_dma->reg_base + IOAT_PREFETCH_LIMIT_OFFSET); return 0; + +err_disable_interrupts: + ioat_disable_interrupts(ioat_dma); + dma_pool_destroy(ioat_dma->completion_pool); + return err; } static void ioat_shutdown(struct pci_dev *pdev) From 29b7cd255f3628e0d65be33a939d8b5bba10aa62 Mon Sep 17 00:00:00 2001 From: Nikita Shubin Date: Tue, 28 May 2024 09:09:25 +0300 Subject: [PATCH 0818/1578] dmaengine: ioatdma: Fix kmemleak in ioat_pci_probe() If probing fails we end up with leaking ioatdma_device and each allocated channel. Following kmemleak easy to reproduce by injecting an error in ioat_alloc_chan_resources() when doing ioat_dma_self_test(). unreferenced object 0xffff888014ad5800 (size 1024): [..] [] kmemleak_alloc+0x4a/0x80 [] kmalloc_trace+0x270/0x2f0 [] ioat_pci_probe+0xc1/0x1c0 [ioatdma] [..] repeated for each ioatdma channel: unreferenced object 0xffff8880148e5c00 (size 512): [..] [] kmemleak_alloc+0x4a/0x80 [] kmalloc_trace+0x270/0x2f0 [] ioat_enumerate_channels+0x101/0x2d0 [ioatdma] [] ioat3_dma_probe+0x4d6/0x970 [ioatdma] [] ioat_pci_probe+0x181/0x1c0 [ioatdma] [..] Fixes: bf453a0a18b2 ("dmaengine: ioat: Support in-use unbind") Signed-off-by: Nikita Shubin Reviewed-by: Dave Jiang Link: https://lore.kernel.org/r/20240528-ioatdma-fixes-v2-3-a9f2fbe26ab1@yadro.com Signed-off-by: Vinod Koul --- drivers/dma/ioat/init.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/dma/ioat/init.c b/drivers/dma/ioat/init.c index 26964b7c8cf141..cf688b0c8444cb 100644 --- a/drivers/dma/ioat/init.c +++ b/drivers/dma/ioat/init.c @@ -1347,6 +1347,7 @@ static int ioat_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) void __iomem * const *iomap; struct device *dev = &pdev->dev; struct ioatdma_device *device; + unsigned int i; u8 version; int err; @@ -1384,6 +1385,9 @@ static int ioat_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) err = ioat3_dma_probe(device, ioat_dca_enabled); if (err) { + for (i = 0; i < IOAT_MAX_CHANS; i++) + kfree(device->idx[i]); + kfree(device); dev_err(dev, "Intel(R) I/OAT DMA Engine init failed\n"); return -ENODEV; } From fa555b5026d0bf1ba7c9e645ff75e2725a982631 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 28 May 2024 13:54:22 +0200 Subject: [PATCH 0819/1578] dmaengine: fsl-edma: avoid linking both modules Kbuild does not support having a source file compiled multiple times and linked into distinct modules, or built-in and modular at the same time. For fs-edma, there are two common components that are linked into the fsl-edma.ko for Arm and PowerPC, plus the mcf-edma.ko module on Coldfire. This violates the rule for compile-testing: scripts/Makefile.build:236: drivers/dma/Makefile: fsl-edma-common.o is added to multiple modules: fsl-edma mcf-edma scripts/Makefile.build:236: drivers/dma/Makefile: fsl-edma-trace.o is added to multiple modules: fsl-edma mcf-edma I tried splitting out the common parts into a separate modules, but that adds back the complexity that a cleanup patch removed, and it gets harder with the addition of the tracepoints. As a minimal workaround, address it at the Kconfig level, by disallowing the broken configurations. Link: https://lore.kernel.org/lkml/20240110232255.1099757-1-arnd@kernel.org/ Fixes: 66aac8ea0a6c ("dmaengine: fsl-edma: clean up EXPORT_SYMBOL_GPL in fsl-edma-common.c") Signed-off-by: Arnd Bergmann Acked-by: Peng Fan Link: https://lore.kernel.org/r/20240528115440.2965975-1-arnd@kernel.org Signed-off-by: Vinod Koul --- drivers/dma/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index 002a5ec806207c..9fc99cfbef08c5 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -394,7 +394,7 @@ config LS2X_APB_DMA config MCF_EDMA tristate "Freescale eDMA engine support, ColdFire mcf5441x SoCs" - depends on M5441x || COMPILE_TEST + depends on M5441x || (COMPILE_TEST && FSL_EDMA=n) select DMA_ENGINE select DMA_VIRTUAL_CHANNELS help From 1345a13f18370ad9e5bc98995959a27f9bd71464 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 21 May 2024 10:30:02 +0200 Subject: [PATCH 0820/1578] dt-bindings: dma: fsl-edma: fix dma-channels constraints dma-channels is a number, not a list. Apply proper constraints on the actual number. Fixes: 6eb439dff645 ("dt-bindings: fsl-dma: fsl-edma: add edma3 compatible string") Cc: stable@vger.kernel.org Signed-off-by: Krzysztof Kozlowski Reviewed-by: Peng Fan Acked-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20240521083002.23262-1-krzysztof.kozlowski@linaro.org Signed-off-by: Vinod Koul --- Documentation/devicetree/bindings/dma/fsl,edma.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/dma/fsl,edma.yaml b/Documentation/devicetree/bindings/dma/fsl,edma.yaml index acfb4b2ee7a960..d54140f18d3405 100644 --- a/Documentation/devicetree/bindings/dma/fsl,edma.yaml +++ b/Documentation/devicetree/bindings/dma/fsl,edma.yaml @@ -59,8 +59,8 @@ properties: - 3 dma-channels: - minItems: 1 - maxItems: 64 + minimum: 1 + maximum: 64 clocks: minItems: 1 From 5422145d0b749ad554ada772133b9b20f9fb0ec8 Mon Sep 17 00:00:00 2001 From: Nikita Shubin Date: Tue, 14 May 2024 13:52:31 +0300 Subject: [PATCH 0821/1578] dmaengine: ioatdma: Fix missing kmem_cache_destroy() Fix missing kmem_cache_destroy() for ioat_sed_cache in ioat_exit_module(). Noticed via: ``` modprobe ioatdma rmmod ioatdma modprobe ioatdma debugfs: Directory 'ioat_sed_ent' with parent 'slab' already present! ``` Fixes: c0f28ce66ecf ("dmaengine: ioatdma: move all the init routines") Signed-off-by: Nikita Shubin Acked-by: Dave Jiang Link: https://lore.kernel.org/r/20240514-ioatdma_fixes-v1-1-2776a0913254@yadro.com Signed-off-by: Vinod Koul --- drivers/dma/ioat/init.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/dma/ioat/init.c b/drivers/dma/ioat/init.c index cf688b0c8444cb..e8f45a7fded435 100644 --- a/drivers/dma/ioat/init.c +++ b/drivers/dma/ioat/init.c @@ -1449,6 +1449,7 @@ module_init(ioat_init_module); static void __exit ioat_exit_module(void) { pci_unregister_driver(&ioat_pci_driver); + kmem_cache_destroy(ioat_sed_cache); kmem_cache_destroy(ioat_cache); } module_exit(ioat_exit_module); From d66e50beb91114f387bd798a371384b2a245e8cc Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Tue, 11 Jun 2024 18:53:17 +0100 Subject: [PATCH 0822/1578] KVM: arm64: FFA: Release hyp rx buffer According to the FF-A spec (Buffer states and ownership), after a producer has written into a buffer, it is "full" and now owned by the consumer. The producer won't be able to use that buffer, until the consumer hands it over with an invocation such as RX_RELEASE. It is clear in the following paragraph (Transfer of buffer ownership), that MEM_RETRIEVE_RESP is transferring the ownership from producer (in our case SPM) to consumer (hypervisor). RX_RELEASE is therefore mandatory here. It is less clear though what is happening with MEM_FRAG_TX. But this invocation, as a response to MEM_FRAG_RX writes into the same hypervisor RX buffer (see paragraph "Transmission of transaction descriptor in fragments"). Also this is matching the TF-A implementation where the RX buffer is marked "full" during a MEM_FRAG_RX. Release the RX hypervisor buffer in those two cases. This will unblock later invocations using this buffer which would otherwise fail. (RETRIEVE_REQ, MEM_FRAG_RX and PARTITION_INFO_GET). Signed-off-by: Vincent Donnefort Reviewed-by: Sudeep Holla Link: https://lore.kernel.org/r/20240611175317.1220842-1-vdonnefort@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/ffa.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c index 02746f9d0980f9..efb053af331cc9 100644 --- a/arch/arm64/kvm/hyp/nvhe/ffa.c +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -177,6 +177,14 @@ static void ffa_retrieve_req(struct arm_smccc_res *res, u32 len) res); } +static void ffa_rx_release(struct arm_smccc_res *res) +{ + arm_smccc_1_1_smc(FFA_RX_RELEASE, + 0, 0, + 0, 0, 0, 0, 0, + res); +} + static void do_ffa_rxtx_map(struct arm_smccc_res *res, struct kvm_cpu_context *ctxt) { @@ -543,16 +551,19 @@ static void do_ffa_mem_reclaim(struct arm_smccc_res *res, if (WARN_ON(offset > len || fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE)) { ret = FFA_RET_ABORTED; + ffa_rx_release(res); goto out_unlock; } if (len > ffa_desc_buf.len) { ret = FFA_RET_NO_MEMORY; + ffa_rx_release(res); goto out_unlock; } buf = ffa_desc_buf.buf; memcpy(buf, hyp_buffers.rx, fraglen); + ffa_rx_release(res); for (fragoff = fraglen; fragoff < len; fragoff += fraglen) { ffa_mem_frag_rx(res, handle_lo, handle_hi, fragoff); @@ -563,6 +574,7 @@ static void do_ffa_mem_reclaim(struct arm_smccc_res *res, fraglen = res->a3; memcpy((void *)buf + fragoff, hyp_buffers.rx, fraglen); + ffa_rx_release(res); } ffa_mem_reclaim(res, handle_lo, handle_hi, flags); From b6846826982b9f2f2ad0e79540521b517469ee92 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 11 Jun 2024 16:51:23 +0200 Subject: [PATCH 0823/1578] thermal: gov_step_wise: Restore passive polling management Consider a thermal zone with one passive trip point, a cooling device with 3 states (0, 1, 2) bound to it, passive polling enabled (nonzero passive_delay_jiffies) and no regular polling (polling_delay_jiffies equal to 0) that is managed by the Step-Wise governor. Suppose that the initial state of the cooling device is 0 and the zone temperature is below the trip point to start with. When the trip point is crossed, tz->passive is incremented by the thermal core and the governor's .manage() callback is invoked. It sets 'throttle' to 'true' for the trip in question and get_target_state() returns 1 for the instance corresponding to the cooling device (say that 'upper' and 'lower' are set to 2 and 0 for it, respectively), so its state changes to 1. Passive polling is still active for the zone, so next time the temperature is updated, the governor's .manage() callback will be invoked again. If the temperature is still rising, it will change the state of the cooling device to 2. Now suppose that next time the zone temperature is updated, it falls below the trip point, so tz->passive is decremented for the zone (say it becomes 0 then) and the governor's .manage() callbacks runs. It finds that the temperature trend for the zone is 'falling' and 'throttle' will be set to 'false' for the trip in question, so the cooling device's state will be changed to 1. However, because tz->polling is 0 for the zone, the governor's .manage() callback may not be invoked again for a long time and the cooling device's state will not be reset back to 0. This can happen because commit 042a3d80f118 ("thermal: core: Move passive polling management to the core") removed passive polling management from the Step-Wise governor. Before that change, thermal_zone_trip_update() would bump up tz->passive when changing the target state for a thermal instance from "no target" to a specific value and it would drop tz->passive when changing it back to "no target" which would cause passive polling to be active for the zone until the governor has reset the states of all cooling devices. In particular, in the example above tz->passive would be incremented when changing the state of the cooling device from 0 to 1 and then it would be still nonzero when the state of the cooling device was changed from 2 to 1. To prevent this problem from occurring, restore the passive polling management in the Step-Wise governor by partially reverting the commit in question and update the comment in the restored code to explain its role more clearly. Fixes: 042a3d80f118 ("thermal: core: Move passive polling management to the core") Closes: https://lore.kernel.org/linux-pm/ZmVfcEOxmjUHZTSX@hovoldconsulting.com Reported-by: Johan Hovold Tested-by: Johan Hovold Signed-off-by: Rafael J. Wysocki --- drivers/thermal/gov_step_wise.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/thermal/gov_step_wise.c b/drivers/thermal/gov_step_wise.c index e0fdc497bfcce1..65974fe8be0db5 100644 --- a/drivers/thermal/gov_step_wise.c +++ b/drivers/thermal/gov_step_wise.c @@ -93,6 +93,23 @@ static void thermal_zone_trip_update(struct thermal_zone_device *tz, if (instance->initialized && old_target == instance->target) continue; + if (trip->type == THERMAL_TRIP_PASSIVE) { + /* + * If the target state for this thermal instance + * changes from THERMAL_NO_TARGET to something else, + * ensure that the zone temperature will be updated + * (assuming enabled passive cooling) until it becomes + * THERMAL_NO_TARGET again, or the cooling device may + * not be reset to its initial state. + */ + if (old_target == THERMAL_NO_TARGET && + instance->target != THERMAL_NO_TARGET) + tz->passive++; + else if (old_target != THERMAL_NO_TARGET && + instance->target == THERMAL_NO_TARGET) + tz->passive--; + } + instance->initialized = true; mutex_lock(&instance->cdev->lock); From 442b15a2d7a3f01534cb80585b84d7b60e4e2219 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Fri, 31 May 2024 18:45:33 -0700 Subject: [PATCH 0824/1578] selftests/openat2: fix clang build failures: -static-libasan, LOCAL_HDRS When building with clang via: make LLVM=1 -C tools/testing/selftests two distinct failures occur: 1) gcc requires -static-libasan in order to ensure that Address Sanitizer's library is the first one loaded. However, this leads to build failures on clang, when building via: make LLVM=1 -C tools/testing/selftests However, clang already does the right thing by default: it statically links the Address Sanitizer if -fsanitize is specified. Therefore, fix this by simply omitting -static-libasan for clang builds. And leave behind a comment, because the whole reason for static linking might not be obvious. 2) clang won't accept invocations of this form, but gcc will: $(CC) file1.c header2.h Fix this by using selftests/lib.mk facilities for tracking local header file dependencies: add them to LOCAL_HDRS, leaving only the .c files to be passed to the compiler. Reviewed-by: Ryan Roberts Signed-off-by: John Hubbard Reviewed-by: Nathan Chancellor Reviewed-by: Nathan Chancellor Signed-off-by: Shuah Khan --- tools/testing/selftests/openat2/Makefile | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/openat2/Makefile b/tools/testing/selftests/openat2/Makefile index 254d676a268983..185dc76ebb5fc4 100644 --- a/tools/testing/selftests/openat2/Makefile +++ b/tools/testing/selftests/openat2/Makefile @@ -1,8 +1,18 @@ # SPDX-License-Identifier: GPL-2.0-or-later -CFLAGS += -Wall -O2 -g -fsanitize=address -fsanitize=undefined -static-libasan +CFLAGS += -Wall -O2 -g -fsanitize=address -fsanitize=undefined TEST_GEN_PROGS := openat2_test resolve_test rename_attack_test +# gcc requires -static-libasan in order to ensure that Address Sanitizer's +# library is the first one loaded. However, clang already statically links the +# Address Sanitizer if -fsanitize is specified. Therefore, simply omit +# -static-libasan for clang builds. +ifeq ($(LLVM),) + CFLAGS += -static-libasan +endif + +LOCAL_HDRS += helpers.h + include ../lib.mk -$(TEST_GEN_PROGS): helpers.c helpers.h +$(TEST_GEN_PROGS): helpers.c From ed3994ac847e0d6605f248e7f6776b1d4f445f4b Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Fri, 31 May 2024 18:45:34 -0700 Subject: [PATCH 0825/1578] selftests/fchmodat2: fix clang build failure due to -static-libasan gcc requires -static-libasan in order to ensure that Address Sanitizer's library is the first one loaded. However, this leads to build failures on clang, when building via: make LLVM=1 -C tools/testing/selftests However, clang already does the right thing by default: it statically links the Address Sanitizer if -fsanitize is specified. Therefore, simply omit -static-libasan for clang builds. And leave behind a comment, because the whole reason for static linking might not be obvious. Cc: Ryan Roberts Signed-off-by: John Hubbard Reviewed-by: Nathan Chancellor Signed-off-by: Shuah Khan --- tools/testing/selftests/fchmodat2/Makefile | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/fchmodat2/Makefile b/tools/testing/selftests/fchmodat2/Makefile index 71ec34bf1501e2..4373cea79b7941 100644 --- a/tools/testing/selftests/fchmodat2/Makefile +++ b/tools/testing/selftests/fchmodat2/Makefile @@ -1,6 +1,15 @@ # SPDX-License-Identifier: GPL-2.0-or-later -CFLAGS += -Wall -O2 -g -fsanitize=address -fsanitize=undefined -static-libasan $(KHDR_INCLUDES) +CFLAGS += -Wall -O2 -g -fsanitize=address -fsanitize=undefined $(KHDR_INCLUDES) + +# gcc requires -static-libasan in order to ensure that Address Sanitizer's +# library is the first one loaded. However, clang already statically links the +# Address Sanitizer if -fsanitize is specified. Therefore, simply omit +# -static-libasan for clang builds. +ifeq ($(LLVM),) + CFLAGS += -static-libasan +endif + TEST_GEN_PROGS := fchmodat2_test include ../lib.mk From 017ed5e70c88bf2f1e0952bab6f53a53a028e561 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 18 May 2024 00:26:17 +0100 Subject: [PATCH 0826/1578] drm/nouveau: remove unused struct 'init_exec' 'init_exec' is unused since commit cb75d97e9c77 ("drm/nouveau: implement devinit subdev, and new init table parser") Remove it. Signed-off-by: Dr. David Alan Gilbert Acked-by: Danilo Krummrich Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20240517232617.230767-1-linux@treblig.org --- drivers/gpu/drm/nouveau/nouveau_bios.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_bios.c b/drivers/gpu/drm/nouveau/nouveau_bios.c index 79cfab53f80e25..8c3c1f1e01c585 100644 --- a/drivers/gpu/drm/nouveau/nouveau_bios.c +++ b/drivers/gpu/drm/nouveau/nouveau_bios.c @@ -43,11 +43,6 @@ #define BIOSLOG(sip, fmt, arg...) NV_DEBUG(sip->dev, fmt, ##arg) #define LOG_OLD_VALUE(x) -struct init_exec { - bool execute; - bool repeat; -}; - static bool nv_cksum(const uint8_t *data, unsigned int length) { /* From ab99a87542f194f28e2364a42afbf9fb48b1c724 Mon Sep 17 00:00:00 2001 From: Ofir Gal Date: Fri, 7 Jun 2024 10:27:44 +0300 Subject: [PATCH 0827/1578] md/md-bitmap: fix writing non bitmap pages __write_sb_page() rounds up the io size to the optimal io size if it doesn't exceed the data offset, but it doesn't check the final size exceeds the bitmap length. For example: page count - 1 page size - 4K data offset - 1M optimal io size - 256K The final io size would be 256K (64 pages) but md_bitmap_storage_alloc() allocated 1 page, the IO would write 1 valid page and 63 pages that happens to be allocated afterwards. This leaks memory to the raid device superblock. This issue caused a data transfer failure in nvme-tcp. The network drivers checks the first page of an IO with sendpage_ok(), it returns true if the page isn't a slabpage and refcount >= 1. If the page !sendpage_ok() the network driver disables MSG_SPLICE_PAGES. As of now the network layer assumes all the pages of the IO are sendpage_ok() when MSG_SPLICE_PAGES is on. The bitmap pages aren't slab pages, the first page of the IO is sendpage_ok(), but the additional pages that happens to be allocated after the bitmap pages might be !sendpage_ok(). That cause skb_splice_from_iter() to stop the data transfer, in the case below it hangs 'mdadm --create'. The bug is reproducible, in order to reproduce we need nvme-over-tcp controllers with optimal IO size bigger than PAGE_SIZE. Creating a raid with bitmap over those devices reproduces the bug. In order to simulate large optimal IO size you can use dm-stripe with a single device. Script to reproduce the issue on top of brd devices using dm-stripe is attached below (will be added to blktest). I have added some logs to test the theory: ... md: created bitmap (1 pages) for device md127 __write_sb_page before md_super_write offset: 16, size: 262144. pfn: 0x53ee === __write_sb_page before md_super_write. logging pages === pfn: 0x53ee, slab: 0 <-- the only page that allocated for the bitmap pfn: 0x53ef, slab: 1 pfn: 0x53f0, slab: 0 pfn: 0x53f1, slab: 0 pfn: 0x53f2, slab: 0 pfn: 0x53f3, slab: 1 ... nvme_tcp: sendpage_ok - pfn: 0x53ee, len: 262144, offset: 0 skbuff: before sendpage_ok() - pfn: 0x53ee skbuff: before sendpage_ok() - pfn: 0x53ef WARNING at net/core/skbuff.c:6848 skb_splice_from_iter+0x142/0x450 skbuff: !sendpage_ok - pfn: 0x53ef. is_slab: 1, page_count: 1 ... Cc: stable@vger.kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Ofir Gal Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240607072748.3182199-1-ofir.gal@volumez.com --- drivers/md/md-bitmap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 0a2d37eb38ef92..08232d8dc815ee 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -227,6 +227,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, struct block_device *bdev; struct mddev *mddev = bitmap->mddev; struct bitmap_storage *store = &bitmap->storage; + unsigned int bitmap_limit = (bitmap->storage.file_pages - pg_index) << + PAGE_SHIFT; loff_t sboff, offset = mddev->bitmap_info.offset; sector_t ps = pg_index * PAGE_SIZE / SECTOR_SIZE; unsigned int size = PAGE_SIZE; @@ -269,11 +271,9 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, if (size == 0) /* bitmap runs in to data */ return -EINVAL; - } else { - /* DATA METADATA BITMAP - no problems */ } - md_super_write(mddev, rdev, sboff + ps, (int) size, page); + md_super_write(mddev, rdev, sboff + ps, (int)min(size, bitmap_limit), page); return 0; } From a126eca844353360ebafa9088d22865cb8e022e3 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Sun, 19 May 2024 23:07:35 +0200 Subject: [PATCH 0828/1578] rust: avoid unused import warning in `rusttest` When compiling for the `rusttest` target, the `core::ptr` import is unused since its only use happens in the `reserve()` method which is not compiled in that target: warning: unused import: `core::ptr` --> rust/kernel/alloc/vec_ext.rs:7:5 | 7 | use core::ptr; | ^^^^^^^^^ | = note: `#[warn(unused_imports)]` on by default Thus clean it. Fixes: 97ab3e8eec0c ("rust: alloc: fix dangling pointer in VecExt::reserve()") Reviewed-by: Alice Ryhl Reviewed-by: Danilo Krummrich Link: https://lore.kernel.org/r/20240519210735.587323-1-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- rust/kernel/alloc/vec_ext.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/rust/kernel/alloc/vec_ext.rs b/rust/kernel/alloc/vec_ext.rs index e9a81052728a4e..1297a4be32e8c4 100644 --- a/rust/kernel/alloc/vec_ext.rs +++ b/rust/kernel/alloc/vec_ext.rs @@ -4,7 +4,6 @@ use super::{AllocError, Flags}; use alloc::vec::Vec; -use core::ptr; /// Extensions to [`Vec`]. pub trait VecExt: Sized { @@ -141,7 +140,11 @@ impl VecExt for Vec { // `krealloc_aligned`. A `Vec`'s `ptr` value is not guaranteed to be NULL and might be // dangling after being created with `Vec::new`. Instead, we can rely on `Vec`'s capacity // to be zero if no memory has been allocated yet. - let ptr = if cap == 0 { ptr::null_mut() } else { old_ptr }; + let ptr = if cap == 0 { + core::ptr::null_mut() + } else { + old_ptr + }; // SAFETY: `ptr` is valid because it's either NULL or comes from a previous call to // `krealloc_aligned`. We also verified that the type is not a ZST. From f2736b9c791a126ecb9cfc1aef1c7b4152b66e2d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 11 Jun 2024 18:59:08 -0400 Subject: [PATCH 0829/1578] bcachefs: Fix rcu_read_lock() leak in drop_extra_replicas Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 0d807c2ce9c67a..1a0072eef109b5 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -202,9 +202,8 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i)); /* Now, drop excess replicas: */ -restart_drop_extra_replicas: - rcu_read_lock(); +restart_drop_extra_replicas: bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) { unsigned ptr_durability = bch2_extent_ptr_durability(c, &p); From 8c860ed825cb85f6672cd7b10a8f33e3498a7c81 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 10 Jun 2024 14:02:27 -0700 Subject: [PATCH 0830/1578] x86/uaccess: Fix missed zeroing of ia32 u64 get_user() range checking When reworking the range checking for get_user(), the get_user_8() case on 32-bit wasn't zeroing the high register. (The jump to bad_get_user_8 was accidentally dropped.) Restore the correct error handling destination (and rename the jump to using the expected ".L" prefix). While here, switch to using a named argument ("size") for the call template ("%c4" to "%c[size]") as already used in the other call templates in this file. Found after moving the usercopy selftests to KUnit: # usercopy_test_invalid: EXPECTATION FAILED at lib/usercopy_kunit.c:278 Expected val_u64 == 0, but val_u64 == -60129542144 (0xfffffff200000000) Closes: https://lore.kernel.org/all/CABVgOSn=tb=Lj9SxHuT4_9MTjjKVxsq-ikdXC4kGHO4CfKVmGQ@mail.gmail.com Fixes: b19b74bc99b1 ("x86/mm: Rework address range check in get_user() and put_user()") Reported-by: David Gow Signed-off-by: Kees Cook Signed-off-by: Dave Hansen Reviewed-by: Kirill A. Shutemov Reviewed-by: Qiuxu Zhuo Tested-by: David Gow Link: https://lore.kernel.org/all/20240610210213.work.143-kees%40kernel.org --- arch/x86/include/asm/uaccess.h | 4 ++-- arch/x86/lib/getuser.S | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 0f9bab92a43d76..3a7755c1a44102 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -78,10 +78,10 @@ extern int __get_user_bad(void); int __ret_gu; \ register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \ __chk_user_ptr(ptr); \ - asm volatile("call __" #fn "_%c4" \ + asm volatile("call __" #fn "_%c[size]" \ : "=a" (__ret_gu), "=r" (__val_gu), \ ASM_CALL_CONSTRAINT \ - : "0" (ptr), "i" (sizeof(*(ptr)))); \ + : "0" (ptr), [size] "i" (sizeof(*(ptr)))); \ instrument_get_user(__val_gu); \ (x) = (__force __typeof__(*(ptr))) __val_gu; \ __builtin_expect(__ret_gu, 0); \ diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index 10d5ed8b5990f4..a1cb3a4e6742df 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -44,7 +44,11 @@ or %rdx, %rax .else cmp $TASK_SIZE_MAX-\size+1, %eax +.if \size != 8 jae .Lbad_get_user +.else + jae .Lbad_get_user_8 +.endif sbb %edx, %edx /* array_index_mask_nospec() */ and %edx, %eax .endif @@ -154,7 +158,7 @@ SYM_CODE_END(__get_user_handle_exception) #ifdef CONFIG_X86_32 SYM_CODE_START_LOCAL(__get_user_8_handle_exception) ASM_CLAC -bad_get_user_8: +.Lbad_get_user_8: xor %edx,%edx xor %ecx,%ecx mov $(-EFAULT),%_ASM_AX From 3572bd5689b0812b161b40279e39ca5b66d73e88 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 11 Jun 2024 22:30:37 +0900 Subject: [PATCH 0831/1578] tracing: Build event generation tests only as modules The kprobes and synth event generation test modules add events and lock (get a reference) those event file reference in module init function, and unlock and delete it in module exit function. This is because those are designed for playing as modules. If we make those modules as built-in, those events are left locked in the kernel, and never be removed. This causes kprobe event self-test failure as below. [ 97.349708] ------------[ cut here ]------------ [ 97.353453] WARNING: CPU: 3 PID: 1 at kernel/trace/trace_kprobe.c:2133 kprobe_trace_self_tests_init+0x3f1/0x480 [ 97.357106] Modules linked in: [ 97.358488] CPU: 3 PID: 1 Comm: swapper/0 Not tainted 6.9.0-g699646734ab5-dirty #14 [ 97.361556] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 [ 97.363880] RIP: 0010:kprobe_trace_self_tests_init+0x3f1/0x480 [ 97.365538] Code: a8 24 08 82 e9 ae fd ff ff 90 0f 0b 90 48 c7 c7 e5 aa 0b 82 e9 ee fc ff ff 90 0f 0b 90 48 c7 c7 2d 61 06 82 e9 8e fd ff ff 90 <0f> 0b 90 48 c7 c7 33 0b 0c 82 89 c6 e8 6e 03 1f ff 41 ff c7 e9 90 [ 97.370429] RSP: 0000:ffffc90000013b50 EFLAGS: 00010286 [ 97.371852] RAX: 00000000fffffff0 RBX: ffff888005919c00 RCX: 0000000000000000 [ 97.373829] RDX: ffff888003f40000 RSI: ffffffff8236a598 RDI: ffff888003f40a68 [ 97.375715] RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000000 [ 97.377675] R10: ffffffff811c9ae5 R11: ffffffff8120c4e0 R12: 0000000000000000 [ 97.379591] R13: 0000000000000001 R14: 0000000000000015 R15: 0000000000000000 [ 97.381536] FS: 0000000000000000(0000) GS:ffff88807dcc0000(0000) knlGS:0000000000000000 [ 97.383813] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 97.385449] CR2: 0000000000000000 CR3: 0000000002244000 CR4: 00000000000006b0 [ 97.387347] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 97.389277] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 97.391196] Call Trace: [ 97.391967] [ 97.392647] ? __warn+0xcc/0x180 [ 97.393640] ? kprobe_trace_self_tests_init+0x3f1/0x480 [ 97.395181] ? report_bug+0xbd/0x150 [ 97.396234] ? handle_bug+0x3e/0x60 [ 97.397311] ? exc_invalid_op+0x1a/0x50 [ 97.398434] ? asm_exc_invalid_op+0x1a/0x20 [ 97.399652] ? trace_kprobe_is_busy+0x20/0x20 [ 97.400904] ? tracing_reset_all_online_cpus+0x15/0x90 [ 97.402304] ? kprobe_trace_self_tests_init+0x3f1/0x480 [ 97.403773] ? init_kprobe_trace+0x50/0x50 [ 97.404972] do_one_initcall+0x112/0x240 [ 97.406113] do_initcall_level+0x95/0xb0 [ 97.407286] ? kernel_init+0x1a/0x1a0 [ 97.408401] do_initcalls+0x3f/0x70 [ 97.409452] kernel_init_freeable+0x16f/0x1e0 [ 97.410662] ? rest_init+0x1f0/0x1f0 [ 97.411738] kernel_init+0x1a/0x1a0 [ 97.412788] ret_from_fork+0x39/0x50 [ 97.413817] ? rest_init+0x1f0/0x1f0 [ 97.414844] ret_from_fork_asm+0x11/0x20 [ 97.416285] [ 97.417134] irq event stamp: 13437323 [ 97.418376] hardirqs last enabled at (13437337): [] console_unlock+0x11c/0x150 [ 97.421285] hardirqs last disabled at (13437370): [] console_unlock+0x101/0x150 [ 97.423838] softirqs last enabled at (13437366): [] handle_softirqs+0x23f/0x2a0 [ 97.426450] softirqs last disabled at (13437393): [] __irq_exit_rcu+0x66/0xd0 [ 97.428850] ---[ end trace 0000000000000000 ]--- And also, since we can not cleanup dynamic_event file, ftracetest are failed too. To avoid these issues, build these tests only as modules. Link: https://lore.kernel.org/all/171811263754.85078.5877446624311852525.stgit@devnote2/ Fixes: 9fe41efaca08 ("tracing: Add synth event generation test module") Fixes: 64836248dda2 ("tracing: Add kprobe event command generation test module") Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) --- kernel/trace/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 166ad5444eeae6..721c3b221048a9 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -1136,7 +1136,7 @@ config PREEMPTIRQ_DELAY_TEST config SYNTH_EVENT_GEN_TEST tristate "Test module for in-kernel synthetic event generation" - depends on SYNTH_EVENTS + depends on SYNTH_EVENTS && m help This option creates a test module to check the base functionality of in-kernel synthetic event definition and @@ -1149,7 +1149,7 @@ config SYNTH_EVENT_GEN_TEST config KPROBE_EVENT_GEN_TEST tristate "Test module for in-kernel kprobe event generation" - depends on KPROBE_EVENTS + depends on KPROBE_EVENTS && m help This option creates a test module to check the base functionality of in-kernel kprobe event definition. From 72d95924ee35c8cd16ef52f912483ee938a34d49 Mon Sep 17 00:00:00 2001 From: John David Anglin Date: Mon, 10 Jun 2024 18:47:07 +0000 Subject: [PATCH 0832/1578] parisc: Try to fix random segmentation faults in package builds PA-RISC systems with PA8800 and PA8900 processors have had problems with random segmentation faults for many years. Systems with earlier processors are much more stable. Systems with PA8800 and PA8900 processors have a large L2 cache which needs per page flushing for decent performance when a large range is flushed. The combined cache in these systems is also more sensitive to non-equivalent aliases than the caches in earlier systems. The majority of random segmentation faults that I have looked at appear to be memory corruption in memory allocated using mmap and malloc. My first attempt at fixing the random faults didn't work. On reviewing the cache code, I realized that there were two issues which the existing code didn't handle correctly. Both relate to cache move-in. Another issue is that the present bit in PTEs is racy. 1) PA-RISC caches have a mind of their own and they can speculatively load data and instructions for a page as long as there is a entry in the TLB for the page which allows move-in. TLBs are local to each CPU. Thus, the TLB entry for a page must be purged before flushing the page. This is particularly important on SMP systems. In some of the flush routines, the flush routine would be called and then the TLB entry would be purged. This was because the flush routine needed the TLB entry to do the flush. 2) My initial approach to trying the fix the random faults was to try and use flush_cache_page_if_present for all flush operations. This actually made things worse and led to a couple of hardware lockups. It finally dawned on me that some lines weren't being flushed because the pte check code was racy. This resulted in random inequivalent mappings to physical pages. The __flush_cache_page tmpalias flush sets up its own TLB entry and it doesn't need the existing TLB entry. As long as we can find the pte pointer for the vm page, we can get the pfn and physical address of the page. We can also purge the TLB entry for the page before doing the flush. Further, __flush_cache_page uses a special TLB entry that inhibits cache move-in. When switching page mappings, we need to ensure that lines are removed from the cache. It is not sufficient to just flush the lines to memory as they may come back. This made it clear that we needed to implement all the required flush operations using tmpalias routines. This includes flushes for user and kernel pages. After modifying the code to use tmpalias flushes, it became clear that the random segmentation faults were not fully resolved. The frequency of faults was worse on systems with a 64 MB L2 (PA8900) and systems with more CPUs (rp4440). The warning that I added to flush_cache_page_if_present to detect pages that couldn't be flushed triggered frequently on some systems. Helge and I looked at the pages that couldn't be flushed and found that the PTE was either cleared or for a swap page. Ignoring pages that were swapped out seemed okay but pages with cleared PTEs seemed problematic. I looked at routines related to pte_clear and noticed ptep_clear_flush. The default implementation just flushes the TLB entry. However, it was obvious that on parisc we need to flush the cache page as well. If we don't flush the cache page, stale lines will be left in the cache and cause random corruption. Once a PTE is cleared, there is no way to find the physical address associated with the PTE and flush the associated page at a later time. I implemented an updated change with a parisc specific version of ptep_clear_flush. It fixed the random data corruption on Helge's rp4440 and rp3440, as well as on my c8000. At this point, I realized that I could restore the code where we only flush in flush_cache_page_if_present if the page has been accessed. However, for this, we also need to flush the cache when the accessed bit is cleared in ptep_clear_flush_young to keep things synchronized. The default implementation only flushes the TLB entry. Other changes in this version are: 1) Implement parisc specific version of ptep_get. It's identical to default but needed in arch/parisc/include/asm/pgtable.h. 2) Revise parisc implementation of ptep_test_and_clear_young to use ptep_get (READ_ONCE). 3) Drop parisc implementation of ptep_get_and_clear. We can use default. 4) Revise flush_kernel_vmap_range and invalidate_kernel_vmap_range to use full data cache flush. 5) Move flush_cache_vmap and flush_cache_vunmap to cache.c. Handle VM_IOREMAP case in flush_cache_vmap. At this time, I don't know whether it is better to always flush when the PTE present bit is set or when both the accessed and present bits are set. The later saves flushing pages that haven't been accessed, but we need to flush in ptep_clear_flush_young. It also needs a page table lookup to find the PTE pointer. The lpa instruction only needs a page table lookup when the PTE entry isn't in the TLB. We don't atomically handle setting and clearing the _PAGE_ACCESSED bit. If we miss an update, we may miss a flush and the cache may get corrupted. Whether the current code is effectively atomic depends on process control. When CONFIG_FLUSH_PAGE_ACCESSED is set to zero, the page will eventually be flushed when the PTE is cleared or in flush_cache_page_if_present. The _PAGE_ACCESSED bit is not used, so the problem is avoided. The flush method can be selected using the CONFIG_FLUSH_PAGE_ACCESSED define in cache.c. The default is 0. I didn't see a large difference in performance. Signed-off-by: John David Anglin Cc: # v6.6+ Signed-off-by: Helge Deller --- arch/parisc/include/asm/cacheflush.h | 15 +- arch/parisc/include/asm/pgtable.h | 27 +- arch/parisc/kernel/cache.c | 413 +++++++++++++++++---------- 3 files changed, 275 insertions(+), 180 deletions(-) diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index ba4c05bc24d690..8394718870e1a2 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -31,18 +31,17 @@ void flush_cache_all_local(void); void flush_cache_all(void); void flush_cache_mm(struct mm_struct *mm); -void flush_kernel_dcache_page_addr(const void *addr); - #define flush_kernel_dcache_range(start,size) \ flush_kernel_dcache_range_asm((start), (start)+(size)); +/* The only way to flush a vmap range is to flush whole cache */ #define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1 void flush_kernel_vmap_range(void *vaddr, int size); void invalidate_kernel_vmap_range(void *vaddr, int size); -#define flush_cache_vmap(start, end) flush_cache_all() +void flush_cache_vmap(unsigned long start, unsigned long end); #define flush_cache_vmap_early(start, end) do { } while (0) -#define flush_cache_vunmap(start, end) flush_cache_all() +void flush_cache_vunmap(unsigned long start, unsigned long end); void flush_dcache_folio(struct folio *folio); #define flush_dcache_folio flush_dcache_folio @@ -77,17 +76,11 @@ void flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); -/* defined in pacache.S exported in cache.c used by flush_anon_page */ -void flush_dcache_page_asm(unsigned long phys_addr, unsigned long vaddr); - #define ARCH_HAS_FLUSH_ANON_PAGE void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr); #define ARCH_HAS_FLUSH_ON_KUNMAP -static inline void kunmap_flush_on_unmap(const void *addr) -{ - flush_kernel_dcache_page_addr(addr); -} +void kunmap_flush_on_unmap(const void *addr); #endif /* _PARISC_CACHEFLUSH_H */ diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 974accac05cd34..babf65751e8180 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -448,14 +448,17 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) return pte; } +static inline pte_t ptep_get(pte_t *ptep) +{ + return READ_ONCE(*ptep); +} +#define ptep_get ptep_get + static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t pte; - if (!pte_young(*ptep)) - return 0; - - pte = *ptep; + pte = ptep_get(ptep); if (!pte_young(pte)) { return 0; } @@ -463,17 +466,10 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned return 1; } -struct mm_struct; -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) -{ - pte_t old_pte; - - old_pte = *ptep; - set_pte(ptep, __pte(0)); - - return old_pte; -} +int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); +pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); +struct mm_struct; static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { set_pte(ptep, pte_wrprotect(*ptep)); @@ -511,7 +507,8 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR +#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH +#define __HAVE_ARCH_PTEP_CLEAR_FLUSH #define __HAVE_ARCH_PTEP_SET_WRPROTECT #define __HAVE_ARCH_PTE_SAME diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index 422f3e1e6d9cad..483bfafd930cde 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -31,20 +32,31 @@ #include #include +#define PTR_PAGE_ALIGN_DOWN(addr) PTR_ALIGN_DOWN(addr, PAGE_SIZE) + +/* + * When nonzero, use _PAGE_ACCESSED bit to try to reduce the number + * of page flushes done flush_cache_page_if_present. There are some + * pros and cons in using this option. It may increase the risk of + * random segmentation faults. + */ +#define CONFIG_FLUSH_PAGE_ACCESSED 0 + int split_tlb __ro_after_init; int dcache_stride __ro_after_init; int icache_stride __ro_after_init; EXPORT_SYMBOL(dcache_stride); +/* Internal implementation in arch/parisc/kernel/pacache.S */ void flush_dcache_page_asm(unsigned long phys_addr, unsigned long vaddr); EXPORT_SYMBOL(flush_dcache_page_asm); void purge_dcache_page_asm(unsigned long phys_addr, unsigned long vaddr); void flush_icache_page_asm(unsigned long phys_addr, unsigned long vaddr); - -/* Internal implementation in arch/parisc/kernel/pacache.S */ void flush_data_cache_local(void *); /* flushes local data-cache only */ void flush_instruction_cache_local(void); /* flushes local code-cache only */ +static void flush_kernel_dcache_page_addr(const void *addr); + /* On some machines (i.e., ones with the Merced bus), there can be * only a single PxTLB broadcast at a time; this must be guaranteed * by software. We need a spinlock around all TLB flushes to ensure @@ -321,6 +333,18 @@ __flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, { if (!static_branch_likely(&parisc_has_cache)) return; + + /* + * The TLB is the engine of coherence on parisc. The CPU is + * entitled to speculate any page with a TLB mapping, so here + * we kill the mapping then flush the page along a special flush + * only alias mapping. This guarantees that the page is no-longer + * in the cache for any process and nor may it be speculatively + * read in (until the user or kernel specifically accesses it, + * of course). + */ + flush_tlb_page(vma, vmaddr); + preempt_disable(); flush_dcache_page_asm(physaddr, vmaddr); if (vma->vm_flags & VM_EXEC) @@ -328,46 +352,44 @@ __flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, preempt_enable(); } -static void flush_user_cache_page(struct vm_area_struct *vma, unsigned long vmaddr) +static void flush_kernel_dcache_page_addr(const void *addr) { - unsigned long flags, space, pgd, prot; -#ifdef CONFIG_TLB_PTLOCK - unsigned long pgd_lock; -#endif + unsigned long vaddr = (unsigned long)addr; + unsigned long flags; - vmaddr &= PAGE_MASK; + /* Purge TLB entry to remove translation on all CPUs */ + purge_tlb_start(flags); + pdtlb(SR_KERNEL, addr); + purge_tlb_end(flags); + /* Use tmpalias flush to prevent data cache move-in */ preempt_disable(); + flush_dcache_page_asm(__pa(vaddr), vaddr); + preempt_enable(); +} - /* Set context for flush */ - local_irq_save(flags); - prot = mfctl(8); - space = mfsp(SR_USER); - pgd = mfctl(25); -#ifdef CONFIG_TLB_PTLOCK - pgd_lock = mfctl(28); -#endif - switch_mm_irqs_off(NULL, vma->vm_mm, NULL); - local_irq_restore(flags); - - flush_user_dcache_range_asm(vmaddr, vmaddr + PAGE_SIZE); - if (vma->vm_flags & VM_EXEC) - flush_user_icache_range_asm(vmaddr, vmaddr + PAGE_SIZE); - flush_tlb_page(vma, vmaddr); +static void flush_kernel_icache_page_addr(const void *addr) +{ + unsigned long vaddr = (unsigned long)addr; + unsigned long flags; - /* Restore previous context */ - local_irq_save(flags); -#ifdef CONFIG_TLB_PTLOCK - mtctl(pgd_lock, 28); -#endif - mtctl(pgd, 25); - mtsp(space, SR_USER); - mtctl(prot, 8); - local_irq_restore(flags); + /* Purge TLB entry to remove translation on all CPUs */ + purge_tlb_start(flags); + pdtlb(SR_KERNEL, addr); + purge_tlb_end(flags); + /* Use tmpalias flush to prevent instruction cache move-in */ + preempt_disable(); + flush_icache_page_asm(__pa(vaddr), vaddr); preempt_enable(); } +void kunmap_flush_on_unmap(const void *addr) +{ + flush_kernel_dcache_page_addr(addr); +} +EXPORT_SYMBOL(kunmap_flush_on_unmap); + void flush_icache_pages(struct vm_area_struct *vma, struct page *page, unsigned int nr) { @@ -375,13 +397,16 @@ void flush_icache_pages(struct vm_area_struct *vma, struct page *page, for (;;) { flush_kernel_dcache_page_addr(kaddr); - flush_kernel_icache_page(kaddr); + flush_kernel_icache_page_addr(kaddr); if (--nr == 0) break; kaddr += PAGE_SIZE; } } +/* + * Walk page directory for MM to find PTEP pointer for address ADDR. + */ static inline pte_t *get_ptep(struct mm_struct *mm, unsigned long addr) { pte_t *ptep = NULL; @@ -410,6 +435,41 @@ static inline bool pte_needs_flush(pte_t pte) == (_PAGE_PRESENT | _PAGE_ACCESSED); } +/* + * Return user physical address. Returns 0 if page is not present. + */ +static inline unsigned long get_upa(struct mm_struct *mm, unsigned long addr) +{ + unsigned long flags, space, pgd, prot, pa; +#ifdef CONFIG_TLB_PTLOCK + unsigned long pgd_lock; +#endif + + /* Save context */ + local_irq_save(flags); + prot = mfctl(8); + space = mfsp(SR_USER); + pgd = mfctl(25); +#ifdef CONFIG_TLB_PTLOCK + pgd_lock = mfctl(28); +#endif + + /* Set context for lpa_user */ + switch_mm_irqs_off(NULL, mm, NULL); + pa = lpa_user(addr); + + /* Restore previous context */ +#ifdef CONFIG_TLB_PTLOCK + mtctl(pgd_lock, 28); +#endif + mtctl(pgd, 25); + mtsp(space, SR_USER); + mtctl(prot, 8); + local_irq_restore(flags); + + return pa; +} + void flush_dcache_folio(struct folio *folio) { struct address_space *mapping = folio_flush_mapping(folio); @@ -458,50 +518,23 @@ void flush_dcache_folio(struct folio *folio) if (addr + nr * PAGE_SIZE > vma->vm_end) nr = (vma->vm_end - addr) / PAGE_SIZE; - if (parisc_requires_coherency()) { - for (i = 0; i < nr; i++) { - pte_t *ptep = get_ptep(vma->vm_mm, - addr + i * PAGE_SIZE); - if (!ptep) - continue; - if (pte_needs_flush(*ptep)) - flush_user_cache_page(vma, - addr + i * PAGE_SIZE); - /* Optimise accesses to the same table? */ - pte_unmap(ptep); - } - } else { + if (old_addr == 0 || (old_addr & (SHM_COLOUR - 1)) + != (addr & (SHM_COLOUR - 1))) { + for (i = 0; i < nr; i++) + __flush_cache_page(vma, + addr + i * PAGE_SIZE, + (pfn + i) * PAGE_SIZE); /* - * The TLB is the engine of coherence on parisc: - * The CPU is entitled to speculate any page - * with a TLB mapping, so here we kill the - * mapping then flush the page along a special - * flush only alias mapping. This guarantees that - * the page is no-longer in the cache for any - * process and nor may it be speculatively read - * in (until the user or kernel specifically - * accesses it, of course) + * Software is allowed to have any number + * of private mappings to a page. */ - for (i = 0; i < nr; i++) - flush_tlb_page(vma, addr + i * PAGE_SIZE); - if (old_addr == 0 || (old_addr & (SHM_COLOUR - 1)) - != (addr & (SHM_COLOUR - 1))) { - for (i = 0; i < nr; i++) - __flush_cache_page(vma, - addr + i * PAGE_SIZE, - (pfn + i) * PAGE_SIZE); - /* - * Software is allowed to have any number - * of private mappings to a page. - */ - if (!(vma->vm_flags & VM_SHARED)) - continue; - if (old_addr) - pr_err("INEQUIVALENT ALIASES 0x%lx and 0x%lx in file %pD\n", - old_addr, addr, vma->vm_file); - if (nr == folio_nr_pages(folio)) - old_addr = addr; - } + if (!(vma->vm_flags & VM_SHARED)) + continue; + if (old_addr) + pr_err("INEQUIVALENT ALIASES 0x%lx and 0x%lx in file %pD\n", + old_addr, addr, vma->vm_file); + if (nr == folio_nr_pages(folio)) + old_addr = addr; } WARN_ON(++count == 4096); } @@ -591,35 +624,28 @@ extern void purge_kernel_dcache_page_asm(unsigned long); extern void clear_user_page_asm(void *, unsigned long); extern void copy_user_page_asm(void *, void *, unsigned long); -void flush_kernel_dcache_page_addr(const void *addr) -{ - unsigned long flags; - - flush_kernel_dcache_page_asm(addr); - purge_tlb_start(flags); - pdtlb(SR_KERNEL, addr); - purge_tlb_end(flags); -} -EXPORT_SYMBOL(flush_kernel_dcache_page_addr); - static void flush_cache_page_if_present(struct vm_area_struct *vma, - unsigned long vmaddr, unsigned long pfn) + unsigned long vmaddr) { +#if CONFIG_FLUSH_PAGE_ACCESSED bool needs_flush = false; - pte_t *ptep; + pte_t *ptep, pte; - /* - * The pte check is racy and sometimes the flush will trigger - * a non-access TLB miss. Hopefully, the page has already been - * flushed. - */ ptep = get_ptep(vma->vm_mm, vmaddr); if (ptep) { - needs_flush = pte_needs_flush(*ptep); + pte = ptep_get(ptep); + needs_flush = pte_needs_flush(pte); pte_unmap(ptep); } if (needs_flush) - flush_cache_page(vma, vmaddr, pfn); + __flush_cache_page(vma, vmaddr, PFN_PHYS(pte_pfn(pte))); +#else + struct mm_struct *mm = vma->vm_mm; + unsigned long physaddr = get_upa(mm, vmaddr); + + if (physaddr) + __flush_cache_page(vma, vmaddr, PAGE_ALIGN_DOWN(physaddr)); +#endif } void copy_user_highpage(struct page *to, struct page *from, @@ -629,7 +655,7 @@ void copy_user_highpage(struct page *to, struct page *from, kfrom = kmap_local_page(from); kto = kmap_local_page(to); - flush_cache_page_if_present(vma, vaddr, page_to_pfn(from)); + __flush_cache_page(vma, vaddr, PFN_PHYS(page_to_pfn(from))); copy_page_asm(kto, kfrom); kunmap_local(kto); kunmap_local(kfrom); @@ -638,16 +664,17 @@ void copy_user_highpage(struct page *to, struct page *from, void copy_to_user_page(struct vm_area_struct *vma, struct page *page, unsigned long user_vaddr, void *dst, void *src, int len) { - flush_cache_page_if_present(vma, user_vaddr, page_to_pfn(page)); + __flush_cache_page(vma, user_vaddr, PFN_PHYS(page_to_pfn(page))); memcpy(dst, src, len); - flush_kernel_dcache_range_asm((unsigned long)dst, (unsigned long)dst + len); + flush_kernel_dcache_page_addr(PTR_PAGE_ALIGN_DOWN(dst)); } void copy_from_user_page(struct vm_area_struct *vma, struct page *page, unsigned long user_vaddr, void *dst, void *src, int len) { - flush_cache_page_if_present(vma, user_vaddr, page_to_pfn(page)); + __flush_cache_page(vma, user_vaddr, PFN_PHYS(page_to_pfn(page))); memcpy(dst, src, len); + flush_kernel_dcache_page_addr(PTR_PAGE_ALIGN_DOWN(src)); } /* __flush_tlb_range() @@ -681,32 +708,10 @@ int __flush_tlb_range(unsigned long sid, unsigned long start, static void flush_cache_pages(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - unsigned long addr, pfn; - pte_t *ptep; - - for (addr = start; addr < end; addr += PAGE_SIZE) { - bool needs_flush = false; - /* - * The vma can contain pages that aren't present. Although - * the pte search is expensive, we need the pte to find the - * page pfn and to check whether the page should be flushed. - */ - ptep = get_ptep(vma->vm_mm, addr); - if (ptep) { - needs_flush = pte_needs_flush(*ptep); - pfn = pte_pfn(*ptep); - pte_unmap(ptep); - } - if (needs_flush) { - if (parisc_requires_coherency()) { - flush_user_cache_page(vma, addr); - } else { - if (WARN_ON(!pfn_valid(pfn))) - return; - __flush_cache_page(vma, addr, PFN_PHYS(pfn)); - } - } - } + unsigned long addr; + + for (addr = start; addr < end; addr += PAGE_SIZE) + flush_cache_page_if_present(vma, addr); } static inline unsigned long mm_total_size(struct mm_struct *mm) @@ -757,21 +762,19 @@ void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned if (WARN_ON(IS_ENABLED(CONFIG_SMP) && arch_irqs_disabled())) return; flush_tlb_range(vma, start, end); - flush_cache_all(); + if (vma->vm_flags & VM_EXEC) + flush_cache_all(); + else + flush_data_cache(); return; } - flush_cache_pages(vma, start, end); + flush_cache_pages(vma, start & PAGE_MASK, end); } void flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, unsigned long pfn) { - if (WARN_ON(!pfn_valid(pfn))) - return; - if (parisc_requires_coherency()) - flush_user_cache_page(vma, vmaddr); - else - __flush_cache_page(vma, vmaddr, PFN_PHYS(pfn)); + __flush_cache_page(vma, vmaddr, PFN_PHYS(pfn)); } void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr) @@ -779,34 +782,133 @@ void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned lon if (!PageAnon(page)) return; - if (parisc_requires_coherency()) { - if (vma->vm_flags & VM_SHARED) - flush_data_cache(); - else - flush_user_cache_page(vma, vmaddr); + __flush_cache_page(vma, vmaddr, PFN_PHYS(page_to_pfn(page))); +} + +int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep) +{ + pte_t pte = ptep_get(ptep); + + if (!pte_young(pte)) + return 0; + set_pte(ptep, pte_mkold(pte)); +#if CONFIG_FLUSH_PAGE_ACCESSED + __flush_cache_page(vma, addr, PFN_PHYS(pte_pfn(pte))); +#endif + return 1; +} + +/* + * After a PTE is cleared, we have no way to flush the cache for + * the physical page. On PA8800 and PA8900 processors, these lines + * can cause random cache corruption. Thus, we must flush the cache + * as well as the TLB when clearing a PTE that's valid. + */ +pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep) +{ + struct mm_struct *mm = (vma)->vm_mm; + pte_t pte = ptep_get_and_clear(mm, addr, ptep); + unsigned long pfn = pte_pfn(pte); + + if (pfn_valid(pfn)) + __flush_cache_page(vma, addr, PFN_PHYS(pfn)); + else if (pte_accessible(mm, pte)) + flush_tlb_page(vma, addr); + + return pte; +} + +/* + * The physical address for pages in the ioremap case can be obtained + * from the vm_struct struct. I wasn't able to successfully handle the + * vmalloc and vmap cases. We have an array of struct page pointers in + * the uninitialized vmalloc case but the flush failed using page_to_pfn. + */ +void flush_cache_vmap(unsigned long start, unsigned long end) +{ + unsigned long addr, physaddr; + struct vm_struct *vm; + + /* Prevent cache move-in */ + flush_tlb_kernel_range(start, end); + + if (end - start >= parisc_cache_flush_threshold) { + flush_cache_all(); return; } - flush_tlb_page(vma, vmaddr); - preempt_disable(); - flush_dcache_page_asm(page_to_phys(page), vmaddr); - preempt_enable(); + if (WARN_ON_ONCE(!is_vmalloc_addr((void *)start))) { + flush_cache_all(); + return; + } + + vm = find_vm_area((void *)start); + if (WARN_ON_ONCE(!vm)) { + flush_cache_all(); + return; + } + + /* The physical addresses of IOREMAP regions are contiguous */ + if (vm->flags & VM_IOREMAP) { + physaddr = vm->phys_addr; + for (addr = start; addr < end; addr += PAGE_SIZE) { + preempt_disable(); + flush_dcache_page_asm(physaddr, start); + flush_icache_page_asm(physaddr, start); + preempt_enable(); + physaddr += PAGE_SIZE; + } + return; + } + + flush_cache_all(); } +EXPORT_SYMBOL(flush_cache_vmap); +/* + * The vm_struct has been retired and the page table is set up. The + * last page in the range is a guard page. Its physical address can't + * be determined using lpa, so there is no way to flush the range + * using flush_dcache_page_asm. + */ +void flush_cache_vunmap(unsigned long start, unsigned long end) +{ + /* Prevent cache move-in */ + flush_tlb_kernel_range(start, end); + flush_data_cache(); +} +EXPORT_SYMBOL(flush_cache_vunmap); + +/* + * On systems with PA8800/PA8900 processors, there is no way to flush + * a vmap range other than using the architected loop to flush the + * entire cache. The page directory is not set up, so we can't use + * fdc, etc. FDCE/FICE don't work to flush a portion of the cache. + * L2 is physically indexed but FDCE/FICE instructions in virtual + * mode output their virtual address on the core bus, not their + * real address. As a result, the L2 cache index formed from the + * virtual address will most likely not be the same as the L2 index + * formed from the real address. + */ void flush_kernel_vmap_range(void *vaddr, int size) { unsigned long start = (unsigned long)vaddr; unsigned long end = start + size; - if ((!IS_ENABLED(CONFIG_SMP) || !arch_irqs_disabled()) && - (unsigned long)size >= parisc_cache_flush_threshold) { - flush_tlb_kernel_range(start, end); - flush_data_cache(); + flush_tlb_kernel_range(start, end); + + if (!static_branch_likely(&parisc_has_dcache)) + return; + + /* If interrupts are disabled, we can only do local flush */ + if (WARN_ON(IS_ENABLED(CONFIG_SMP) && arch_irqs_disabled())) { + flush_data_cache_local(NULL); return; } - flush_kernel_dcache_range_asm(start, end); - flush_tlb_kernel_range(start, end); + flush_data_cache(); } EXPORT_SYMBOL(flush_kernel_vmap_range); @@ -818,15 +920,18 @@ void invalidate_kernel_vmap_range(void *vaddr, int size) /* Ensure DMA is complete */ asm_syncdma(); - if ((!IS_ENABLED(CONFIG_SMP) || !arch_irqs_disabled()) && - (unsigned long)size >= parisc_cache_flush_threshold) { - flush_tlb_kernel_range(start, end); - flush_data_cache(); + flush_tlb_kernel_range(start, end); + + if (!static_branch_likely(&parisc_has_dcache)) + return; + + /* If interrupts are disabled, we can only do local flush */ + if (WARN_ON(IS_ENABLED(CONFIG_SMP) && arch_irqs_disabled())) { + flush_data_cache_local(NULL); return; } - purge_kernel_dcache_range_asm(start, end); - flush_tlb_kernel_range(start, end); + flush_data_cache(); } EXPORT_SYMBOL(invalidate_kernel_vmap_range); From 52912ca87e2b810e5acdcdc452593d30c9187d8f Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 7 Jun 2024 10:25:07 +0900 Subject: [PATCH 0833/1578] scsi: core: Disable CDL by default For SCSI devices supporting the Command Duration Limits feature set, the user can enable/disable this feature use through the sysfs device attribute "cdl_enable". This attribute modification triggers a call to scsi_cdl_enable() to enable and disable the feature for ATA devices and set the scsi device cdl_enable field to the user provided bool value. For SCSI devices supporting CDL, the feature set is always enabled and scsi_cdl_enable() is reduced to setting the cdl_enable field. However, for ATA devices, a drive may spin-up with the CDL feature enabled by default. But the SCSI device cdl_enable field is always initialized to false (CDL disabled), regardless of the actual device CDL feature state. For ATA devices managed by libata (or libsas), libata-core always disables the CDL feature set when the device is attached, thus syncing the state of the CDL feature on the device and of the SCSI device cdl_enable field. However, for ATA devices connected to a SAS HBA, the CDL feature is not disabled on scan for ATA devices that have this feature enabled by default, leading to an inconsistent state of the feature on the device with the SCSI device cdl_enable field. Avoid this inconsistency by adding a call to scsi_cdl_enable() in scsi_cdl_check() to make sure that the device-side state of the CDL feature set always matches the scsi device cdl_enable field state. This implies that CDL will always be disabled for ATA devices connected to SAS HBAs, which is consistent with libata/libsas initialization of the device. Reported-by: Scott McCoy Fixes: 1b22cfb14142 ("scsi: core: Allow enabling and disabling command duration limits") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20240607012507.111488-1-dlemoal@kernel.org Reviewed-by: Niklas Cassel Reviewed-by: Igor Pylypiv Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c index f0464db3f9de99..ee69bd35889d17 100644 --- a/drivers/scsi/scsi.c +++ b/drivers/scsi/scsi.c @@ -673,6 +673,13 @@ void scsi_cdl_check(struct scsi_device *sdev) sdev->use_10_for_rw = 0; sdev->cdl_supported = 1; + + /* + * If the device supports CDL, make sure that the current drive + * feature status is consistent with the user controlled + * cdl_enable state. + */ + scsi_cdl_enable(sdev, sdev->cdl_enable); } else { sdev->cdl_supported = 0; } From 77691af484e28af7a692e511b9ed5ca63012ec6e Mon Sep 17 00:00:00 2001 From: Ziqi Chen Date: Fri, 7 Jun 2024 18:06:23 +0800 Subject: [PATCH 0834/1578] scsi: ufs: core: Quiesce request queues before checking pending cmds In ufshcd_clock_scaling_prepare(), after SCSI layer is blocked, ufshcd_pending_cmds() is called to check whether there are pending transactions or not. And only if there are no pending transactions can we proceed to kickstart the clock scaling sequence. ufshcd_pending_cmds() traverses over all SCSI devices and calls sbitmap_weight() on their budget_map. sbitmap_weight() can be broken down to three steps: 1. Calculate the nr outstanding bits set in the 'word' bitmap. 2. Calculate the nr outstanding bits set in the 'cleared' bitmap. 3. Subtract the result from step 1 by the result from step 2. This can lead to a race condition as outlined below: Assume there is one pending transaction in the request queue of one SCSI device, say sda, and the budget token of this request is 0, the 'word' is 0x1 and the 'cleared' is 0x0. 1. When step 1 executes, it gets the result as 1. 2. Before step 2 executes, block layer tries to dispatch a new request to sda. Since the SCSI layer is blocked, the request cannot pass through SCSI but the block layer would do budget_get() and budget_put() to sda's budget map regardless, so the 'word' has become 0x3 and 'cleared' has become 0x2 (assume the new request got budget token 1). 3. When step 2 executes, it gets the result as 1. 4. When step 3 executes, it gets the result as 0, meaning there is no pending transactions, which is wrong. Thread A Thread B ufshcd_pending_cmds() __blk_mq_sched_dispatch_requests() | | sbitmap_weight(word) | | scsi_mq_get_budget() | | | scsi_mq_put_budget() | | sbitmap_weight(cleared) ... When this race condition happens, the clock scaling sequence is started with transactions still in flight, leading to subsequent hibernate enter failure, broken link, task abort and back to back error recovery. Fix this race condition by quiescing the request queues before calling ufshcd_pending_cmds() so that block layer won't touch the budget map when ufshcd_pending_cmds() is working on it. In addition, remove the SCSI layer blocking/unblocking to reduce redundancies and latencies. Fixes: 8d077ede48c1 ("scsi: ufs: Optimize the command queueing code") Co-developed-by: Can Guo Signed-off-by: Can Guo Signed-off-by: Ziqi Chen Link: https://lore.kernel.org/r/1717754818-39863-1-git-send-email-quic_ziqichen@quicinc.com Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 0cf07194bbe89d..e5e9da61f15d0b 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -1366,7 +1366,7 @@ static int ufshcd_clock_scaling_prepare(struct ufs_hba *hba, u64 timeout_us) * make sure that there are no outstanding requests when * clock scaling is in progress */ - ufshcd_scsi_block_requests(hba); + blk_mq_quiesce_tagset(&hba->host->tag_set); mutex_lock(&hba->wb_mutex); down_write(&hba->clk_scaling_lock); @@ -1375,7 +1375,7 @@ static int ufshcd_clock_scaling_prepare(struct ufs_hba *hba, u64 timeout_us) ret = -EBUSY; up_write(&hba->clk_scaling_lock); mutex_unlock(&hba->wb_mutex); - ufshcd_scsi_unblock_requests(hba); + blk_mq_unquiesce_tagset(&hba->host->tag_set); goto out; } @@ -1396,7 +1396,7 @@ static void ufshcd_clock_scaling_unprepare(struct ufs_hba *hba, int err, bool sc mutex_unlock(&hba->wb_mutex); - ufshcd_scsi_unblock_requests(hba); + blk_mq_unquiesce_tagset(&hba->host->tag_set); ufshcd_release(hba); } From 90e6f08915ec6efe46570420412a65050ec826b2 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 11 Jun 2024 17:34:35 +0900 Subject: [PATCH 0835/1578] scsi: mpi3mr: Fix ATA NCQ priority support The function mpi3mr_qcmd() of the mpi3mr driver is able to indicate to the HBA if a read or write command directed at an ATA device should be translated to an NCQ read/write command with the high prioiryt bit set when the request uses the RT priority class and the user has enabled NCQ priority through sysfs. However, unlike the mpt3sas driver, the mpi3mr driver does not define the sas_ncq_prio_supported and sas_ncq_prio_enable sysfs attributes, so the ncq_prio_enable field of struct mpi3mr_sdev_priv_data is never actually set and NCQ Priority cannot ever be used. Fix this by defining these missing atributes to allow a user to check if an ATA device supports NCQ priority and to enable/disable the use of NCQ priority. To do this, lift the function scsih_ncq_prio_supp() out of the mpt3sas driver and make it the generic SCSI SAS transport function sas_ata_ncq_prio_supported(). Nothing in that function is hardware specific, so this function can be used in both the mpt3sas driver and the mpi3mr driver. Reported-by: Scott McCoy Fixes: 023ab2a9b4ed ("scsi: mpi3mr: Add support for queue command processing") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20240611083435.92961-1-dlemoal@kernel.org Reviewed-by: Niklas Cassel Signed-off-by: Martin K. Petersen --- drivers/scsi/mpi3mr/mpi3mr_app.c | 62 ++++++++++++++++++++++++++++ drivers/scsi/mpt3sas/mpt3sas_base.h | 3 -- drivers/scsi/mpt3sas/mpt3sas_ctl.c | 4 +- drivers/scsi/mpt3sas/mpt3sas_scsih.c | 23 ----------- drivers/scsi/scsi_transport_sas.c | 23 +++++++++++ include/scsi/scsi_transport_sas.h | 2 + 6 files changed, 89 insertions(+), 28 deletions(-) diff --git a/drivers/scsi/mpi3mr/mpi3mr_app.c b/drivers/scsi/mpi3mr/mpi3mr_app.c index 1638109a68a0b4..cd261b48eb46d7 100644 --- a/drivers/scsi/mpi3mr/mpi3mr_app.c +++ b/drivers/scsi/mpi3mr/mpi3mr_app.c @@ -2163,10 +2163,72 @@ persistent_id_show(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR_RO(persistent_id); +/** + * sas_ncq_prio_supported_show - Indicate if device supports NCQ priority + * @dev: pointer to embedded device + * @attr: sas_ncq_prio_supported attribute descriptor + * @buf: the buffer returned + * + * A sysfs 'read-only' sdev attribute, only works with SATA devices + */ +static ssize_t +sas_ncq_prio_supported_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct scsi_device *sdev = to_scsi_device(dev); + + return sysfs_emit(buf, "%d\n", sas_ata_ncq_prio_supported(sdev)); +} +static DEVICE_ATTR_RO(sas_ncq_prio_supported); + +/** + * sas_ncq_prio_enable_show - send prioritized io commands to device + * @dev: pointer to embedded device + * @attr: sas_ncq_prio_enable attribute descriptor + * @buf: the buffer returned + * + * A sysfs 'read/write' sdev attribute, only works with SATA devices + */ +static ssize_t +sas_ncq_prio_enable_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct scsi_device *sdev = to_scsi_device(dev); + struct mpi3mr_sdev_priv_data *sdev_priv_data = sdev->hostdata; + + if (!sdev_priv_data) + return 0; + + return sysfs_emit(buf, "%d\n", sdev_priv_data->ncq_prio_enable); +} + +static ssize_t +sas_ncq_prio_enable_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct scsi_device *sdev = to_scsi_device(dev); + struct mpi3mr_sdev_priv_data *sdev_priv_data = sdev->hostdata; + bool ncq_prio_enable = 0; + + if (kstrtobool(buf, &ncq_prio_enable)) + return -EINVAL; + + if (!sas_ata_ncq_prio_supported(sdev)) + return -EINVAL; + + sdev_priv_data->ncq_prio_enable = ncq_prio_enable; + + return strlen(buf); +} +static DEVICE_ATTR_RW(sas_ncq_prio_enable); + static struct attribute *mpi3mr_dev_attrs[] = { &dev_attr_sas_address.attr, &dev_attr_device_handle.attr, &dev_attr_persistent_id.attr, + &dev_attr_sas_ncq_prio_supported.attr, + &dev_attr_sas_ncq_prio_enable.attr, NULL, }; diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.h b/drivers/scsi/mpt3sas/mpt3sas_base.h index bf100a4ebfc362..fe1e96fda284bc 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_base.h +++ b/drivers/scsi/mpt3sas/mpt3sas_base.h @@ -2048,9 +2048,6 @@ void mpt3sas_setup_direct_io(struct MPT3SAS_ADAPTER *ioc, struct scsi_cmnd *scmd, struct _raid_device *raid_device, Mpi25SCSIIORequest_t *mpi_request); -/* NCQ Prio Handling Check */ -bool scsih_ncq_prio_supp(struct scsi_device *sdev); - void mpt3sas_setup_debugfs(struct MPT3SAS_ADAPTER *ioc); void mpt3sas_destroy_debugfs(struct MPT3SAS_ADAPTER *ioc); void mpt3sas_init_debugfs(void); diff --git a/drivers/scsi/mpt3sas/mpt3sas_ctl.c b/drivers/scsi/mpt3sas/mpt3sas_ctl.c index 1c9fd26195b812..87784c96249a7f 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_ctl.c +++ b/drivers/scsi/mpt3sas/mpt3sas_ctl.c @@ -4088,7 +4088,7 @@ sas_ncq_prio_supported_show(struct device *dev, { struct scsi_device *sdev = to_scsi_device(dev); - return sysfs_emit(buf, "%d\n", scsih_ncq_prio_supp(sdev)); + return sysfs_emit(buf, "%d\n", sas_ata_ncq_prio_supported(sdev)); } static DEVICE_ATTR_RO(sas_ncq_prio_supported); @@ -4123,7 +4123,7 @@ sas_ncq_prio_enable_store(struct device *dev, if (kstrtobool(buf, &ncq_prio_enable)) return -EINVAL; - if (!scsih_ncq_prio_supp(sdev)) + if (!sas_ata_ncq_prio_supported(sdev)) return -EINVAL; sas_device_priv_data->ncq_prio_enable = ncq_prio_enable; diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c index 12d08d8ba5382d..870ec2cb4af4ba 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c +++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c @@ -12571,29 +12571,6 @@ scsih_pci_mmio_enabled(struct pci_dev *pdev) return PCI_ERS_RESULT_RECOVERED; } -/** - * scsih_ncq_prio_supp - Check for NCQ command priority support - * @sdev: scsi device struct - * - * This is called when a user indicates they would like to enable - * ncq command priorities. This works only on SATA devices. - */ -bool scsih_ncq_prio_supp(struct scsi_device *sdev) -{ - struct scsi_vpd *vpd; - bool ncq_prio_supp = false; - - rcu_read_lock(); - vpd = rcu_dereference(sdev->vpd_pg89); - if (!vpd || vpd->len < 214) - goto out; - - ncq_prio_supp = (vpd->data[213] >> 4) & 1; -out: - rcu_read_unlock(); - - return ncq_prio_supp; -} /* * The pci device ids are defined in mpi/mpi2_cnfg.h. */ diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c index 424a89513814b4..4e33f1661e4c1d 100644 --- a/drivers/scsi/scsi_transport_sas.c +++ b/drivers/scsi/scsi_transport_sas.c @@ -416,6 +416,29 @@ unsigned int sas_is_tlr_enabled(struct scsi_device *sdev) } EXPORT_SYMBOL_GPL(sas_is_tlr_enabled); +/** + * sas_ata_ncq_prio_supported - Check for ATA NCQ command priority support + * @sdev: SCSI device + * + * Check if an ATA device supports NCQ priority using VPD page 89h (ATA + * Information). Since this VPD page is implemented only for ATA devices, + * this function always returns false for SCSI devices. + */ +bool sas_ata_ncq_prio_supported(struct scsi_device *sdev) +{ + struct scsi_vpd *vpd; + bool ncq_prio_supported = false; + + rcu_read_lock(); + vpd = rcu_dereference(sdev->vpd_pg89); + if (vpd && vpd->len >= 214) + ncq_prio_supported = (vpd->data[213] >> 4) & 1; + rcu_read_unlock(); + + return ncq_prio_supported; +} +EXPORT_SYMBOL_GPL(sas_ata_ncq_prio_supported); + /* * SAS Phy attributes */ diff --git a/include/scsi/scsi_transport_sas.h b/include/scsi/scsi_transport_sas.h index 0e75b9277c8c6a..e3b6ce3cbf8835 100644 --- a/include/scsi/scsi_transport_sas.h +++ b/include/scsi/scsi_transport_sas.h @@ -200,6 +200,8 @@ unsigned int sas_is_tlr_enabled(struct scsi_device *); void sas_disable_tlr(struct scsi_device *); void sas_enable_tlr(struct scsi_device *); +bool sas_ata_ncq_prio_supported(struct scsi_device *sdev); + extern struct sas_rphy *sas_end_device_alloc(struct sas_port *); extern struct sas_rphy *sas_expander_alloc(struct sas_port *, enum sas_device_type); void sas_rphy_free(struct sas_rphy *); From 144ba8580bcb82b2686c3d1a043299d844b9a682 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Mon, 10 Jun 2024 10:34:26 +0200 Subject: [PATCH 0836/1578] net: pse-pd: Use EOPNOTSUPP error code instead of ENOTSUPP ENOTSUPP is not a SUSV4 error code, prefer EOPNOTSUPP as reported by checkpatch script. Fixes: 18ff0bcda6d1 ("ethtool: add interface to interact with Ethernet Power Equipment") Reviewed-by: Andrew Lunn Acked-by: Oleksij Rempel Signed-off-by: Kory Maincent Link: https://lore.kernel.org/r/20240610083426.740660-1-kory.maincent@bootlin.com Signed-off-by: Jakub Kicinski --- include/linux/pse-pd/pse.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h index 6d07c95dabb9a2..6eec24ffa8669c 100644 --- a/include/linux/pse-pd/pse.h +++ b/include/linux/pse-pd/pse.h @@ -167,14 +167,14 @@ static inline int pse_ethtool_get_status(struct pse_control *psec, struct netlink_ext_ack *extack, struct pse_control_status *status) { - return -ENOTSUPP; + return -EOPNOTSUPP; } static inline int pse_ethtool_set_config(struct pse_control *psec, struct netlink_ext_ack *extack, const struct pse_control_config *config) { - return -ENOTSUPP; + return -EOPNOTSUPP; } static inline bool pse_has_podl(struct pse_control *psec) From 1b9f756344416e02b41439bf2324b26aa25e141c Mon Sep 17 00:00:00 2001 From: Joshua Washington Date: Mon, 10 Jun 2024 15:57:18 -0700 Subject: [PATCH 0837/1578] gve: ignore nonrelevant GSO type bits when processing TSO headers TSO currently fails when the skb's gso_type field has more than one bit set. TSO packets can be passed from userspace using PF_PACKET, TUNTAP and a few others, using virtio_net_hdr (e.g., PACKET_VNET_HDR). This includes virtualization, such as QEMU, a real use-case. The gso_type and gso_size fields as passed from userspace in virtio_net_hdr are not trusted blindly by the kernel. It adds gso_type |= SKB_GSO_DODGY to force the packet to enter the software GSO stack for verification. This issue might similarly come up when the CWR bit is set in the TCP header for congestion control, causing the SKB_GSO_TCP_ECN gso_type bit to be set. Fixes: a57e5de476be ("gve: DQO: Add TX path") Signed-off-by: Joshua Washington Reviewed-by: Praveen Kaligineedi Reviewed-by: Harshitha Ramamurthy Reviewed-by: Willem de Bruijn Suggested-by: Eric Dumazet Acked-by: Andrei Vagin v2 - Remove unnecessary comments, remove line break between fixes tag and signoffs. v3 - Add back unrelated empty line removal. Link: https://lore.kernel.org/r/20240610225729.2985343-1-joshwash@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/gve/gve_tx_dqo.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve_tx_dqo.c b/drivers/net/ethernet/google/gve/gve_tx_dqo.c index fe1b26a4d73660..0b3cca3fc79215 100644 --- a/drivers/net/ethernet/google/gve/gve_tx_dqo.c +++ b/drivers/net/ethernet/google/gve/gve_tx_dqo.c @@ -555,28 +555,18 @@ static int gve_prep_tso(struct sk_buff *skb) if (unlikely(skb_shinfo(skb)->gso_size < GVE_TX_MIN_TSO_MSS_DQO)) return -1; + if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) + return -EINVAL; + /* Needed because we will modify header. */ err = skb_cow_head(skb, 0); if (err < 0) return err; tcp = tcp_hdr(skb); - - /* Remove payload length from checksum. */ paylen = skb->len - skb_transport_offset(skb); - - switch (skb_shinfo(skb)->gso_type) { - case SKB_GSO_TCPV4: - case SKB_GSO_TCPV6: - csum_replace_by_diff(&tcp->check, - (__force __wsum)htonl(paylen)); - - /* Compute length of segmentation header. */ - header_len = skb_tcp_all_headers(skb); - break; - default: - return -EINVAL; - } + csum_replace_by_diff(&tcp->check, (__force __wsum)htonl(paylen)); + header_len = skb_tcp_all_headers(skb); if (unlikely(header_len > GVE_TX_MAX_HDR_SIZE_DQO)) return -EINVAL; From be27b896529787e23a35ae4befb6337ce73fcca0 Mon Sep 17 00:00:00 2001 From: Xiaolei Wang Date: Sat, 8 Jun 2024 22:35:24 +0800 Subject: [PATCH 0838/1578] net: stmmac: replace priv->speed with the portTransmitRate from the tc-cbs parameters The current cbs parameter depends on speed after uplinking, which is not needed and will report a configuration error if the port is not initially connected. The UAPI exposed by tc-cbs requires userspace to recalculate the send slope anyway, because the formula depends on port_transmit_rate (see man tc-cbs), which is not an invariant from tc's perspective. Therefore, we use offload->sendslope and offload->idleslope to derive the original port_transmit_rate from the CBS formula. Fixes: 1f705bc61aee ("net: stmmac: Add support for CBS QDISC") Signed-off-by: Xiaolei Wang Reviewed-by: Wojciech Drewek Reviewed-by: Vladimir Oltean Link: https://lore.kernel.org/r/20240608143524.2065736-1-xiaolei.wang@windriver.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/stmicro/stmmac/stmmac_tc.c | 25 ++++++++----------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c index 222540b5548073..1562fbdd0a0400 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c @@ -343,10 +343,11 @@ static int tc_setup_cbs(struct stmmac_priv *priv, struct tc_cbs_qopt_offload *qopt) { u32 tx_queues_count = priv->plat->tx_queues_to_use; + s64 port_transmit_rate_kbps; u32 queue = qopt->queue; - u32 ptr, speed_div; u32 mode_to_use; u64 value; + u32 ptr; int ret; /* Queue 0 is not AVB capable */ @@ -355,30 +356,26 @@ static int tc_setup_cbs(struct stmmac_priv *priv, if (!priv->dma_cap.av) return -EOPNOTSUPP; + port_transmit_rate_kbps = qopt->idleslope - qopt->sendslope; + /* Port Transmit Rate and Speed Divider */ - switch (priv->speed) { + switch (div_s64(port_transmit_rate_kbps, 1000)) { case SPEED_10000: - ptr = 32; - speed_div = 10000000; - break; case SPEED_5000: ptr = 32; - speed_div = 5000000; break; case SPEED_2500: - ptr = 8; - speed_div = 2500000; - break; case SPEED_1000: ptr = 8; - speed_div = 1000000; break; case SPEED_100: ptr = 4; - speed_div = 100000; break; default: - return -EOPNOTSUPP; + netdev_err(priv->dev, + "Invalid portTransmitRate %lld (idleSlope - sendSlope)\n", + port_transmit_rate_kbps); + return -EINVAL; } mode_to_use = priv->plat->tx_queues_cfg[queue].mode_to_use; @@ -398,10 +395,10 @@ static int tc_setup_cbs(struct stmmac_priv *priv, } /* Final adjustments for HW */ - value = div_s64(qopt->idleslope * 1024ll * ptr, speed_div); + value = div_s64(qopt->idleslope * 1024ll * ptr, port_transmit_rate_kbps); priv->plat->tx_queues_cfg[queue].idle_slope = value & GENMASK(31, 0); - value = div_s64(-qopt->sendslope * 1024ll * ptr, speed_div); + value = div_s64(-qopt->sendslope * 1024ll * ptr, port_transmit_rate_kbps); priv->plat->tx_queues_cfg[queue].send_slope = value & GENMASK(31, 0); value = qopt->hicredit * 1024ll * 8; From 1cdeca6a7264021e20157de0baf7880ff0ced822 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Mon, 10 Jun 2024 23:06:19 +0900 Subject: [PATCH 0839/1578] ksmbd: move leading slash check to smb2_get_name() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the directory name in the root of the share starts with character like 镜(0x955c) or Ṝ(0x1e5c), it (and anything inside) cannot be accessed. The leading slash check must be checked after converting unicode to nls string. Cc: stable@vger.kernel.org Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index b6c5a8ea388791..f79d06d2d655cf 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -630,6 +630,12 @@ smb2_get_name(const char *src, const int maxlen, struct nls_table *local_nls) return name; } + if (*name == '\\') { + pr_err("not allow directory name included leading slash\n"); + kfree(name); + return ERR_PTR(-EINVAL); + } + ksmbd_conv_path_to_unix(name); ksmbd_strip_last_slash(name); return name; @@ -2842,20 +2848,11 @@ int smb2_open(struct ksmbd_work *work) } if (req->NameLength) { - if ((req->CreateOptions & FILE_DIRECTORY_FILE_LE) && - *(char *)req->Buffer == '\\') { - pr_err("not allow directory name included leading slash\n"); - rc = -EINVAL; - goto err_out2; - } - name = smb2_get_name((char *)req + le16_to_cpu(req->NameOffset), le16_to_cpu(req->NameLength), work->conn->local_nls); if (IS_ERR(name)) { rc = PTR_ERR(name); - if (rc != -ENOMEM) - rc = -ENOENT; name = NULL; goto err_out2; } From 2bfc4214c69c62da13a9da8e3c3db5539da2ccd3 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Tue, 11 Jun 2024 23:27:27 +0900 Subject: [PATCH 0840/1578] ksmbd: fix missing use of get_write in in smb2_set_ea() Fix an issue where get_write is not used in smb2_set_ea(). Fixes: 6fc0a265e1b9 ("ksmbd: fix potential circular locking issue in smb2_set_ea()") Cc: stable@vger.kernel.org Reported-by: Wang Zhaolong Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 7 ++++--- fs/smb/server/vfs.c | 17 +++++++++++------ fs/smb/server/vfs.h | 3 ++- fs/smb/server/vfs_cache.c | 3 ++- 4 files changed, 19 insertions(+), 11 deletions(-) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index f79d06d2d655cf..e7e07891781b36 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -2367,7 +2367,8 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, if (rc > 0) { rc = ksmbd_vfs_remove_xattr(idmap, path, - attr_name); + attr_name, + get_write); if (rc < 0) { ksmbd_debug(SMB, @@ -2382,7 +2383,7 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, } else { rc = ksmbd_vfs_setxattr(idmap, path, attr_name, value, le16_to_cpu(eabuf->EaValueLength), - 0, true); + 0, get_write); if (rc < 0) { ksmbd_debug(SMB, "ksmbd_vfs_setxattr is failed(%d)\n", @@ -2474,7 +2475,7 @@ static int smb2_remove_smb_xattrs(const struct path *path) !strncmp(&name[XATTR_USER_PREFIX_LEN], STREAM_PREFIX, STREAM_PREFIX_LEN)) { err = ksmbd_vfs_remove_xattr(idmap, path, - name); + name, true); if (err) ksmbd_debug(SMB, "remove xattr failed : %s\n", name); diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 51b1b0bed616ee..9e859ba010cf12 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -1058,16 +1058,21 @@ int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length, } int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap, - const struct path *path, char *attr_name) + const struct path *path, char *attr_name, + bool get_write) { int err; - err = mnt_want_write(path->mnt); - if (err) - return err; + if (get_write == true) { + err = mnt_want_write(path->mnt); + if (err) + return err; + } err = vfs_removexattr(idmap, path->dentry, attr_name); - mnt_drop_write(path->mnt); + + if (get_write == true) + mnt_drop_write(path->mnt); return err; } @@ -1380,7 +1385,7 @@ int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, const struct path *path) ksmbd_debug(SMB, "%s, len %zd\n", name, strlen(name)); if (!strncmp(name, XATTR_NAME_SD, XATTR_NAME_SD_LEN)) { - err = ksmbd_vfs_remove_xattr(idmap, path, name); + err = ksmbd_vfs_remove_xattr(idmap, path, name, true); if (err) ksmbd_debug(SMB, "remove xattr failed : %s\n", name); } diff --git a/fs/smb/server/vfs.h b/fs/smb/server/vfs.h index cfe1c8092f2302..cb76f4b5bafe8c 100644 --- a/fs/smb/server/vfs.h +++ b/fs/smb/server/vfs.h @@ -114,7 +114,8 @@ int ksmbd_vfs_setxattr(struct mnt_idmap *idmap, int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name, size_t *xattr_stream_name_size, int s_type); int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap, - const struct path *path, char *attr_name); + const struct path *path, char *attr_name, + bool get_write); int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, unsigned int flags, struct path *parent_path, struct path *path, bool caseless); diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c index 6cb599cd287ee4..8b2e37c8716ed7 100644 --- a/fs/smb/server/vfs_cache.c +++ b/fs/smb/server/vfs_cache.c @@ -254,7 +254,8 @@ static void __ksmbd_inode_close(struct ksmbd_file *fp) ci->m_flags &= ~S_DEL_ON_CLS_STREAM; err = ksmbd_vfs_remove_xattr(file_mnt_idmap(filp), &filp->f_path, - fp->stream.name); + fp->stream.name, + true); if (err) pr_err("remove xattr failed : %s\n", fp->stream.name); From f0260589b439e2637ad54a2b25f00a516ef28a57 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Tue, 11 Jun 2024 15:06:07 +0300 Subject: [PATCH 0841/1578] xhci: Set correct transferred length for cancelled bulk transfers The transferred length is set incorrectly for cancelled bulk transfer TDs in case the bulk transfer ring stops on the last transfer block with a 'Stop - Length Invalid' completion code. length essentially ends up being set to the requested length: urb->actual_length = urb->transfer_buffer_length Length for 'Stop - Length Invalid' cases should be the sum of all TRB transfer block lengths up to the one the ring stopped on, _excluding_ the one stopped on. Fix this by always summing up TRB lengths for 'Stop - Length Invalid' bulk cases. This issue was discovered by Alan Stern while debugging https://bugzilla.kernel.org/show_bug.cgi?id=218890, but does not solve that bug. Issue is older than 4.10 kernel but fix won't apply to those due to major reworks in that area. Tested-by: Pierre Tomon Cc: stable@vger.kernel.org # v4.10+ Cc: Alan Stern Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20240611120610.3264502-2-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-ring.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index 9e90d295276004..1db61bb2b9b566 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -2524,9 +2524,8 @@ static int process_bulk_intr_td(struct xhci_hcd *xhci, struct xhci_virt_ep *ep, goto finish_td; case COMP_STOPPED_LENGTH_INVALID: /* stopped on ep trb with invalid length, exclude it */ - ep_trb_len = 0; - remaining = 0; - break; + td->urb->actual_length = sum_trb_lengths(xhci, ep_ring, ep_trb); + goto finish_td; case COMP_USB_TRANSACTION_ERROR: if (xhci->quirks & XHCI_NO_SOFT_RETRY || (ep->err_count++ > MAX_SOFT_RETRY) || From 17bd54555c2aaecfdb38e2734149f684a73fa584 Mon Sep 17 00:00:00 2001 From: Kuangyi Chiang Date: Tue, 11 Jun 2024 15:06:08 +0300 Subject: [PATCH 0842/1578] xhci: Apply reset resume quirk to Etron EJ188 xHCI host As described in commit c877b3b2ad5c ("xhci: Add reset on resume quirk for asrock p67 host"), EJ188 have the same issue as EJ168, where completely dies on resume. So apply XHCI_RESET_ON_RESUME quirk to EJ188 as well. Cc: stable@vger.kernel.org Signed-off-by: Kuangyi Chiang Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20240611120610.3264502-3-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-pci.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index c040d816e62661..dd8256e4e02af0 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -36,6 +36,7 @@ #define PCI_VENDOR_ID_ETRON 0x1b6f #define PCI_DEVICE_ID_EJ168 0x7023 +#define PCI_DEVICE_ID_EJ188 0x7052 #define PCI_DEVICE_ID_INTEL_LYNXPOINT_XHCI 0x8c31 #define PCI_DEVICE_ID_INTEL_LYNXPOINT_LP_XHCI 0x9c31 @@ -395,6 +396,10 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci) xhci->quirks |= XHCI_RESET_ON_RESUME; xhci->quirks |= XHCI_BROKEN_STREAMS; } + if (pdev->vendor == PCI_VENDOR_ID_ETRON && + pdev->device == PCI_DEVICE_ID_EJ188) + xhci->quirks |= XHCI_RESET_ON_RESUME; + if (pdev->vendor == PCI_VENDOR_ID_RENESAS && pdev->device == 0x0014) { xhci->quirks |= XHCI_ZERO_64B_REGS; From 91f7a1524a92c70ffe264db8bdfa075f15bbbeb9 Mon Sep 17 00:00:00 2001 From: Kuangyi Chiang Date: Tue, 11 Jun 2024 15:06:09 +0300 Subject: [PATCH 0843/1578] xhci: Apply broken streams quirk to Etron EJ188 xHCI host As described in commit 8f873c1ff4ca ("xhci: Blacklist using streams on the Etron EJ168 controller"), EJ188 have the same issue as EJ168, where Streams do not work reliable on EJ188. So apply XHCI_BROKEN_STREAMS quirk to EJ188 as well. Cc: stable@vger.kernel.org Signed-off-by: Kuangyi Chiang Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20240611120610.3264502-4-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-pci.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index dd8256e4e02af0..05881153883ec0 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -397,8 +397,10 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci) xhci->quirks |= XHCI_BROKEN_STREAMS; } if (pdev->vendor == PCI_VENDOR_ID_ETRON && - pdev->device == PCI_DEVICE_ID_EJ188) + pdev->device == PCI_DEVICE_ID_EJ188) { xhci->quirks |= XHCI_RESET_ON_RESUME; + xhci->quirks |= XHCI_BROKEN_STREAMS; + } if (pdev->vendor == PCI_VENDOR_ID_RENESAS && pdev->device == 0x0014) { From 5ceac4402f5d975e5a01c806438eb4e554771577 Mon Sep 17 00:00:00 2001 From: Hector Martin Date: Tue, 11 Jun 2024 15:06:10 +0300 Subject: [PATCH 0844/1578] xhci: Handle TD clearing for multiple streams case When multiple streams are in use, multiple TDs might be in flight when an endpoint is stopped. We need to issue a Set TR Dequeue Pointer for each, to ensure everything is reset properly and the caches cleared. Change the logic so that any N>1 TDs found active for different streams are deferred until after the first one is processed, calling xhci_invalidate_cancelled_tds() again from xhci_handle_cmd_set_deq() to queue another command until we are done with all of them. Also change the error/"should never happen" paths to ensure we at least clear any affected TDs, even if we can't issue a command to clear the hardware cache, and complain loudly with an xhci_warn() if this ever happens. This problem case dates back to commit e9df17eb1408 ("USB: xhci: Correct assumptions about number of rings per endpoint.") early on in the XHCI driver's life, when stream support was first added. It was then identified but not fixed nor made into a warning in commit 674f8438c121 ("xhci: split handling halted endpoints into two steps"), which added a FIXME comment for the problem case (without materially changing the behavior as far as I can tell, though the new logic made the problem more obvious). Then later, in commit 94f339147fc3 ("xhci: Fix failure to give back some cached cancelled URBs."), it was acknowledged again. [Mathias: commit 94f339147fc3 ("xhci: Fix failure to give back some cached cancelled URBs.") was a targeted regression fix to the previously mentioned patch. Users reported issues with usb stuck after unmounting/disconnecting UAS devices. This rolled back the TD clearing of multiple streams to its original state.] Apparently the commit author was aware of the problem (yet still chose to submit it): It was still mentioned as a FIXME, an xhci_dbg() was added to log the problem condition, and the remaining issue was mentioned in the commit description. The choice of making the log type xhci_dbg() for what is, at this point, a completely unhandled and known broken condition is puzzling and unfortunate, as it guarantees that no actual users would see the log in production, thereby making it nigh undebuggable (indeed, even if you turn on DEBUG, the message doesn't really hint at there being a problem at all). It took me *months* of random xHC crashes to finally find a reliable repro and be able to do a deep dive debug session, which could all have been avoided had this unhandled, broken condition been actually reported with a warning, as it should have been as a bug intentionally left in unfixed (never mind that it shouldn't have been left in at all). > Another fix to solve clearing the caches of all stream rings with > cancelled TDs is needed, but not as urgent. 3 years after that statement and 14 years after the original bug was introduced, I think it's finally time to fix it. And maybe next time let's not leave bugs unfixed (that are actually worse than the original bug), and let's actually get people to review kernel commits please. Fixes xHC crashes and IOMMU faults with UAS devices when handling errors/faults. Easiest repro is to use `hdparm` to mark an early sector (e.g. 1024) on a disk as bad, then `cat /dev/sdX > /dev/null` in a loop. At least in the case of JMicron controllers, the read errors end up having to cancel two TDs (for two queued requests to different streams) and the one that didn't get cleared properly ends up faulting the xHC entirely when it tries to access DMA pages that have since been unmapped, referred to by the stale TDs. This normally happens quickly (after two or three loops). After this fix, I left the `cat` in a loop running overnight and experienced no xHC failures, with all read errors recovered properly. Repro'd and tested on an Apple M1 Mac Mini (dwc3 host). On systems without an IOMMU, this bug would instead silently corrupt freed memory, making this a security bug (even on systems with IOMMUs this could silently corrupt memory belonging to other USB devices on the same controller, so it's still a security bug). Given that the kernel autoprobes partition tables, I'm pretty sure a malicious USB device pretending to be a UAS device and reporting an error with the right timing could deliberately trigger a UAF and write to freed memory, with no user action. [Mathias: Commit message and code comment edit, original at:] https://lore.kernel.org/linux-usb/20240524-xhci-streams-v1-1-6b1f13819bea@marcan.st/ Fixes: e9df17eb1408 ("USB: xhci: Correct assumptions about number of rings per endpoint.") Fixes: 94f339147fc3 ("xhci: Fix failure to give back some cached cancelled URBs.") Fixes: 674f8438c121 ("xhci: split handling halted endpoints into two steps") Cc: stable@vger.kernel.org Cc: security@kernel.org Reviewed-by: Neal Gompa Signed-off-by: Hector Martin Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20240611120610.3264502-5-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-ring.c | 54 ++++++++++++++++++++++++++++-------- drivers/usb/host/xhci.h | 1 + 2 files changed, 44 insertions(+), 11 deletions(-) diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index 1db61bb2b9b566..fd0cde3d1569c7 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -1031,13 +1031,27 @@ static int xhci_invalidate_cancelled_tds(struct xhci_virt_ep *ep) break; case TD_DIRTY: /* TD is cached, clear it */ case TD_HALTED: + case TD_CLEARING_CACHE_DEFERRED: + if (cached_td) { + if (cached_td->urb->stream_id != td->urb->stream_id) { + /* Multiple streams case, defer move dq */ + xhci_dbg(xhci, + "Move dq deferred: stream %u URB %p\n", + td->urb->stream_id, td->urb); + td->cancel_status = TD_CLEARING_CACHE_DEFERRED; + break; + } + + /* Should never happen, but clear the TD if it does */ + xhci_warn(xhci, + "Found multiple active URBs %p and %p in stream %u?\n", + td->urb, cached_td->urb, + td->urb->stream_id); + td_to_noop(xhci, ring, cached_td, false); + cached_td->cancel_status = TD_CLEARED; + } + td->cancel_status = TD_CLEARING_CACHE; - if (cached_td) - /* FIXME stream case, several stopped rings */ - xhci_dbg(xhci, - "Move dq past stream %u URB %p instead of stream %u URB %p\n", - td->urb->stream_id, td->urb, - cached_td->urb->stream_id, cached_td->urb); cached_td = td; break; } @@ -1057,10 +1071,16 @@ static int xhci_invalidate_cancelled_tds(struct xhci_virt_ep *ep) if (err) { /* Failed to move past cached td, just set cached TDs to no-op */ list_for_each_entry_safe(td, tmp_td, &ep->cancelled_td_list, cancelled_td_list) { - if (td->cancel_status != TD_CLEARING_CACHE) + /* + * Deferred TDs need to have the deq pointer set after the above command + * completes, so if that failed we just give up on all of them (and + * complain loudly since this could cause issues due to caching). + */ + if (td->cancel_status != TD_CLEARING_CACHE && + td->cancel_status != TD_CLEARING_CACHE_DEFERRED) continue; - xhci_dbg(xhci, "Failed to clear cancelled cached URB %p, mark clear anyway\n", - td->urb); + xhci_warn(xhci, "Failed to clear cancelled cached URB %p, mark clear anyway\n", + td->urb); td_to_noop(xhci, ring, td, false); td->cancel_status = TD_CLEARED; } @@ -1346,6 +1366,7 @@ static void xhci_handle_cmd_set_deq(struct xhci_hcd *xhci, int slot_id, struct xhci_ep_ctx *ep_ctx; struct xhci_slot_ctx *slot_ctx; struct xhci_td *td, *tmp_td; + bool deferred = false; ep_index = TRB_TO_EP_INDEX(le32_to_cpu(trb->generic.field[3])); stream_id = TRB_TO_STREAM_ID(le32_to_cpu(trb->generic.field[2])); @@ -1432,6 +1453,8 @@ static void xhci_handle_cmd_set_deq(struct xhci_hcd *xhci, int slot_id, xhci_dbg(ep->xhci, "%s: Giveback cancelled URB %p TD\n", __func__, td->urb); xhci_td_cleanup(ep->xhci, td, ep_ring, td->status); + } else if (td->cancel_status == TD_CLEARING_CACHE_DEFERRED) { + deferred = true; } else { xhci_dbg(ep->xhci, "%s: Keep cancelled URB %p TD as cancel_status is %d\n", __func__, td->urb, td->cancel_status); @@ -1441,8 +1464,17 @@ static void xhci_handle_cmd_set_deq(struct xhci_hcd *xhci, int slot_id, ep->ep_state &= ~SET_DEQ_PENDING; ep->queued_deq_seg = NULL; ep->queued_deq_ptr = NULL; - /* Restart any rings with pending URBs */ - ring_doorbell_for_active_rings(xhci, slot_id, ep_index); + + if (deferred) { + /* We have more streams to clear */ + xhci_dbg(ep->xhci, "%s: Pending TDs to clear, continuing with invalidation\n", + __func__); + xhci_invalidate_cancelled_tds(ep); + } else { + /* Restart any rings with pending URBs */ + xhci_dbg(ep->xhci, "%s: All TDs cleared, ring doorbell\n", __func__); + ring_doorbell_for_active_rings(xhci, slot_id, ep_index); + } } static void xhci_handle_cmd_reset_ep(struct xhci_hcd *xhci, int slot_id, diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h index 30415158ed3cb5..78d014c4d884a2 100644 --- a/drivers/usb/host/xhci.h +++ b/drivers/usb/host/xhci.h @@ -1276,6 +1276,7 @@ enum xhci_cancelled_td_status { TD_DIRTY = 0, TD_HALTED, TD_CLEARING_CACHE, + TD_CLEARING_CACHE_DEFERRED, TD_CLEARED, }; From 0320ca14c6fb68ad19aa72e55a1a21c061b2946b Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 12 Jun 2024 09:23:13 +0200 Subject: [PATCH 0845/1578] drm: renesas: shmobile: Call drm_atomic_helper_shutdown() at shutdown time Based on grepping through the source code, this driver appears to be missing a call to drm_atomic_helper_shutdown() at system shutdown time. This is important because drm_atomic_helper_shutdown() will cause panels to get disabled cleanly which may be important for their power sequencing. Future changes will remove any custom powering off in individual panel drivers so the DRM drivers need to start getting this right. The fact that we should call drm_atomic_helper_shutdown() in the case of OS shutdown comes straight out of the kernel doc "driver instance overview" in drm_drv.c. [geert: shmob_drm_remove() already calls drm_atomic_helper_shutdown] Suggested-by: Maxime Ripard Signed-off-by: Douglas Anderson Link: https://lore.kernel.org/r/20230901164111.RFT.15.Iaf638a1d4c8b3c307a6192efabb4cbb06b195f15@changeid [geert: s/drm_helper_force_disable_all/drm_atomic_helper_shutdown/] Signed-off-by: Geert Uytterhoeven Reviewed-by: Laurent Pinchart Reviewed-by: Sui Jingfeng Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/17c6a5a668e5975f871b77fb1fca6711a0799d9e.1718176895.git.geert+renesas@glider.be --- drivers/gpu/drm/renesas/shmobile/shmob_drm_drv.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/renesas/shmobile/shmob_drm_drv.c b/drivers/gpu/drm/renesas/shmobile/shmob_drm_drv.c index e83c3e52251ded..0250d5f00bf102 100644 --- a/drivers/gpu/drm/renesas/shmobile/shmob_drm_drv.c +++ b/drivers/gpu/drm/renesas/shmobile/shmob_drm_drv.c @@ -171,6 +171,13 @@ static void shmob_drm_remove(struct platform_device *pdev) drm_kms_helper_poll_fini(ddev); } +static void shmob_drm_shutdown(struct platform_device *pdev) +{ + struct shmob_drm_device *sdev = platform_get_drvdata(pdev); + + drm_atomic_helper_shutdown(&sdev->ddev); +} + static int shmob_drm_probe(struct platform_device *pdev) { struct shmob_drm_platform_data *pdata = pdev->dev.platform_data; @@ -273,6 +280,7 @@ static const struct of_device_id shmob_drm_of_table[] __maybe_unused = { static struct platform_driver shmob_drm_platform_driver = { .probe = shmob_drm_probe, .remove_new = shmob_drm_remove, + .shutdown = shmob_drm_shutdown, .driver = { .name = "shmob-drm", .of_match_table = of_match_ptr(shmob_drm_of_table), From c38896ca6318c2df20bbe6c8e3f633e071fda910 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Tue, 11 Jun 2024 10:27:44 -0700 Subject: [PATCH 0846/1578] drm/mediatek: Call drm_atomic_helper_shutdown() at shutdown time Based on grepping through the source code this driver appears to be missing a call to drm_atomic_helper_shutdown() at system shutdown time. Among other things, this means that if a panel is in use that it won't be cleanly powered off at system shutdown time. The fact that we should call drm_atomic_helper_shutdown() in the case of OS shutdown/restart comes straight out of the kernel doc "driver instance overview" in drm_drv.c. This driver users the component model and shutdown happens in the base driver. The "drvdata" for this driver will always be valid if shutdown() is called and as of commit 2a073968289d ("drm/atomic-helper: drm_atomic_helper_shutdown(NULL) should be a noop") we don't need to confirm that "drm" is non-NULL. Suggested-by: Maxime Ripard Reviewed-by: Maxime Ripard Reviewed-by: Fei Shao Tested-by: Fei Shao Signed-off-by: Douglas Anderson Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20240611102744.v2.1.I2b014f90afc4729b6ecc7b5ddd1f6dedcea4625b@changeid --- drivers/gpu/drm/mediatek/mtk_drm_drv.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/mediatek/mtk_drm_drv.c b/drivers/gpu/drm/mediatek/mtk_drm_drv.c index b5f605751b0a13..de811e2265da75 100644 --- a/drivers/gpu/drm/mediatek/mtk_drm_drv.c +++ b/drivers/gpu/drm/mediatek/mtk_drm_drv.c @@ -952,6 +952,13 @@ static void mtk_drm_remove(struct platform_device *pdev) of_node_put(private->comp_node[i]); } +static void mtk_drm_shutdown(struct platform_device *pdev) +{ + struct mtk_drm_private *private = platform_get_drvdata(pdev); + + drm_atomic_helper_shutdown(private->drm); +} + static int mtk_drm_sys_prepare(struct device *dev) { struct mtk_drm_private *private = dev_get_drvdata(dev); @@ -983,6 +990,7 @@ static const struct dev_pm_ops mtk_drm_pm_ops = { static struct platform_driver mtk_drm_platform_driver = { .probe = mtk_drm_probe, .remove_new = mtk_drm_remove, + .shutdown = mtk_drm_shutdown, .driver = { .name = "mediatek-drm", .pm = &mtk_drm_pm_ops, From 0941772342d59e48733131ac3a202fa1a4d832e9 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 11 Jun 2024 18:58:16 +0200 Subject: [PATCH 0847/1578] wifi: cfg80211: wext: set ssids=NULL for passive scans In nl80211, we always set the ssids of a scan request to NULL when n_ssids==0 (passive scan). Drivers have relied on this behaviour in the past, so we fixed it in 6 GHz scan requests as well, and added a warning so we'd have assurance the API would always be called that way. syzbot found that wext doesn't ensure that, so we reach the check and trigger the warning. Fix the wext code to set the ssids pointer to NULL when there are none. Reported-by: syzbot+cd6135193ba6bb9ad158@syzkaller.appspotmail.com Fixes: f7a8b10bfd61 ("wifi: cfg80211: fix 6 GHz scan request building") Signed-off-by: Johannes Berg --- net/wireless/scan.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 2f2a3163968a7c..d7485e26f4fc2b 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -3493,8 +3493,10 @@ int cfg80211_wext_siwscan(struct net_device *dev, memcpy(creq->ssids[0].ssid, wreq->essid, wreq->essid_len); creq->ssids[0].ssid_len = wreq->essid_len; } - if (wreq->scan_type == IW_SCAN_TYPE_PASSIVE) + if (wreq->scan_type == IW_SCAN_TYPE_PASSIVE) { + creq->ssids = NULL; creq->n_ssids = 0; + } } for (i = 0; i < NUM_NL80211_BANDS; i++) From 6ef09cdc5ba0f93826c09d810c141a8d103a80fc Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Fri, 31 May 2024 06:20:10 +0300 Subject: [PATCH 0848/1578] wifi: cfg80211: wext: add extra SIOCSIWSCAN data check In 'cfg80211_wext_siwscan()', add extra check whether number of channels passed via 'ioctl(sock, SIOCSIWSCAN, ...)' doesn't exceed IW_MAX_FREQUENCIES and reject invalid request with -EINVAL otherwise. Reported-by: syzbot+253cd2d2491df77c93ac@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=253cd2d2491df77c93ac Signed-off-by: Dmitry Antipov Link: https://msgid.link/20240531032010.451295-1-dmantipov@yandex.ru Signed-off-by: Johannes Berg --- net/wireless/scan.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index d7485e26f4fc2b..0222ede0feb60b 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -3416,10 +3416,14 @@ int cfg80211_wext_siwscan(struct net_device *dev, wiphy = &rdev->wiphy; /* Determine number of channels, needed to allocate creq */ - if (wreq && wreq->num_channels) + if (wreq && wreq->num_channels) { + /* Passed from userspace so should be checked */ + if (unlikely(wreq->num_channels > IW_MAX_FREQUENCIES)) + return -EINVAL; n_channels = wreq->num_channels; - else + } else { n_channels = ieee80211_get_num_supported_channels(wiphy); + } creq = kzalloc(sizeof(*creq) + sizeof(struct cfg80211_ssid) + n_channels * sizeof(void *), From d792011b6c282bfb787eb2893538e5e336d5e982 Mon Sep 17 00:00:00 2001 From: Shaul Triebitz Date: Wed, 5 Jun 2024 14:05:04 +0300 Subject: [PATCH 0849/1578] wifi: iwlwifi: mvm: unlock mvm mutex Unlock the mvm mutex before returning from a function with the mutex locked. Fixes: a1efeb823084 ("wifi: iwlwifi: mvm: Block EMLSR when a p2p/softAP vif is active") Signed-off-by: Shaul Triebitz Signed-off-by: Miri Korenblit Link: https://msgid.link/20240605140327.96cb956db4af.Ib468cbad38959910977b5581f6111ab0afae9880@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/time-event.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c b/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c index 8ee4498f424555..31bc80cdcb7d55 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c @@ -1238,6 +1238,7 @@ void iwl_mvm_stop_roc(struct iwl_mvm *mvm, struct ieee80211_vif *vif) if (te_data->id >= SESSION_PROTECT_CONF_MAX_ID) { IWL_DEBUG_TE(mvm, "No remain on channel event\n"); + mutex_unlock(&mvm->mutex); return; } @@ -1253,6 +1254,7 @@ void iwl_mvm_stop_roc(struct iwl_mvm *mvm, struct ieee80211_vif *vif) te_data = iwl_mvm_get_roc_te(mvm); if (!te_data) { IWL_WARN(mvm, "No remain on channel event\n"); + mutex_unlock(&mvm->mutex); return; } From 4c2bed6042fb6aca1d1d4f291f85461b1d5ac08c Mon Sep 17 00:00:00 2001 From: Shaul Triebitz Date: Wed, 5 Jun 2024 14:05:05 +0300 Subject: [PATCH 0850/1578] wifi: iwlwifi: mvm: fix ROC version check For using the ROC command, check that the ROC version is *greater or equal* to 3, rather than *equal* to 3. The ROC version was added to the TLV starting from version 3. Fixes: 67ac248e4db0 ("wifi: iwlwifi: mvm: implement ROC version 3") Signed-off-by: Shaul Triebitz Signed-off-by: Miri Korenblit Link: https://msgid.link/20240605140327.93d86cd188ad.Iceadef5a2f3cfa4a127e94a0405eba8342ec89c1@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index de9f0b44654562..18ce060df9b5b3 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -4795,7 +4795,7 @@ static int iwl_mvm_roc_station(struct iwl_mvm *mvm, if (fw_ver == IWL_FW_CMD_VER_UNKNOWN) { ret = iwl_mvm_send_aux_roc_cmd(mvm, channel, vif, duration); - } else if (fw_ver == 3) { + } else if (fw_ver >= 3) { ret = iwl_mvm_roc_add_cmd(mvm, channel, vif, duration, ROC_ACTIVITY_HOTSPOT); } else { From fcc356020a0171106c9ba524ba05a6792668451e Mon Sep 17 00:00:00 2001 From: Ayala Beker Date: Wed, 5 Jun 2024 14:07:38 +0300 Subject: [PATCH 0851/1578] wifi: iwlwifi: scan: correctly check if PSC listen period is needed The flags variable is incorrectly checked while it is still cleared and has not been assigned any value yet. Fix it. Fixes: a615323f7f90 ("wifi: iwlwifi: mvm: always apply 6 GHz probe limitations") Signed-off-by: Ayala Beker Reviewed-by: Benjamin Berg Signed-off-by: Miri Korenblit Link: https://msgid.link/20240605140556.291c33f9a283.Id651fe69828aebce177b49b2316c5780906f1b37@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/scan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c index b5f664ae5a17d0..e975f5ff17b5db 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c @@ -1830,7 +1830,7 @@ iwl_mvm_umac_scan_cfg_channels_v7_6g(struct iwl_mvm *mvm, */ if (!iwl_mvm_is_scan_fragmented(params->type)) { if (!cfg80211_channel_is_psc(params->channels[i]) || - flags & IWL_UHB_CHAN_CFG_FLAG_PSC_CHAN_NO_LISTEN) { + psc_no_listen) { if (unsolicited_probe_on_chan) { max_s_ssids = 2; max_bssids = 6; From 7d09e17c0415fe6d946044c7e70bce31cda952ec Mon Sep 17 00:00:00 2001 From: Remi Pommarel Date: Sat, 18 May 2024 18:07:33 +0200 Subject: [PATCH 0852/1578] wifi: mac80211: Recalc offload when monitor stop When a monitor interface is started, ieee80211_recalc_offload() is called and 802.11 encapsulation offloading support get disabled so monitor interface could get native wifi frames directly. But when this interface is stopped there is no need to keep the 802.11 encpasulation offloading off. This call ieee80211_recalc_offload() when monitor interface is stopped so 802.11 encapsulation offloading gets re-activated if possible. Fixes: 6aea26ce5a4c ("mac80211: rework tx encapsulation offload API") Signed-off-by: Remi Pommarel Link: https://msgid.link/840baab454f83718e6e16fd836ac597d924e85b9.1716048326.git.repk@triplefau.lt Signed-off-by: Johannes Berg --- net/mac80211/iface.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index dc42902e269359..0c54554bf761bf 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -686,6 +686,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do ieee80211_del_virtual_monitor(local); ieee80211_recalc_idle(local); + ieee80211_recalc_offload(local); if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE)) break; From 350cbb5d2f676bff22c49e5e81764c3b8da342a9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 11 Jun 2024 16:53:06 +0200 Subject: [PATCH 0853/1578] cpufreq: intel_pstate: Check turbo_is_disabled() in store_no_turbo() After recent changes in intel_pstate, global.turbo_disabled is only set at the initialization time and never changed. However, it turns out that on some systems the "turbo disabled" bit in MSR_IA32_MISC_ENABLE, the initial state of which is reflected by global.turbo_disabled, can be flipped later and there should be a way to take that into account (other than checking that MSR every time the driver runs which is costly and useless overhead on the vast majority of systems). For this purpose, notice that before the changes in question, store_no_turbo() contained a turbo_is_disabled() check that was used for updating global.turbo_disabled if the "turbo disabled" bit in MSR_IA32_MISC_ENABLE had been flipped and that functionality can be restored. Then, users will be able to reset global.turbo_disabled by writing 0 to no_turbo which used to work before on systems with flipping "turbo disabled" bit. This guarantees the driver state to remain in sync, but READ_ONCE() annotations need to be added in two places where global.turbo_disabled is accessed locklessly, so modify the driver to make that happen. Fixes: 0940f1a8011f ("cpufreq: intel_pstate: Do not update global.turbo_disabled after initialization") Closes: https://lore.kernel.org/linux-pm/bf3ebf1571a4788e97daf861eb493c12d42639a3.camel@xry111.site Suggested-by: Srinivas Pandruvada Reported-by: Xi Ruoyao Tested-by: Xi Ruoyao Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 65d3f79104bd50..15de5e3d96fd4d 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1302,12 +1302,17 @@ static ssize_t store_no_turbo(struct kobject *a, struct kobj_attribute *b, no_turbo = !!clamp_t(int, input, 0, 1); - if (no_turbo == global.no_turbo) - goto unlock_driver; - - if (global.turbo_disabled) { - pr_notice_once("Turbo disabled by BIOS or unavailable on processor\n"); + WRITE_ONCE(global.turbo_disabled, turbo_is_disabled()); + if (global.turbo_disabled && !no_turbo) { + pr_notice("Turbo disabled by BIOS or unavailable on processor\n"); count = -EPERM; + if (global.no_turbo) + goto unlock_driver; + else + no_turbo = 1; + } + + if (no_turbo == global.no_turbo) { goto unlock_driver; } @@ -1762,7 +1767,7 @@ static u64 atom_get_val(struct cpudata *cpudata, int pstate) u32 vid; val = (u64)pstate << 8; - if (READ_ONCE(global.no_turbo) && !global.turbo_disabled) + if (READ_ONCE(global.no_turbo) && !READ_ONCE(global.turbo_disabled)) val |= (u64)1 << 32; vid_fp = cpudata->vid.min + mul_fp( @@ -1927,7 +1932,7 @@ static u64 core_get_val(struct cpudata *cpudata, int pstate) u64 val; val = (u64)pstate << 8; - if (READ_ONCE(global.no_turbo) && !global.turbo_disabled) + if (READ_ONCE(global.no_turbo) && !READ_ONCE(global.turbo_disabled)) val |= (u64)1 << 32; return val; From 4cac29b846f38d5f0654cdfff5c5bfc37305081c Mon Sep 17 00:00:00 2001 From: Kalle Niemi Date: Wed, 12 Jun 2024 14:42:34 +0300 Subject: [PATCH 0854/1578] regulator: bd71815: fix ramp values Ramp values are inverted. This caused wrong values written to register when ramp values were defined in device tree. Invert values in table to fix this. Signed-off-by: Kalle Niemi Fixes: 1aad39001e85 ("regulator: Support ROHM BD71815 regulators") Reviewed-by: Matti Vaittinen Link: https://lore.kernel.org/r/ZmmJXtuVJU6RgQAH@latitude5580 Signed-off-by: Mark Brown --- drivers/regulator/bd71815-regulator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/regulator/bd71815-regulator.c b/drivers/regulator/bd71815-regulator.c index 26192d55a6858e..79fbb45297f6bc 100644 --- a/drivers/regulator/bd71815-regulator.c +++ b/drivers/regulator/bd71815-regulator.c @@ -256,7 +256,7 @@ static int buck12_set_hw_dvs_levels(struct device_node *np, * 10: 2.50mV/usec 10mV 4uS * 11: 1.25mV/usec 10mV 8uS */ -static const unsigned int bd7181x_ramp_table[] = { 1250, 2500, 5000, 10000 }; +static const unsigned int bd7181x_ramp_table[] = { 10000, 5000, 2500, 1250 }; static int bd7181x_led_set_current_limit(struct regulator_dev *rdev, int min_uA, int max_uA) From d6d5645e5fc1233a7ba950de4a72981c394a2557 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Fri, 31 May 2024 11:19:14 +0200 Subject: [PATCH 0855/1578] i2c: at91: Fix the functionality flags of the slave-only interface When an I2C adapter acts only as a slave, it should not claim to support I2C master capabilities. Fixes: 9d3ca54b550c ("i2c: at91: added slave mode support") Signed-off-by: Jean Delvare Cc: Juergen Fitschen Cc: Ludovic Desroches Cc: Codrin Ciubotariu Cc: Andi Shyti Cc: Nicolas Ferre Cc: Alexandre Belloni Cc: Claudiu Beznea Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-at91-slave.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-at91-slave.c b/drivers/i2c/busses/i2c-at91-slave.c index d6eeea5166c04f..131a67d9d4a689 100644 --- a/drivers/i2c/busses/i2c-at91-slave.c +++ b/drivers/i2c/busses/i2c-at91-slave.c @@ -106,8 +106,7 @@ static int at91_unreg_slave(struct i2c_client *slave) static u32 at91_twi_func(struct i2c_adapter *adapter) { - return I2C_FUNC_SLAVE | I2C_FUNC_I2C | I2C_FUNC_SMBUS_EMUL - | I2C_FUNC_SMBUS_READ_BLOCK_DATA; + return I2C_FUNC_SLAVE; } static const struct i2c_algorithm at91_twi_algorithm_slave = { From cbf3fb5b29e99e3689d63a88c3cddbffa1b8de99 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Fri, 31 May 2024 11:17:48 +0200 Subject: [PATCH 0856/1578] i2c: designware: Fix the functionality flags of the slave-only interface When an I2C adapter acts only as a slave, it should not claim to support I2C master capabilities. Fixes: 5b6d721b266a ("i2c: designware: enable SLAVE in platform module") Signed-off-by: Jean Delvare Cc: Luis Oliveira Cc: Jarkko Nikula Cc: Andy Shevchenko Cc: Mika Westerberg Cc: Jan Dabros Cc: Andi Shyti Reviewed-by: Andy Shevchenko Acked-by: Jarkko Nikula Tested-by: Jarkko Nikula Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-designware-slave.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-designware-slave.c b/drivers/i2c/busses/i2c-designware-slave.c index 2e079cf20bb5b4..78e2c47e3d7da7 100644 --- a/drivers/i2c/busses/i2c-designware-slave.c +++ b/drivers/i2c/busses/i2c-designware-slave.c @@ -220,7 +220,7 @@ static const struct i2c_algorithm i2c_dw_algo = { void i2c_dw_configure_slave(struct dw_i2c_dev *dev) { - dev->functionality = I2C_FUNC_SLAVE | DW_IC_DEFAULT_FUNCTIONALITY; + dev->functionality = I2C_FUNC_SLAVE; dev->slave_cfg = DW_IC_CON_RX_FIFO_FULL_HLD_CTRL | DW_IC_CON_RESTART_EN | DW_IC_CON_STOP_DET_IFADDRESSED; From 0476d09c36a845757dd9fd1c80fbbf45b0faeb3c Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 11 Jun 2024 21:22:40 +0800 Subject: [PATCH 0857/1578] md: rearrange recovery_flags Currently there are lots of flags with the same confusing prefix "MD_REOCVERY_", and there are two main types of flags, sync thread runnng status, I prefer prefix "SYNC_THREAD_", and sync thread action, I perfer prefix "SYNC_ACTION_". For now, rearrange and update comment to improve code readability, there are no functional changes. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240611132251.1967786-2-yukuai1@huaweicloud.com --- drivers/md/md.h | 52 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/drivers/md/md.h b/drivers/md/md.h index 487582058f7417..1ee129c6f98ff5 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -551,22 +551,46 @@ struct mddev { }; enum recovery_flags { + /* flags for sync thread running status */ + + /* + * set when one of sync action is set and new sync thread need to be + * registered, or just add/remove spares from conf. + */ + MD_RECOVERY_NEEDED, + /* sync thread is running, or about to be started */ + MD_RECOVERY_RUNNING, + /* sync thread needs to be aborted for some reason */ + MD_RECOVERY_INTR, + /* sync thread is done and is waiting to be unregistered */ + MD_RECOVERY_DONE, + /* running sync thread must abort immediately, and not restart */ + MD_RECOVERY_FROZEN, + /* waiting for pers->start() to finish */ + MD_RECOVERY_WAIT, + /* interrupted because io-error */ + MD_RECOVERY_ERROR, + + /* flags determines sync action */ + + /* if just this flag is set, action is resync. */ + MD_RECOVERY_SYNC, + /* + * paired with MD_RECOVERY_SYNC, if MD_RECOVERY_CHECK is not set, + * action is repair, means user requested resync. + */ + MD_RECOVERY_REQUESTED, /* - * If neither SYNC or RESHAPE are set, then it is a recovery. + * paired with MD_RECOVERY_SYNC and MD_RECOVERY_REQUESTED, action is + * check. */ - MD_RECOVERY_RUNNING, /* a thread is running, or about to be started */ - MD_RECOVERY_SYNC, /* actually doing a resync, not a recovery */ - MD_RECOVERY_RECOVER, /* doing recovery, or need to try it. */ - MD_RECOVERY_INTR, /* resync needs to be aborted for some reason */ - MD_RECOVERY_DONE, /* thread is done and is waiting to be reaped */ - MD_RECOVERY_NEEDED, /* we might need to start a resync/recover */ - MD_RECOVERY_REQUESTED, /* user-space has requested a sync (used with SYNC) */ - MD_RECOVERY_CHECK, /* user-space request for check-only, no repair */ - MD_RECOVERY_RESHAPE, /* A reshape is happening */ - MD_RECOVERY_FROZEN, /* User request to abort, and not restart, any action */ - MD_RECOVERY_ERROR, /* sync-action interrupted because io-error */ - MD_RECOVERY_WAIT, /* waiting for pers->start() to finish */ - MD_RESYNCING_REMOTE, /* remote node is running resync thread */ + MD_RECOVERY_CHECK, + /* recovery, or need to try it */ + MD_RECOVERY_RECOVER, + /* reshape */ + MD_RECOVERY_RESHAPE, + /* remote node is running resync thread */ + MD_RESYNCING_REMOTE, }; enum md_ro_state { From a85aa09da2f2773c685310666ef09f935ff68a45 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 11 Jun 2024 21:22:41 +0800 Subject: [PATCH 0858/1578] md: add a new enum type sync_action In order to make code related to sync_thread cleaner in following patches, also add detail comment about each sync action. And also prepare to remove the related recovery_flags in the fulture. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240611132251.1967786-3-yukuai1@huaweicloud.com --- drivers/md/md.h | 57 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/drivers/md/md.h b/drivers/md/md.h index 1ee129c6f98ff5..e5001d39c82d4b 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -34,6 +34,61 @@ */ #define MD_FAILFAST (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT) +/* Status of sync thread. */ +enum sync_action { + /* + * Represent by MD_RECOVERY_SYNC, start when: + * 1) after assemble, sync data from first rdev to other copies, this + * must be done first before other sync actions and will only execute + * once; + * 2) resize the array(notice that this is not reshape), sync data for + * the new range; + */ + ACTION_RESYNC, + /* + * Represent by MD_RECOVERY_RECOVER, start when: + * 1) for new replacement, sync data based on the replace rdev or + * available copies from other rdev; + * 2) for new member disk while the array is degraded, sync data from + * other rdev; + * 3) reassemble after power failure or re-add a hot removed rdev, sync + * data from first rdev to other copies based on bitmap; + */ + ACTION_RECOVER, + /* + * Represent by MD_RECOVERY_SYNC | MD_RECOVERY_REQUESTED | + * MD_RECOVERY_CHECK, start when user echo "check" to sysfs api + * sync_action, used to check if data copies from differenct rdev are + * the same. The number of mismatch sectors will be exported to user + * by sysfs api mismatch_cnt; + */ + ACTION_CHECK, + /* + * Represent by MD_RECOVERY_SYNC | MD_RECOVERY_REQUESTED, start when + * user echo "repair" to sysfs api sync_action, usually paired with + * ACTION_CHECK, used to force syncing data once user found that there + * are inconsistent data, + */ + ACTION_REPAIR, + /* + * Represent by MD_RECOVERY_RESHAPE, start when new member disk is added + * to the conf, notice that this is different from spares or + * replacement; + */ + ACTION_RESHAPE, + /* + * Represent by MD_RECOVERY_FROZEN, can be set by sysfs api sync_action + * or internal usage like setting the array read-only, will forbid above + * actions. + */ + ACTION_FROZEN, + /* + * All above actions don't match. + */ + ACTION_IDLE, + NR_SYNC_ACTIONS, +}; + /* * The struct embedded in rdev is used to serialize IO. */ @@ -571,7 +626,7 @@ enum recovery_flags { /* interrupted because io-error */ MD_RECOVERY_ERROR, - /* flags determines sync action */ + /* flags determines sync action, see details in enum sync_action */ /* if just this flag is set, action is resync. */ MD_RECOVERY_SYNC, From e792a4c2156a392d8126bf0496f74407a21a8824 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 11 Jun 2024 21:22:42 +0800 Subject: [PATCH 0859/1578] md: add new helpers for sync_action The new helpers will get current sync_action of the array, will be used in later patches to make code cleaner. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240611132251.1967786-4-yukuai1@huaweicloud.com --- drivers/md/md.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++ drivers/md/md.h | 3 ++ 2 files changed, 82 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index b9b15aa79496fb..4ce8d164cde97d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -69,6 +69,16 @@ #include "md-bitmap.h" #include "md-cluster.h" +static const char *action_name[NR_SYNC_ACTIONS] = { + [ACTION_RESYNC] = "resync", + [ACTION_RECOVER] = "recover", + [ACTION_CHECK] = "check", + [ACTION_REPAIR] = "repair", + [ACTION_RESHAPE] = "reshape", + [ACTION_FROZEN] = "frozen", + [ACTION_IDLE] = "idle", +}; + /* pers_list is a list of registered personalities protected by pers_lock. */ static LIST_HEAD(pers_list); static DEFINE_SPINLOCK(pers_lock); @@ -4868,6 +4878,75 @@ metadata_store(struct mddev *mddev, const char *buf, size_t len) static struct md_sysfs_entry md_metadata = __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); +enum sync_action md_sync_action(struct mddev *mddev) +{ + unsigned long recovery = mddev->recovery; + + /* + * frozen has the highest priority, means running sync_thread will be + * stopped immediately, and no new sync_thread can start. + */ + if (test_bit(MD_RECOVERY_FROZEN, &recovery)) + return ACTION_FROZEN; + + /* + * read-only array can't register sync_thread, and it can only + * add/remove spares. + */ + if (!md_is_rdwr(mddev)) + return ACTION_IDLE; + + /* + * idle means no sync_thread is running, and no new sync_thread is + * requested. + */ + if (!test_bit(MD_RECOVERY_RUNNING, &recovery) && + !test_bit(MD_RECOVERY_NEEDED, &recovery)) + return ACTION_IDLE; + + if (test_bit(MD_RECOVERY_RESHAPE, &recovery) || + mddev->reshape_position != MaxSector) + return ACTION_RESHAPE; + + if (test_bit(MD_RECOVERY_RECOVER, &recovery)) + return ACTION_RECOVER; + + if (test_bit(MD_RECOVERY_SYNC, &recovery)) { + /* + * MD_RECOVERY_CHECK must be paired with + * MD_RECOVERY_REQUESTED. + */ + if (test_bit(MD_RECOVERY_CHECK, &recovery)) + return ACTION_CHECK; + if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) + return ACTION_REPAIR; + return ACTION_RESYNC; + } + + /* + * MD_RECOVERY_NEEDED or MD_RECOVERY_RUNNING is set, however, no + * sync_action is specified. + */ + return ACTION_IDLE; +} + +enum sync_action md_sync_action_by_name(const char *page) +{ + enum sync_action action; + + for (action = 0; action < NR_SYNC_ACTIONS; ++action) { + if (cmd_match(page, action_name[action])) + return action; + } + + return NR_SYNC_ACTIONS; +} + +const char *md_sync_action_name(enum sync_action action) +{ + return action_name[action]; +} + static ssize_t action_show(struct mddev *mddev, char *page) { diff --git a/drivers/md/md.h b/drivers/md/md.h index e5001d39c82d4b..88add162b08ea0 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -864,6 +864,9 @@ extern void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **t extern void md_wakeup_thread(struct md_thread __rcu *thread); extern void md_check_recovery(struct mddev *mddev); extern void md_reap_sync_thread(struct mddev *mddev); +extern enum sync_action md_sync_action(struct mddev *mddev); +extern enum sync_action md_sync_action_by_name(const char *page); +extern const char *md_sync_action_name(enum sync_action action); extern void md_write_start(struct mddev *mddev, struct bio *bi); extern void md_write_inc(struct mddev *mddev, struct bio *bi); extern void md_write_end(struct mddev *mddev); From 207c5656c33d56b3759d0876b68fa56cb56e5c51 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 11 Jun 2024 21:22:43 +0800 Subject: [PATCH 0860/1578] md: factor out helper to start reshape from action_store() There are no functional changes, just to make code cleaner and prepare for following refactor. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240611132251.1967786-5-yukuai1@huaweicloud.com --- drivers/md/md.c | 65 +++++++++++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 24 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 4ce8d164cde97d..b34ae9fbd24671 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5071,6 +5071,45 @@ static void frozen_sync_thread(struct mddev *mddev) mutex_unlock(&mddev->sync_mutex); } +static int mddev_start_reshape(struct mddev *mddev) +{ + int ret; + + if (mddev->pers->start_reshape == NULL) + return -EINVAL; + + ret = mddev_lock(mddev); + if (ret) + return ret; + + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { + mddev_unlock(mddev); + return -EBUSY; + } + + if (mddev->reshape_position == MaxSector || + mddev->pers->check_reshape == NULL || + mddev->pers->check_reshape(mddev)) { + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + ret = mddev->pers->start_reshape(mddev); + if (ret) { + mddev_unlock(mddev); + return ret; + } + } else { + /* + * If reshape is still in progress, and md_check_recovery() can + * continue to reshape, don't restart reshape because data can + * be corrupted for raid456. + */ + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + } + + mddev_unlock(mddev); + sysfs_notify_dirent_safe(mddev->sysfs_degraded); + return 0; +} + static ssize_t action_store(struct mddev *mddev, const char *page, size_t len) { @@ -5090,32 +5129,10 @@ action_store(struct mddev *mddev, const char *page, size_t len) clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); } else if (cmd_match(page, "reshape")) { - int err; - if (mddev->pers->start_reshape == NULL) - return -EINVAL; - err = mddev_lock(mddev); - if (!err) { - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { - err = -EBUSY; - } else if (mddev->reshape_position == MaxSector || - mddev->pers->check_reshape == NULL || - mddev->pers->check_reshape(mddev)) { - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - err = mddev->pers->start_reshape(mddev); - } else { - /* - * If reshape is still in progress, and - * md_check_recovery() can continue to reshape, - * don't restart reshape because data can be - * corrupted for raid456. - */ - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - } - mddev_unlock(mddev); - } + int err = mddev_start_reshape(mddev); + if (err) return err; - sysfs_notify_dirent_safe(mddev->sysfs_degraded); } else { if (cmd_match(page, "check")) set_bit(MD_RECOVERY_CHECK, &mddev->recovery); From c8ecfe680c371db2a6d125de5d6bc2398950e9cf Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 11 Jun 2024 21:22:44 +0800 Subject: [PATCH 0861/1578] md: replace sysfs api sync_action with new helpers To get rid of extrem long if else if usage, and make code cleaner. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240611132251.1967786-6-yukuai1@huaweicloud.com --- drivers/md/md.c | 94 +++++++++++++++++++++++++++---------------------- 1 file changed, 52 insertions(+), 42 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index b34ae9fbd24671..d035cd52e49a2d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4950,27 +4950,9 @@ const char *md_sync_action_name(enum sync_action action) static ssize_t action_show(struct mddev *mddev, char *page) { - char *type = "idle"; - unsigned long recovery = mddev->recovery; - if (test_bit(MD_RECOVERY_FROZEN, &recovery)) - type = "frozen"; - else if (test_bit(MD_RECOVERY_RUNNING, &recovery) || - (md_is_rdwr(mddev) && test_bit(MD_RECOVERY_NEEDED, &recovery))) { - if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) - type = "reshape"; - else if (test_bit(MD_RECOVERY_SYNC, &recovery)) { - if (!test_bit(MD_RECOVERY_REQUESTED, &recovery)) - type = "resync"; - else if (test_bit(MD_RECOVERY_CHECK, &recovery)) - type = "check"; - else - type = "repair"; - } else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) - type = "recover"; - else if (mddev->reshape_position != MaxSector) - type = "reshape"; - } - return sprintf(page, "%s\n", type); + enum sync_action action = md_sync_action(mddev); + + return sprintf(page, "%s\n", md_sync_action_name(action)); } /** @@ -5113,35 +5095,63 @@ static int mddev_start_reshape(struct mddev *mddev) static ssize_t action_store(struct mddev *mddev, const char *page, size_t len) { + int ret; + enum sync_action action; + if (!mddev->pers || !mddev->pers->sync_request) return -EINVAL; + action = md_sync_action_by_name(page); - if (cmd_match(page, "idle")) - idle_sync_thread(mddev); - else if (cmd_match(page, "frozen")) - frozen_sync_thread(mddev); - else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - return -EBUSY; - else if (cmd_match(page, "resync")) - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - else if (cmd_match(page, "recover")) { - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - } else if (cmd_match(page, "reshape")) { - int err = mddev_start_reshape(mddev); - - if (err) - return err; + /* TODO: mdadm rely on "idle" to start sync_thread. */ + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { + switch (action) { + case ACTION_FROZEN: + frozen_sync_thread(mddev); + return len; + case ACTION_IDLE: + idle_sync_thread(mddev); + break; + case ACTION_RESHAPE: + case ACTION_RECOVER: + case ACTION_CHECK: + case ACTION_REPAIR: + case ACTION_RESYNC: + return -EBUSY; + default: + return -EINVAL; + } } else { - if (cmd_match(page, "check")) + switch (action) { + case ACTION_FROZEN: + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + return len; + case ACTION_RESHAPE: + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + ret = mddev_start_reshape(mddev); + if (ret) + return ret; + break; + case ACTION_RECOVER: + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + break; + case ACTION_CHECK: set_bit(MD_RECOVERY_CHECK, &mddev->recovery); - else if (!cmd_match(page, "repair")) + fallthrough; + case ACTION_REPAIR: + set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + fallthrough; + case ACTION_RESYNC: + case ACTION_IDLE: + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + break; + default: return -EINVAL; - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); - set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + } } + if (mddev->ro == MD_AUTO_READ) { /* A write to sync_action is enough to justify * canceling read-auto mode From df79234bdc3f441bec99dfc8199b6f2c673203ed Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 11 Jun 2024 21:22:45 +0800 Subject: [PATCH 0862/1578] md: remove parameter check_seq for stop_sync_thread() Caller will always set MD_RECOVERY_FROZEN if check_seq is true, and always clear MD_RECOVERY_FROZEN if check_seq is false, hence replace the parameter with test_bit() to make code cleaner. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240611132251.1967786-7-yukuai1@huaweicloud.com --- drivers/md/md.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index d035cd52e49a2d..44cb18ec1c52d9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4961,15 +4961,10 @@ action_show(struct mddev *mddev, char *page) * @locked: if set, reconfig_mutex will still be held after this function * return; if not set, reconfig_mutex will be released after this * function return. - * @check_seq: if set, only wait for curent running sync_thread to stop, noted - * that new sync_thread can still start. */ -static void stop_sync_thread(struct mddev *mddev, bool locked, bool check_seq) +static void stop_sync_thread(struct mddev *mddev, bool locked) { - int sync_seq; - - if (check_seq) - sync_seq = atomic_read(&mddev->sync_seq); + int sync_seq = atomic_read(&mddev->sync_seq); if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { if (!locked) @@ -4990,7 +4985,8 @@ static void stop_sync_thread(struct mddev *mddev, bool locked, bool check_seq) wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || - (check_seq && sync_seq != atomic_read(&mddev->sync_seq))); + (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery) && + sync_seq != atomic_read(&mddev->sync_seq))); if (locked) mddev_lock_nointr(mddev); @@ -5001,7 +4997,7 @@ void md_idle_sync_thread(struct mddev *mddev) lockdep_assert_held(&mddev->reconfig_mutex); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - stop_sync_thread(mddev, true, true); + stop_sync_thread(mddev, true); } EXPORT_SYMBOL_GPL(md_idle_sync_thread); @@ -5010,7 +5006,7 @@ void md_frozen_sync_thread(struct mddev *mddev) lockdep_assert_held(&mddev->reconfig_mutex); set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - stop_sync_thread(mddev, true, false); + stop_sync_thread(mddev, true); } EXPORT_SYMBOL_GPL(md_frozen_sync_thread); @@ -5035,7 +5031,7 @@ static void idle_sync_thread(struct mddev *mddev) return; } - stop_sync_thread(mddev, false, true); + stop_sync_thread(mddev, false); mutex_unlock(&mddev->sync_mutex); } @@ -5049,7 +5045,7 @@ static void frozen_sync_thread(struct mddev *mddev) return; } - stop_sync_thread(mddev, false, false); + stop_sync_thread(mddev, false); mutex_unlock(&mddev->sync_mutex); } @@ -6544,7 +6540,7 @@ void md_stop_writes(struct mddev *mddev) { mddev_lock_nointr(mddev); set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - stop_sync_thread(mddev, true, false); + stop_sync_thread(mddev, true); __md_stop_writes(mddev); mddev_unlock(mddev); } @@ -6612,7 +6608,7 @@ static int md_set_readonly(struct mddev *mddev) set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } - stop_sync_thread(mddev, false, false); + stop_sync_thread(mddev, false); wait_event(mddev->sb_wait, !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); mddev_lock_nointr(mddev); @@ -6658,7 +6654,7 @@ static int do_md_stop(struct mddev *mddev, int mode) set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } - stop_sync_thread(mddev, true, false); + stop_sync_thread(mddev, true); if (mddev->sysfs_active || test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { From 5ce10a38590c77f20d0dc706944f79e7d56a7400 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 11 Jun 2024 21:22:46 +0800 Subject: [PATCH 0863/1578] md: don't fail action_store() if sync_thread is not registered MD_RECOVERY_RUNNING will always be set when trying to register a new sync_thread, however, if md_start_sync() turns out to do nothing, MD_RECOVERY_RUNNING will be cleared in this case. And during the race window, action_store() will return -EBUSY, which will cause some mdadm tests to fail. For example: The test 07reshape5intr will add a new disk to array, then start reshape: mdadm /dev/md0 --add /dev/xxx mdadm --grow /dev/md0 -n 3 And add_bound_rdev() from mdadm --add will set MD_RECOVERY_NEEDED, then during the race windown, mdadm --grow will fail. Fix the problem by waiting in action_store() during the race window, fail only if sync_thread is registered. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240611132251.1967786-8-yukuai1@huaweicloud.com --- drivers/md/md.c | 85 +++++++++++++++++++------------------------------ drivers/md/md.h | 2 -- 2 files changed, 33 insertions(+), 54 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 44cb18ec1c52d9..86abd0fe0681ac 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -753,7 +753,6 @@ int mddev_init(struct mddev *mddev) mutex_init(&mddev->open_mutex); mutex_init(&mddev->reconfig_mutex); - mutex_init(&mddev->sync_mutex); mutex_init(&mddev->suspend_mutex); mutex_init(&mddev->bitmap_info.mutex); INIT_LIST_HEAD(&mddev->disks); @@ -5021,34 +5020,6 @@ void md_unfrozen_sync_thread(struct mddev *mddev) } EXPORT_SYMBOL_GPL(md_unfrozen_sync_thread); -static void idle_sync_thread(struct mddev *mddev) -{ - mutex_lock(&mddev->sync_mutex); - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - - if (mddev_lock(mddev)) { - mutex_unlock(&mddev->sync_mutex); - return; - } - - stop_sync_thread(mddev, false); - mutex_unlock(&mddev->sync_mutex); -} - -static void frozen_sync_thread(struct mddev *mddev) -{ - mutex_lock(&mddev->sync_mutex); - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - - if (mddev_lock(mddev)) { - mutex_unlock(&mddev->sync_mutex); - return; - } - - stop_sync_thread(mddev, false); - mutex_unlock(&mddev->sync_mutex); -} - static int mddev_start_reshape(struct mddev *mddev) { int ret; @@ -5056,24 +5027,13 @@ static int mddev_start_reshape(struct mddev *mddev) if (mddev->pers->start_reshape == NULL) return -EINVAL; - ret = mddev_lock(mddev); - if (ret) - return ret; - - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { - mddev_unlock(mddev); - return -EBUSY; - } - if (mddev->reshape_position == MaxSector || mddev->pers->check_reshape == NULL || mddev->pers->check_reshape(mddev)) { clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); ret = mddev->pers->start_reshape(mddev); - if (ret) { - mddev_unlock(mddev); + if (ret) return ret; - } } else { /* * If reshape is still in progress, and md_check_recovery() can @@ -5083,7 +5043,6 @@ static int mddev_start_reshape(struct mddev *mddev) clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } - mddev_unlock(mddev); sysfs_notify_dirent_safe(mddev->sysfs_degraded); return 0; } @@ -5097,36 +5056,53 @@ action_store(struct mddev *mddev, const char *page, size_t len) if (!mddev->pers || !mddev->pers->sync_request) return -EINVAL; +retry: + if (work_busy(&mddev->sync_work)) + flush_work(&mddev->sync_work); + + ret = mddev_lock(mddev); + if (ret) + return ret; + + if (work_busy(&mddev->sync_work)) { + mddev_unlock(mddev); + goto retry; + } + action = md_sync_action_by_name(page); /* TODO: mdadm rely on "idle" to start sync_thread. */ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { switch (action) { case ACTION_FROZEN: - frozen_sync_thread(mddev); - return len; + md_frozen_sync_thread(mddev); + ret = len; + goto out; case ACTION_IDLE: - idle_sync_thread(mddev); + md_idle_sync_thread(mddev); break; case ACTION_RESHAPE: case ACTION_RECOVER: case ACTION_CHECK: case ACTION_REPAIR: case ACTION_RESYNC: - return -EBUSY; + ret = -EBUSY; + goto out; default: - return -EINVAL; + ret = -EINVAL; + goto out; } } else { switch (action) { case ACTION_FROZEN: set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - return len; + ret = len; + goto out; case ACTION_RESHAPE: clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); ret = mddev_start_reshape(mddev); if (ret) - return ret; + goto out; break; case ACTION_RECOVER: clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); @@ -5144,7 +5120,8 @@ action_store(struct mddev *mddev, const char *page, size_t len) clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); break; default: - return -EINVAL; + ret = -EINVAL; + goto out; } } @@ -5152,14 +5129,18 @@ action_store(struct mddev *mddev, const char *page, size_t len) /* A write to sync_action is enough to justify * canceling read-auto mode */ - flush_work(&mddev->sync_work); mddev->ro = MD_RDWR; md_wakeup_thread(mddev->sync_thread); } + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); sysfs_notify_dirent_safe(mddev->sysfs_action); - return len; + ret = len; + +out: + mddev_unlock(mddev); + return ret; } static struct md_sysfs_entry md_scan_mode = diff --git a/drivers/md/md.h b/drivers/md/md.h index 88add162b08ea0..732053b905b229 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -595,8 +595,6 @@ struct mddev { */ struct list_head deleting; - /* Used to synchronize idle and frozen for action_store() */ - struct mutex sync_mutex; /* The sequence number for sync thread */ atomic_t sync_seq; From cea2a26553ace13ee36b56dc09ad548b5e6907df Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 11 Jun 2024 17:58:57 +0300 Subject: [PATCH 0864/1578] mailmap: Add my outdated addresses to the map file There is a couple of outdated addresses that are still visible in the Git history, add them to .mailmap. While at it, replace one in the comment. Signed-off-by: Andy Shevchenko Signed-off-by: Linus Torvalds --- .mailmap | 2 ++ drivers/media/pci/saa7134/saa7134-cards.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index efd9fa867a8ecc..b6e9fe0916cec3 100644 --- a/.mailmap +++ b/.mailmap @@ -72,6 +72,8 @@ Andrey Ryabinin Andrzej Hajda André Almeida Andy Adamson +Andy Shevchenko +Andy Shevchenko Anilkumar Kolli Anirudh Ghayal Antoine Tenart diff --git a/drivers/media/pci/saa7134/saa7134-cards.c b/drivers/media/pci/saa7134/saa7134-cards.c index 1280696f65f26b..e80fb4ebfda61b 100644 --- a/drivers/media/pci/saa7134/saa7134-cards.c +++ b/drivers/media/pci/saa7134/saa7134-cards.c @@ -5152,7 +5152,7 @@ struct saa7134_board saa7134_boards[] = { }, }, [SAA7134_BOARD_AVERMEDIA_STUDIO_507UA] = { - /* Andy Shevchenko */ + /* Andy Shevchenko */ .name = "Avermedia AVerTV Studio 507UA", .audio_clock = 0x00187de7, .tuner_type = TUNER_PHILIPS_FM1216ME_MK3, /* Should be MK5 */ From 7d9f107a4e946bb52b7502eed9ed8f316700397e Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 11 Jun 2024 21:22:47 +0800 Subject: [PATCH 0865/1578] md: use new helpers in md_do_sync() Make code cleaner. and also use the action_name directly in kernel log: - "check" instead of "data-check" - "repair" instead of "requested-resync" Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240611132251.1967786-9-yukuai1@huaweicloud.com --- drivers/md/md.c | 21 +++++---------------- drivers/md/md.h | 2 +- 2 files changed, 6 insertions(+), 17 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 86abd0fe0681ac..5fa7b5f4bc6de5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8929,7 +8929,8 @@ void md_do_sync(struct md_thread *thread) sector_t last_check; int skipped = 0; struct md_rdev *rdev; - char *desc, *action = NULL; + enum sync_action action; + const char *desc; struct blk_plug plug; int ret; @@ -8960,21 +8961,9 @@ void md_do_sync(struct md_thread *thread) goto skip; } - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { - desc = "data-check"; - action = "check"; - } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { - desc = "requested-resync"; - action = "repair"; - } else - desc = "resync"; - } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) - desc = "reshape"; - else - desc = "recovery"; - - mddev->last_sync_action = action ?: desc; + action = md_sync_action(mddev); + desc = md_sync_action_name(action); + mddev->last_sync_action = desc; /* * Before starting a resync we must have set curr_resync to diff --git a/drivers/md/md.h b/drivers/md/md.h index 732053b905b229..ee06cb076f8c7d 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -432,7 +432,7 @@ struct mddev { * when the sync thread is "frozen" (interrupted) or "idle" (stopped * or finished). It is overwritten when a new sync operation is begun. */ - char *last_sync_action; + const char *last_sync_action; sector_t curr_resync; /* last block scheduled */ /* As resync requests can complete out of order, we cannot easily track * how much resync has been completed. So we occasionally pause until From d249e541887a966df37544f7c4d301cdee0f0e27 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 11 Jun 2024 21:22:48 +0800 Subject: [PATCH 0866/1578] md: replace last_sync_action with new enum type The only difference is that "none" is removed and initial last_sync_action will be idle. On the one hand, this value is introduced by commit c4a395514516 ("MD: Remember the last sync operation that was performed"), and the usage described in commit message is not affected. On the other hand, last_sync_action is not used in mdadm or mdmon, and none of the tests that I can find. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240611132251.1967786-10-yukuai1@huaweicloud.com --- drivers/md/dm-raid.c | 2 +- drivers/md/md.c | 7 ++++--- drivers/md/md.h | 9 ++++----- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index abe88d1e673582..052c00c1eb1542 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3542,7 +3542,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, recovery = rs->md.recovery; state = decipher_sync_action(mddev, recovery); progress = rs_get_progress(rs, recovery, state, resync_max_sectors); - resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ? + resync_mismatches = mddev->last_sync_action == ACTION_CHECK ? atomic64_read(&mddev->resync_mismatches) : 0; /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */ diff --git a/drivers/md/md.c b/drivers/md/md.c index 5fa7b5f4bc6de5..ab492e88586729 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -768,7 +768,7 @@ int mddev_init(struct mddev *mddev) init_waitqueue_head(&mddev->recovery_wait); mddev->reshape_position = MaxSector; mddev->reshape_backwards = 0; - mddev->last_sync_action = "none"; + mddev->last_sync_action = ACTION_IDLE; mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->level = LEVEL_NONE; @@ -5149,7 +5149,8 @@ __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); static ssize_t last_sync_action_show(struct mddev *mddev, char *page) { - return sprintf(page, "%s\n", mddev->last_sync_action); + return sprintf(page, "%s\n", + md_sync_action_name(mddev->last_sync_action)); } static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); @@ -8963,7 +8964,7 @@ void md_do_sync(struct md_thread *thread) action = md_sync_action(mddev); desc = md_sync_action_name(action); - mddev->last_sync_action = desc; + mddev->last_sync_action = action; /* * Before starting a resync we must have set curr_resync to diff --git a/drivers/md/md.h b/drivers/md/md.h index ee06cb076f8c7d..41781e41d8ffc2 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -426,13 +426,12 @@ struct mddev { struct md_thread __rcu *thread; /* management thread */ struct md_thread __rcu *sync_thread; /* doing resync or reconstruct */ - /* 'last_sync_action' is initialized to "none". It is set when a - * sync operation (i.e "data-check", "requested-resync", "resync", - * "recovery", or "reshape") is started. It holds this value even + /* + * Set when a sync operation is started. It holds this value even * when the sync thread is "frozen" (interrupted) or "idle" (stopped - * or finished). It is overwritten when a new sync operation is begun. + * or finished). It is overwritten when a new sync operation is begun. */ - const char *last_sync_action; + enum sync_action last_sync_action; sector_t curr_resync; /* last block scheduled */ /* As resync requests can complete out of order, we cannot easily track * how much resync has been completed. So we occasionally pause until From bbf2076277b137f03624259da0a0369af88f3a68 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 11 Jun 2024 21:22:49 +0800 Subject: [PATCH 0867/1578] md: factor out helpers for different sync_action in md_do_sync() Make code cleaner by replacing if else if with switch, and it's more obvious now what is doing for each sync_action. There are no functional changes. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240611132251.1967786-11-yukuai1@huaweicloud.com --- drivers/md/md.c | 123 ++++++++++++++++++++++++++++-------------------- 1 file changed, 73 insertions(+), 50 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index ab492e88586729..ec2ef4dd42cf56 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8914,6 +8914,77 @@ void md_allow_write(struct mddev *mddev) } EXPORT_SYMBOL_GPL(md_allow_write); +static sector_t md_sync_max_sectors(struct mddev *mddev, + enum sync_action action) +{ + switch (action) { + case ACTION_RESYNC: + case ACTION_CHECK: + case ACTION_REPAIR: + atomic64_set(&mddev->resync_mismatches, 0); + fallthrough; + case ACTION_RESHAPE: + return mddev->resync_max_sectors; + case ACTION_RECOVER: + return mddev->dev_sectors; + default: + return 0; + } +} + +static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) +{ + sector_t start = 0; + struct md_rdev *rdev; + + switch (action) { + case ACTION_CHECK: + case ACTION_REPAIR: + return mddev->resync_min; + case ACTION_RESYNC: + if (!mddev->bitmap) + return mddev->recovery_cp; + return 0; + case ACTION_RESHAPE: + /* + * If the original node aborts reshaping then we continue the + * reshaping, so set again to avoid restart reshape from the + * first beginning + */ + if (mddev_is_clustered(mddev) && + mddev->reshape_position != MaxSector) + return mddev->reshape_position; + return 0; + case ACTION_RECOVER: + start = MaxSector; + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(Faulty, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < start) + start = rdev->recovery_offset; + rcu_read_unlock(); + + /* If there is a bitmap, we need to make sure all + * writes that started before we added a spare + * complete before we start doing a recovery. + * Otherwise the write might complete and (via + * bitmap_endwrite) set a bit in the bitmap after the + * recovery has checked that bit and skipped that + * region. + */ + if (mddev->bitmap) { + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + } + return start; + default: + return MaxSector; + } +} + #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) #define UPDATE_FREQUENCY (5*60*HZ) @@ -9032,56 +9103,8 @@ void md_do_sync(struct md_thread *thread) spin_unlock(&all_mddevs_lock); } while (mddev->curr_resync < MD_RESYNC_DELAYED); - j = 0; - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - /* resync follows the size requested by the personality, - * which defaults to physical size, but can be virtual size - */ - max_sectors = mddev->resync_max_sectors; - atomic64_set(&mddev->resync_mismatches, 0); - /* we don't use the checkpoint if there's a bitmap */ - if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) - j = mddev->resync_min; - else if (!mddev->bitmap) - j = mddev->recovery_cp; - - } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { - max_sectors = mddev->resync_max_sectors; - /* - * If the original node aborts reshaping then we continue the - * reshaping, so set j again to avoid restart reshape from the - * first beginning - */ - if (mddev_is_clustered(mddev) && - mddev->reshape_position != MaxSector) - j = mddev->reshape_position; - } else { - /* recovery follows the physical size of devices */ - max_sectors = mddev->dev_sectors; - j = MaxSector; - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) - if (rdev->raid_disk >= 0 && - !test_bit(Journal, &rdev->flags) && - !test_bit(Faulty, &rdev->flags) && - !test_bit(In_sync, &rdev->flags) && - rdev->recovery_offset < j) - j = rdev->recovery_offset; - rcu_read_unlock(); - - /* If there is a bitmap, we need to make sure all - * writes that started before we added a spare - * complete before we start doing a recovery. - * Otherwise the write might complete and (via - * bitmap_endwrite) set a bit in the bitmap after the - * recovery has checked that bit and skipped that - * region. - */ - if (mddev->bitmap) { - mddev->pers->quiesce(mddev, 1); - mddev->pers->quiesce(mddev, 0); - } - } + max_sectors = md_sync_max_sectors(mddev, action); + j = md_sync_position(mddev, action); pr_info("md: %s of RAID array %s\n", desc, mdname(mddev)); pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev)); From bc49694a9e8fd1b36bca47d9a54ec8da8e39012f Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 11 Jun 2024 21:22:50 +0800 Subject: [PATCH 0868/1578] md: pass in max_sectors for pers->sync_request() For different sync_action, sync_thread will use different max_sectors, see details in md_sync_max_sectors(), currently both md_do_sync() and pers->sync_request() in eatch iteration have to get the same max_sectors. Hence pass in max_sectors for pers->sync_request() to prevent redundant code. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240611132251.1967786-12-yukuai1@huaweicloud.com --- drivers/md/md.c | 5 +++-- drivers/md/md.h | 3 ++- drivers/md/raid1.c | 5 ++--- drivers/md/raid10.c | 8 ++------ drivers/md/raid5.c | 3 +-- 5 files changed, 10 insertions(+), 14 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index ec2ef4dd42cf56..c0426a6d2fd197 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9186,7 +9186,8 @@ void md_do_sync(struct md_thread *thread) if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) break; - sectors = mddev->pers->sync_request(mddev, j, &skipped); + sectors = mddev->pers->sync_request(mddev, j, max_sectors, + &skipped); if (sectors == 0) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); break; @@ -9276,7 +9277,7 @@ void md_do_sync(struct md_thread *thread) mddev->curr_resync_completed = mddev->curr_resync; sysfs_notify_dirent_safe(mddev->sysfs_completed); } - mddev->pers->sync_request(mddev, max_sectors, &skipped); + mddev->pers->sync_request(mddev, max_sectors, max_sectors, &skipped); if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && mddev->curr_resync > MD_RESYNC_ACTIVE) { diff --git a/drivers/md/md.h b/drivers/md/md.h index 41781e41d8ffc2..2dc52edec3fe63 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -729,7 +729,8 @@ struct md_personality int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev); int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev); int (*spare_active) (struct mddev *mddev); - sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped); + sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, + sector_t max_sector, int *skipped); int (*resize) (struct mddev *mddev, sector_t sectors); sector_t (*size) (struct mddev *mddev, sector_t sectors, int raid_disks); int (*check_reshape) (struct mddev *mddev); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 3d54f30112a0e8..2bbfb4e682b2ff 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2756,12 +2756,12 @@ static struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf) */ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, - int *skipped) + sector_t max_sector, int *skipped) { struct r1conf *conf = mddev->private; struct r1bio *r1_bio; struct bio *bio; - sector_t max_sector, nr_sectors; + sector_t nr_sectors; int disk = -1; int i; int wonly = -1; @@ -2777,7 +2777,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, if (init_resync(conf)) return 0; - max_sector = mddev->dev_sectors; if (sector_nr >= max_sector) { /* If we aborted, we need to abort the * sync on the 'current' bitmap chunk (there will diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index f8d7c02c6ed561..4e804602d1e53a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3139,12 +3139,12 @@ static void raid10_set_cluster_sync_high(struct r10conf *conf) */ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, - int *skipped) + sector_t max_sector, int *skipped) { struct r10conf *conf = mddev->private; struct r10bio *r10_bio; struct bio *biolist = NULL, *bio; - sector_t max_sector, nr_sectors; + sector_t nr_sectors; int i; int max_sync; sector_t sync_blocks; @@ -3174,10 +3174,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, return 0; skipped: - max_sector = mddev->dev_sectors; - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || - test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) - max_sector = mddev->resync_max_sectors; if (sector_nr >= max_sector) { conf->cluster_sync_low = 0; conf->cluster_sync_high = 0; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a84389311dd1ea..013adc5ba0e124 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6457,11 +6457,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk } static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr, - int *skipped) + sector_t max_sector, int *skipped) { struct r5conf *conf = mddev->private; struct stripe_head *sh; - sector_t max_sector = mddev->dev_sectors; sector_t sync_blocks; int still_degraded = 0; int i; From 305a5170dc5cf3d395bb4c4e9239bca6d0b54b49 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 11 Jun 2024 21:22:51 +0800 Subject: [PATCH 0869/1578] md/raid5: avoid BUG_ON() while continue reshape after reassembling Currently, mdadm support --revert-reshape to abort the reshape while reassembling, as the test 07revert-grow. However, following BUG_ON() can be triggerred by the test: kernel BUG at drivers/md/raid5.c:6278! invalid opcode: 0000 [#1] PREEMPT SMP PTI irq event stamp: 158985 CPU: 6 PID: 891 Comm: md0_reshape Not tainted 6.9.0-03335-g7592a0b0049a #94 RIP: 0010:reshape_request+0x3f1/0xe60 Call Trace: raid5_sync_request+0x43d/0x550 md_do_sync+0xb7a/0x2110 md_thread+0x294/0x2b0 kthread+0x147/0x1c0 ret_from_fork+0x59/0x70 ret_from_fork_asm+0x1a/0x30 Root cause is that --revert-reshape update the raid_disks from 5 to 4, while reshape position is still set, and after reassembling the array, reshape position will be read from super block, then during reshape the checking of 'writepos' that is caculated by old reshape position will fail. Fix this panic the easy way first, by converting the BUG_ON() to WARN_ON(), and stop the reshape if checkings fail. Noted that mdadm must fix --revert-shape as well, and probably md/raid should enhance metadata validation as well, however this means reassemble will fail and there must be user tools to fix the wrong metadata. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240611132251.1967786-13-yukuai1@huaweicloud.com --- drivers/md/raid5.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 013adc5ba0e124..547fd15115cdc5 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6254,7 +6254,9 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk safepos = conf->reshape_safe; sector_div(safepos, data_disks); if (mddev->reshape_backwards) { - BUG_ON(writepos < reshape_sectors); + if (WARN_ON(writepos < reshape_sectors)) + return MaxSector; + writepos -= reshape_sectors; readpos += reshape_sectors; safepos += reshape_sectors; @@ -6272,14 +6274,18 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk * to set 'stripe_addr' which is where we will write to. */ if (mddev->reshape_backwards) { - BUG_ON(conf->reshape_progress == 0); + if (WARN_ON(conf->reshape_progress == 0)) + return MaxSector; + stripe_addr = writepos; - BUG_ON((mddev->dev_sectors & - ~((sector_t)reshape_sectors - 1)) - - reshape_sectors - stripe_addr - != sector_nr); + if (WARN_ON((mddev->dev_sectors & + ~((sector_t)reshape_sectors - 1)) - + reshape_sectors - stripe_addr != sector_nr)) + return MaxSector; } else { - BUG_ON(writepos != sector_nr + reshape_sectors); + if (WARN_ON(writepos != sector_nr + reshape_sectors)) + return MaxSector; + stripe_addr = sector_nr; } From 9b1ebce6a1fded90d4a1c6c57dc6262dac4c4c14 Mon Sep 17 00:00:00 2001 From: Su Hui Date: Tue, 11 Jun 2024 15:37:00 +0800 Subject: [PATCH 0870/1578] block: sed-opal: avoid possible wrong address reference in read_sed_opal_key() Clang static checker (scan-build) warning: block/sed-opal.c:line 317, column 3 Value stored to 'ret' is never read. Fix this problem by returning the error code when keyring_search() failed. Otherwise, 'key' will have a wrong value when 'kerf' stores the error code. Fixes: 3bfeb6125664 ("block: sed-opal: keyring support for SED keys") Signed-off-by: Su Hui Link: https://lore.kernel.org/r/20240611073659.429582-1-suhui@nfschina.com Signed-off-by: Jens Axboe --- block/sed-opal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index 14fe0fef811cfc..598fd3e7fcc8e2 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -314,7 +314,7 @@ static int read_sed_opal_key(const char *key_name, u_char *buffer, int buflen) &key_type_user, key_name, true); if (IS_ERR(kref)) - ret = PTR_ERR(kref); + return PTR_ERR(kref); key = key_ref_to_ptr(kref); down_read(&key->sem); From 1933192a91be0a570663a3c6310c46e4ce3b2baa Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 7 Jun 2024 09:21:26 +0900 Subject: [PATCH 0871/1578] block: Optimize disk zone resource cleanup For zoned block devices using zone write plugging, an rcu_barrier() call is needed in disk_free_zone_resources() to synchronize freeing of zone write plugs and the destrution of the mempool used to allocate the plugs. The barrier call does slow down a little teardown of zoned block devices but should not affect teardown of regular block devices or zoned block devices that do not use zone write plugging (e.g. zoned DM devices that do not require zone append emulation). Modify disk_free_zone_resources() to return early if we do not have a mempool to start with, that is, if the device does not use zone write plugging. This avoids the costly rcu_barrier() and speeds up disk teardown. Reported-by: Mikulas Patocka Fixes: dd291d77cc90 ("block: Introduce zone write plugging") Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Tested-by: Mikulas Patocka Reviewed-by: Niklas Cassel Link: https://lore.kernel.org/r/20240607002126.104227-1-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 52abebf56027a8..08d7dfe8bd938b 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -1552,6 +1552,9 @@ static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) void disk_free_zone_resources(struct gendisk *disk) { + if (!disk->zone_wplugs_pool) + return; + cancel_work_sync(&disk->zone_wplugs_work); if (disk->zone_wplugs_wq) { From d0321c812d89c5910d8da8e4b10c891c6b96ff70 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sat, 8 Jun 2024 22:31:15 +0800 Subject: [PATCH 0872/1578] block: fix request.queuelist usage in flush Friedrich Weber reported a kernel crash problem and bisected to commit 81ada09cc25e ("blk-flush: reuse rq queuelist in flush state machine"). The root cause is that we use "list_move_tail(&rq->queuelist, pending)" in the PREFLUSH/POSTFLUSH sequences. But rq->queuelist.next == xxx since it's popped out from plug->cached_rq in __blk_mq_alloc_requests_batch(). We don't initialize its queuelist just for this first request, although the queuelist of all later popped requests will be initialized. Fix it by changing to use "list_add_tail(&rq->queuelist, pending)" so rq->queuelist doesn't need to be initialized. It should be ok since rq can't be on any list when PREFLUSH or POSTFLUSH, has no move actually. Please note the commit 81ada09cc25e ("blk-flush: reuse rq queuelist in flush state machine") also has another requirement that no drivers would touch rq->queuelist after blk_mq_end_request() since we will reuse it to add rq to the post-flush pending list in POSTFLUSH. If this is not true, we will have to revert that commit IMHO. This updated version adds "list_del_init(&rq->queuelist)" in flush rq callback since the dm layer may submit request of a weird invalid format (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH), which causes double list_add if without this "list_del_init(&rq->queuelist)". The weird invalid format problem should be fixed in dm layer. Reported-by: Friedrich Weber Closes: https://lore.kernel.org/lkml/14b89dfb-505c-49f7-aebb-01c54451db40@proxmox.com/ Closes: https://lore.kernel.org/lkml/c9d03ff7-27c5-4ebd-b3f6-5a90d96f35ba@proxmox.com/ Fixes: 81ada09cc25e ("blk-flush: reuse rq queuelist in flush state machine") Cc: Christoph Hellwig Cc: ming.lei@redhat.com Cc: bvanassche@acm.org Tested-by: Friedrich Weber Signed-off-by: Chengming Zhou Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240608143115.972486-1-chengming.zhou@linux.dev Signed-off-by: Jens Axboe --- block/blk-flush.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index c17cf8ed8113db..cca4f9131f7955 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -185,7 +185,7 @@ static void blk_flush_complete_seq(struct request *rq, /* queue for flush */ if (list_empty(pending)) fq->flush_pending_since = jiffies; - list_move_tail(&rq->queuelist, pending); + list_add_tail(&rq->queuelist, pending); break; case REQ_FSEQ_DATA: @@ -263,6 +263,7 @@ static enum rq_end_io_ret flush_end_io(struct request *flush_rq, unsigned int seq = blk_flush_cur_seq(rq); BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); + list_del_init(&rq->queuelist); blk_flush_complete_seq(rq, fq, seq, error); } From e038ee6189842e9662d2fc59d09dbcf48350cf99 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Mon, 10 Jun 2024 16:41:44 +0530 Subject: [PATCH 0873/1578] block: unmap and free user mapped integrity via submitter The user mapped intergity is copied back and unpinned by bio_integrity_free which is a low-level routine. Do it via the submitter rather than doing it in the low-level block layer code, to split the submitter side from the consumer side of the bio. Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20240610111144.14647-1-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- block/bio-integrity.c | 26 ++++++++++++++++++++++++-- drivers/nvme/host/ioctl.c | 15 +++++++++++---- include/linux/bio.h | 4 ++++ 3 files changed, 39 insertions(+), 6 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 2e3e8e04961eae..8b528e12136f50 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -144,16 +144,38 @@ void bio_integrity_free(struct bio *bio) struct bio_integrity_payload *bip = bio_integrity(bio); struct bio_set *bs = bio->bi_pool; + if (bip->bip_flags & BIP_INTEGRITY_USER) + return; if (bip->bip_flags & BIP_BLOCK_INTEGRITY) kfree(bvec_virt(bip->bip_vec)); - else if (bip->bip_flags & BIP_INTEGRITY_USER) - bio_integrity_unmap_user(bip); __bio_integrity_free(bs, bip); bio->bi_integrity = NULL; bio->bi_opf &= ~REQ_INTEGRITY; } +/** + * bio_integrity_unmap_free_user - Unmap and free bio user integrity payload + * @bio: bio containing bip to be unmapped and freed + * + * Description: Used to unmap and free the user mapped integrity portion of a + * bio. Submitter attaching the user integrity buffer is responsible for + * unmapping and freeing it during completion. + */ +void bio_integrity_unmap_free_user(struct bio *bio) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + struct bio_set *bs = bio->bi_pool; + + if (WARN_ON_ONCE(!(bip->bip_flags & BIP_INTEGRITY_USER))) + return; + bio_integrity_unmap_user(bip); + __bio_integrity_free(bs, bip); + bio->bi_integrity = NULL; + bio->bi_opf &= ~REQ_INTEGRITY; +} +EXPORT_SYMBOL(bio_integrity_unmap_free_user); + /** * bio_integrity_add_page - Attach integrity metadata * @bio: bio to update diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 9d9d2a127c4ec2..8b69427a44762a 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -111,6 +111,13 @@ static struct request *nvme_alloc_user_request(struct request_queue *q, return req; } +static void nvme_unmap_bio(struct bio *bio) +{ + if (bio_integrity(bio)) + bio_integrity_unmap_free_user(bio); + blk_rq_unmap_user(bio); +} + static int nvme_map_user_request(struct request *req, u64 ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, u32 meta_seed, struct io_uring_cmd *ioucmd, unsigned int flags) @@ -157,7 +164,7 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, out_unmap: if (bio) - blk_rq_unmap_user(bio); + nvme_unmap_bio(bio); out: blk_mq_free_request(req); return ret; @@ -195,7 +202,7 @@ static int nvme_submit_user_cmd(struct request_queue *q, if (result) *result = le64_to_cpu(nvme_req(req)->result.u64); if (bio) - blk_rq_unmap_user(bio); + nvme_unmap_bio(bio); blk_mq_free_request(req); if (effects) @@ -406,7 +413,7 @@ static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd, struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); if (pdu->bio) - blk_rq_unmap_user(pdu->bio); + nvme_unmap_bio(pdu->bio); io_uring_cmd_done(ioucmd, pdu->status, pdu->result, issue_flags); } @@ -432,7 +439,7 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, */ if (blk_rq_is_poll(req)) { if (pdu->bio) - blk_rq_unmap_user(pdu->bio); + nvme_unmap_bio(pdu->bio); io_uring_cmd_iopoll_done(ioucmd, pdu->result, pdu->status); } else { io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); diff --git a/include/linux/bio.h b/include/linux/bio.h index d5379548d684e1..818e9361294781 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -731,6 +731,7 @@ static inline bool bioset_initialized(struct bio_set *bs) bip_for_each_vec(_bvl, _bio->bi_integrity, _iter) int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len, u32 seed); +void bio_integrity_unmap_free_user(struct bio *bio); extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int); extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int); extern bool bio_integrity_prep(struct bio *); @@ -807,6 +808,9 @@ static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf, { return -EINVAL; } +static inline void bio_integrity_unmap_free_user(struct bio *bio) +{ +} #endif /* CONFIG_BLK_DEV_INTEGRITY */ From 72cacd06e47d86d89b0e7179fbc9eb3a0f39cd93 Mon Sep 17 00:00:00 2001 From: Julien Panis Date: Tue, 4 Jun 2024 18:46:58 +0200 Subject: [PATCH 0874/1578] thermal/drivers/mediatek/lvts_thermal: Return error in case of invalid efuse data This patch prevents from registering thermal entries and letting the driver misbehave if efuse data is invalid. A device is not properly calibrated if the golden temperature is zero. Fixes: f5f633b18234 ("thermal/drivers/mediatek: Add the Low Voltage Thermal Sensor driver") Signed-off-by: Julien Panis Reviewed-by: Nicolas Pitre Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20240604-mtk-thermal-calib-check-v2-1-8f258254051d@baylibre.com Signed-off-by: Daniel Lezcano --- drivers/thermal/mediatek/lvts_thermal.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/thermal/mediatek/lvts_thermal.c b/drivers/thermal/mediatek/lvts_thermal.c index 82c355c466cfec..819ed0110f3e73 100644 --- a/drivers/thermal/mediatek/lvts_thermal.c +++ b/drivers/thermal/mediatek/lvts_thermal.c @@ -769,7 +769,11 @@ static int lvts_golden_temp_init(struct device *dev, u8 *calib, */ gt = (((u32 *)calib)[0] >> lvts_data->gt_calib_bit_offset) & 0xff; - if (gt && gt < LVTS_GOLDEN_TEMP_MAX) + /* A zero value for gt means that device has invalid efuse data */ + if (!gt) + return -ENODATA; + + if (gt < LVTS_GOLDEN_TEMP_MAX) golden_temp = gt; golden_temp_offset = golden_temp * 500 + lvts_data->temp_offset; From 6f2a43e3d14f6e31a3b041a1043195d02c54d615 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Wed, 12 Jun 2024 15:12:03 +0300 Subject: [PATCH 0875/1578] ASoC: SOF: sof-audio: Skip unprepare for in-use widgets on error rollback If the ipc_prepare() callback fails for a module instance, on error rewind we must skip the ipc_unprepare() call for ones that has positive use count. The positive use count means that the module instance is in active use, it cannot be unprepared. The issue affects capture direction paths with branches (single dai with multiple PCMs), the affected widgets are in the shared part of the paths. Signed-off-by: Peter Ujfalusi Reviewed-by: Pierre-Louis Bossart Reviewed-by: Kai Vehmanen Reviewed-by: Ranjani Sridharan Link: https://lore.kernel.org/r/20240612121203.15468-1-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/sof-audio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/sof/sof-audio.c b/sound/soc/sof/sof-audio.c index b3ac040811e796..ef9318947d74f5 100644 --- a/sound/soc/sof/sof-audio.c +++ b/sound/soc/sof/sof-audio.c @@ -485,7 +485,7 @@ sof_prepare_widgets_in_path(struct snd_sof_dev *sdev, struct snd_soc_dapm_widget if (ret < 0) { /* unprepare the source widget */ if (widget_ops[widget->id].ipc_unprepare && - swidget && swidget->prepared) { + swidget && swidget->prepared && swidget->use_count == 0) { widget_ops[widget->id].ipc_unprepare(swidget); swidget->prepared = false; } From f3b198e4788fcc8d03ed0c8bd5e3856c6a5760c5 Mon Sep 17 00:00:00 2001 From: Jack Yu Date: Wed, 12 Jun 2024 09:01:07 +0000 Subject: [PATCH 0876/1578] ASoC: rt722-sdca-sdw: add debounce time for type detection Add debounce time in headset type detection for better performance. Signed-off-by: Jack Yu Link: https://lore.kernel.org/r/7e502e9a9dd94122a1b60deb5ceb60fb@realtek.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt722-sdca-sdw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/rt722-sdca-sdw.c b/sound/soc/codecs/rt722-sdca-sdw.c index f73ee3bf90f5e6..87354bb1564e8d 100644 --- a/sound/soc/codecs/rt722-sdca-sdw.c +++ b/sound/soc/codecs/rt722-sdca-sdw.c @@ -352,7 +352,7 @@ static int rt722_sdca_interrupt_callback(struct sdw_slave *slave, if (status->sdca_cascade && !rt722->disable_irq) mod_delayed_work(system_power_efficient_wq, - &rt722->jack_detect_work, msecs_to_jiffies(30)); + &rt722->jack_detect_work, msecs_to_jiffies(280)); mutex_unlock(&rt722->disable_irq_lock); From e5d574ab37f5f2e7937405613d9b1a724811e5ad Mon Sep 17 00:00:00 2001 From: Chunguang Xu Date: Tue, 11 Jun 2024 18:02:08 +0800 Subject: [PATCH 0877/1578] nvme: avoid double free special payload If a discard request needs to be retried, and that retry may fail before a new special payload is added, a double free will result. Clear the RQF_SPECIAL_LOAD when the request is cleaned. Signed-off-by: Chunguang Xu Reviewed-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f5d150c62955d8..c40930d10bd34d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -998,6 +998,7 @@ void nvme_cleanup_cmd(struct request *req) clear_bit_unlock(0, &ctrl->discard_page_busy); else kfree(bvec_virt(&req->special_vec)); + req->rq_flags &= ~RQF_SPECIAL_PAYLOAD; } } EXPORT_SYMBOL_GPL(nvme_cleanup_cmd); From d76584e53f4244dbc154bec447c3852600acc914 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Wed, 12 Jun 2024 16:02:40 +0200 Subject: [PATCH 0878/1578] nvmet-passthru: propagate status from id override functions The id override functions return a status which is not propagated to the caller. Fixes: c1fef73f793b ("nvmet: add passthru code to process commands") Signed-off-by: Daniel Wagner Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/target/passthru.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index bb4a69d538fd10..f003782d4ecff4 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -226,13 +226,13 @@ static void nvmet_passthru_execute_cmd_work(struct work_struct *w) req->cmd->common.opcode == nvme_admin_identify) { switch (req->cmd->identify.cns) { case NVME_ID_CNS_CTRL: - nvmet_passthru_override_id_ctrl(req); + status = nvmet_passthru_override_id_ctrl(req); break; case NVME_ID_CNS_NS: - nvmet_passthru_override_id_ns(req); + status = nvmet_passthru_override_id_ns(req); break; case NVME_ID_CNS_NS_DESC_LIST: - nvmet_passthru_override_id_descs(req); + status = nvmet_passthru_override_id_descs(req); break; } } else if (status < 0) From cd0c1b8e045a8d2785342b385cb2684d9b48e426 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Wed, 12 Jun 2024 16:11:59 +0200 Subject: [PATCH 0879/1578] nvmet: always initialize cqe.result The spec doesn't mandate that the first two double words (aka results) for the command queue entry need to be set to 0 when they are not used (not specified). Though, the target implemention returns 0 for TCP and FC but not for RDMA. Let's make RDMA behave the same and thus explicitly initializing the result field. This prevents leaking any data from the stack. Signed-off-by: Daniel Wagner Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/target/core.c | 1 + drivers/nvme/target/fabrics-cmd-auth.c | 3 --- drivers/nvme/target/fabrics-cmd.c | 6 ------ 3 files changed, 1 insertion(+), 9 deletions(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 06f0c587f3437b..4ff460ba282633 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -957,6 +957,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, req->metadata_sg_cnt = 0; req->transfer_len = 0; req->metadata_len = 0; + req->cqe->result.u64 = 0; req->cqe->status = 0; req->cqe->sq_head = 0; req->ns = NULL; diff --git a/drivers/nvme/target/fabrics-cmd-auth.c b/drivers/nvme/target/fabrics-cmd-auth.c index d61b8c6ff3b20e..cb34d644ed086c 100644 --- a/drivers/nvme/target/fabrics-cmd-auth.c +++ b/drivers/nvme/target/fabrics-cmd-auth.c @@ -333,7 +333,6 @@ void nvmet_execute_auth_send(struct nvmet_req *req) pr_debug("%s: ctrl %d qid %d nvme status %x error loc %d\n", __func__, ctrl->cntlid, req->sq->qid, status, req->error_loc); - req->cqe->result.u64 = 0; if (req->sq->dhchap_step != NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2 && req->sq->dhchap_step != NVME_AUTH_DHCHAP_MESSAGE_FAILURE2) { unsigned long auth_expire_secs = ctrl->kato ? ctrl->kato : 120; @@ -516,8 +515,6 @@ void nvmet_execute_auth_receive(struct nvmet_req *req) status = nvmet_copy_to_sgl(req, 0, d, al); kfree(d); done: - req->cqe->result.u64 = 0; - if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2) nvmet_auth_sq_free(req->sq); else if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_FAILURE1) { diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 042b379cbb3647..69d77d34bec116 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -226,9 +226,6 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req) if (status) goto out; - /* zero out initial completion result, assign values as needed */ - req->cqe->result.u32 = 0; - if (c->recfmt != 0) { pr_warn("invalid connect version (%d).\n", le16_to_cpu(c->recfmt)); @@ -305,9 +302,6 @@ static void nvmet_execute_io_connect(struct nvmet_req *req) if (status) goto out; - /* zero out initial completion result, assign values as needed */ - req->cqe->result.u32 = 0; - if (c->recfmt != 0) { pr_warn("invalid connect version (%d).\n", le16_to_cpu(c->recfmt)); From 6bfff3582416b2f809e6b08c6e9d57b18086bdbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20L=C3=BCssing?= Date: Wed, 12 Jun 2024 15:33:57 +0200 Subject: [PATCH 0880/1578] Revert "batman-adv: prefer kfree_rcu() over call_rcu() with free-only callbacks" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 537db05da2ca8ccc1243c9dd1d0c148b84aa0432. This change seems to result in a memory leak / RCU race and the following kernel splat when the batman-adv kernel module is unloaded: ``` [ 112.208633] ============================================================================= [ 112.210359] BUG batadv_tl_cache (Tainted: G OE ): Objects remaining in batadv_tl_cache on __kmem_cache_shutdown() [ 112.211943] ----------------------------------------------------------------------------- [ 112.212517] Slab 0xffffe8afc0216d00 objects=16 used=1 fp=0xffff93f4085b4340 flags=0xfffffc0000a00(workingset|slab|node=0|zone=1|lastcpupid=0x1fffff) [ 112.212517] CPU: 1 PID: 776 Comm: rmmod Tainted: G OE 6.8.12-amd64 #1 Debian 6.8.12-1 [ 112.212517] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 [ 112.212517] Call Trace: [ 112.212517] [ 112.212517] dump_stack_lvl+0x64/0x80 [ 112.212517] slab_err+0xe6/0x120 [ 112.212517] __kmem_cache_shutdown+0x160/0x2e0 [ 112.212517] kmem_cache_destroy+0x55/0x160 [ 112.220849] batadv_tt_cache_destroy+0x15/0x60 [batman_adv] [ 112.220849] __do_sys_delete_module+0x1d5/0x320 [ 112.220849] do_syscall_64+0x83/0x190 [ 112.220849] ? do_syscall_64+0x8f/0x190 [ 112.220849] ? exc_page_fault+0x7f/0x180 [ 112.220849] entry_SYSCALL_64_after_hwframe+0x78/0x80 [ 112.224478] RIP: 0033:0x7f2ac8434977 [ 112.224478] Code: 73 01 c3 48 8b 0d a9 94 0c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 b8 b0 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 79 94 0c 00 f7 d8 64 89 01 48 [ 112.224478] RSP: 002b:00007ffe0adf6138 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 [ 112.224478] RAX: ffffffffffffffda RBX: 000055db9018e770 RCX: 00007f2ac8434977 [ 112.224478] RDX: 0000000000000000 RSI: 0000000000000800 RDI: 000055db9018e7d8 [ 112.224478] RBP: 0000000000000000 R08: 1999999999999999 R09: 0000000000000000 [ 112.224478] R10: 00007f2ac84a6ac0 R11: 0000000000000206 R12: 00007ffe0adf6390 [ 112.224478] R13: 000055db9018e770 R14: 000055db9018d2a0 R15: 0000000000000000 [ 112.233961] [ 112.233961] Disabling lock debugging due to kernel taint [ 112.233961] Object 0xffff93f4085b4140 @offset=320 [ 112.233961] Allocated in batadv_tt_local_add+0x297/0xa20 [batman_adv] age=15835 cpu=1 pid=755 [ 112.233961] batadv_tt_local_add+0x297/0xa20 [batman_adv] [ 112.233961] batadv_interface_set_mac_addr+0xf6/0x120 [batman_adv] [ 112.233961] dev_set_mac_address+0xde/0x140 [ 112.233961] dev_set_mac_address_user+0x30/0x50 [ 112.233961] do_setlink+0x261/0x12d0 [ 112.233961] rtnl_setlink+0x11f/0x1d0 [ 112.233961] rtnetlink_rcv_msg+0x152/0x3c0 [ 112.241772] netlink_rcv_skb+0x5b/0x110 [ 112.241772] netlink_unicast+0x1a6/0x290 [ 112.241772] netlink_sendmsg+0x223/0x490 [ 112.241772] __sys_sendto+0x1df/0x1f0 [ 112.241772] __x64_sys_sendto+0x24/0x30 [ 112.241772] do_syscall_64+0x83/0x190 [ 112.241772] entry_SYSCALL_64_after_hwframe+0x78/0x80 [ 112.245994] ------------[ cut here ]------------ [ 112.246650] kmem_cache_destroy batadv_tl_cache: Slab cache still has objects when called from batadv_tt_cache_destroy+0x15/0x60 [batman_adv] [ 112.246668] WARNING: CPU: 1 PID: 776 at mm/slab_common.c:493 kmem_cache_destroy+0x14d/0x160 [ 112.249584] Modules linked in: veth batman_adv(OE-) cfg80211 rfkill bridge stp llc libcrc32c crc32c_generic crc16 rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver binfmt_misc pcspkr button joydev evdev serio_raw loop dm_mod efi_pstore nfnetlink vsock_loopback vmw_vsock_virtio_transport_common vmw_vsock_vmci_transport vsock vmw_vmci qemu_fw_cfg ip_tables x_tables autofs4 nfsv3 nfs_acl nfs lockd grace sunrpc 9pnet_rdma rdma_cm iw_cm ib_cm ib_core configfs 9p netfs ata_generic ata_piix libata psmouse scsi_mod 9pnet_virtio i2c_piix4 9pnet e1000 scsi_common floppy crypto_simd cryptd [ 112.256555] CPU: 1 PID: 776 Comm: rmmod Tainted: G B OE 6.8.12-amd64 #1 Debian 6.8.12-1 [ 112.258457] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 [ 112.260410] RIP: 0010:kmem_cache_destroy+0x14d/0x160 [ 112.261687] Code: 00 eb be 5b 5d 41 5c 41 5d c3 cc cc cc cc 48 8b 53 60 48 8b 4c 24 20 48 c7 c6 60 d5 e3 98 48 c7 c7 b8 ec 2d 99 e8 43 0d d8 ff <0f> 0b e9 e2 fe ff ff c3 cc cc cc cc 0f 1f 80 00 00 00 00 90 90 90 [ 112.265219] RSP: 0018:ffffb3b2806e7e48 EFLAGS: 00010282 [ 112.266044] RAX: 0000000000000000 RBX: ffff93f4270a2640 RCX: 0000000000000027 [ 112.267157] RDX: ffff93f43c521708 RSI: 0000000000000001 RDI: ffff93f43c521700 [ 112.268268] RBP: 000055db9018e7d8 R08: 0000000000000000 R09: ffffb3b2806e7cd8 [ 112.269418] R10: ffffb3b2806e7cd0 R11: 0000000000000003 R12: 0000000080012d00 [ 112.270572] R13: ffffb3b2806e7f58 R14: 0000000000000000 R15: 0000000000000000 [ 112.271699] FS: 00007f2ac8308440(0000) GS:ffff93f43c500000(0000) knlGS:0000000000000000 [ 112.273001] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 112.273923] CR2: 00005584ef830110 CR3: 000000000787c000 CR4: 00000000000006f0 [ 112.275050] Call Trace: [ 112.275464] [ 112.275810] ? kmem_cache_destroy+0x14d/0x160 [ 112.276518] ? __warn+0x81/0x130 [ 112.277043] ? kmem_cache_destroy+0x14d/0x160 [ 112.277730] ? report_bug+0x171/0x1a0 [ 112.278315] ? prb_read_valid+0x1b/0x30 [ 112.278919] ? handle_bug+0x3c/0x80 [ 112.279467] ? exc_invalid_op+0x17/0x70 [ 112.280071] ? asm_exc_invalid_op+0x1a/0x20 [ 112.280741] ? kmem_cache_destroy+0x14d/0x160 [ 112.281603] ? kmem_cache_destroy+0x14d/0x160 [ 112.282489] batadv_tt_cache_destroy+0x15/0x60 [batman_adv] [ 112.283373] __do_sys_delete_module+0x1d5/0x320 [ 112.284080] do_syscall_64+0x83/0x190 [ 112.284696] ? do_syscall_64+0x8f/0x190 [ 112.285315] ? exc_page_fault+0x7f/0x180 [ 112.285970] entry_SYSCALL_64_after_hwframe+0x78/0x80 [ 112.286768] RIP: 0033:0x7f2ac8434977 [ 112.287355] Code: 73 01 c3 48 8b 0d a9 94 0c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 b8 b0 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 79 94 0c 00 f7 d8 64 89 01 48 [ 112.290282] RSP: 002b:00007ffe0adf6138 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 [ 112.291465] RAX: ffffffffffffffda RBX: 000055db9018e770 RCX: 00007f2ac8434977 [ 112.292595] RDX: 0000000000000000 RSI: 0000000000000800 RDI: 000055db9018e7d8 [ 112.293724] RBP: 0000000000000000 R08: 1999999999999999 R09: 0000000000000000 [ 112.294863] R10: 00007f2ac84a6ac0 R11: 0000000000000206 R12: 00007ffe0adf6390 [ 112.295982] R13: 000055db9018e770 R14: 000055db9018d2a0 R15: 0000000000000000 [ 112.297103] [ 112.297465] ---[ end trace 0000000000000000 ]--- ``` So far, after some debugging, the actual cause for this could not immediately be found within the batman-adv code. Therefore reverting this for now until the underlying issue can be found and better understood. Some additional debugging information and discussions can be found on our Redmine bugtracker, linked below. Link: https://www.open-mesh.org/issues/428 Fixes: 537db05da2ca ("batman-adv: prefer kfree_rcu() over call_rcu() with free-only callbacks") Signed-off-by: Linus Lüssing Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/translation-table.c | 47 ++++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index b21ff3c36b07d2..2243cec18ecc86 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -208,6 +208,20 @@ batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr, return tt_global_entry; } +/** + * batadv_tt_local_entry_free_rcu() - free the tt_local_entry + * @rcu: rcu pointer of the tt_local_entry + */ +static void batadv_tt_local_entry_free_rcu(struct rcu_head *rcu) +{ + struct batadv_tt_local_entry *tt_local_entry; + + tt_local_entry = container_of(rcu, struct batadv_tt_local_entry, + common.rcu); + + kmem_cache_free(batadv_tl_cache, tt_local_entry); +} + /** * batadv_tt_local_entry_release() - release tt_local_entry from lists and queue * for free after rcu grace period @@ -222,7 +236,7 @@ static void batadv_tt_local_entry_release(struct kref *ref) batadv_softif_vlan_put(tt_local_entry->vlan); - kfree_rcu(tt_local_entry, common.rcu); + call_rcu(&tt_local_entry->common.rcu, batadv_tt_local_entry_free_rcu); } /** @@ -240,6 +254,20 @@ batadv_tt_local_entry_put(struct batadv_tt_local_entry *tt_local_entry) batadv_tt_local_entry_release); } +/** + * batadv_tt_global_entry_free_rcu() - free the tt_global_entry + * @rcu: rcu pointer of the tt_global_entry + */ +static void batadv_tt_global_entry_free_rcu(struct rcu_head *rcu) +{ + struct batadv_tt_global_entry *tt_global_entry; + + tt_global_entry = container_of(rcu, struct batadv_tt_global_entry, + common.rcu); + + kmem_cache_free(batadv_tg_cache, tt_global_entry); +} + /** * batadv_tt_global_entry_release() - release tt_global_entry from lists and * queue for free after rcu grace period @@ -254,7 +282,7 @@ void batadv_tt_global_entry_release(struct kref *ref) batadv_tt_global_del_orig_list(tt_global_entry); - kfree_rcu(tt_global_entry, common.rcu); + call_rcu(&tt_global_entry->common.rcu, batadv_tt_global_entry_free_rcu); } /** @@ -379,6 +407,19 @@ static void batadv_tt_global_size_dec(struct batadv_orig_node *orig_node, batadv_tt_global_size_mod(orig_node, vid, -1); } +/** + * batadv_tt_orig_list_entry_free_rcu() - free the orig_entry + * @rcu: rcu pointer of the orig_entry + */ +static void batadv_tt_orig_list_entry_free_rcu(struct rcu_head *rcu) +{ + struct batadv_tt_orig_list_entry *orig_entry; + + orig_entry = container_of(rcu, struct batadv_tt_orig_list_entry, rcu); + + kmem_cache_free(batadv_tt_orig_cache, orig_entry); +} + /** * batadv_tt_orig_list_entry_release() - release tt orig entry from lists and * queue for free after rcu grace period @@ -392,7 +433,7 @@ static void batadv_tt_orig_list_entry_release(struct kref *ref) refcount); batadv_orig_node_put(orig_entry->orig_node); - kfree_rcu(orig_entry, rcu); + call_rcu(&orig_entry->rcu, batadv_tt_orig_list_entry_free_rcu); } /** From 54559642b96116b45e4b5ca7fd9f7835b8561272 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 12 Jun 2024 13:56:38 +0100 Subject: [PATCH 0881/1578] io_uring/rsrc: don't lock while !TASK_RUNNING There is a report of io_rsrc_ref_quiesce() locking a mutex while not TASK_RUNNING, which is due to forgetting restoring the state back after io_run_task_work_sig() and attempts to break out of the waiting loop. do not call blocking ops when !TASK_RUNNING; state=1 set at [] prepare_to_wait+0xa4/0x380 kernel/sched/wait.c:237 WARNING: CPU: 2 PID: 397056 at kernel/sched/core.c:10099 __might_sleep+0x114/0x160 kernel/sched/core.c:10099 RIP: 0010:__might_sleep+0x114/0x160 kernel/sched/core.c:10099 Call Trace: __mutex_lock_common kernel/locking/mutex.c:585 [inline] __mutex_lock+0xb4/0x940 kernel/locking/mutex.c:752 io_rsrc_ref_quiesce+0x590/0x940 io_uring/rsrc.c:253 io_sqe_buffers_unregister+0xa2/0x340 io_uring/rsrc.c:799 __io_uring_register io_uring/register.c:424 [inline] __do_sys_io_uring_register+0x5b9/0x2400 io_uring/register.c:613 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xd8/0x270 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x6f/0x77 Reported-by: Li Shi Fixes: 4ea15b56f0810 ("io_uring/rsrc: use wq for quiescing") Cc: stable@vger.kernel.org Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/77966bc104e25b0534995d5dbb152332bc8f31c0.1718196953.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 65417c9553b1d8..edb9c5baf2e290 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -249,6 +249,7 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, ret = io_run_task_work_sig(ctx); if (ret < 0) { + __set_current_state(TASK_RUNNING); mutex_lock(&ctx->uring_lock); if (list_empty(&ctx->rsrc_ref_list)) ret = 0; From 0057222c45140830a7bf55e92fb67f84a2814f67 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Thu, 18 Apr 2024 01:07:33 +0100 Subject: [PATCH 0882/1578] regulator: axp20x: AXP717: fix LDO supply rails and off-by-ones The X-Powers AXP717 PMIC has separate input supply pins for each group of LDOs, so they are not all using the same DCDC1 input, as described currently. Replace the "supply" member of each LDO description with the respective group supply name, so that the supply dependencies can be correctly described in the devicetree. Also fix two off-by-ones in the regulator macros, after some double checking the numbers against the datasheet. This uncovered a bug in the datasheet: add a comment to document this. Fixes: d2ac3df75c3a ("regulator: axp20x: add support for the AXP717") Signed-off-by: Andre Przywara Reviewed-by: John Watts Link: https://lore.kernel.org/r/20240418000736.24338-3-andre.przywara@arm.com Signed-off-by: Mark Brown --- drivers/regulator/axp20x-regulator.c | 33 ++++++++++++++++------------ 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/drivers/regulator/axp20x-regulator.c b/drivers/regulator/axp20x-regulator.c index 34fcdd82b2eaa7..f3c447ecdc3bfd 100644 --- a/drivers/regulator/axp20x-regulator.c +++ b/drivers/regulator/axp20x-regulator.c @@ -140,7 +140,7 @@ #define AXP717_DCDC1_NUM_VOLTAGES 88 #define AXP717_DCDC2_NUM_VOLTAGES 107 -#define AXP717_DCDC3_NUM_VOLTAGES 104 +#define AXP717_DCDC3_NUM_VOLTAGES 103 #define AXP717_DCDC_V_OUT_MASK GENMASK(6, 0) #define AXP717_LDO_V_OUT_MASK GENMASK(4, 0) @@ -763,10 +763,15 @@ static const struct linear_range axp717_dcdc1_ranges[] = { REGULATOR_LINEAR_RANGE(1220000, 71, 87, 20000), }; +/* + * The manual says that the last voltage is 3.4V, encoded as 0b1101011 (107), + * but every other method proves that this is wrong, so it's really 106 that + * programs the final 3.4V. + */ static const struct linear_range axp717_dcdc2_ranges[] = { REGULATOR_LINEAR_RANGE(500000, 0, 70, 10000), REGULATOR_LINEAR_RANGE(1220000, 71, 87, 20000), - REGULATOR_LINEAR_RANGE(1600000, 88, 107, 100000), + REGULATOR_LINEAR_RANGE(1600000, 88, 106, 100000), }; static const struct linear_range axp717_dcdc3_ranges[] = { @@ -790,40 +795,40 @@ static const struct regulator_desc axp717_regulators[] = { AXP_DESC(AXP717, DCDC4, "dcdc4", "vin4", 1000, 3700, 100, AXP717_DCDC4_CONTROL, AXP717_DCDC_V_OUT_MASK, AXP717_DCDC_OUTPUT_CONTROL, BIT(3)), - AXP_DESC(AXP717, ALDO1, "aldo1", "vin1", 500, 3500, 100, + AXP_DESC(AXP717, ALDO1, "aldo1", "aldoin", 500, 3500, 100, AXP717_ALDO1_CONTROL, AXP717_LDO_V_OUT_MASK, AXP717_LDO0_OUTPUT_CONTROL, BIT(0)), - AXP_DESC(AXP717, ALDO2, "aldo2", "vin1", 500, 3500, 100, + AXP_DESC(AXP717, ALDO2, "aldo2", "aldoin", 500, 3500, 100, AXP717_ALDO2_CONTROL, AXP717_LDO_V_OUT_MASK, AXP717_LDO0_OUTPUT_CONTROL, BIT(1)), - AXP_DESC(AXP717, ALDO3, "aldo3", "vin1", 500, 3500, 100, + AXP_DESC(AXP717, ALDO3, "aldo3", "aldoin", 500, 3500, 100, AXP717_ALDO3_CONTROL, AXP717_LDO_V_OUT_MASK, AXP717_LDO0_OUTPUT_CONTROL, BIT(2)), - AXP_DESC(AXP717, ALDO4, "aldo4", "vin1", 500, 3500, 100, + AXP_DESC(AXP717, ALDO4, "aldo4", "aldoin", 500, 3500, 100, AXP717_ALDO4_CONTROL, AXP717_LDO_V_OUT_MASK, AXP717_LDO0_OUTPUT_CONTROL, BIT(3)), - AXP_DESC(AXP717, BLDO1, "bldo1", "vin1", 500, 3500, 100, + AXP_DESC(AXP717, BLDO1, "bldo1", "bldoin", 500, 3500, 100, AXP717_BLDO1_CONTROL, AXP717_LDO_V_OUT_MASK, AXP717_LDO0_OUTPUT_CONTROL, BIT(4)), - AXP_DESC(AXP717, BLDO2, "bldo2", "vin1", 500, 3500, 100, + AXP_DESC(AXP717, BLDO2, "bldo2", "bldoin", 500, 3500, 100, AXP717_BLDO2_CONTROL, AXP717_LDO_V_OUT_MASK, AXP717_LDO0_OUTPUT_CONTROL, BIT(5)), - AXP_DESC(AXP717, BLDO3, "bldo3", "vin1", 500, 3500, 100, + AXP_DESC(AXP717, BLDO3, "bldo3", "bldoin", 500, 3500, 100, AXP717_BLDO3_CONTROL, AXP717_LDO_V_OUT_MASK, AXP717_LDO0_OUTPUT_CONTROL, BIT(6)), - AXP_DESC(AXP717, BLDO4, "bldo4", "vin1", 500, 3500, 100, + AXP_DESC(AXP717, BLDO4, "bldo4", "bldoin", 500, 3500, 100, AXP717_BLDO4_CONTROL, AXP717_LDO_V_OUT_MASK, AXP717_LDO0_OUTPUT_CONTROL, BIT(7)), - AXP_DESC(AXP717, CLDO1, "cldo1", "vin1", 500, 3500, 100, + AXP_DESC(AXP717, CLDO1, "cldo1", "cldoin", 500, 3500, 100, AXP717_CLDO1_CONTROL, AXP717_LDO_V_OUT_MASK, AXP717_LDO1_OUTPUT_CONTROL, BIT(0)), - AXP_DESC(AXP717, CLDO2, "cldo2", "vin1", 500, 3500, 100, + AXP_DESC(AXP717, CLDO2, "cldo2", "cldoin", 500, 3500, 100, AXP717_CLDO2_CONTROL, AXP717_LDO_V_OUT_MASK, AXP717_LDO1_OUTPUT_CONTROL, BIT(1)), - AXP_DESC(AXP717, CLDO3, "cldo3", "vin1", 500, 3500, 100, + AXP_DESC(AXP717, CLDO3, "cldo3", "cldoin", 500, 3500, 100, AXP717_CLDO3_CONTROL, AXP717_LDO_V_OUT_MASK, AXP717_LDO1_OUTPUT_CONTROL, BIT(2)), - AXP_DESC(AXP717, CLDO4, "cldo4", "vin1", 500, 3500, 100, + AXP_DESC(AXP717, CLDO4, "cldo4", "cldoin", 500, 3500, 100, AXP717_CLDO4_CONTROL, AXP717_LDO_V_OUT_MASK, AXP717_LDO1_OUTPUT_CONTROL, BIT(3)), AXP_DESC(AXP717, CPUSLDO, "cpusldo", "vin1", 500, 1400, 50, From 957df9af723ccc570da835978cf7fe7863632842 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 4 Jun 2024 16:15:31 -0600 Subject: [PATCH 0883/1578] nbd: Remove __force casts Make it again possible for sparse to verify that blk_status_t and Unix error codes are used in the proper context by making nbd_send_cmd() return a blk_status_t instead of an integer. No functionality has been changed. Signed-off-by: Christoph Hellwig [ bvanassche: added description and made two small formatting changes ] Signed-off-by: Bart Van Assche Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20240604221531.327131-1-bvanassche@acm.org Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 51 +++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 29 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 22a79a62cc4eab..b87aa80a46ddaa 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -589,10 +589,11 @@ static inline int was_interrupted(int result) } /* - * Returns BLK_STS_RESOURCE if the caller should retry after a delay. Returns - * -EAGAIN if the caller should requeue @cmd. Returns -EIO if sending failed. + * Returns BLK_STS_RESOURCE if the caller should retry after a delay. + * Returns BLK_STS_IOERR if sending failed. */ -static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) +static blk_status_t nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, + int index) { struct request *req = blk_mq_rq_from_pdu(cmd); struct nbd_config *config = nbd->config; @@ -614,13 +615,13 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) type = req_to_nbd_cmd_type(req); if (type == U32_MAX) - return -EIO; + return BLK_STS_IOERR; if (rq_data_dir(req) == WRITE && (config->flags & NBD_FLAG_READ_ONLY)) { dev_err_ratelimited(disk_to_dev(nbd->disk), "Write on read-only\n"); - return -EIO; + return BLK_STS_IOERR; } if (req->cmd_flags & REQ_FUA) @@ -674,11 +675,11 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) nsock->sent = sent; } set_bit(NBD_CMD_REQUEUED, &cmd->flags); - return (__force int)BLK_STS_RESOURCE; + return BLK_STS_RESOURCE; } dev_err_ratelimited(disk_to_dev(nbd->disk), "Send control failed (result %d)\n", result); - return -EAGAIN; + goto requeue; } send_pages: if (type != NBD_CMD_WRITE) @@ -715,12 +716,12 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) nsock->pending = req; nsock->sent = sent; set_bit(NBD_CMD_REQUEUED, &cmd->flags); - return (__force int)BLK_STS_RESOURCE; + return BLK_STS_RESOURCE; } dev_err(disk_to_dev(nbd->disk), "Send data failed (result %d)\n", result); - return -EAGAIN; + goto requeue; } /* * The completion might already have come in, @@ -737,7 +738,16 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) trace_nbd_payload_sent(req, handle); nsock->pending = NULL; nsock->sent = 0; - return 0; + __set_bit(NBD_CMD_INFLIGHT, &cmd->flags); + return BLK_STS_OK; + +requeue: + /* retry on a different socket */ + dev_err_ratelimited(disk_to_dev(nbd->disk), + "Request send failed, requeueing\n"); + nbd_mark_nsock_dead(nbd, nsock, 1); + nbd_requeue_cmd(cmd); + return BLK_STS_OK; } static int nbd_read_reply(struct nbd_device *nbd, struct socket *sock, @@ -1018,7 +1028,7 @@ static blk_status_t nbd_handle_cmd(struct nbd_cmd *cmd, int index) struct nbd_device *nbd = cmd->nbd; struct nbd_config *config; struct nbd_sock *nsock; - int ret; + blk_status_t ret; lockdep_assert_held(&cmd->lock); @@ -1072,28 +1082,11 @@ static blk_status_t nbd_handle_cmd(struct nbd_cmd *cmd, int index) ret = BLK_STS_OK; goto out; } - /* - * Some failures are related to the link going down, so anything that - * returns EAGAIN can be retried on a different socket. - */ ret = nbd_send_cmd(nbd, cmd, index); - /* - * Access to this flag is protected by cmd->lock, thus it's safe to set - * the flag after nbd_send_cmd() succeed to send request to server. - */ - if (!ret) - __set_bit(NBD_CMD_INFLIGHT, &cmd->flags); - else if (ret == -EAGAIN) { - dev_err_ratelimited(disk_to_dev(nbd->disk), - "Request send failed, requeueing\n"); - nbd_mark_nsock_dead(nbd, nsock, 1); - nbd_requeue_cmd(cmd); - ret = BLK_STS_OK; - } out: mutex_unlock(&nsock->tx_lock); nbd_config_put(nbd); - return ret < 0 ? BLK_STS_IOERR : (__force blk_status_t)ret; + return ret; } static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx, From d71a989cf5d961989c273093cdff2550acdde314 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 6 Jun 2024 21:52:07 -0600 Subject: [PATCH 0884/1578] vfio/pci: Insert full vma on mmap'd MMIO fault In order to improve performance of typical scenarios we can try to insert the entire vma on fault. This accelerates typical cases, such as when the MMIO region is DMA mapped by QEMU. The vfio_iommu_type1 driver will fault in the entire DMA mapped range through fixup_user_fault(). In synthetic testing, this improves the time required to walk a PCI BAR mapping from userspace by roughly 1/3rd. This is likely an interim solution until vmf_insert_pfn_{pmd,pud}() gain support for pfnmaps. Suggested-by: Yan Zhao Link: https://lore.kernel.org/all/Zl6XdUkt%2FzMMGOLF@yzhao56-desk.sh.intel.com/ Reviewed-by: Yan Zhao Link: https://lore.kernel.org/r/20240607035213.2054226-1-alex.williamson@redhat.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index db31c27bf78bce..987c7921affa68 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1662,6 +1662,7 @@ static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct vfio_pci_core_device *vdev = vma->vm_private_data; unsigned long pfn, pgoff = vmf->pgoff - vma->vm_pgoff; + unsigned long addr = vma->vm_start; vm_fault_t ret = VM_FAULT_SIGBUS; pfn = vma_to_pfn(vma); @@ -1669,11 +1670,25 @@ static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf) down_read(&vdev->memory_lock); if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) - goto out_disabled; + goto out_unlock; ret = vmf_insert_pfn(vma, vmf->address, pfn + pgoff); + if (ret & VM_FAULT_ERROR) + goto out_unlock; -out_disabled: + /* + * Pre-fault the remainder of the vma, abort further insertions and + * supress error if fault is encountered during pre-fault. + */ + for (; addr < vma->vm_end; addr += PAGE_SIZE, pfn++) { + if (addr == vmf->address) + continue; + + if (vmf_insert_pfn(vma, addr, pfn) & VM_FAULT_ERROR) + break; + } + +out_unlock: up_read(&vdev->memory_lock); return ret; From 14a20e5b4ad998793c5f43b0330d9e1388446cf3 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Fri, 7 Jun 2024 13:28:28 +0200 Subject: [PATCH 0885/1578] net/ipv6: Fix the RT cache flush via sysctl using a previous delay The net.ipv6.route.flush system parameter takes a value which specifies a delay used during the flush operation for aging exception routes. The written value is however not used in the currently requested flush and instead utilized only in the next one. A problem is that ipv6_sysctl_rtcache_flush() first reads the old value of net->ipv6.sysctl.flush_delay into a local delay variable and then calls proc_dointvec() which actually updates the sysctl based on the provided input. Fix the problem by switching the order of the two operations. Fixes: 4990509f19e8 ("[NETNS][IPV6]: Make sysctls route per namespace.") Signed-off-by: Petr Pavlu Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240607112828.30285-1-petr.pavlu@suse.com Signed-off-by: Jakub Kicinski --- net/ipv6/route.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f083d9faba6b1e..952c2bf1170942 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -6343,12 +6343,12 @@ static int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, if (!write) return -EINVAL; - net = (struct net *)ctl->extra1; - delay = net->ipv6.sysctl.flush_delay; ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (ret) return ret; + net = (struct net *)ctl->extra1; + delay = net->ipv6.sysctl.flush_delay; fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); return 0; } From 26ba7c3f139f843bf46ed0779e30d84641767959 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 12 Jun 2024 15:53:29 -0700 Subject: [PATCH 0886/1578] MAINTAINERS: mailmap: Update Stanislav's email address Moving to personal address for upstream work. Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20240612225334.41869-1-sdf@google.com Signed-off-by: Alexei Starovoitov --- .mailmap | 1 + MAINTAINERS | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index 9af91bd3584bdd..6898017339756b 100644 --- a/.mailmap +++ b/.mailmap @@ -606,6 +606,7 @@ Simon Kelley Sricharan Ramabadhran Srinivas Ramana Sriram R +Stanislav Fomichev Stefan Wahren Stéphane Witzmann Stephen Hemminger diff --git a/MAINTAINERS b/MAINTAINERS index aacccb376c28a1..4582af09f2da0d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3980,7 +3980,7 @@ R: Song Liu R: Yonghong Song R: John Fastabend R: KP Singh -R: Stanislav Fomichev +R: Stanislav Fomichev R: Hao Luo R: Jiri Olsa L: bpf@vger.kernel.org From 36c92936e868601fa1f43da6758cf55805043509 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sun, 9 Jun 2024 13:36:53 +0300 Subject: [PATCH 0887/1578] net: bridge: mst: pass vlan group directly to br_mst_vlan_set_state Pass the already obtained vlan group pointer to br_mst_vlan_set_state() instead of dereferencing it again. Each caller has already correctly dereferenced it for their context. This change is required for the following suspicious RCU dereference fix. No functional changes intended. Fixes: 3a7c1661ae13 ("net: bridge: mst: fix vlan use-after-free") Reported-by: syzbot+9bbe2de1bc9d470eb5fe@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=9bbe2de1bc9d470eb5fe Signed-off-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20240609103654.914987-2-razor@blackwall.org Signed-off-by: Jakub Kicinski --- net/bridge/br_mst.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/net/bridge/br_mst.c b/net/bridge/br_mst.c index 3c66141d34d62f..1de72816b0fb25 100644 --- a/net/bridge/br_mst.c +++ b/net/bridge/br_mst.c @@ -73,11 +73,10 @@ int br_mst_get_state(const struct net_device *dev, u16 msti, u8 *state) } EXPORT_SYMBOL_GPL(br_mst_get_state); -static void br_mst_vlan_set_state(struct net_bridge_port *p, struct net_bridge_vlan *v, +static void br_mst_vlan_set_state(struct net_bridge_vlan_group *vg, + struct net_bridge_vlan *v, u8 state) { - struct net_bridge_vlan_group *vg = nbp_vlan_group(p); - if (br_vlan_get_state(v) == state) return; @@ -121,7 +120,7 @@ int br_mst_set_state(struct net_bridge_port *p, u16 msti, u8 state, if (v->brvlan->msti != msti) continue; - br_mst_vlan_set_state(p, v, state); + br_mst_vlan_set_state(vg, v, state); } out: @@ -140,13 +139,13 @@ static void br_mst_vlan_sync_state(struct net_bridge_vlan *pv, u16 msti) * it. */ if (v != pv && v->brvlan->msti == msti) { - br_mst_vlan_set_state(pv->port, pv, v->state); + br_mst_vlan_set_state(vg, pv, v->state); return; } } /* Otherwise, start out in a new MSTI with all ports disabled. */ - return br_mst_vlan_set_state(pv->port, pv, BR_STATE_DISABLED); + return br_mst_vlan_set_state(vg, pv, BR_STATE_DISABLED); } int br_mst_vlan_set_msti(struct net_bridge_vlan *mv, u16 msti) From 546ceb1dfdac866648ec959cbc71d9525bd73462 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sun, 9 Jun 2024 13:36:54 +0300 Subject: [PATCH 0888/1578] net: bridge: mst: fix suspicious rcu usage in br_mst_set_state I converted br_mst_set_state to RCU to avoid a vlan use-after-free but forgot to change the vlan group dereference helper. Switch to vlan group RCU deref helper to fix the suspicious rcu usage warning. Fixes: 3a7c1661ae13 ("net: bridge: mst: fix vlan use-after-free") Reported-by: syzbot+9bbe2de1bc9d470eb5fe@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=9bbe2de1bc9d470eb5fe Signed-off-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20240609103654.914987-3-razor@blackwall.org Signed-off-by: Jakub Kicinski --- net/bridge/br_mst.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bridge/br_mst.c b/net/bridge/br_mst.c index 1de72816b0fb25..1820f09ff59ceb 100644 --- a/net/bridge/br_mst.c +++ b/net/bridge/br_mst.c @@ -102,7 +102,7 @@ int br_mst_set_state(struct net_bridge_port *p, u16 msti, u8 state, int err = 0; rcu_read_lock(); - vg = nbp_vlan_group(p); + vg = nbp_vlan_group_rcu(p); if (!vg) goto out; From 6e5aee08bd2517397c9572243a816664f2ead547 Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Thu, 13 Jun 2024 10:17:09 +0200 Subject: [PATCH 0889/1578] Revert "MIPS: pci: lantiq: restore reset gpio polarity" This reverts commit 277a0363120276645ae598d8d5fea7265e076ae9. While fixing old boards with broken DTs, this change will break newer ones with correct gpio polarity annotation. Signed-off-by: Thomas Bogendoerfer --- arch/mips/pci/pci-lantiq.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/mips/pci/pci-lantiq.c b/arch/mips/pci/pci-lantiq.c index 0844db34022e45..68a8cefed420bf 100644 --- a/arch/mips/pci/pci-lantiq.c +++ b/arch/mips/pci/pci-lantiq.c @@ -124,14 +124,14 @@ static int ltq_pci_startup(struct platform_device *pdev) clk_disable(clk_external); /* setup reset gpio used by pci */ - reset_gpio = devm_gpiod_get_optional(&pdev->dev, "reset", GPIOD_ASIS); + reset_gpio = devm_gpiod_get_optional(&pdev->dev, "reset", + GPIOD_OUT_LOW); error = PTR_ERR_OR_ZERO(reset_gpio); if (error) { dev_err(&pdev->dev, "failed to request gpio: %d\n", error); return error; } gpiod_set_consumer_name(reset_gpio, "pci_reset"); - gpiod_direction_output(reset_gpio, 1); /* enable auto-switching between PCI and EBU */ ltq_pci_w32(0xa, PCI_CR_CLK_CTRL); @@ -194,10 +194,10 @@ static int ltq_pci_startup(struct platform_device *pdev) /* toggle reset pin */ if (reset_gpio) { - gpiod_set_value_cansleep(reset_gpio, 0); + gpiod_set_value_cansleep(reset_gpio, 1); wmb(); mdelay(1); - gpiod_set_value_cansleep(reset_gpio, 1); + gpiod_set_value_cansleep(reset_gpio, 0); } return 0; } From b2747f108b8034271fd5289bd8f3a7003e0775a3 Mon Sep 17 00:00:00 2001 From: Benjamin Segall Date: Wed, 12 Jun 2024 12:44:44 -0700 Subject: [PATCH 0890/1578] x86/boot: Don't add the EFI stub to targets, again This is a re-commit of da05b143a308 ("x86/boot: Don't add the EFI stub to targets") after the tagged patch incorrectly reverted it. vmlinux-objs-y is added to targets, with an assumption that they are all relative to $(obj); adding a $(objtree)/drivers/... path causes the build to incorrectly create a useless arch/x86/boot/compressed/drivers/... directory tree. Fix this just by using a different make variable for the EFI stub. Fixes: cb8bda8ad443 ("x86/boot/compressed: Rename efi_thunk_64.S to efi-mixed.S") Signed-off-by: Ben Segall Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Ard Biesheuvel Cc: stable@vger.kernel.org # v6.1+ Link: https://lore.kernel.org/r/xm267ceukksz.fsf@bsegall.svl.corp.google.com --- arch/x86/boot/compressed/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 243ee86cb1b1ef..f2051644de9432 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -105,9 +105,9 @@ vmlinux-objs-$(CONFIG_UNACCEPTED_MEMORY) += $(obj)/mem.o vmlinux-objs-$(CONFIG_EFI) += $(obj)/efi.o vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_mixed.o -vmlinux-objs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a +vmlinux-libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a -$(obj)/vmlinux: $(vmlinux-objs-y) FORCE +$(obj)/vmlinux: $(vmlinux-objs-y) $(vmlinux-libs-y) FORCE $(call if_changed,ld) OBJCOPYFLAGS_vmlinux.bin := -R .comment -S From 0298f51652be47b79780833e0b63194e1231fa34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Amadeusz=20S=C5=82awi=C5=84ski?= Date: Thu, 13 Jun 2024 11:01:26 +0200 Subject: [PATCH 0891/1578] ASoC: topology: Fix route memory corruption MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It was reported that recent fix for memory corruption during topology load, causes corruption in other cases. Instead of being overeager with checking topology, assume that it is properly formatted and just duplicate strings. Reported-by: Pierre-Louis Bossart Closes: https://lore.kernel.org/linux-sound/171812236450.201359.3019210915105428447.b4-ty@kernel.org/T/#m8c4bd5abf453960fde6f826c4b7f84881da63e9d Suggested-by: Péter Ujfalusi Signed-off-by: Amadeusz Sławiński Link: https://lore.kernel.org/r/20240613090126.841189-1-amadeuszx.slawinski@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/soc-topology.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c index 2ac442644ed4f9..6951ff7bc61e7c 100644 --- a/sound/soc/soc-topology.c +++ b/sound/soc/soc-topology.c @@ -1052,21 +1052,15 @@ static int soc_tplg_dapm_graph_elems_load(struct soc_tplg *tplg, break; } - route->source = devm_kmemdup(tplg->dev, elem->source, - min(strlen(elem->source), maxlen), - GFP_KERNEL); - route->sink = devm_kmemdup(tplg->dev, elem->sink, - min(strlen(elem->sink), maxlen), - GFP_KERNEL); + route->source = devm_kstrdup(tplg->dev, elem->source, GFP_KERNEL); + route->sink = devm_kstrdup(tplg->dev, elem->sink, GFP_KERNEL); if (!route->source || !route->sink) { ret = -ENOMEM; break; } if (strnlen(elem->control, maxlen) != 0) { - route->control = devm_kmemdup(tplg->dev, elem->control, - min(strlen(elem->control), maxlen), - GFP_KERNEL); + route->control = devm_kstrdup(tplg->dev, elem->control, GFP_KERNEL); if (!route->control) { ret = -ENOMEM; break; From 12243a8115f8583a6bcada7717c01fb164e23c89 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Thu, 30 May 2024 13:36:03 -0500 Subject: [PATCH 0892/1578] iommu/amd: Fix panic accessing amd_iommu_enable_faulting This fixes a bug introduced by commit d74169ceb0d2 ("iommu/vt-d: Allocate DMAR fault interrupts locally"). The panic happens when amd_iommu_enable_faulting is called from CPUHP_AP_ONLINE_DYN context. Fixes: d74169ceb0d2 ("iommu/vt-d: Allocate DMAR fault interrupts locally") Signed-off-by: Dimitri Sivanich Tested-by: Yi Zhang Reviewed-by: Jerry Snitselaar Reviewed-by: Vasant Hegde Link: https://lore.kernel.org/r/ZljHE/R4KLzGU6vx@hpe.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 27e2937270950b..16124806777647 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -3362,7 +3362,7 @@ int amd_iommu_reenable(int mode) return 0; } -int __init amd_iommu_enable_faulting(unsigned int cpu) +int amd_iommu_enable_faulting(unsigned int cpu) { /* We enable MSI later when PCI is initialized */ return 0; From 0e6b6dedf16800df0ff73ffe2bb5066514db29c2 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 12 Jun 2024 16:15:55 +0200 Subject: [PATCH 0893/1578] ACPI: EC: Evaluate orphan _REG under EC device After starting to install the EC address space handler at the ACPI namespace root, if there is an "orphan" _REG method in the EC device's scope, it will not be evaluated any more. This breaks EC operation regions on some systems, like Asus gu605. To address this, use a wrapper around an existing ACPICA function to look for an "orphan" _REG method in the EC device scope and evaluate it if present. Fixes: 60fa6ae6e6d0 ("ACPI: EC: Install address space handler at the namespace root") Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218945 Reported-by: VitaliiT Tested-by: VitaliiT Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/acevents.h | 4 +++ drivers/acpi/acpica/evregion.c | 6 +--- drivers/acpi/acpica/evxfregn.c | 54 ++++++++++++++++++++++++++++++++++ drivers/acpi/ec.c | 3 ++ include/acpi/acpixf.h | 4 +++ 5 files changed, 66 insertions(+), 5 deletions(-) diff --git a/drivers/acpi/acpica/acevents.h b/drivers/acpi/acpica/acevents.h index ddd072cbc738d4..2133085deda77f 100644 --- a/drivers/acpi/acpica/acevents.h +++ b/drivers/acpi/acpica/acevents.h @@ -191,6 +191,10 @@ void acpi_ev_execute_reg_methods(struct acpi_namespace_node *node, acpi_adr_space_type space_id, u32 function); +void +acpi_ev_execute_orphan_reg_method(struct acpi_namespace_node *node, + acpi_adr_space_type space_id); + acpi_status acpi_ev_execute_reg_method(union acpi_operand_object *region_obj, u32 function); diff --git a/drivers/acpi/acpica/evregion.c b/drivers/acpi/acpica/evregion.c index 18fdf2bc2d499a..dc6004daf624b6 100644 --- a/drivers/acpi/acpica/evregion.c +++ b/drivers/acpi/acpica/evregion.c @@ -20,10 +20,6 @@ extern u8 acpi_gbl_default_address_spaces[]; /* Local prototypes */ -static void -acpi_ev_execute_orphan_reg_method(struct acpi_namespace_node *device_node, - acpi_adr_space_type space_id); - static acpi_status acpi_ev_reg_run(acpi_handle obj_handle, u32 level, void *context, void **return_value); @@ -818,7 +814,7 @@ acpi_ev_reg_run(acpi_handle obj_handle, * ******************************************************************************/ -static void +void acpi_ev_execute_orphan_reg_method(struct acpi_namespace_node *device_node, acpi_adr_space_type space_id) { diff --git a/drivers/acpi/acpica/evxfregn.c b/drivers/acpi/acpica/evxfregn.c index 3197e6303c5b08..624361a5f34d86 100644 --- a/drivers/acpi/acpica/evxfregn.c +++ b/drivers/acpi/acpica/evxfregn.c @@ -306,3 +306,57 @@ acpi_execute_reg_methods(acpi_handle device, acpi_adr_space_type space_id) } ACPI_EXPORT_SYMBOL(acpi_execute_reg_methods) + +/******************************************************************************* + * + * FUNCTION: acpi_execute_orphan_reg_method + * + * PARAMETERS: device - Handle for the device + * space_id - The address space ID + * + * RETURN: Status + * + * DESCRIPTION: Execute an "orphan" _REG method that appears under an ACPI + * device. This is a _REG method that has no corresponding region + * within the device's scope. + * + ******************************************************************************/ +acpi_status +acpi_execute_orphan_reg_method(acpi_handle device, acpi_adr_space_type space_id) +{ + struct acpi_namespace_node *node; + acpi_status status; + + ACPI_FUNCTION_TRACE(acpi_execute_orphan_reg_method); + + /* Parameter validation */ + + if (!device) { + return_ACPI_STATUS(AE_BAD_PARAMETER); + } + + status = acpi_ut_acquire_mutex(ACPI_MTX_NAMESPACE); + if (ACPI_FAILURE(status)) { + return_ACPI_STATUS(status); + } + + /* Convert and validate the device handle */ + + node = acpi_ns_validate_handle(device); + if (node) { + + /* + * If an "orphan" _REG method is present in the device's scope + * for the given address space ID, run it. + */ + + acpi_ev_execute_orphan_reg_method(node, space_id); + } else { + status = AE_BAD_PARAMETER; + } + + (void)acpi_ut_release_mutex(ACPI_MTX_NAMESPACE); + return_ACPI_STATUS(status); +} + +ACPI_EXPORT_SYMBOL(acpi_execute_orphan_reg_method) diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index 68dd17f96f636f..299ec653388ceb 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -1507,6 +1507,9 @@ static int ec_install_handlers(struct acpi_ec *ec, struct acpi_device *device, if (call_reg && !test_bit(EC_FLAGS_EC_REG_CALLED, &ec->flags)) { acpi_execute_reg_methods(scope_handle, ACPI_ADR_SPACE_EC); + if (scope_handle != ec->handle) + acpi_execute_orphan_reg_method(ec->handle, ACPI_ADR_SPACE_EC); + set_bit(EC_FLAGS_EC_REG_CALLED, &ec->flags); } diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index 94d0fc3bd412d3..80dc36f9d5274c 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -662,6 +662,10 @@ ACPI_EXTERNAL_RETURN_STATUS(acpi_status acpi_execute_reg_methods(acpi_handle device, acpi_adr_space_type space_id)) +ACPI_EXTERNAL_RETURN_STATUS(acpi_status + acpi_execute_orphan_reg_method(acpi_handle device, + acpi_adr_space_type + space_id)) ACPI_EXTERNAL_RETURN_STATUS(acpi_status acpi_remove_address_space_handler(acpi_handle device, From 7c877115da4196fa108dcfefd49f5a9b67b8d8ca Mon Sep 17 00:00:00 2001 From: Riana Tauro Date: Thu, 6 Jun 2024 15:38:41 +0530 Subject: [PATCH 0894/1578] drm/xe/xe_gt_idle: use GT forcewake domain assertion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The rc6 registers used in disable_c6 function belong to the GT forcewake domain. Hence change the forcewake assertion to check GT forcewake domain. v2: add fixes tag (Himal) Fixes: 975e4a3795d4 ("drm/xe: Manually setup C6 when skip_guc_pc is set") Signed-off-by: Riana Tauro Reviewed-by: Rodrigo Vivi Reviewed-by: Himal Prasad Ghimiray Link: https://patchwork.freedesktop.org/patch/msgid/20240606100842.956072-2-riana.tauro@intel.com Signed-off-by: Rodrigo Vivi (cherry picked from commit 21b708554648177a0078962c31629bce31ef5d83) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_gt_idle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_gt_idle.c b/drivers/gpu/drm/xe/xe_gt_idle.c index 8fc0f3f6ecc530..c699f47f4700da 100644 --- a/drivers/gpu/drm/xe/xe_gt_idle.c +++ b/drivers/gpu/drm/xe/xe_gt_idle.c @@ -199,7 +199,7 @@ void xe_gt_idle_enable_c6(struct xe_gt *gt) void xe_gt_idle_disable_c6(struct xe_gt *gt) { xe_device_assert_mem_access(gt_to_xe(gt)); - xe_force_wake_assert_held(gt_to_fw(gt), XE_FORCEWAKE_ALL); + xe_force_wake_assert_held(gt_to_fw(gt), XE_FW_GT); xe_mmio_write32(gt, PG_ENABLE, 0); xe_mmio_write32(gt, RC_CONTROL, 0); From cd554e1e118a6aa1c919309cd28398b003f69c1f Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Fri, 7 Jun 2024 17:31:55 +0200 Subject: [PATCH 0895/1578] drm/xe/pf: Assert LMEM provisioning is done only on DGFX MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Local Memory (aka VRAM) is only available on DGFX platforms. We shouldn't attempt to provision VFs with LMEM or attempt to update the LMTT on non-DGFX platforms. Add missing asserts that would enforce that and fix release code that could crash on iGFX due to uninitialized LMTT. Fixes: 0698ff57bf32 ("drm/xe/pf: Update the LMTT when freeing VF GT config") Signed-off-by: Michal Wajdeczko Cc: Piotr Piórkowski Reviewed-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20240607153155.1592-1-michal.wajdeczko@intel.com (cherry picked from commit b321cb83a375bcc18cd0a4b62bdeaf6905cca769) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c index 476d613333a981..6c2cfc54442cec 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c @@ -1274,6 +1274,9 @@ static void pf_reset_vf_lmtt(struct xe_device *xe, unsigned int vfid) struct xe_tile *tile; unsigned int tid; + xe_assert(xe, IS_DGFX(xe)); + xe_assert(xe, IS_SRIOV_PF(xe)); + for_each_tile(tile, xe, tid) { lmtt = &tile->sriov.pf.lmtt; xe_lmtt_drop_pages(lmtt, vfid); @@ -1292,6 +1295,9 @@ static int pf_update_vf_lmtt(struct xe_device *xe, unsigned int vfid) unsigned int tid; int err; + xe_assert(xe, IS_DGFX(xe)); + xe_assert(xe, IS_SRIOV_PF(xe)); + total = 0; for_each_tile(tile, xe, tid) total += pf_get_vf_config_lmem(tile->primary_gt, vfid); @@ -1337,6 +1343,7 @@ static int pf_update_vf_lmtt(struct xe_device *xe, unsigned int vfid) static void pf_release_vf_config_lmem(struct xe_gt *gt, struct xe_gt_sriov_config *config) { + xe_gt_assert(gt, IS_DGFX(gt_to_xe(gt))); xe_gt_assert(gt, !xe_gt_is_media_type(gt)); lockdep_assert_held(xe_gt_sriov_pf_master_mutex(gt)); @@ -1355,6 +1362,7 @@ static int pf_provision_vf_lmem(struct xe_gt *gt, unsigned int vfid, u64 size) int err; xe_gt_assert(gt, vfid); + xe_gt_assert(gt, IS_DGFX(xe)); xe_gt_assert(gt, !xe_gt_is_media_type(gt)); size = round_up(size, pf_get_lmem_alignment(gt)); @@ -1745,11 +1753,14 @@ static void pf_reset_config_sched(struct xe_gt *gt, struct xe_gt_sriov_config *c static void pf_release_vf_config(struct xe_gt *gt, unsigned int vfid) { struct xe_gt_sriov_config *config = pf_pick_vf_config(gt, vfid); + struct xe_device *xe = gt_to_xe(gt); if (!xe_gt_is_media_type(gt)) { pf_release_vf_config_ggtt(gt, config); - pf_release_vf_config_lmem(gt, config); - pf_update_vf_lmtt(gt_to_xe(gt), vfid); + if (IS_DGFX(xe)) { + pf_release_vf_config_lmem(gt, config); + pf_update_vf_lmtt(xe, vfid); + } } pf_release_config_ctxs(gt, config); pf_release_config_dbs(gt, config); From b5e3a9b83f352a737b77a01734a6661d1130ed49 Mon Sep 17 00:00:00 2001 From: Andrzej Hajda Date: Wed, 5 Jun 2024 09:29:48 +0200 Subject: [PATCH 0896/1578] drm/xe: flush engine buffers before signalling user fence on all engines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tests show that user fence signalling requires kind of write barrier, otherwise not all writes performed by the workload will be available to userspace. It is already done for render and compute, we need it also for the rest: video, gsc, copy. Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Signed-off-by: Andrzej Hajda Reviewed-by: Thomas Hellström Signed-off-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20240605-fix_user_fence_posted-v3-2-06e7932f784a@intel.com (cherry picked from commit 3ad7d18c5dad75ed38098c7cc3bc9594b4701399) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_ring_ops.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c b/drivers/gpu/drm/xe/xe_ring_ops.c index d42b3f33bd7a2b..aca7a9af6e846e 100644 --- a/drivers/gpu/drm/xe/xe_ring_ops.c +++ b/drivers/gpu/drm/xe/xe_ring_ops.c @@ -80,6 +80,16 @@ static int emit_store_imm_ggtt(u32 addr, u32 value, u32 *dw, int i) return i; } +static int emit_flush_dw(u32 *dw, int i) +{ + dw[i++] = MI_FLUSH_DW | MI_FLUSH_IMM_DW; + dw[i++] = 0; + dw[i++] = 0; + dw[i++] = 0; + + return i; +} + static int emit_flush_imm_ggtt(u32 addr, u32 value, bool invalidate_tlb, u32 *dw, int i) { @@ -234,10 +244,12 @@ static void __emit_job_gen12_simple(struct xe_sched_job *job, struct xe_lrc *lrc i = emit_bb_start(batch_addr, ppgtt_flag, dw, i); - if (job->user_fence.used) + if (job->user_fence.used) { + i = emit_flush_dw(dw, i); i = emit_store_imm_ppgtt_posted(job->user_fence.addr, job->user_fence.value, dw, i); + } i = emit_flush_imm_ggtt(xe_lrc_seqno_ggtt_addr(lrc), seqno, false, dw, i); @@ -293,10 +305,12 @@ static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc, i = emit_bb_start(batch_addr, ppgtt_flag, dw, i); - if (job->user_fence.used) + if (job->user_fence.used) { + i = emit_flush_dw(dw, i); i = emit_store_imm_ppgtt_posted(job->user_fence.addr, job->user_fence.value, dw, i); + } i = emit_flush_imm_ggtt(xe_lrc_seqno_ggtt_addr(lrc), seqno, false, dw, i); From 2470b141bfae2b9695b5b6823e3b978b22d33dde Mon Sep 17 00:00:00 2001 From: Riana Tauro Date: Thu, 6 Jun 2024 15:38:42 +0530 Subject: [PATCH 0897/1578] drm/xe: move disable_c6 call MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit disable c6 called in guc_pc_fini_hw is unreachable. GuC PC init returns earlier if skip_guc_pc is true and never registers the finish call thus making disable_c6 unreachable. move this call to gt idle. v2: rebase v3: add fixes tag (Himal) Fixes: 975e4a3795d4 ("drm/xe: Manually setup C6 when skip_guc_pc is set") Signed-off-by: Riana Tauro Reviewed-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20240606100842.956072-3-riana.tauro@intel.com Signed-off-by: Rodrigo Vivi (cherry picked from commit 6800e63cf97bae62bca56d8e691544540d945f53) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_gt_idle.c | 7 +++++++ drivers/gpu/drm/xe/xe_guc_pc.c | 6 ------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_idle.c b/drivers/gpu/drm/xe/xe_gt_idle.c index c699f47f4700da..944770fb2daff0 100644 --- a/drivers/gpu/drm/xe/xe_gt_idle.c +++ b/drivers/gpu/drm/xe/xe_gt_idle.c @@ -147,6 +147,13 @@ static const struct attribute *gt_idle_attrs[] = { static void gt_idle_sysfs_fini(struct drm_device *drm, void *arg) { struct kobject *kobj = arg; + struct xe_gt *gt = kobj_to_gt(kobj->parent); + + if (gt_to_xe(gt)->info.skip_guc_pc) { + XE_WARN_ON(xe_force_wake_get(gt_to_fw(gt), XE_FW_GT)); + xe_gt_idle_disable_c6(gt); + xe_force_wake_put(gt_to_fw(gt), XE_FW_GT); + } sysfs_remove_files(kobj, gt_idle_attrs); kobject_put(kobj); diff --git a/drivers/gpu/drm/xe/xe_guc_pc.c b/drivers/gpu/drm/xe/xe_guc_pc.c index 509649d0e65e1c..23382ced4ea747 100644 --- a/drivers/gpu/drm/xe/xe_guc_pc.c +++ b/drivers/gpu/drm/xe/xe_guc_pc.c @@ -895,12 +895,6 @@ int xe_guc_pc_stop(struct xe_guc_pc *pc) static void xe_guc_pc_fini(struct drm_device *drm, void *arg) { struct xe_guc_pc *pc = arg; - struct xe_device *xe = pc_to_xe(pc); - - if (xe->info.skip_guc_pc) { - xe_gt_idle_disable_c6(pc_to_gt(pc)); - return; - } XE_WARN_ON(xe_force_wake_get(gt_to_fw(pc_to_gt(pc)), XE_FORCEWAKE_ALL)); XE_WARN_ON(xe_guc_pc_gucrc_disable(pc)); From ea5f8c4cffcd8a6b62b3a3bd5008275218c9d02a Mon Sep 17 00:00:00 2001 From: Andy Chi Date: Wed, 5 Jun 2024 17:22:41 +0800 Subject: [PATCH 0898/1578] ALSA: hda/realtek: fix mute/micmute LEDs don't work for ProBook 445/465 G11. HP ProBook 445/465 G11 needs ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF quirk to make mic-mute/audio-mute working. Signed-off-by: Andy Chi Cc: Link: https://lore.kernel.org/r/20240605092243.41963-1-andy.chi@canonical.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index aa76d1c885895a..54a52c1480707b 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10194,6 +10194,10 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8c70, "HP EliteBook 835 G11", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8c71, "HP EliteBook 845 G11", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8c72, "HP EliteBook 865 G11", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8c7b, "HP ProBook 445 G11", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), + SND_PCI_QUIRK(0x103c, 0x8c7c, "HP ProBook 445 G11", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), + SND_PCI_QUIRK(0x103c, 0x8c7d, "HP ProBook 465 G11", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), + SND_PCI_QUIRK(0x103c, 0x8c7e, "HP ProBook 465 G11", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), SND_PCI_QUIRK(0x103c, 0x8c89, "HP ProBook 460 G11", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8c8a, "HP EliteBook 630", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8c8c, "HP EliteBook 660", ALC236_FIXUP_HP_GPIO_LED), From 86a433862912f52597263aa224a9ed82bcd533bf Mon Sep 17 00:00:00 2001 From: Edson Juliano Drosdeck Date: Wed, 5 Jun 2024 12:39:23 -0300 Subject: [PATCH 0899/1578] ALSA: hda/realtek: Limit mic boost on N14AP7 The internal mic boost on the N14AP7 is too high. Fix this by applying the ALC269_FIXUP_LIMIT_INT_MIC_BOOST fixup to the machine to limit the gain. Signed-off-by: Edson Juliano Drosdeck Cc: Link: https://lore.kernel.org/r/20240605153923.2837-1-edson.drosdeck@gmail.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 54a52c1480707b..da54300279af87 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10585,6 +10585,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1b7d, 0xa831, "Ordissimo EVE2 ", ALC269VB_FIXUP_ORDISSIMO_EVE2), /* Also known as Malata PC-B1303 */ SND_PCI_QUIRK(0x1c06, 0x2013, "Lemote A1802", ALC269_FIXUP_LEMOTE_A1802), SND_PCI_QUIRK(0x1c06, 0x2015, "Lemote A190X", ALC269_FIXUP_LEMOTE_A190X), + SND_PCI_QUIRK(0x1c6c, 0x122a, "Positivo N14AP7", ALC269_FIXUP_LIMIT_INT_MIC_BOOST), SND_PCI_QUIRK(0x1c6c, 0x1251, "Positivo N14KP6-TG", ALC288_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1d05, 0x1132, "TongFang PHxTxX1", ALC256_FIXUP_SET_COEF_DEFAULTS), SND_PCI_QUIRK(0x1d05, 0x1096, "TongFang GMxMRxx", ALC269_FIXUP_NO_SHUTUP), From e799bdf51d54bebaf939fdb655aad424e624c1b1 Mon Sep 17 00:00:00 2001 From: "Dustin L. Howett" Date: Wed, 5 Jun 2024 12:01:32 -0500 Subject: [PATCH 0900/1578] ALSA: hda/realtek: Remove Framework Laptop 16 from quirks The Framework Laptop 16 does not have a combination headphone/headset 3.5mm jack; however, applying the pincfg from the Laptop 13 (nid=0x19) erroneously informs hda that the node is present. Fixes: 8804fa04a492 ("ALSA: hda/realtek: Add Framework laptop 16 to quirks") Signed-off-by: Dustin L. Howett Reviewed-by: Mario Limonciello Link: https://lore.kernel.org/r/20240605-alsa-hda-realtek-remove-framework-laptop-16-from-quirks-v1-1-11d47fe8ec4d@howett.net Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 - 1 file changed, 1 deletion(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index da54300279af87..8408e0b0730a6f 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10610,7 +10610,6 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x8086, 0x2081, "Intel NUC 10", ALC256_FIXUP_INTEL_NUC10), SND_PCI_QUIRK(0x8086, 0x3038, "Intel NUC 13", ALC295_FIXUP_CHROME_BOOK), SND_PCI_QUIRK(0xf111, 0x0001, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), - SND_PCI_QUIRK(0xf111, 0x0005, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), SND_PCI_QUIRK(0xf111, 0x0006, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), #if 0 From 82f3daed2d3590fa286a02301573a183dd902a0f Mon Sep 17 00:00:00 2001 From: Stefan Binding Date: Thu, 6 Jun 2024 14:03:48 +0100 Subject: [PATCH 0901/1578] ALSA: hda: cs35l41: Support Lenovo Thinkbook 16P Gen 5 This laptop does not contain _DSD so needs to be supported using the configuration table. Signed-off-by: Stefan Binding Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/r/20240606130351.333495-2-sbinding@opensource.cirrus.com --- sound/pci/hda/cs35l41_hda_property.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/pci/hda/cs35l41_hda_property.c b/sound/pci/hda/cs35l41_hda_property.c index 6a7a6d486916ad..046a94250683da 100644 --- a/sound/pci/hda/cs35l41_hda_property.c +++ b/sound/pci/hda/cs35l41_hda_property.c @@ -128,6 +128,8 @@ static const struct cs35l41_config cs35l41_config_table[] = { { "17AA38B5", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 0, 0, 0 }, { "17AA38B6", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 0, 0, 0 }, { "17AA38B7", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 0, 0, 0 }, + { "17AA38F9", 2, EXTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, 0, 0 }, 0, 2, -1, 0, 0, 0 }, + { "17AA38FA", 2, EXTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, 0, 0 }, 0, 2, -1, 0, 0, 0 }, {} }; @@ -529,6 +531,8 @@ static const struct cs35l41_prop_model cs35l41_prop_model_table[] = { { "CSC3551", "17AA38B5", generic_dsd_config }, { "CSC3551", "17AA38B6", generic_dsd_config }, { "CSC3551", "17AA38B7", generic_dsd_config }, + { "CSC3551", "17AA38F9", generic_dsd_config }, + { "CSC3551", "17AA38FA", generic_dsd_config }, {} }; From b32f92d1af3789038f03c2899e3be0d00b43faf2 Mon Sep 17 00:00:00 2001 From: Stefan Binding Date: Thu, 6 Jun 2024 14:03:49 +0100 Subject: [PATCH 0902/1578] ALSA: hda: cs35l41: Support Lenovo Thinkbook 13x Gen 4 This laptop does not contain _DSD so needs to be supported using the configuration table. Signed-off-by: Stefan Binding Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/r/20240606130351.333495-3-sbinding@opensource.cirrus.com --- sound/pci/hda/cs35l41_hda_property.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/pci/hda/cs35l41_hda_property.c b/sound/pci/hda/cs35l41_hda_property.c index 046a94250683da..51998d1c72ff11 100644 --- a/sound/pci/hda/cs35l41_hda_property.c +++ b/sound/pci/hda/cs35l41_hda_property.c @@ -128,6 +128,8 @@ static const struct cs35l41_config cs35l41_config_table[] = { { "17AA38B5", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 0, 0, 0 }, { "17AA38B6", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 0, 0, 0 }, { "17AA38B7", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 0, 0, 0 }, + { "17AA38C7", 4, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, CS35L41_LEFT, CS35L41_RIGHT }, 0, 2, -1, 1000, 4500, 24 }, + { "17AA38C8", 4, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, CS35L41_LEFT, CS35L41_RIGHT }, 0, 2, -1, 1000, 4500, 24 }, { "17AA38F9", 2, EXTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, 0, 0 }, 0, 2, -1, 0, 0, 0 }, { "17AA38FA", 2, EXTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, 0, 0 }, 0, 2, -1, 0, 0, 0 }, {} @@ -531,6 +533,8 @@ static const struct cs35l41_prop_model cs35l41_prop_model_table[] = { { "CSC3551", "17AA38B5", generic_dsd_config }, { "CSC3551", "17AA38B6", generic_dsd_config }, { "CSC3551", "17AA38B7", generic_dsd_config }, + { "CSC3551", "17AA38C7", generic_dsd_config }, + { "CSC3551", "17AA38C8", generic_dsd_config }, { "CSC3551", "17AA38F9", generic_dsd_config }, { "CSC3551", "17AA38FA", generic_dsd_config }, {} From 75f2ea939b5c694b36aad8ef823a2f9bcf7b3d7d Mon Sep 17 00:00:00 2001 From: Stefan Binding Date: Thu, 6 Jun 2024 14:03:50 +0100 Subject: [PATCH 0903/1578] ALSA: hda/realtek: Support Lenovo Thinkbook 16P Gen 5 Add support for this laptop, which uses CS35L41 HDA amps. The laptop does not contain valid _DSD for these amps, so requires entries into the CS35L41 configuration table to function correctly. [ fixed to lower hex numbers in quirk entries -- tiwai ] Signed-off-by: Stefan Binding Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/r/20240606130351.333495-4-sbinding@opensource.cirrus.com --- sound/pci/hda/patch_realtek.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 8408e0b0730a6f..2320f04eca5aae 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10548,6 +10548,8 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x38cd, "Y790 VECO DUAL", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x38d2, "Lenovo Yoga 9 14IMH9", ALC287_FIXUP_YOGA9_14IMH9_BASS_SPK_PIN), SND_PCI_QUIRK(0x17aa, 0x38d7, "Lenovo Yoga 9 14IMH9", ALC287_FIXUP_YOGA9_14IMH9_BASS_SPK_PIN), + SND_PCI_QUIRK(0x17aa, 0x38f9, "Thinkbook 16P Gen5", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x17aa, 0x38fa, "Thinkbook 16P Gen5", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x17aa, 0x3902, "Lenovo E50-80", ALC269_FIXUP_DMIC_THINKPAD_ACPI), SND_PCI_QUIRK(0x17aa, 0x3977, "IdeaPad S210", ALC283_FIXUP_INT_MIC), SND_PCI_QUIRK(0x17aa, 0x3978, "Lenovo B50-70", ALC269_FIXUP_DMIC_THINKPAD_ACPI), From 4ecb16d9250e6fcf8818572bf317b6adae16515b Mon Sep 17 00:00:00 2001 From: Stefan Binding Date: Thu, 6 Jun 2024 14:03:51 +0100 Subject: [PATCH 0904/1578] ALSA: hda/realtek: Support Lenovo Thinkbook 13x Gen 4 Add support for this laptop, which uses CS35L41 HDA amps. The laptop does not contain valid _DSD for these amps, so requires entries into the CS35L41 configuration table to function correctly. Signed-off-by: Stefan Binding Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/r/20240606130351.333495-5-sbinding@opensource.cirrus.com --- sound/pci/hda/patch_realtek.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 2320f04eca5aae..79736c8782a31c 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10544,6 +10544,8 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x38be, "Yoga S980-14.5 proX YC Dual", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x38bf, "Yoga S980-14.5 proX LX Dual", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x38c3, "Y980 DUAL", ALC287_FIXUP_TAS2781_I2C), + SND_PCI_QUIRK(0x17aa, 0x38c7, "Thinkbook 13x Gen 4", ALC287_FIXUP_CS35L41_I2C_4), + SND_PCI_QUIRK(0x17aa, 0x38c8, "Thinkbook 13x Gen 4", ALC287_FIXUP_CS35L41_I2C_4), SND_PCI_QUIRK(0x17aa, 0x38cb, "Y790 YG DUAL", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x38cd, "Y790 VECO DUAL", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x38d2, "Lenovo Yoga 9 14IMH9", ALC287_FIXUP_YOGA9_14IMH9_BASS_SPK_PIN), From 2646b43910c0e6d7f4ad535919b44b88f98c688d Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Fri, 7 Jun 2024 09:00:21 +0300 Subject: [PATCH 0905/1578] ALSA/hda: intel-dsp-config: Document AVS as dsp_driver option dsp_driver=4 will force the AVS driver stack to be used, it is better to docuement this. Fixes: 1affc44ea5dd ("ASoC: Intel: avs: PCI driver implementation") Signed-off-by: Peter Ujfalusi Reviewed-by: Cezary Rojewski Link: https://lore.kernel.org/r/20240607060021.11503-1-peter.ujfalusi@linux.intel.com Signed-off-by: Takashi Iwai --- sound/hda/intel-dsp-config.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/hda/intel-dsp-config.c b/sound/hda/intel-dsp-config.c index 537863447358e0..478d2b50c571de 100644 --- a/sound/hda/intel-dsp-config.c +++ b/sound/hda/intel-dsp-config.c @@ -18,7 +18,7 @@ static int dsp_driver; module_param(dsp_driver, int, 0444); -MODULE_PARM_DESC(dsp_driver, "Force the DSP driver for Intel DSP (0=auto, 1=legacy, 2=SST, 3=SOF)"); +MODULE_PARM_DESC(dsp_driver, "Force the DSP driver for Intel DSP (0=auto, 1=legacy, 2=SST, 3=SOF, 4=AVS)"); #define FLAG_SST BIT(0) #define FLAG_SOF BIT(1) From 8eef5c3cea65f248c99cd9dcb3f84c6509b78162 Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Tue, 11 Jun 2024 09:24:55 -0700 Subject: [PATCH 0906/1578] Revert "igc: fix a log entry using uninitialized netdev" This reverts commit 86167183a17e03ec77198897975e9fdfbd53cb0b. igc_ptp_init() needs to be called before igc_reset(), otherwise kernel crash could be observed. Following the corresponding discussion [1] and [2] revert this commit. Link: https://lore.kernel.org/all/8fb634f8-7330-4cf4-a8ce-485af9c0a61a@intel.com/ [1] Link: https://lore.kernel.org/all/87o78rmkhu.fsf@intel.com/ [2] Fixes: 86167183a17e ("igc: fix a log entry using uninitialized netdev") Signed-off-by: Sasha Neftin Tested-by: Naama Meir Signed-off-by: Tony Nguyen Link: https://lore.kernel.org/r/20240611162456.961631-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/igc/igc_main.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 305e05294a2659..87b655b839c1c7 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -7032,6 +7032,8 @@ static int igc_probe(struct pci_dev *pdev, device_set_wakeup_enable(&adapter->pdev->dev, adapter->flags & IGC_FLAG_WOL_SUPPORTED); + igc_ptp_init(adapter); + igc_tsn_clear_schedule(adapter); /* reset the hardware with the new settings */ @@ -7053,9 +7055,6 @@ static int igc_probe(struct pci_dev *pdev, /* Check if Media Autosense is enabled */ adapter->ei = *ei; - /* do hw tstamp init after resetting */ - igc_ptp_init(adapter); - /* print pcie link status and MAC address */ pcie_print_link_status(pdev); netdev_info(netdev, "MAC: %pM\n", netdev->dev_addr); From 79f18a41dd056115d685f3b0a419c7cd40055e13 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Wed, 12 Jun 2024 06:04:46 +0000 Subject: [PATCH 0907/1578] ionic: fix use after netif_napi_del() When queues are started, netif_napi_add() and napi_enable() are called. If there are 4 queues and only 3 queues are used for the current configuration, only 3 queues' napi should be registered and enabled. The ionic_qcq_enable() checks whether the .poll pointer is not NULL for enabling only the using queue' napi. Unused queues' napi will not be registered by netif_napi_add(), so the .poll pointer indicates NULL. But it couldn't distinguish whether the napi was unregistered or not because netif_napi_del() doesn't reset the .poll pointer to NULL. So, ionic_qcq_enable() calls napi_enable() for the queue, which was unregistered by netif_napi_del(). Reproducer: ethtool -L rx 1 tx 1 combined 0 ethtool -L rx 0 tx 0 combined 1 ethtool -L rx 0 tx 0 combined 4 Splat looks like: kernel BUG at net/core/dev.c:6666! Oops: invalid opcode: 0000 [#1] PREEMPT SMP NOPTI CPU: 3 PID: 1057 Comm: kworker/3:3 Not tainted 6.10.0-rc2+ #16 Workqueue: events ionic_lif_deferred_work [ionic] RIP: 0010:napi_enable+0x3b/0x40 Code: 48 89 c2 48 83 e2 f6 80 b9 61 09 00 00 00 74 0d 48 83 bf 60 01 00 00 00 74 03 80 ce 01 f0 4f RSP: 0018:ffffb6ed83227d48 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff97560cda0828 RCX: 0000000000000029 RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffff97560cda0a28 RBP: ffffb6ed83227d50 R08: 0000000000000400 R09: 0000000000000001 R10: 0000000000000001 R11: 0000000000000001 R12: 0000000000000000 R13: ffff97560ce3c1a0 R14: 0000000000000000 R15: ffff975613ba0a20 FS: 0000000000000000(0000) GS:ffff975d5f780000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f8f734ee200 CR3: 0000000103e50000 CR4: 00000000007506f0 PKRU: 55555554 Call Trace: ? die+0x33/0x90 ? do_trap+0xd9/0x100 ? napi_enable+0x3b/0x40 ? do_error_trap+0x83/0xb0 ? napi_enable+0x3b/0x40 ? napi_enable+0x3b/0x40 ? exc_invalid_op+0x4e/0x70 ? napi_enable+0x3b/0x40 ? asm_exc_invalid_op+0x16/0x20 ? napi_enable+0x3b/0x40 ionic_qcq_enable+0xb7/0x180 [ionic 59bdfc8a035436e1c4224ff7d10789e3f14643f8] ionic_start_queues+0xc4/0x290 [ionic 59bdfc8a035436e1c4224ff7d10789e3f14643f8] ionic_link_status_check+0x11c/0x170 [ionic 59bdfc8a035436e1c4224ff7d10789e3f14643f8] ionic_lif_deferred_work+0x129/0x280 [ionic 59bdfc8a035436e1c4224ff7d10789e3f14643f8] process_one_work+0x145/0x360 worker_thread+0x2bb/0x3d0 ? __pfx_worker_thread+0x10/0x10 kthread+0xcc/0x100 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2d/0x50 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 Fixes: 0f3154e6bcb3 ("ionic: Add Tx and Rx handling") Signed-off-by: Taehee Yoo Reviewed-by: Brett Creeley Reviewed-by: Shannon Nelson Link: https://lore.kernel.org/r/20240612060446.1754392-1-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/pensando/ionic/ionic_lif.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index 24870da3f48488..1934e9d6d9e4fb 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -304,10 +304,8 @@ static int ionic_qcq_enable(struct ionic_qcq *qcq) if (ret) return ret; - if (qcq->napi.poll) - napi_enable(&qcq->napi); - if (qcq->flags & IONIC_QCQ_F_INTR) { + napi_enable(&qcq->napi); irq_set_affinity_hint(qcq->intr.vector, &qcq->intr.affinity_mask); ionic_intr_mask(idev->intr_ctrl, qcq->intr.index, From 6f4d93b78ade0a4c2cafd587f7b429ce95abb02e Mon Sep 17 00:00:00 2001 From: Ziwei Xiao Date: Wed, 12 Jun 2024 00:16:54 +0000 Subject: [PATCH 0908/1578] gve: Clear napi->skb before dev_kfree_skb_any() gve_rx_free_skb incorrectly leaves napi->skb referencing an skb after it is freed with dev_kfree_skb_any(). This can result in a subsequent call to napi_get_frags returning a dangling pointer. Fix this by clearing napi->skb before the skb is freed. Fixes: 9b8dd5e5ea48 ("gve: DQO: Add RX path") Cc: stable@vger.kernel.org Reported-by: Shailend Chand Signed-off-by: Ziwei Xiao Reviewed-by: Harshitha Ramamurthy Reviewed-by: Shailend Chand Reviewed-by: Praveen Kaligineedi Link: https://lore.kernel.org/r/20240612001654.923887-1-ziweixiao@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/gve/gve_rx_dqo.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve_rx_dqo.c b/drivers/net/ethernet/google/gve/gve_rx_dqo.c index c1c912de59c726..1154c1d8f66f05 100644 --- a/drivers/net/ethernet/google/gve/gve_rx_dqo.c +++ b/drivers/net/ethernet/google/gve/gve_rx_dqo.c @@ -647,11 +647,13 @@ static void gve_rx_skb_hash(struct sk_buff *skb, skb_set_hash(skb, le32_to_cpu(compl_desc->hash), hash_type); } -static void gve_rx_free_skb(struct gve_rx_ring *rx) +static void gve_rx_free_skb(struct napi_struct *napi, struct gve_rx_ring *rx) { if (!rx->ctx.skb_head) return; + if (rx->ctx.skb_head == napi->skb) + napi->skb = NULL; dev_kfree_skb_any(rx->ctx.skb_head); rx->ctx.skb_head = NULL; rx->ctx.skb_tail = NULL; @@ -950,7 +952,7 @@ int gve_rx_poll_dqo(struct gve_notify_block *block, int budget) err = gve_rx_dqo(napi, rx, compl_desc, complq->head, rx->q_num); if (err < 0) { - gve_rx_free_skb(rx); + gve_rx_free_skb(napi, rx); u64_stats_update_begin(&rx->statss); if (err == -ENOMEM) rx->rx_skb_alloc_fail++; @@ -993,7 +995,7 @@ int gve_rx_poll_dqo(struct gve_notify_block *block, int budget) /* gve_rx_complete_skb() will consume skb if successful */ if (gve_rx_complete_skb(rx, napi, compl_desc, feat) != 0) { - gve_rx_free_skb(rx); + gve_rx_free_skb(napi, rx); u64_stats_update_begin(&rx->statss); rx->rx_desc_err_dropped_pkt++; u64_stats_update_end(&rx->statss); From 7da9dfdd5a3dbfd3d2450d9c6a3d1d699d625c43 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 11 Jun 2024 09:27:38 +0200 Subject: [PATCH 0909/1578] .editorconfig: remove trim_trailing_whitespace option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some editors (like the vim variants), when seeing "trim_whitespace" decide to do just that for all of the whitespace in the file you are saving, even if it is not on a line that you have modified. This plays havoc with diffs and is NOT something that should be intended. As the "only trim whitespace on modified lines" is not part of the editorconfig standard yet, just delete these lines from the .editorconfig file so that we don't end up with diffs that are automatically rejected by maintainers for containing things they shouldn't. Cc: Danny Lin Cc: Íñigo Huguet Cc: Mickaël Salaün Cc: Masahiro Yamada Fixes: 5a602de99797 ("Add .editorconfig file for basic formatting") Acked-by: Vincent Mailhol Link: https://lore.kernel.org/r/2024061137-jawless-dipped-e789@gregkh Signed-off-by: Greg Kroah-Hartman --- .editorconfig | 3 --- 1 file changed, 3 deletions(-) diff --git a/.editorconfig b/.editorconfig index 854773350cc5a9..29a30ccfc07bf2 100644 --- a/.editorconfig +++ b/.editorconfig @@ -5,7 +5,6 @@ root = true [{*.{awk,c,dts,dtsi,dtso,h,mk,s,S},Kconfig,Makefile,Makefile.*}] charset = utf-8 end_of_line = lf -trim_trailing_whitespace = true insert_final_newline = true indent_style = tab indent_size = 8 @@ -13,7 +12,6 @@ indent_size = 8 [*.{json,py,rs}] charset = utf-8 end_of_line = lf -trim_trailing_whitespace = true insert_final_newline = true indent_style = space indent_size = 4 @@ -26,7 +24,6 @@ indent_size = 8 [*.yaml] charset = utf-8 end_of_line = lf -trim_trailing_whitespace = unset insert_final_newline = true indent_style = space indent_size = 2 From 7d9df38c9c037ab84502ce7eeae9f1e1e7e72603 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Wed, 12 Jun 2024 16:17:36 -0700 Subject: [PATCH 0910/1578] bnxt_en: Cap the size of HWRM_PORT_PHY_QCFG forwarded response Firmware interface 1.10.2.118 has increased the size of HWRM_PORT_PHY_QCFG response beyond the maximum size that can be forwarded. When the VF's link state is not the default auto state, the PF will need to forward the response back to the VF to indicate the forced state. This regression may cause the VF to fail to initialize. Fix it by capping the HWRM_PORT_PHY_QCFG response to the maximum 96 bytes. The SPEEDS2_SUPPORTED flag needs to be cleared because the new speeds2 fields are beyond the legacy structure. Also modify bnxt_hwrm_fwd_resp() to print a warning if the message size exceeds 96 bytes to make this failure more obvious. Fixes: 84a911db8305 ("bnxt_en: Update firmware interface to 1.10.2.118") Reviewed-by: Somnath Kotur Reviewed-by: Pavan Chebbi Signed-off-by: Michael Chan Link: https://lore.kernel.org/r/20240612231736.57823-1-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 51 +++++++++++++++++++ .../net/ethernet/broadcom/bnxt/bnxt_sriov.c | 12 ++++- 2 files changed, 61 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 656ab81c0272d9..bbc7edccd5a4d0 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -1434,6 +1434,57 @@ struct bnxt_l2_filter { atomic_t refcnt; }; +/* Compat version of hwrm_port_phy_qcfg_output capped at 96 bytes. The + * first 95 bytes are identical to hwrm_port_phy_qcfg_output in bnxt_hsi.h. + * The last valid byte in the compat version is different. + */ +struct hwrm_port_phy_qcfg_output_compat { + __le16 error_code; + __le16 req_type; + __le16 seq_id; + __le16 resp_len; + u8 link; + u8 active_fec_signal_mode; + __le16 link_speed; + u8 duplex_cfg; + u8 pause; + __le16 support_speeds; + __le16 force_link_speed; + u8 auto_mode; + u8 auto_pause; + __le16 auto_link_speed; + __le16 auto_link_speed_mask; + u8 wirespeed; + u8 lpbk; + u8 force_pause; + u8 module_status; + __le32 preemphasis; + u8 phy_maj; + u8 phy_min; + u8 phy_bld; + u8 phy_type; + u8 media_type; + u8 xcvr_pkg_type; + u8 eee_config_phy_addr; + u8 parallel_detect; + __le16 link_partner_adv_speeds; + u8 link_partner_adv_auto_mode; + u8 link_partner_adv_pause; + __le16 adv_eee_link_speed_mask; + __le16 link_partner_adv_eee_link_speed_mask; + __le32 xcvr_identifier_type_tx_lpi_timer; + __le16 fec_cfg; + u8 duplex_state; + u8 option_flags; + char phy_vendor_name[16]; + char phy_vendor_partnumber[16]; + __le16 support_pam4_speeds; + __le16 force_pam4_link_speed; + __le16 auto_pam4_link_speed_mask; + u8 link_partner_pam4_adv_speeds; + u8 valid; +}; + struct bnxt_link_info { u8 phy_type; u8 media_type; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c index 175192ebaa773a..22898d3d088b05 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c @@ -950,8 +950,11 @@ static int bnxt_hwrm_fwd_resp(struct bnxt *bp, struct bnxt_vf_info *vf, struct hwrm_fwd_resp_input *req; int rc; - if (BNXT_FWD_RESP_SIZE_ERR(msg_size)) + if (BNXT_FWD_RESP_SIZE_ERR(msg_size)) { + netdev_warn_once(bp->dev, "HWRM fwd response too big (%d bytes)\n", + msg_size); return -EINVAL; + } rc = hwrm_req_init(bp, req, HWRM_FWD_RESP); if (!rc) { @@ -1085,7 +1088,7 @@ static int bnxt_vf_set_link(struct bnxt *bp, struct bnxt_vf_info *vf) rc = bnxt_hwrm_exec_fwd_resp( bp, vf, sizeof(struct hwrm_port_phy_qcfg_input)); } else { - struct hwrm_port_phy_qcfg_output phy_qcfg_resp = {0}; + struct hwrm_port_phy_qcfg_output_compat phy_qcfg_resp = {}; struct hwrm_port_phy_qcfg_input *phy_qcfg_req; phy_qcfg_req = @@ -1096,6 +1099,11 @@ static int bnxt_vf_set_link(struct bnxt *bp, struct bnxt_vf_info *vf) mutex_unlock(&bp->link_lock); phy_qcfg_resp.resp_len = cpu_to_le16(sizeof(phy_qcfg_resp)); phy_qcfg_resp.seq_id = phy_qcfg_req->seq_id; + /* New SPEEDS2 fields are beyond the legacy structure, so + * clear the SPEEDS2_SUPPORTED flag. + */ + phy_qcfg_resp.option_flags &= + ~PORT_PHY_QCAPS_RESP_FLAGS2_SPEEDS2_SUPPORTED; phy_qcfg_resp.valid = 1; if (vf->flags & BNXT_VF_LINK_UP) { From a6736a0addd60fccc3a3508461d72314cc609772 Mon Sep 17 00:00:00 2001 From: Rao Shoaib Date: Tue, 11 Jun 2024 01:46:39 -0700 Subject: [PATCH 0911/1578] af_unix: Read with MSG_PEEK loops if the first unread byte is OOB Read with MSG_PEEK flag loops if the first byte to read is an OOB byte. commit 22dd70eb2c3d ("af_unix: Don't peek OOB data without MSG_OOB.") addresses the loop issue but does not address the issue that no data beyond OOB byte can be read. >>> from socket import * >>> c1, c2 = socketpair(AF_UNIX, SOCK_STREAM) >>> c1.send(b'a', MSG_OOB) 1 >>> c1.send(b'b') 1 >>> c2.recv(1, MSG_PEEK | MSG_DONTWAIT) b'b' >>> from socket import * >>> c1, c2 = socketpair(AF_UNIX, SOCK_STREAM) >>> c2.setsockopt(SOL_SOCKET, SO_OOBINLINE, 1) >>> c1.send(b'a', MSG_OOB) 1 >>> c1.send(b'b') 1 >>> c2.recv(1, MSG_PEEK | MSG_DONTWAIT) b'a' >>> c2.recv(1, MSG_PEEK | MSG_DONTWAIT) b'a' >>> c2.recv(1, MSG_DONTWAIT) b'a' >>> c2.recv(1, MSG_PEEK | MSG_DONTWAIT) b'b' >>> Fixes: 314001f0bf92 ("af_unix: Add OOB support") Signed-off-by: Rao Shoaib Reviewed-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240611084639.2248934-1-Rao.Shoaib@oracle.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 80846279de9f3b..5e695a9a609c26 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2625,18 +2625,18 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, if (skb == u->oob_skb) { if (copied) { skb = NULL; - } else if (sock_flag(sk, SOCK_URGINLINE)) { - if (!(flags & MSG_PEEK)) { + } else if (!(flags & MSG_PEEK)) { + if (sock_flag(sk, SOCK_URGINLINE)) { WRITE_ONCE(u->oob_skb, NULL); consume_skb(skb); + } else { + __skb_unlink(skb, &sk->sk_receive_queue); + WRITE_ONCE(u->oob_skb, NULL); + unlinked_skb = skb; + skb = skb_peek(&sk->sk_receive_queue); } - } else if (flags & MSG_PEEK) { - skb = NULL; - } else { - __skb_unlink(skb, &sk->sk_receive_queue); - WRITE_ONCE(u->oob_skb, NULL); - unlinked_skb = skb; - skb = skb_peek(&sk->sk_receive_queue); + } else if (!sock_flag(sk, SOCK_URGINLINE)) { + skb = skb_peek_next(skb, &sk->sk_receive_queue); } } From a9b9741854a9fe9df948af49ca5514e0ed0429df Mon Sep 17 00:00:00 2001 From: Aleksandr Mishin Date: Tue, 11 Jun 2024 11:25:46 +0300 Subject: [PATCH 0912/1578] bnxt_en: Adjust logging of firmware messages in case of released token in __hwrm_send() In case of token is released due to token->state == BNXT_HWRM_DEFERRED, released token (set to NULL) is used in log messages. This issue is expected to be prevented by HWRM_ERR_CODE_PF_UNAVAILABLE error code. But this error code is returned by recent firmware. So some firmware may not return it. This may lead to NULL pointer dereference. Adjust this issue by adding token pointer check. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 8fa4219dba8e ("bnxt_en: add dynamic debug support for HWRM messages") Suggested-by: Michael Chan Signed-off-by: Aleksandr Mishin Reviewed-by: Wojciech Drewek Reviewed-by: Michael Chan Link: https://lore.kernel.org/r/20240611082547.12178-1-amishin@t-argos.ru Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.c index 1df3d56cc4b513..d2fd2d04ed474a 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.c @@ -680,7 +680,7 @@ static int __hwrm_send(struct bnxt *bp, struct bnxt_hwrm_ctx *ctx) req_type); else if (rc && rc != HWRM_ERR_CODE_PF_UNAVAILABLE) hwrm_err(bp, ctx, "hwrm req_type 0x%x seq id 0x%x error 0x%x\n", - req_type, token->seq_id, rc); + req_type, le16_to_cpu(ctx->req->seq_id), rc); rc = __hwrm_to_stderr(rc); exit: if (token) From bc69ad74867dba1377abe14356c94a946d9837a3 Mon Sep 17 00:00:00 2001 From: En-Wei Wu Date: Thu, 30 May 2024 22:21:31 +0800 Subject: [PATCH 0913/1578] ice: avoid IRQ collision to fix init failure on ACPI S3 resume A bug in https://bugzilla.kernel.org/show_bug.cgi?id=218906 describes that irdma would break and report hardware initialization failed after suspend/resume with Intel E810 NIC (tested on 6.9.0-rc5). The problem is caused due to the collision between the irq numbers requested in irdma and the irq numbers requested in other drivers after suspend/resume. The irq numbers used by irdma are derived from ice's ice_pf->msix_entries which stores mappings between MSI-X index and Linux interrupt number. It's supposed to be cleaned up when suspend and rebuilt in resume but it's not, causing irdma using the old irq numbers stored in the old ice_pf->msix_entries to request_irq() when resume. And eventually collide with other drivers. This patch fixes this problem. On suspend, we call ice_deinit_rdma() to clean up the ice_pf->msix_entries (and free the MSI-X vectors used by irdma if we've dynamically allocated them). On resume, we call ice_init_rdma() to rebuild the ice_pf->msix_entries (and allocate the MSI-X vectors if we would like to dynamically allocate them). Fixes: f9f5301e7e2d ("ice: Register auxiliary device to provide RDMA") Tested-by: Cyrus Lien Signed-off-by: En-Wei Wu Reviewed-by: Wojciech Drewek Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_main.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 1b61ca3a6eb6e1..45d850514f4c3c 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -5564,7 +5564,7 @@ static int ice_suspend(struct device *dev) */ disabled = ice_service_task_stop(pf); - ice_unplug_aux_dev(pf); + ice_deinit_rdma(pf); /* Already suspended?, then there is nothing to do */ if (test_and_set_bit(ICE_SUSPENDED, pf->state)) { @@ -5644,6 +5644,11 @@ static int ice_resume(struct device *dev) if (ret) dev_err(dev, "Cannot restore interrupt scheme: %d\n", ret); + ret = ice_init_rdma(pf); + if (ret) + dev_err(dev, "Reinitialize RDMA during resume failed: %d\n", + ret); + clear_bit(ICE_DOWN, pf->state); /* Now perform PF reset and rebuild */ reset_type = ICE_RESET_PFR; From aeccadb24d9dacdde673a0f68f0a9135c6be4993 Mon Sep 17 00:00:00 2001 From: Paul Greenwalt Date: Thu, 30 May 2024 13:06:17 -0400 Subject: [PATCH 0914/1578] ice: fix 200G link speed message log Commit 24407a01e57c ("ice: Add 200G speed/phy type use") added support for 200G PHY speeds, but did not include 200G link speed message support. As a result the driver incorrectly reports Unknown for 200G link speed. Fix this by adding 200G support to ice_print_link_msg(). Fixes: 24407a01e57c ("ice: Add 200G speed/phy type use") Reviewed-by: Michal Swiatkowski Signed-off-by: Paul Greenwalt Reviewed-by: Jesse Brandeburg Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 45d850514f4c3c..1766230abfff65 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -805,6 +805,9 @@ void ice_print_link_msg(struct ice_vsi *vsi, bool isup) } switch (vsi->port_info->phy.link_info.link_speed) { + case ICE_AQ_LINK_SPEED_200GB: + speed = "200 G"; + break; case ICE_AQ_LINK_SPEED_100GB: speed = "100 G"; break; From a27f6ac9d404ea84196639dcc456f969ef813c0f Mon Sep 17 00:00:00 2001 From: Wojciech Drewek Date: Tue, 4 Jun 2024 14:55:14 +0200 Subject: [PATCH 0915/1578] ice: implement AQ download pkg retry ice_aqc_opc_download_pkg (0x0C40) AQ sporadically returns error due to FW issue. Fix this by retrying five times before moving to Safe Mode. Sleep for 20 ms before retrying. This was tested with the 4.40 firmware. Fixes: c76488109616 ("ice: Implement Dynamic Device Personalization (DDP) download") Reviewed-by: Michal Swiatkowski Signed-off-by: Wojciech Drewek Reviewed-by: Brett Creeley Reviewed-by: Przemek Kitszel Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_ddp.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ddp.c b/drivers/net/ethernet/intel/ice/ice_ddp.c index ce5034ed2b240c..f182179529b7de 100644 --- a/drivers/net/ethernet/intel/ice/ice_ddp.c +++ b/drivers/net/ethernet/intel/ice/ice_ddp.c @@ -1339,6 +1339,7 @@ ice_dwnld_cfg_bufs_no_lock(struct ice_hw *hw, struct ice_buf *bufs, u32 start, for (i = 0; i < count; i++) { bool last = false; + int try_cnt = 0; int status; bh = (struct ice_buf_hdr *)(bufs + start + i); @@ -1346,8 +1347,26 @@ ice_dwnld_cfg_bufs_no_lock(struct ice_hw *hw, struct ice_buf *bufs, u32 start, if (indicate_last) last = ice_is_last_download_buffer(bh, i, count); - status = ice_aq_download_pkg(hw, bh, ICE_PKG_BUF_SIZE, last, - &offset, &info, NULL); + while (1) { + status = ice_aq_download_pkg(hw, bh, ICE_PKG_BUF_SIZE, + last, &offset, &info, + NULL); + if (hw->adminq.sq_last_status != ICE_AQ_RC_ENOSEC && + hw->adminq.sq_last_status != ICE_AQ_RC_EBADSIG) + break; + + try_cnt++; + + if (try_cnt == 5) + break; + + msleep(20); + } + + if (try_cnt) + dev_dbg(ice_hw_to_dev(hw), + "ice_aq_download_pkg number of retries: %d\n", + try_cnt); /* Save AQ status from download package */ if (status) { From 92424801261d1564a0bb759da3cf3ccd69fdf5a2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 13 Jun 2024 13:53:08 +0200 Subject: [PATCH 0916/1578] bpf: Fix reg_set_min_max corruption of fake_reg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Juan reported that after doing some changes to buzzer [0] and implementing a new fuzzing strategy guided by coverage, they noticed the following in one of the probes: [...] 13: (79) r6 = *(u64 *)(r0 +0) ; R0=map_value(ks=4,vs=8) R6_w=scalar() 14: (b7) r0 = 0 ; R0_w=0 15: (b4) w0 = -1 ; R0_w=0xffffffff 16: (74) w0 >>= 1 ; R0_w=0x7fffffff 17: (5c) w6 &= w0 ; R0_w=0x7fffffff R6_w=scalar(smin=smin32=0,smax=umax=umax32=0x7fffffff,var_off=(0x0; 0x7fffffff)) 18: (44) w6 |= 2 ; R6_w=scalar(smin=umin=smin32=umin32=2,smax=umax=umax32=0x7fffffff,var_off=(0x2; 0x7ffffffd)) 19: (56) if w6 != 0x7ffffffd goto pc+1 REG INVARIANTS VIOLATION (true_reg2): range bounds violation u64=[0x7fffffff, 0x7ffffffd] s64=[0x7fffffff, 0x7ffffffd] u32=[0x7fffffff, 0x7ffffffd] s32=[0x7fffffff, 0x7ffffffd] var_off=(0x7fffffff, 0x0) REG INVARIANTS VIOLATION (false_reg1): range bounds violation u64=[0x7fffffff, 0x7ffffffd] s64=[0x7fffffff, 0x7ffffffd] u32=[0x7fffffff, 0x7ffffffd] s32=[0x7fffffff, 0x7ffffffd] var_off=(0x7fffffff, 0x0) REG INVARIANTS VIOLATION (false_reg2): const tnum out of sync with range bounds u64=[0x0, 0xffffffffffffffff] s64=[0x8000000000000000, 0x7fffffffffffffff] u32=[0x0, 0xffffffff] s32=[0x80000000, 0x7fffffff] var_off=(0x7fffffff, 0x0) 19: R6_w=0x7fffffff 20: (95) exit from 19 to 21: R0=0x7fffffff R6=scalar(smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=0x7ffffffe,var_off=(0x2; 0x7ffffffd)) R7=map_ptr(ks=4,vs=8) R9=ctx() R10=fp0 fp-24=map_ptr(ks=4,vs=8) fp-40=mmmmmmmm 21: R0=0x7fffffff R6=scalar(smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=0x7ffffffe,var_off=(0x2; 0x7ffffffd)) R7=map_ptr(ks=4,vs=8) R9=ctx() R10=fp0 fp-24=map_ptr(ks=4,vs=8) fp-40=mmmmmmmm 21: (14) w6 -= 2147483632 ; R6_w=scalar(smin=umin=umin32=2,smax=umax=0xffffffff,smin32=0x80000012,smax32=14,var_off=(0x2; 0xfffffffd)) 22: (76) if w6 s>= 0xe goto pc+1 ; R6_w=scalar(smin=umin=umin32=2,smax=umax=0xffffffff,smin32=0x80000012,smax32=13,var_off=(0x2; 0xfffffffd)) 23: (95) exit from 22 to 24: R0=0x7fffffff R6_w=14 R7=map_ptr(ks=4,vs=8) R9=ctx() R10=fp0 fp-24=map_ptr(ks=4,vs=8) fp-40=mmmmmmmm 24: R0=0x7fffffff R6_w=14 R7=map_ptr(ks=4,vs=8) R9=ctx() R10=fp0 fp-24=map_ptr(ks=4,vs=8) fp-40=mmmmmmmm 24: (14) w6 -= 14 ; R6_w=0 [...] What can be seen here is a register invariant violation on line 19. After the binary-or in line 18, the verifier knows that bit 2 is set but knows nothing about the rest of the content which was loaded from a map value, meaning, range is [2,0x7fffffff] with var_off=(0x2; 0x7ffffffd). When in line 19 the verifier analyzes the branch, it splits the register states in reg_set_min_max() into the registers of the true branch (true_reg1, true_reg2) and the registers of the false branch (false_reg1, false_reg2). Since the test is w6 != 0x7ffffffd, the src_reg is a known constant. Internally, the verifier creates a "fake" register initialized as scalar to the value of 0x7ffffffd, and then passes it onto reg_set_min_max(). Now, for line 19, it is mathematically impossible to take the false branch of this program, yet the verifier analyzes it. It is impossible because the second bit of r6 will be set due to the prior or operation and the constant in the condition has that bit unset (hex(fd) == binary(1111 1101). When the verifier first analyzes the false / fall-through branch, it will compute an intersection between the var_off of r6 and of the constant. This is because the verifier creates a "fake" register initialized to the value of the constant. The intersection result later refines both registers in regs_refine_cond_op(): [...] t = tnum_intersect(tnum_subreg(reg1->var_off), tnum_subreg(reg2->var_off)); reg1->var_off = tnum_with_subreg(reg1->var_off, t); reg2->var_off = tnum_with_subreg(reg2->var_off, t); [...] Since the verifier is analyzing the false branch of the conditional jump, reg1 is equal to false_reg1 and reg2 is equal to false_reg2, i.e. the reg2 is the "fake" register that was meant to hold a constant value. The resulting var_off of the intersection says that both registers now hold a known value of var_off=(0x7fffffff, 0x0) or in other words: this operation manages to make the verifier think that the "constant" value that was passed in the jump operation now holds a different value. Normally this would not be an issue since it should not influence the true branch, however, false_reg2 and true_reg2 are pointers to the same "fake" register. Meaning, the false branch can influence the results of the true branch. In line 24, the verifier assumes R6_w=0, but the actual runtime value in this case is 1. The fix is simply not passing in the same "fake" register location as inputs to reg_set_min_max(), but instead making a copy. Moving the fake_reg into the env also reduces stack consumption by 120 bytes. With this, the verifier successfully rejects invalid accesses from the test program. [0] https://github.com/google/buzzer Fixes: 67420501e868 ("bpf: generalize reg_set_min_max() to handle non-const register comparisons") Reported-by: Juan José López Jaimez Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Link: https://lore.kernel.org/r/20240613115310.25383-1-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 ++ kernel/bpf/verifier.c | 14 ++++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 50aa87f8d77ff6..e4070fb02b1100 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -746,6 +746,8 @@ struct bpf_verifier_env { /* Same as scratched_regs but for stack slots */ u64 scratched_stack_slots; u64 prev_log_pos, prev_insn_print_pos; + /* buffer used to temporary hold constants as scalar registers */ + struct bpf_reg_state fake_reg[2]; /* buffer used to generate temporary string representations, * e.g., in reg_type_str() to generate reg_type string */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 36ef8e96787ed5..f455548ba46c98 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15113,7 +15113,6 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL; struct bpf_reg_state *eq_branch_regs; - struct bpf_reg_state fake_reg = {}; u8 opcode = BPF_OP(insn->code); bool is_jmp32; int pred = -1; @@ -15179,7 +15178,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); return -EINVAL; } - src_reg = &fake_reg; + src_reg = &env->fake_reg[0]; + memset(src_reg, 0, sizeof(*src_reg)); src_reg->type = SCALAR_VALUE; __mark_reg_known(src_reg, insn->imm); } @@ -15239,10 +15239,16 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, &other_branch_regs[insn->src_reg], dst_reg, src_reg, opcode, is_jmp32); } else /* BPF_SRC(insn->code) == BPF_K */ { + /* reg_set_min_max() can mangle the fake_reg. Make a copy + * so that these are two different memory locations. The + * src_reg is not used beyond here in context of K. + */ + memcpy(&env->fake_reg[1], &env->fake_reg[0], + sizeof(env->fake_reg[0])); err = reg_set_min_max(env, &other_branch_regs[insn->dst_reg], - src_reg /* fake one */, - dst_reg, src_reg /* same fake one */, + &env->fake_reg[0], + dst_reg, &env->fake_reg[1], opcode, is_jmp32); } if (err) From e73cd1cfc2177654e562b04f514be5f0f0b96da2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 13 Jun 2024 13:53:09 +0200 Subject: [PATCH 0917/1578] bpf: Reduce stack consumption in check_stack_write_fixed_off The fake_reg moved into env->fake_reg given it consumes a lot of stack space (120 bytes). Migrate the fake_reg in check_stack_write_fixed_off() as well now that we have it. Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20240613115310.25383-2-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f455548ba46c98..e5a0ba3bc38d45 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4549,11 +4549,12 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, state->stack[spi].spilled_ptr.id = 0; } else if (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) && env->bpf_capable) { - struct bpf_reg_state fake_reg = {}; + struct bpf_reg_state *tmp_reg = &env->fake_reg[0]; - __mark_reg_known(&fake_reg, insn->imm); - fake_reg.type = SCALAR_VALUE; - save_register_state(env, state, spi, &fake_reg, size); + memset(tmp_reg, 0, sizeof(*tmp_reg)); + __mark_reg_known(tmp_reg, insn->imm); + tmp_reg->type = SCALAR_VALUE; + save_register_state(env, state, spi, tmp_reg, size); } else if (reg && is_spillable_regtype(reg->type)) { /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { From ceb65eb60026e03e1028a99f0ec94f22065e722a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 13 Jun 2024 13:53:10 +0200 Subject: [PATCH 0918/1578] selftests/bpf: Add test coverage for reg_set_min_max handling Add a test case for the jmp32/k fix to ensure selftests have coverage. Before fix: # ./vmtest.sh -- ./test_progs -t verifier_or_jmp32_k [...] ./test_progs -t verifier_or_jmp32_k tester_init:PASS:tester_log_buf 0 nsec process_subtest:PASS:obj_open_mem 0 nsec process_subtest:PASS:specs_alloc 0 nsec run_subtest:PASS:obj_open_mem 0 nsec run_subtest:FAIL:unexpected_load_success unexpected success: 0 #492/1 verifier_or_jmp32_k/or_jmp32_k: bit ops + branch on unknown value:FAIL #492 verifier_or_jmp32_k:FAIL Summary: 0/0 PASSED, 0 SKIPPED, 1 FAILED After fix: # ./vmtest.sh -- ./test_progs -t verifier_or_jmp32_k [...] ./test_progs -t verifier_or_jmp32_k #492/1 verifier_or_jmp32_k/or_jmp32_k: bit ops + branch on unknown value:OK #492 verifier_or_jmp32_k:OK Summary: 1/1 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/r/20240613115310.25383-3-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/verifier.c | 2 + .../selftests/bpf/progs/verifier_or_jmp32_k.c | 41 +++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_or_jmp32_k.c diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 1c9c4ec1be11ed..98ef39efa77e84 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -53,6 +53,7 @@ #include "verifier_movsx.skel.h" #include "verifier_netfilter_ctx.skel.h" #include "verifier_netfilter_retcode.skel.h" +#include "verifier_or_jmp32_k.skel.h" #include "verifier_precision.skel.h" #include "verifier_prevent_map_lookup.skel.h" #include "verifier_raw_stack.skel.h" @@ -170,6 +171,7 @@ void test_verifier_meta_access(void) { RUN(verifier_meta_access); } void test_verifier_movsx(void) { RUN(verifier_movsx); } void test_verifier_netfilter_ctx(void) { RUN(verifier_netfilter_ctx); } void test_verifier_netfilter_retcode(void) { RUN(verifier_netfilter_retcode); } +void test_verifier_or_jmp32_k(void) { RUN(verifier_or_jmp32_k); } void test_verifier_precision(void) { RUN(verifier_precision); } void test_verifier_prevent_map_lookup(void) { RUN(verifier_prevent_map_lookup); } void test_verifier_raw_stack(void) { RUN(verifier_raw_stack); } diff --git a/tools/testing/selftests/bpf/progs/verifier_or_jmp32_k.c b/tools/testing/selftests/bpf/progs/verifier_or_jmp32_k.c new file mode 100644 index 00000000000000..f37713a265ac70 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_or_jmp32_k.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "bpf_misc.h" + +SEC("socket") +__description("or_jmp32_k: bit ops + branch on unknown value") +__failure +__msg("R0 invalid mem access 'scalar'") +__naked void or_jmp32_k(void) +{ + asm volatile (" \ + r0 = 0xffffffff; \ + r0 /= 1; \ + r1 = 0; \ + w1 = -1; \ + w1 >>= 1; \ + w0 &= w1; \ + w0 |= 2; \ + if w0 != 0x7ffffffd goto l1; \ + r0 = 1; \ + exit; \ +l3: \ + r0 = 5; \ + *(u64*)(r0 - 8) = r0; \ + exit; \ +l2: \ + w0 -= 0xe; \ + if w0 == 1 goto l3; \ + r0 = 4; \ + exit; \ +l1: \ + w0 -= 0x7ffffff0; \ + if w0 s>= 0xe goto l2; \ + r0 = 3; \ + exit; \ +" ::: __clobber_all); +} + +char _license[] SEC("license") = "GPL"; From b99a95bc56c52a428befbce12d9451fd7a0f3bc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Thu, 13 Jun 2024 10:31:46 -0700 Subject: [PATCH 0919/1578] bpf: fix UML x86_64 compile failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pcpu_hot (defined in arch/x86) is not available on user mode linux (ARCH=um) Cc: Andrii Nakryiko Cc: John Fastabend Cc: Alexei Starovoitov Fixes: 1ae6921009e5 ("bpf: inline bpf_get_smp_processor_id() helper") Signed-off-by: Maciej Żenczykowski Link: https://lore.kernel.org/r/20240613173146.2524647-1-maze@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e5a0ba3bc38d45..010cfee7ffe938 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20320,7 +20320,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) goto next_insn; } -#ifdef CONFIG_X86_64 +#if defined(CONFIG_X86_64) && !defined(CONFIG_UML) /* Implement bpf_get_smp_processor_id() inline. */ if (insn->imm == BPF_FUNC_get_smp_processor_id && prog->jit_requested && bpf_jit_supports_percpu_insn()) { From 9a95c5bfbf02a0a7f5983280fe284a0ff0836c34 Mon Sep 17 00:00:00 2001 From: GUO Zihua Date: Tue, 7 May 2024 01:25:41 +0000 Subject: [PATCH 0920/1578] ima: Avoid blocking in RCU read-side critical section A panic happens in ima_match_policy: BUG: unable to handle kernel NULL pointer dereference at 0000000000000010 PGD 42f873067 P4D 0 Oops: 0000 [#1] SMP NOPTI CPU: 5 PID: 1286325 Comm: kubeletmonit.sh Kdump: loaded Tainted: P Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.0.0 02/06/2015 RIP: 0010:ima_match_policy+0x84/0x450 Code: 49 89 fc 41 89 cf 31 ed 89 44 24 14 eb 1c 44 39 7b 18 74 26 41 83 ff 05 74 20 48 8b 1b 48 3b 1d f2 b9 f4 00 0f 84 9c 01 00 00 <44> 85 73 10 74 ea 44 8b 6b 14 41 f6 c5 01 75 d4 41 f6 c5 02 74 0f RSP: 0018:ff71570009e07a80 EFLAGS: 00010207 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000200 RDX: ffffffffad8dc7c0 RSI: 0000000024924925 RDI: ff3e27850dea2000 RBP: 0000000000000000 R08: 0000000000000000 R09: ffffffffabfce739 R10: ff3e27810cc42400 R11: 0000000000000000 R12: ff3e2781825ef970 R13: 00000000ff3e2785 R14: 000000000000000c R15: 0000000000000001 FS: 00007f5195b51740(0000) GS:ff3e278b12d40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000010 CR3: 0000000626d24002 CR4: 0000000000361ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ima_get_action+0x22/0x30 process_measurement+0xb0/0x830 ? page_add_file_rmap+0x15/0x170 ? alloc_set_pte+0x269/0x4c0 ? prep_new_page+0x81/0x140 ? simple_xattr_get+0x75/0xa0 ? selinux_file_open+0x9d/0xf0 ima_file_check+0x64/0x90 path_openat+0x571/0x1720 do_filp_open+0x9b/0x110 ? page_counter_try_charge+0x57/0xc0 ? files_cgroup_alloc_fd+0x38/0x60 ? __alloc_fd+0xd4/0x250 ? do_sys_open+0x1bd/0x250 do_sys_open+0x1bd/0x250 do_syscall_64+0x5d/0x1d0 entry_SYSCALL_64_after_hwframe+0x65/0xca Commit c7423dbdbc9e ("ima: Handle -ESTALE returned by ima_filter_rule_match()") introduced call to ima_lsm_copy_rule within a RCU read-side critical section which contains kmalloc with GFP_KERNEL. This implies a possible sleep and violates limitations of RCU read-side critical sections on non-PREEMPT systems. Sleeping within RCU read-side critical section might cause synchronize_rcu() returning early and break RCU protection, allowing a UAF to happen. The root cause of this issue could be described as follows: | Thread A | Thread B | | |ima_match_policy | | | rcu_read_lock | |ima_lsm_update_rule | | | synchronize_rcu | | | | kmalloc(GFP_KERNEL)| | | sleep | ==> synchronize_rcu returns early | kfree(entry) | | | | entry = entry->next| ==> UAF happens and entry now becomes NULL (or could be anything). | | entry->action | ==> Accessing entry might cause panic. To fix this issue, we are converting all kmalloc that is called within RCU read-side critical section to use GFP_ATOMIC. Fixes: c7423dbdbc9e ("ima: Handle -ESTALE returned by ima_filter_rule_match()") Cc: stable@vger.kernel.org Signed-off-by: GUO Zihua Acked-by: John Johansen Reviewed-by: Mimi Zohar Reviewed-by: Casey Schaufler [PM: fixed missing comment, long lines, !CONFIG_IMA_LSM_RULES case] Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 2 +- include/linux/security.h | 5 +++-- kernel/auditfilter.c | 5 +++-- security/apparmor/audit.c | 6 +++--- security/apparmor/include/audit.h | 2 +- security/integrity/ima/ima.h | 2 +- security/integrity/ima/ima_policy.c | 15 +++++++++------ security/security.c | 6 ++++-- security/selinux/include/audit.h | 4 +++- security/selinux/ss/services.c | 5 +++-- security/smack/smack_lsm.c | 4 +++- 11 files changed, 34 insertions(+), 22 deletions(-) diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index f804b76cde44ee..44488b1ab9a976 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -413,7 +413,7 @@ LSM_HOOK(void, LSM_RET_VOID, key_post_create_or_update, struct key *keyring, #ifdef CONFIG_AUDIT LSM_HOOK(int, 0, audit_rule_init, u32 field, u32 op, char *rulestr, - void **lsmrule) + void **lsmrule, gfp_t gfp) LSM_HOOK(int, 0, audit_rule_known, struct audit_krule *krule) LSM_HOOK(int, 0, audit_rule_match, u32 secid, u32 field, u32 op, void *lsmrule) LSM_HOOK(void, LSM_RET_VOID, audit_rule_free, void *lsmrule) diff --git a/include/linux/security.h b/include/linux/security.h index 21cf70346b330e..de3af33e6ff502 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2048,7 +2048,8 @@ static inline void security_key_post_create_or_update(struct key *keyring, #ifdef CONFIG_AUDIT #ifdef CONFIG_SECURITY -int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule); +int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule, + gfp_t gfp); int security_audit_rule_known(struct audit_krule *krule); int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule); void security_audit_rule_free(void *lsmrule); @@ -2056,7 +2057,7 @@ void security_audit_rule_free(void *lsmrule); #else static inline int security_audit_rule_init(u32 field, u32 op, char *rulestr, - void **lsmrule) + void **lsmrule, gfp_t gfp) { return 0; } diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index be8c680121e466..d6ef4f4f9cba50 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -529,7 +529,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, entry->rule.buflen += f_val; f->lsm_str = str; err = security_audit_rule_init(f->type, f->op, str, - (void **)&f->lsm_rule); + (void **)&f->lsm_rule, + GFP_KERNEL); /* Keep currently invalid fields around in case they * become valid after a policy reload. */ if (err == -EINVAL) { @@ -799,7 +800,7 @@ static inline int audit_dupe_lsm_field(struct audit_field *df, /* our own (refreshed) copy of lsm_rule */ ret = security_audit_rule_init(df->type, df->op, df->lsm_str, - (void **)&df->lsm_rule); + (void **)&df->lsm_rule, GFP_KERNEL); /* Keep currently invalid fields around in case they * become valid after a policy reload. */ if (ret == -EINVAL) { diff --git a/security/apparmor/audit.c b/security/apparmor/audit.c index 45beb1c5f747a8..6b5181c668b5b7 100644 --- a/security/apparmor/audit.c +++ b/security/apparmor/audit.c @@ -217,7 +217,7 @@ void aa_audit_rule_free(void *vrule) } } -int aa_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule) +int aa_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule, gfp_t gfp) { struct aa_audit_rule *rule; @@ -230,14 +230,14 @@ int aa_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule) return -EINVAL; } - rule = kzalloc(sizeof(struct aa_audit_rule), GFP_KERNEL); + rule = kzalloc(sizeof(struct aa_audit_rule), gfp); if (!rule) return -ENOMEM; /* Currently rules are treated as coming from the root ns */ rule->label = aa_label_parse(&root_ns->unconfined->label, rulestr, - GFP_KERNEL, true, false); + gfp, true, false); if (IS_ERR(rule->label)) { int err = PTR_ERR(rule->label); aa_audit_rule_free(rule); diff --git a/security/apparmor/include/audit.h b/security/apparmor/include/audit.h index acbb03b9bd25cc..0c8cc86b417b56 100644 --- a/security/apparmor/include/audit.h +++ b/security/apparmor/include/audit.h @@ -200,7 +200,7 @@ static inline int complain_error(int error) } void aa_audit_rule_free(void *vrule); -int aa_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule); +int aa_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule, gfp_t gfp); int aa_audit_rule_known(struct audit_krule *rule); int aa_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule); diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index 3e568126cd4816..c51e24d24d1e9e 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -546,7 +546,7 @@ static inline void ima_free_modsig(struct modsig *modsig) #else static inline int ima_filter_rule_init(u32 field, u32 op, char *rulestr, - void **lsmrule) + void **lsmrule, gfp_t gfp) { return -EINVAL; } diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index c0556907c2e67a..09da8e63923956 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -401,7 +401,8 @@ static void ima_free_rule(struct ima_rule_entry *entry) kfree(entry); } -static struct ima_rule_entry *ima_lsm_copy_rule(struct ima_rule_entry *entry) +static struct ima_rule_entry *ima_lsm_copy_rule(struct ima_rule_entry *entry, + gfp_t gfp) { struct ima_rule_entry *nentry; int i; @@ -410,7 +411,7 @@ static struct ima_rule_entry *ima_lsm_copy_rule(struct ima_rule_entry *entry) * Immutable elements are copied over as pointers and data; only * lsm rules can change */ - nentry = kmemdup(entry, sizeof(*nentry), GFP_KERNEL); + nentry = kmemdup(entry, sizeof(*nentry), gfp); if (!nentry) return NULL; @@ -425,7 +426,8 @@ static struct ima_rule_entry *ima_lsm_copy_rule(struct ima_rule_entry *entry) ima_filter_rule_init(nentry->lsm[i].type, Audit_equal, nentry->lsm[i].args_p, - &nentry->lsm[i].rule); + &nentry->lsm[i].rule, + gfp); if (!nentry->lsm[i].rule) pr_warn("rule for LSM \'%s\' is undefined\n", nentry->lsm[i].args_p); @@ -438,7 +440,7 @@ static int ima_lsm_update_rule(struct ima_rule_entry *entry) int i; struct ima_rule_entry *nentry; - nentry = ima_lsm_copy_rule(entry); + nentry = ima_lsm_copy_rule(entry, GFP_KERNEL); if (!nentry) return -ENOMEM; @@ -664,7 +666,7 @@ static bool ima_match_rules(struct ima_rule_entry *rule, } if (rc == -ESTALE && !rule_reinitialized) { - lsm_rule = ima_lsm_copy_rule(rule); + lsm_rule = ima_lsm_copy_rule(rule, GFP_ATOMIC); if (lsm_rule) { rule_reinitialized = true; goto retry; @@ -1140,7 +1142,8 @@ static int ima_lsm_rule_init(struct ima_rule_entry *entry, entry->lsm[lsm_rule].type = audit_type; result = ima_filter_rule_init(entry->lsm[lsm_rule].type, Audit_equal, entry->lsm[lsm_rule].args_p, - &entry->lsm[lsm_rule].rule); + &entry->lsm[lsm_rule].rule, + GFP_KERNEL); if (!entry->lsm[lsm_rule].rule) { pr_warn("rule for LSM \'%s\' is undefined\n", entry->lsm[lsm_rule].args_p); diff --git a/security/security.c b/security/security.c index e5da848c50b915..e5ca08789f7418 100644 --- a/security/security.c +++ b/security/security.c @@ -5332,15 +5332,17 @@ void security_key_post_create_or_update(struct key *keyring, struct key *key, * @op: rule operator * @rulestr: rule context * @lsmrule: receive buffer for audit rule struct + * @gfp: GFP flag used for kmalloc * * Allocate and initialize an LSM audit rule structure. * * Return: Return 0 if @lsmrule has been successfully set, -EINVAL in case of * an invalid rule. */ -int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule) +int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule, + gfp_t gfp) { - return call_int_hook(audit_rule_init, field, op, rulestr, lsmrule); + return call_int_hook(audit_rule_init, field, op, rulestr, lsmrule, gfp); } /** diff --git a/security/selinux/include/audit.h b/security/selinux/include/audit.h index 52aca71210b476..29c7d4c86f6d57 100644 --- a/security/selinux/include/audit.h +++ b/security/selinux/include/audit.h @@ -21,12 +21,14 @@ * @op: the operator the rule uses * @rulestr: the text "target" of the rule * @rule: pointer to the new rule structure returned via this + * @gfp: GFP flag used for kmalloc * * Returns 0 if successful, -errno if not. On success, the rule structure * will be allocated internally. The caller must free this structure with * selinux_audit_rule_free() after use. */ -int selinux_audit_rule_init(u32 field, u32 op, char *rulestr, void **rule); +int selinux_audit_rule_init(u32 field, u32 op, char *rulestr, void **rule, + gfp_t gfp); /** * selinux_audit_rule_free - free an selinux audit rule structure. diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index f20e1968b7f7a8..e33e55384b75aa 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -3507,7 +3507,8 @@ void selinux_audit_rule_free(void *vrule) } } -int selinux_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule) +int selinux_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule, + gfp_t gfp) { struct selinux_state *state = &selinux_state; struct selinux_policy *policy; @@ -3548,7 +3549,7 @@ int selinux_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule) return -EINVAL; } - tmprule = kzalloc(sizeof(struct selinux_audit_rule), GFP_KERNEL); + tmprule = kzalloc(sizeof(struct selinux_audit_rule), gfp); if (!tmprule) return -ENOMEM; context_init(&tmprule->au_ctxt); diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 70ba2841e181d3..f5cbec1e6a9230 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -4693,11 +4693,13 @@ static int smack_post_notification(const struct cred *w_cred, * @op: required testing operator (=, !=, >, <, ...) * @rulestr: smack label to be audited * @vrule: pointer to save our own audit rule representation + * @gfp: type of the memory for the allocation * * Prepare to audit cases where (@field @op @rulestr) is true. * The label to be audited is created if necessay. */ -static int smack_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule) +static int smack_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule, + gfp_t gfp) { struct smack_known *skp; char **rule = (char **)vrule; From 4eb4e85c4f818491efc67e9373aa16b123c3f522 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Fri, 7 Jun 2024 12:50:14 -0700 Subject: [PATCH 0921/1578] btrfs: retry block group reclaim without infinite loop If inc_block_group_ro systematically fails (e.g. due to ETXTBUSY from swap) or btrfs_relocate_chunk systematically fails (from lack of space), then this worker becomes an infinite loop. At the very least, this strands the cleaner thread, but can also result in hung tasks/RCU stalls on PREEMPT_NONE kernels and if the reclaim_bgs_lock mutex is not contended. I believe the best long term fix is to manage reclaim via work queue, where we queue up a relocation on the triggering condition and re-queue on failure. In the meantime, this is an easy fix to apply to avoid the immediate pain. Fixes: 7e2718099438 ("btrfs: reinsert BGs failed to reclaim") CC: stable@vger.kernel.org # 6.6+ Signed-off-by: Boris Burkov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 1e09aeea69c22e..1a66be33bb0482 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1785,6 +1785,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) container_of(work, struct btrfs_fs_info, reclaim_bgs_work); struct btrfs_block_group *bg; struct btrfs_space_info *space_info; + LIST_HEAD(retry_list); if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return; @@ -1921,8 +1922,11 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) } next: - if (ret) - btrfs_mark_bg_to_reclaim(bg); + if (ret) { + /* Refcount held by the reclaim_bgs list after splice. */ + btrfs_get_block_group(bg); + list_add_tail(&bg->bg_list, &retry_list); + } btrfs_put_block_group(bg); mutex_unlock(&fs_info->reclaim_bgs_lock); @@ -1942,6 +1946,9 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) spin_unlock(&fs_info->unused_bgs_lock); mutex_unlock(&fs_info->reclaim_bgs_lock); end: + spin_lock(&fs_info->unused_bgs_lock); + list_splice_tail(&retry_list, &fs_info->reclaim_bgs); + spin_unlock(&fs_info->unused_bgs_lock); btrfs_exclop_finish(fs_info); sb_end_write(fs_info->sb); } From cebae292e0c32a228e8f2219c270a7237be24a6a Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Fri, 7 Jun 2024 13:27:48 +0200 Subject: [PATCH 0922/1578] btrfs: zoned: allocate dummy checksums for zoned NODATASUM writes Shin'ichiro reported that when he's running fstests' test-case btrfs/167 on emulated zoned devices, he's seeing the following NULL pointer dereference in 'btrfs_zone_finish_endio()': Oops: general protection fault, probably for non-canonical address 0xdffffc0000000011: 0000 [#1] PREEMPT SMP KASAN NOPTI KASAN: null-ptr-deref in range [0x0000000000000088-0x000000000000008f] CPU: 4 PID: 2332440 Comm: kworker/u80:15 Tainted: G W 6.10.0-rc2-kts+ #4 Hardware name: Supermicro Super Server/X11SPi-TF, BIOS 3.3 02/21/2020 Workqueue: btrfs-endio-write btrfs_work_helper [btrfs] RIP: 0010:btrfs_zone_finish_endio.part.0+0x34/0x160 [btrfs] RSP: 0018:ffff88867f107a90 EFLAGS: 00010206 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffffff893e5534 RDX: 0000000000000011 RSI: 0000000000000004 RDI: 0000000000000088 RBP: 0000000000000002 R08: 0000000000000001 R09: ffffed1081696028 R10: ffff88840b4b0143 R11: ffff88834dfff600 R12: ffff88840b4b0000 R13: 0000000000020000 R14: 0000000000000000 R15: ffff888530ad5210 FS: 0000000000000000(0000) GS:ffff888e3f800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f87223fff38 CR3: 00000007a7c6a002 CR4: 00000000007706f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? __die_body.cold+0x19/0x27 ? die_addr+0x46/0x70 ? exc_general_protection+0x14f/0x250 ? asm_exc_general_protection+0x26/0x30 ? do_raw_read_unlock+0x44/0x70 ? btrfs_zone_finish_endio.part.0+0x34/0x160 [btrfs] btrfs_finish_one_ordered+0x5d9/0x19a0 [btrfs] ? __pfx_lock_release+0x10/0x10 ? do_raw_write_lock+0x90/0x260 ? __pfx_do_raw_write_lock+0x10/0x10 ? __pfx_btrfs_finish_one_ordered+0x10/0x10 [btrfs] ? _raw_write_unlock+0x23/0x40 ? btrfs_finish_ordered_zoned+0x5a9/0x850 [btrfs] ? lock_acquire+0x435/0x500 btrfs_work_helper+0x1b1/0xa70 [btrfs] ? __schedule+0x10a8/0x60b0 ? __pfx___might_resched+0x10/0x10 process_one_work+0x862/0x1410 ? __pfx_lock_acquire+0x10/0x10 ? __pfx_process_one_work+0x10/0x10 ? assign_work+0x16c/0x240 worker_thread+0x5e6/0x1010 ? __pfx_worker_thread+0x10/0x10 kthread+0x2c3/0x3a0 ? trace_irq_enable.constprop.0+0xce/0x110 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x31/0x70 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 Enabling CONFIG_BTRFS_ASSERT revealed the following assertion to trigger: assertion failed: !list_empty(&ordered->list), in fs/btrfs/zoned.c:1815 This indicates, that we're missing the checksums list on the ordered_extent. As btrfs/167 is doing a NOCOW write this is to be expected. Further analysis with drgn confirmed the assumption: >>> inode = prog.crashed_thread().stack_trace()[11]['ordered'].inode >>> btrfs_inode = drgn.container_of(inode, "struct btrfs_inode", \ "vfs_inode") >>> print(btrfs_inode.flags) (u32)1 As zoned emulation mode simulates conventional zones on regular devices, we cannot use zone-append for writing. But we're only attaching dummy checksums if we're doing a zone-append write. So for NOCOW zoned data writes on conventional zones, also attach a dummy checksum. Reported-by: Shinichiro Kawasaki Fixes: cbfce4c7fbde ("btrfs: optimize the logical to physical mapping for zoned writes") CC: Naohiro Aota # 6.6+ Tested-by: Shin'ichiro Kawasaki Reviewed-by: Naohiro Aota Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 477f350a8bd09e..e3a57196b0ee0a 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -741,7 +741,9 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) ret = btrfs_bio_csum(bbio); if (ret) goto fail_put_bio; - } else if (use_append) { + } else if (use_append || + (btrfs_is_zoned(fs_info) && inode && + inode->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_alloc_dummy_sum(bbio); if (ret) goto fail_put_bio; From ff0ffe5b7c3c12c6e0cca16652905963ae817b44 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 13 Jun 2024 09:36:50 -0700 Subject: [PATCH 0923/1578] nvme: fix namespace removal list This function wants to move a subset of a list from one element to the tail into another list. It also needs to use the srcu synchronize instead of the regular rcu version. Do this one element at a time because that's the only to do it. Fixes: be647e2c76b27f4 ("nvme: use srcu for iterating namespace list") Reported-by: Venkat Rao Bagalkote Tested-by: Venkat Rao Bagalkote Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c40930d10bd34d..782090ce0bc10d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3960,12 +3960,13 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, mutex_lock(&ctrl->namespaces_lock); list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) { - if (ns->head->ns_id > nsid) - list_splice_init_rcu(&ns->list, &rm_list, - synchronize_rcu); + if (ns->head->ns_id > nsid) { + list_del_rcu(&ns->list); + synchronize_srcu(&ctrl->srcu); + list_add_tail_rcu(&ns->list, &rm_list); + } } mutex_unlock(&ctrl->namespaces_lock); - synchronize_srcu(&ctrl->srcu); list_for_each_entry_safe(ns, next, &rm_list, list) nvme_ns_remove(ns); From 4467c09bc7a66a17ffd84d6262d48279b26106ea Mon Sep 17 00:00:00 2001 From: Aryan Srivastava Date: Thu, 13 Jun 2024 14:49:00 +1200 Subject: [PATCH 0924/1578] net: mvpp2: use slab_build_skb for oversized frames Setting frag_size to 0 to indicate kmalloc has been deprecated, use slab_build_skb directly. Fixes: ce098da1497c ("skbuff: Introduce slab_build_skb()") Signed-off-by: Aryan Srivastava Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20240613024900.3842238-1-aryan.srivastava@alliedtelesis.co.nz Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index e91486c48de382..671368d2c77e6a 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -4014,7 +4014,10 @@ static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi, } } - skb = build_skb(data, frag_size); + if (frag_size) + skb = build_skb(data, frag_size); + else + skb = slab_build_skb(data); if (!skb) { netdev_warn(port->dev, "skb build failed\n"); goto err_drop_frame; From 135c6eb27a85c8b261a2cc1f5093abcda6ee9010 Mon Sep 17 00:00:00 2001 From: Joel Slebodnick Date: Thu, 13 Jun 2024 14:27:28 -0400 Subject: [PATCH 0925/1578] scsi: ufs: core: Free memory allocated for model before reinit Under the conditions that a device is to be reinitialized within ufshcd_probe_hba(), the device must first be fully reset. Resetting the device should include freeing U8 model (member of dev_info) but does not, and this causes a memory leak. ufs_put_device_desc() is responsible for freeing model. unreferenced object 0xffff3f63008bee60 (size 32): comm "kworker/u33:1", pid 60, jiffies 4294892642 hex dump (first 32 bytes): 54 48 47 4a 46 47 54 30 54 32 35 42 41 5a 5a 41 THGJFGT0T25BAZZA 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace (crc ed7ff1a9): [] kmemleak_alloc+0x34/0x40 [] __kmalloc_noprof+0x1e4/0x2fc [] ufshcd_read_string_desc+0x94/0x190 [] ufshcd_device_init+0x480/0xdf8 [] ufshcd_probe_hba+0x3c/0x404 [] ufshcd_async_scan+0x40/0x370 [] async_run_entry_fn+0x34/0xe0 [] process_one_work+0x154/0x298 [] worker_thread+0x2f8/0x408 [] kthread+0x114/0x118 [] ret_from_fork+0x10/0x20 Fixes: 96a7141da332 ("scsi: ufs: core: Add support for reinitializing the UFS device") Cc: Reviewed-by: Andrew Halaney Reviewed-by: Bart Van Assche Signed-off-by: Joel Slebodnick Link: https://lore.kernel.org/r/20240613200202.2524194-1-jslebodn@redhat.com Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index e5e9da61f15d0b..1b65e6ae41375f 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -8787,6 +8787,7 @@ static int ufshcd_probe_hba(struct ufs_hba *hba, bool init_dev_params) (hba->quirks & UFSHCD_QUIRK_REINIT_AFTER_MAX_GEAR_SWITCH)) { /* Reset the device and controller before doing reinit */ ufshcd_device_reset(hba); + ufs_put_device_desc(hba); ufshcd_hba_stop(hba); ufshcd_vops_reinit_notify(hba); ret = ufshcd_hba_enable(hba); From 633aeefafc9c2a07a76a62be6aac1d73c3e3defa Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 13 Jun 2024 14:18:26 -0700 Subject: [PATCH 0926/1578] scsi: core: Introduce the BLIST_SKIP_IO_HINTS flag Prepare for skipping the IO Advice Hints Grouping mode page for USB storage devices. Cc: Alan Stern Cc: Joao Machado Cc: Andy Shevchenko Cc: Christian Heusel Cc: stable@vger.kernel.org Fixes: 4f53138fffc2 ("scsi: sd: Translate data lifetime information") Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240613211828.2077477-2-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 4 ++++ include/scsi/scsi_devinfo.h | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index fbc11046bbf601..fe82baa924f81b 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include #include @@ -3118,6 +3119,9 @@ static void sd_read_io_hints(struct scsi_disk *sdkp, unsigned char *buffer) struct scsi_mode_data data; int res; + if (sdp->sdev_bflags & BLIST_SKIP_IO_HINTS) + return; + res = scsi_mode_sense(sdp, /*dbd=*/0x8, /*modepage=*/0x0a, /*subpage=*/0x05, buffer, SD_BUF_SIZE, SD_TIMEOUT, sdkp->max_retries, &data, &sshdr); diff --git a/include/scsi/scsi_devinfo.h b/include/scsi/scsi_devinfo.h index 6b548dc2c49654..1d79a3b536ceed 100644 --- a/include/scsi/scsi_devinfo.h +++ b/include/scsi/scsi_devinfo.h @@ -69,8 +69,10 @@ #define BLIST_RETRY_ITF ((__force blist_flags_t)(1ULL << 32)) /* Always retry ABORTED_COMMAND with ASC 0xc1 */ #define BLIST_RETRY_ASC_C1 ((__force blist_flags_t)(1ULL << 33)) +/* Do not query the IO Advice Hints Grouping mode page */ +#define BLIST_SKIP_IO_HINTS ((__force blist_flags_t)(1ULL << 34)) -#define __BLIST_LAST_USED BLIST_RETRY_ASC_C1 +#define __BLIST_LAST_USED BLIST_SKIP_IO_HINTS #define __BLIST_HIGH_UNUSED (~(__BLIST_LAST_USED | \ (__force blist_flags_t) \ From 57619f3cdeb5ae9f4252833b0ed600e9f81da722 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 13 Jun 2024 14:18:27 -0700 Subject: [PATCH 0927/1578] scsi: usb: uas: Do not query the IO Advice Hints Grouping mode page for USB/UAS devices Recently it was reported that the following USB storage devices are unusable with Linux kernel 6.9: * Kingston DataTraveler G2 * Garmin FR35 This is because attempting to read the IO Advice Hints Grouping mode page causes these devices to reset. Hence do not read the IO Advice Hints Grouping mode page from USB/UAS storage devices. Acked-by: Alan Stern Cc: stable@vger.kernel.org Fixes: 4f53138fffc2 ("scsi: sd: Translate data lifetime information") Reported-by: Joao Machado Closes: https://lore.kernel.org/linux-scsi/20240130214911.1863909-1-bvanassche@acm.org/T/#mf4e3410d8f210454d7e4c3d1fb5c0f41e651b85f Tested-by: Andy Shevchenko Bisected-by: Christian Heusel Reported-by: Andy Shevchenko Closes: https://lore.kernel.org/linux-scsi/CACLx9VdpUanftfPo2jVAqXdcWe8Y43MsDeZmMPooTzVaVJAh2w@mail.gmail.com/ Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240613211828.2077477-3-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- drivers/usb/storage/scsiglue.c | 6 ++++++ drivers/usb/storage/uas.c | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/drivers/usb/storage/scsiglue.c b/drivers/usb/storage/scsiglue.c index b31464740f6c86..8c8b5e6041cc27 100644 --- a/drivers/usb/storage/scsiglue.c +++ b/drivers/usb/storage/scsiglue.c @@ -79,6 +79,12 @@ static int slave_alloc (struct scsi_device *sdev) if (us->protocol == USB_PR_BULK && us->max_lun > 0) sdev->sdev_bflags |= BLIST_FORCELUN; + /* + * Some USB storage devices reset if the IO advice hints grouping mode + * page is queried. Hence skip that mode page. + */ + sdev->sdev_bflags |= BLIST_SKIP_IO_HINTS; + return 0; } diff --git a/drivers/usb/storage/uas.c b/drivers/usb/storage/uas.c index a48870a87a293c..b610a2de4ae5dd 100644 --- a/drivers/usb/storage/uas.c +++ b/drivers/usb/storage/uas.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -820,6 +821,12 @@ static int uas_slave_alloc(struct scsi_device *sdev) struct uas_dev_info *devinfo = (struct uas_dev_info *)sdev->host->hostdata; + /* + * Some USB storage devices reset if the IO advice hints grouping mode + * page is queried. Hence skip that mode page. + */ + sdev->sdev_bflags |= BLIST_SKIP_IO_HINTS; + sdev->hostdata = devinfo; return 0; } From f4a1254f2a076afb0edd473589bf40f9b4d36b41 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 14 Jun 2024 01:04:29 +0100 Subject: [PATCH 0928/1578] io_uring: fix cancellation overwriting req->flags Only the current owner of a request is allowed to write into req->flags. Hence, the cancellation path should never touch it. Add a new field instead of the flag, move it into the 3rd cache line because it should always be initialised. poll_refs can move further as polling is an involved process anyway. It's a minimal patch, in the future we can and should find a better place for it and remove now unused REQ_F_CANCEL_SEQ. Fixes: 521223d7c229f ("io_uring/cancel: don't default to setting req->work.cancel_seq") Cc: stable@vger.kernel.org Reported-by: Li Shi Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/6827b129f8f0ad76fa9d1f0a773de938b240ffab.1718323430.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 ++- io_uring/cancel.h | 4 ++-- io_uring/io_uring.c | 1 + 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 7a6b190c7da74a..b48570eaa4497f 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -648,7 +648,7 @@ struct io_kiocb { struct io_rsrc_node *rsrc_node; atomic_t refs; - atomic_t poll_refs; + bool cancel_seq_set; struct io_task_work io_task_work; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ struct hlist_node hash_node; @@ -657,6 +657,7 @@ struct io_kiocb { /* opcode allocated if it needs to store data for async defer */ void *async_data; /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */ + atomic_t poll_refs; struct io_kiocb *link; /* custom credentials, valid IFF REQ_F_CREDS is set */ const struct cred *creds; diff --git a/io_uring/cancel.h b/io_uring/cancel.h index 76b32e65c03cd7..b33995e00ba905 100644 --- a/io_uring/cancel.h +++ b/io_uring/cancel.h @@ -27,10 +27,10 @@ bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd); static inline bool io_cancel_match_sequence(struct io_kiocb *req, int sequence) { - if ((req->flags & REQ_F_CANCEL_SEQ) && sequence == req->work.cancel_seq) + if (req->cancel_seq_set && sequence == req->work.cancel_seq) return true; - req->flags |= REQ_F_CANCEL_SEQ; + req->cancel_seq_set = true; req->work.cancel_seq = sequence; return false; } diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 816e93e7f94907..154b25b8a613b1 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2058,6 +2058,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->file = NULL; req->rsrc_node = NULL; req->task = current; + req->cancel_seq_set = false; if (unlikely(opcode >= IORING_OP_LAST)) { req->opcode = 0; From 22f00812862564b314784167a89f27b444f82a46 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 13 Jun 2024 21:30:43 -0400 Subject: [PATCH 0929/1578] USB: class: cdc-wdm: Fix CPU lockup caused by excessive log messages The syzbot fuzzer found that the interrupt-URB completion callback in the cdc-wdm driver was taking too long, and the driver's immediate resubmission of interrupt URBs with -EPROTO status combined with the dummy-hcd emulation to cause a CPU lockup: cdc_wdm 1-1:1.0: nonzero urb status received: -71 cdc_wdm 1-1:1.0: wdm_int_callback - 0 bytes watchdog: BUG: soft lockup - CPU#0 stuck for 26s! [syz-executor782:6625] CPU#0 Utilization every 4s during lockup: #1: 98% system, 0% softirq, 3% hardirq, 0% idle #2: 98% system, 0% softirq, 3% hardirq, 0% idle #3: 98% system, 0% softirq, 3% hardirq, 0% idle #4: 98% system, 0% softirq, 3% hardirq, 0% idle #5: 98% system, 1% softirq, 3% hardirq, 0% idle Modules linked in: irq event stamp: 73096 hardirqs last enabled at (73095): [] console_emit_next_record kernel/printk/printk.c:2935 [inline] hardirqs last enabled at (73095): [] console_flush_all+0x650/0xb74 kernel/printk/printk.c:2994 hardirqs last disabled at (73096): [] __el1_irq arch/arm64/kernel/entry-common.c:533 [inline] hardirqs last disabled at (73096): [] el1_interrupt+0x24/0x68 arch/arm64/kernel/entry-common.c:551 softirqs last enabled at (73048): [] softirq_handle_end kernel/softirq.c:400 [inline] softirqs last enabled at (73048): [] handle_softirqs+0xa60/0xc34 kernel/softirq.c:582 softirqs last disabled at (73043): [] __do_softirq+0x14/0x20 kernel/softirq.c:588 CPU: 0 PID: 6625 Comm: syz-executor782 Tainted: G W 6.10.0-rc2-syzkaller-g8867bbd4a056 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/02/2024 Testing showed that the problem did not occur if the two error messages -- the first two lines above -- were removed; apparently adding material to the kernel log takes a surprisingly large amount of time. In any case, the best approach for preventing these lockups and to avoid spamming the log with thousands of error messages per second is to ratelimit the two dev_err() calls. Therefore we replace them with dev_err_ratelimited(). Signed-off-by: Alan Stern Suggested-by: Greg KH Reported-and-tested-by: syzbot+5f996b83575ef4058638@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-usb/00000000000073d54b061a6a1c65@google.com/ Reported-and-tested-by: syzbot+1b2abad17596ad03dcff@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-usb/000000000000f45085061aa9b37e@google.com/ Fixes: 9908a32e94de ("USB: remove err() macro from usb class drivers") Link: https://lore.kernel.org/linux-usb/40dfa45b-5f21-4eef-a8c1-51a2f320e267@rowland.harvard.edu/ Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/29855215-52f5-4385-b058-91f42c2bee18@rowland.harvard.edu Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-wdm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c index c553decb546107..6830be4419e20a 100644 --- a/drivers/usb/class/cdc-wdm.c +++ b/drivers/usb/class/cdc-wdm.c @@ -266,14 +266,14 @@ static void wdm_int_callback(struct urb *urb) dev_err(&desc->intf->dev, "Stall on int endpoint\n"); goto sw; /* halt is cleared in work */ default: - dev_err(&desc->intf->dev, + dev_err_ratelimited(&desc->intf->dev, "nonzero urb status received: %d\n", status); break; } } if (urb->actual_length < sizeof(struct usb_cdc_notification)) { - dev_err(&desc->intf->dev, "wdm_int_callback - %d bytes\n", + dev_err_ratelimited(&desc->intf->dev, "wdm_int_callback - %d bytes\n", urb->actual_length); goto exit; } From 41f590e31c6c8b8a0a490b7c1ad2e57c20ec3d9b Mon Sep 17 00:00:00 2001 From: pengfuyuan Date: Thu, 6 Jun 2024 20:08:42 +0800 Subject: [PATCH 0930/1578] arm/komeda: Remove all CONFIG_DEBUG_FS conditional compilations Since the debugfs functions have no-op stubs for CONFIG_DEBUG_FS=n, the compiler will optimize the rest away since they are no longer referenced. The benefit of removing the conditional compilation is that the build is actually tested for both CONFIG_DEBUG_FS configuration values. Assuming most developers have it enabled, CONFIG_DEBUG_FS=n is not tested much and may fail the build due to the conditional compilation. Reported-by: k2ci Signed-off-by: pengfuyuan Acked-by: Liviu Dudau Link: https://patchwork.freedesktop.org/patch/msgid/20240606120842.1377267-1-pengfuyuan@kylinos.cn Signed-off-by: Liviu Dudau Signed-off-by: Maxime Ripard --- drivers/gpu/drm/arm/display/komeda/komeda_dev.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/gpu/drm/arm/display/komeda/komeda_dev.c b/drivers/gpu/drm/arm/display/komeda/komeda_dev.c index 14ee79becacb56..5ba62e637a6169 100644 --- a/drivers/gpu/drm/arm/display/komeda/komeda_dev.c +++ b/drivers/gpu/drm/arm/display/komeda/komeda_dev.c @@ -12,10 +12,8 @@ #include #include #include -#ifdef CONFIG_DEBUG_FS #include #include -#endif #include @@ -43,7 +41,6 @@ static int komeda_register_show(struct seq_file *sf, void *x) DEFINE_SHOW_ATTRIBUTE(komeda_register); -#ifdef CONFIG_DEBUG_FS static void komeda_debugfs_init(struct komeda_dev *mdev) { if (!debugfs_initialized()) @@ -55,7 +52,6 @@ static void komeda_debugfs_init(struct komeda_dev *mdev) debugfs_create_x16("err_verbosity", 0664, mdev->debugfs_root, &mdev->err_verbosity); } -#endif static ssize_t core_id_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -265,9 +261,7 @@ struct komeda_dev *komeda_dev_create(struct device *dev) mdev->err_verbosity = KOMEDA_DEV_PRINT_ERR_EVENTS; -#ifdef CONFIG_DEBUG_FS komeda_debugfs_init(mdev); -#endif return mdev; @@ -286,9 +280,7 @@ void komeda_dev_destroy(struct komeda_dev *mdev) sysfs_remove_group(&dev->kobj, &komeda_sysfs_attr_group); -#ifdef CONFIG_DEBUG_FS debugfs_remove_recursive(mdev->debugfs_root); -#endif if (mdev->aclk) clk_prepare_enable(mdev->aclk); From 2663d0462eb32ae7c9b035300ab6b1523886c718 Mon Sep 17 00:00:00 2001 From: Kenton Groombridge Date: Wed, 5 Jun 2024 11:22:18 -0400 Subject: [PATCH 0931/1578] wifi: mac80211: Avoid address calculations via out of bounds array indexing req->n_channels must be set before req->channels[] can be used. This patch fixes one of the issues encountered in [1]. [ 83.964255] UBSAN: array-index-out-of-bounds in net/mac80211/scan.c:364:4 [ 83.964258] index 0 is out of range for type 'struct ieee80211_channel *[]' [...] [ 83.964264] Call Trace: [ 83.964267] [ 83.964269] dump_stack_lvl+0x3f/0xc0 [ 83.964274] __ubsan_handle_out_of_bounds+0xec/0x110 [ 83.964278] ieee80211_prep_hw_scan+0x2db/0x4b0 [ 83.964281] __ieee80211_start_scan+0x601/0x990 [ 83.964291] nl80211_trigger_scan+0x874/0x980 [ 83.964295] genl_family_rcv_msg_doit+0xe8/0x160 [ 83.964298] genl_rcv_msg+0x240/0x270 [...] [1] https://bugzilla.kernel.org/show_bug.cgi?id=218810 Co-authored-by: Kees Cook Signed-off-by: Kees Cook Signed-off-by: Kenton Groombridge Link: https://msgid.link/20240605152218.236061-1-concord@gentoo.org Signed-off-by: Johannes Berg --- net/mac80211/scan.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 8ecc4b710b0e64..b5f2df61c7f671 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -358,7 +358,8 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_sub_if_data *sdata) struct cfg80211_scan_request *req; struct cfg80211_chan_def chandef; u8 bands_used = 0; - int i, ielen, n_chans; + int i, ielen; + u32 *n_chans; u32 flags = 0; req = rcu_dereference_protected(local->scan_req, @@ -368,34 +369,34 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_sub_if_data *sdata) return false; if (ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS)) { + local->hw_scan_req->req.n_channels = req->n_channels; + for (i = 0; i < req->n_channels; i++) { local->hw_scan_req->req.channels[i] = req->channels[i]; bands_used |= BIT(req->channels[i]->band); } - - n_chans = req->n_channels; } else { do { if (local->hw_scan_band == NUM_NL80211_BANDS) return false; - n_chans = 0; + n_chans = &local->hw_scan_req->req.n_channels; + *n_chans = 0; for (i = 0; i < req->n_channels; i++) { if (req->channels[i]->band != local->hw_scan_band) continue; - local->hw_scan_req->req.channels[n_chans] = + local->hw_scan_req->req.channels[(*n_chans)++] = req->channels[i]; - n_chans++; + bands_used |= BIT(req->channels[i]->band); } local->hw_scan_band++; - } while (!n_chans); + } while (!*n_chans); } - local->hw_scan_req->req.n_channels = n_chans; ieee80211_prepare_scan_chandef(&chandef); if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) From 0d9c2beed116e623ac30810d382bd67163650f98 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 12 Jun 2024 12:23:51 +0200 Subject: [PATCH 0932/1578] wifi: mac80211: fix monitor channel with chanctx emulation After the channel context emulation, there were reports that changing the monitor channel no longer works. This is because those drivers don't have WANT_MONITOR_VIF, so the setting the channel always exits out quickly. Fix this by always allocating the virtual monitor sdata, and simply not telling the driver about it unless it wanted to. This way, we have an interface/sdata to bind the chanctx to, and the emulation can work correctly. Cc: stable@vger.kernel.org Fixes: 0a44dfc07074 ("wifi: mac80211: simplify non-chanctx drivers") Reported-and-tested-by: Savyasaachi Vanga Closes: https://lore.kernel.org/r/chwoymvpzwtbmzryrlitpwmta5j6mtndocxsyqvdyikqu63lon@gfds653hkknl Link: https://msgid.link/20240612122351.b12d4a109dde.I1831a44417faaab92bea1071209abbe4efbe3fba@changeid Signed-off-by: Johannes Berg --- net/mac80211/driver-ops.c | 17 +++++++++++++++++ net/mac80211/iface.c | 21 +++++++++------------ net/mac80211/util.c | 2 +- 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c index dce37ba8ebe374..254d745832cbfe 100644 --- a/net/mac80211/driver-ops.c +++ b/net/mac80211/driver-ops.c @@ -311,6 +311,18 @@ int drv_assign_vif_chanctx(struct ieee80211_local *local, might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); + /* + * We should perhaps push emulate chanctx down and only + * make it call ->config() when the chanctx is actually + * assigned here (and unassigned below), but that's yet + * another change to all drivers to add assign/unassign + * emulation callbacks. Maybe later. + */ + if (sdata->vif.type == NL80211_IFTYPE_MONITOR && + local->emulate_chanctx && + !ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) + return 0; + if (!check_sdata_in_driver(sdata)) return -EIO; @@ -338,6 +350,11 @@ void drv_unassign_vif_chanctx(struct ieee80211_local *local, might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); + if (sdata->vif.type == NL80211_IFTYPE_MONITOR && + local->emulate_chanctx && + !ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) + return; + if (!check_sdata_in_driver(sdata)) return; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 0c54554bf761bf..b935bb5d8ed1f7 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1122,9 +1122,6 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local) struct ieee80211_sub_if_data *sdata; int ret; - if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) - return 0; - ASSERT_RTNL(); lockdep_assert_wiphy(local->hw.wiphy); @@ -1146,11 +1143,13 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local) ieee80211_set_default_queues(sdata); - ret = drv_add_interface(local, sdata); - if (WARN_ON(ret)) { - /* ok .. stupid driver, it asked for this! */ - kfree(sdata); - return ret; + if (ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) { + ret = drv_add_interface(local, sdata); + if (WARN_ON(ret)) { + /* ok .. stupid driver, it asked for this! */ + kfree(sdata); + return ret; + } } set_bit(SDATA_STATE_RUNNING, &sdata->state); @@ -1188,9 +1187,6 @@ void ieee80211_del_virtual_monitor(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; - if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) - return; - ASSERT_RTNL(); lockdep_assert_wiphy(local->hw.wiphy); @@ -1210,7 +1206,8 @@ void ieee80211_del_virtual_monitor(struct ieee80211_local *local) ieee80211_link_release_channel(&sdata->deflink); - drv_remove_interface(local, sdata); + if (ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) + drv_remove_interface(local, sdata); kfree(sdata); } diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 283bfc99417e57..963ed75deb7659 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1843,7 +1843,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) /* add interfaces */ sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); - if (sdata) { + if (sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) { /* in HW restart it exists already */ WARN_ON(local->resuming); res = drv_add_interface(local, sdata); From 9f36169912331fa035d7b73a91252d7c2512eb1a Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Fri, 7 Jun 2024 18:07:52 +0200 Subject: [PATCH 0933/1578] cipso: fix total option length computation As evident from the definition of ip_options_get(), the IP option IPOPT_END is used to pad the IP option data array, not IPOPT_NOP. Yet the loop that walks the IP options to determine the total IP options length in cipso_v4_delopt() doesn't take IPOPT_END into account. Fix it by recognizing the IPOPT_END value as the end of actual options. Fixes: 014ab19a69c3 ("selinux: Set socket NetLabel based on connection endpoint") Signed-off-by: Ondrej Mosnacek Signed-off-by: David S. Miller --- net/ipv4/cipso_ipv4.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index dd6d4601505806..5e9ac68444f894 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -2013,12 +2013,16 @@ static int cipso_v4_delopt(struct ip_options_rcu __rcu **opt_ptr) * from there we can determine the new total option length */ iter = 0; optlen_new = 0; - while (iter < opt->opt.optlen) - if (opt->opt.__data[iter] != IPOPT_NOP) { + while (iter < opt->opt.optlen) { + if (opt->opt.__data[iter] == IPOPT_END) { + break; + } else if (opt->opt.__data[iter] == IPOPT_NOP) { + iter++; + } else { iter += opt->opt.__data[iter + 1]; optlen_new = iter; - } else - iter++; + } + } hdr_delta = opt->opt.optlen; opt->opt.optlen = (optlen_new + 3) & ~3; hdr_delta -= opt->opt.optlen; From 89aa3619d141d6cfb6040a561aebb6d99d3e2285 Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Fri, 7 Jun 2024 18:07:53 +0200 Subject: [PATCH 0934/1578] cipso: make cipso_v4_skbuff_delattr() fully remove the CIPSO options As the comment in this function says, the code currently just clears the CIPSO part with IPOPT_NOP, rather than removing it completely and trimming the packet. The other cipso_v4_*_delattr() functions, however, do the proper removal and also calipso_skbuff_delattr() makes an effort to remove the CALIPSO options instead of replacing them with padding. Some routers treat IPv4 packets with anything (even NOPs) in the option header as a special case and take them through a slower processing path. Consequently, hardening guides such as STIG recommend to configure such routers to drop packets with non-empty IP option headers [1][2]. Thus, users might expect NetLabel to produce packets with minimal padding (or at least with no padding when no actual options are present). Implement the proper option removal to address this and to be closer to what the peer functions do. [1] https://www.stigviewer.com/stig/juniper_router_rtr/2019-09-27/finding/V-90937 [2] https://www.stigviewer.com/stig/cisco_ios_xe_router_rtr/2021-03-26/finding/V-217001 Signed-off-by: Ondrej Mosnacek Signed-off-by: David S. Miller --- net/ipv4/cipso_ipv4.c | 79 +++++++++++++++++++++++++++++-------------- 1 file changed, 54 insertions(+), 25 deletions(-) diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 5e9ac68444f894..e9cb27061c12e5 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1810,6 +1810,29 @@ static int cipso_v4_genopt(unsigned char *buf, u32 buf_len, return CIPSO_V4_HDR_LEN + ret_val; } +static int cipso_v4_get_actual_opt_len(const unsigned char *data, int len) +{ + int iter = 0, optlen = 0; + + /* determining the new total option length is tricky because of + * the padding necessary, the only thing i can think to do at + * this point is walk the options one-by-one, skipping the + * padding at the end to determine the actual option size and + * from there we can determine the new total option length + */ + while (iter < len) { + if (data[iter] == IPOPT_END) { + break; + } else if (data[iter] == IPOPT_NOP) { + iter++; + } else { + iter += data[iter + 1]; + optlen = iter; + } + } + return optlen; +} + /** * cipso_v4_sock_setattr - Add a CIPSO option to a socket * @sk: the socket @@ -1986,7 +2009,6 @@ static int cipso_v4_delopt(struct ip_options_rcu __rcu **opt_ptr) u8 cipso_len; u8 cipso_off; unsigned char *cipso_ptr; - int iter; int optlen_new; cipso_off = opt->opt.cipso - sizeof(struct iphdr); @@ -2006,23 +2028,8 @@ static int cipso_v4_delopt(struct ip_options_rcu __rcu **opt_ptr) memmove(cipso_ptr, cipso_ptr + cipso_len, opt->opt.optlen - cipso_off - cipso_len); - /* determining the new total option length is tricky because of - * the padding necessary, the only thing i can think to do at - * this point is walk the options one-by-one, skipping the - * padding at the end to determine the actual option size and - * from there we can determine the new total option length */ - iter = 0; - optlen_new = 0; - while (iter < opt->opt.optlen) { - if (opt->opt.__data[iter] == IPOPT_END) { - break; - } else if (opt->opt.__data[iter] == IPOPT_NOP) { - iter++; - } else { - iter += opt->opt.__data[iter + 1]; - optlen_new = iter; - } - } + optlen_new = cipso_v4_get_actual_opt_len(opt->opt.__data, + opt->opt.optlen); hdr_delta = opt->opt.optlen; opt->opt.optlen = (optlen_new + 3) & ~3; hdr_delta -= opt->opt.optlen; @@ -2242,7 +2249,8 @@ int cipso_v4_skbuff_setattr(struct sk_buff *skb, */ int cipso_v4_skbuff_delattr(struct sk_buff *skb) { - int ret_val; + int ret_val, cipso_len, hdr_len_actual, new_hdr_len_actual, new_hdr_len, + hdr_len_delta; struct iphdr *iph; struct ip_options *opt = &IPCB(skb)->opt; unsigned char *cipso_ptr; @@ -2255,16 +2263,37 @@ int cipso_v4_skbuff_delattr(struct sk_buff *skb) if (ret_val < 0) return ret_val; - /* the easiest thing to do is just replace the cipso option with noop - * options since we don't change the size of the packet, although we - * still need to recalculate the checksum */ - iph = ip_hdr(skb); cipso_ptr = (unsigned char *)iph + opt->cipso; - memset(cipso_ptr, IPOPT_NOOP, cipso_ptr[1]); + cipso_len = cipso_ptr[1]; + + hdr_len_actual = sizeof(struct iphdr) + + cipso_v4_get_actual_opt_len((unsigned char *)(iph + 1), + opt->optlen); + new_hdr_len_actual = hdr_len_actual - cipso_len; + new_hdr_len = (new_hdr_len_actual + 3) & ~3; + hdr_len_delta = (iph->ihl << 2) - new_hdr_len; + + /* 1. shift any options after CIPSO to the left */ + memmove(cipso_ptr, cipso_ptr + cipso_len, + new_hdr_len_actual - opt->cipso); + /* 2. move the whole IP header to its new place */ + memmove((unsigned char *)iph + hdr_len_delta, iph, new_hdr_len_actual); + /* 3. adjust the skb layout */ + skb_pull(skb, hdr_len_delta); + skb_reset_network_header(skb); + iph = ip_hdr(skb); + /* 4. re-fill new padding with IPOPT_END (may now be longer) */ + memset((unsigned char *)iph + new_hdr_len_actual, IPOPT_END, + new_hdr_len - new_hdr_len_actual); + + opt->optlen -= hdr_len_delta; opt->cipso = 0; opt->is_changed = 1; - + if (hdr_len_delta != 0) { + iph->ihl = new_hdr_len >> 2; + iph_set_totlen(iph, skb->len); + } ip_send_check(iph); return 0; From 68f860426d500cfb697b505799244c7dfff604b1 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 14 Jun 2024 00:31:04 +0100 Subject: [PATCH 0935/1578] mfd: axp20x: AXP717: Fix missing IRQ status registers range While we list the "IRQ status *and acknowledge*" registers as volatile in the MFD description, they are missing from the writable range array, so acknowledging any interrupts was met with an -EIO error. This error propagates up, leading to the whole AXP717 driver failing to probe, which is fatal to most systems using this PMIC, since most peripherals refer one of the PMIC voltage rails. This wasn't noticed on the initial submission, since the interrupt was completely missing at this point, but the DTs now merged describe the interrupt, creating the problem. Add the five registers that hold those bits to the writable array. This fixes the boot on the Anbernic systems using the AXP717 PMIC. Fixes: b5bfc8ab2484 ("mfd: axp20x: Add support for AXP717 PMIC") Reported-by: Chris Morgan Signed-off-by: Andre Przywara Reviewed-by: John Watts Link: https://lore.kernel.org/r/20240613233104.17529-1-andre.przywara@arm.com Signed-off-by: Lee Jones --- drivers/mfd/axp20x.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mfd/axp20x.c b/drivers/mfd/axp20x.c index f2c0f144c0fc38..dacd3c96c9f57c 100644 --- a/drivers/mfd/axp20x.c +++ b/drivers/mfd/axp20x.c @@ -210,6 +210,7 @@ static const struct regmap_access_table axp313a_volatile_table = { static const struct regmap_range axp717_writeable_ranges[] = { regmap_reg_range(AXP717_IRQ0_EN, AXP717_IRQ4_EN), + regmap_reg_range(AXP717_IRQ0_STATE, AXP717_IRQ4_STATE), regmap_reg_range(AXP717_DCDC_OUTPUT_CONTROL, AXP717_CPUSLDO_CONTROL), }; From 004b8d1491b4bcbb7da1a3206d1e7e66822d47c6 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 14 Jun 2024 09:55:58 +0200 Subject: [PATCH 0936/1578] ovl: fix encoding fid for lower only root ovl_check_encode_origin() should return a positive number if the lower dentry is to be encoded, zero otherwise. If there's no upper layer at all (read-only overlay), then it obviously needs to return positive. This was broken by commit 16aac5ad1fa9 ("ovl: support encoding non-decodable file handles"), which didn't take the lower-only configuration into account. Fix by checking the no-upper-layer case up-front. Reported-and-tested-by: Youzhong Yang Closes: https://lore.kernel.org/all/CADpNCvaBimi+zCYfRJHvCOhMih8OU0rmZkwLuh24MKKroRuT8Q@mail.gmail.com/ Fixes: 16aac5ad1fa9 ("ovl: support encoding non-decodable file handles") Cc: # v6.6 Signed-off-by: Miklos Szeredi --- fs/overlayfs/export.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 063409069f56d0..5868cb2229552f 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -181,6 +181,10 @@ static int ovl_check_encode_origin(struct dentry *dentry) struct ovl_fs *ofs = OVL_FS(dentry->d_sb); bool decodable = ofs->config.nfs_export; + /* No upper layer? */ + if (!ovl_upper_mnt(ofs)) + return 1; + /* Lower file handle for non-upper non-decodable */ if (!ovl_dentry_upper(dentry) && !decodable) return 1; @@ -209,7 +213,7 @@ static int ovl_check_encode_origin(struct dentry *dentry) * ovl_connect_layer() will try to make origin's layer "connected" by * copying up a "connectable" ancestor. */ - if (d_is_dir(dentry) && ovl_upper_mnt(ofs) && decodable) + if (d_is_dir(dentry) && decodable) return ovl_connect_layer(dentry); /* Lower file handle for indexed and non-upper dir/non-dir */ From a6a75edc8669a4f030546c7390808ef0cc034742 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 13 Jun 2024 19:33:53 +0200 Subject: [PATCH 0937/1578] ata: libata-scsi: Set the RMB bit only for removable media devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SCSI Removable Media Bit (RMB) should only be set for removable media, where the device stays and the media changes, e.g. CD-ROM or floppy. The ATA removable media device bit is obsoleted since ATA-8 ACS (2006), but before that it was used to indicate that the device can have its media removed (while the device stays). Commit 8a3e33cf92c7 ("ata: ahci: find eSATA ports and flag them as removable") introduced a change to set the RMB bit if the port has either the eSATA bit or the hot-plug capable bit set. The reasoning was that the author wanted his eSATA ports to get treated like a USB stick. This is however wrong. See "20-082r23SPC-6: Removable Medium Bit Expectations" which has since been integrated to SPC, which states that: """ Reports have been received that some USB Memory Stick device servers set the removable medium (RMB) bit to one. The rub comes when the medium is actually removed, because... The device server is removed concurrently with the medium removal. If there is no device server, then there is no device server that is waiting to have removable medium inserted. Sufficient numbers of SCSI analysts see such a device: - not as a device that supports removable medium; but - as a removable, hot pluggable device. """ The definition of the RMB bit in the SPC specification has since been clarified to match this. Thus, a USB stick should not have the RMB bit set (and neither shall an eSATA nor a hot-plug capable port). Commit dc8b4afc4a04 ("ata: ahci: don't mark HotPlugCapable Ports as external/removable") then changed so that the RMB bit is only set for the eSATA bit (and not for the hot-plug capable bit), because of a lot of bug reports of SATA devices were being automounted by udisks. However, treating eSATA and hot-plug capable ports differently is not correct. From the AHCI 1.3.1 spec: Hot Plug Capable Port (HPCP): When set to '1', indicates that this port's signal and power connectors are externally accessible via a joint signal and power connector for blindmate device hot plug. So a hot-plug capable port is an external port, just like commit 45b96d65ec68 ("ata: ahci: a hotplug capable port is an external port") claims. In order to not violate the SPC specification, modify the SCSI INQUIRY data to only set the RMB bit if the ATA device can have its media removed. This fixes a reported problem where GNOME/udisks was automounting devices connected to hot-plug capable ports. Fixes: 45b96d65ec68 ("ata: ahci: a hotplug capable port is an external port") Cc: stable@vger.kernel.org Reviewed-by: Mario Limonciello Reviewed-by: Thomas Weißschuh Tested-by: Thomas Weißschuh Reported-by: Thomas Weißschuh Closes: https://lore.kernel.org/linux-ide/c0de8262-dc4b-4c22-9fac-33432e5bddd3@t-8ch.de/ Signed-off-by: Damien Le Moal [cassel: wrote commit message] Signed-off-by: Niklas Cassel --- drivers/ata/libata-scsi.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index cdf29b178ddc1e..bb4d30d377ae5d 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -1831,11 +1831,11 @@ static unsigned int ata_scsiop_inq_std(struct ata_scsi_args *args, u8 *rbuf) 2 }; - /* set scsi removable (RMB) bit per ata bit, or if the - * AHCI port says it's external (Hotplug-capable, eSATA). + /* + * Set the SCSI Removable Media Bit (RMB) if the ATA removable media + * device bit (obsolete since ATA-8 ACS) is set. */ - if (ata_id_removable(args->id) || - (args->dev->link->ap->pflags & ATA_PFLAG_EXTERNAL)) + if (ata_id_removable(args->id)) hdr[1] |= (1 << 7); if (args->dev->class == ATA_DEV_ZAC) { From 5f75e081ab5cbfbe7aca2112a802e69576ee9778 Mon Sep 17 00:00:00 2001 From: Cyril Hrubis Date: Thu, 13 Jun 2024 18:38:17 +0200 Subject: [PATCH 0938/1578] loop: Disable fallocate() zero and discard if not supported If fallcate is implemented but zero and discard operations are not supported by the filesystem the backing file is on we continue to fill dmesg with errors from the blk_mq_end_request() since each time we call fallocate() on the loop device the EOPNOTSUPP error from lo_fallocate() ends up propagated into the block layer. In the end syscall succeeds since the blkdev_issue_zeroout() falls back to writing zeroes which makes the errors even more misleading and confusing. How to reproduce: 1. make sure /tmp is mounted as tmpfs 2. dd if=/dev/zero of=/tmp/disk.img bs=1M count=100 3. losetup /dev/loop0 /tmp/disk.img 4. mkfs.ext2 /dev/loop0 5. dmesg |tail [710690.898214] operation not supported error, dev loop0, sector 204672 op 0x9:(WRITE_ZEROES) flags 0x8000800 phys_seg 0 prio class 0 [710690.898279] operation not supported error, dev loop0, sector 522 op 0x9:(WRITE_ZEROES) flags 0x8000800 phys_seg 0 prio class 0 [710690.898603] operation not supported error, dev loop0, sector 16906 op 0x9:(WRITE_ZEROES) flags 0x8000800 phys_seg 0 prio class 0 [710690.898917] operation not supported error, dev loop0, sector 32774 op 0x9:(WRITE_ZEROES) flags 0x8000800 phys_seg 0 prio class 0 [710690.899218] operation not supported error, dev loop0, sector 49674 op 0x9:(WRITE_ZEROES) flags 0x8000800 phys_seg 0 prio class 0 [710690.899484] operation not supported error, dev loop0, sector 65542 op 0x9:(WRITE_ZEROES) flags 0x8000800 phys_seg 0 prio class 0 [710690.899743] operation not supported error, dev loop0, sector 82442 op 0x9:(WRITE_ZEROES) flags 0x8000800 phys_seg 0 prio class 0 [710690.900015] operation not supported error, dev loop0, sector 98310 op 0x9:(WRITE_ZEROES) flags 0x8000800 phys_seg 0 prio class 0 [710690.900276] operation not supported error, dev loop0, sector 115210 op 0x9:(WRITE_ZEROES) flags 0x8000800 phys_seg 0 prio class 0 [710690.900546] operation not supported error, dev loop0, sector 131078 op 0x9:(WRITE_ZEROES) flags 0x8000800 phys_seg 0 prio class 0 This patch changes the lo_fallocate() to clear the flags for zero and discard operations if we get EOPNOTSUPP from the backing file fallocate callback, that way we at least stop spewing errors after the first unsuccessful try. CC: Jan Kara Signed-off-by: Cyril Hrubis Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20240613163817.22640-1-chrubis@suse.cz Signed-off-by: Jens Axboe --- drivers/block/loop.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 93780f41646b75..1153721bc7c250 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -302,6 +302,21 @@ static int lo_read_simple(struct loop_device *lo, struct request *rq, return 0; } +static void loop_clear_limits(struct loop_device *lo, int mode) +{ + struct queue_limits lim = queue_limits_start_update(lo->lo_queue); + + if (mode & FALLOC_FL_ZERO_RANGE) + lim.max_write_zeroes_sectors = 0; + + if (mode & FALLOC_FL_PUNCH_HOLE) { + lim.max_hw_discard_sectors = 0; + lim.discard_granularity = 0; + } + + queue_limits_commit_update(lo->lo_queue, &lim); +} + static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos, int mode) { @@ -320,6 +335,14 @@ static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos, ret = file->f_op->fallocate(file, mode, pos, blk_rq_bytes(rq)); if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP)) return -EIO; + + /* + * We initially configure the limits in a hope that fallocate is + * supported and clear them here if that turns out not to be true. + */ + if (unlikely(ret == -EOPNOTSUPP)) + loop_clear_limits(lo, mode); + return ret; } From 721f2e6653f5ab0cc52b3a459c4a2158b92fcf80 Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Thu, 13 Jun 2024 14:37:11 +0100 Subject: [PATCH 0939/1578] ALSA: hda: cs35l56: Component should be unbound before deconstruction The interface associated with the hda_component should be deactivated before the driver is deconstructed during removal. Fixes: 73cfbfa9caea ("ALSA: hda/cs35l56: Add driver for Cirrus Logic CS35L56 amplifier") Signed-off-by: Simon Trimmer Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/r/20240613133713.75550-2-simont@opensource.cirrus.com --- sound/pci/hda/cs35l56_hda.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/cs35l56_hda.c b/sound/pci/hda/cs35l56_hda.c index 0923e2589f5f78..e134ede6c5aa54 100644 --- a/sound/pci/hda/cs35l56_hda.c +++ b/sound/pci/hda/cs35l56_hda.c @@ -1077,12 +1077,12 @@ void cs35l56_hda_remove(struct device *dev) { struct cs35l56_hda *cs35l56 = dev_get_drvdata(dev); + component_del(cs35l56->base.dev, &cs35l56_hda_comp_ops); + pm_runtime_dont_use_autosuspend(cs35l56->base.dev); pm_runtime_get_sync(cs35l56->base.dev); pm_runtime_disable(cs35l56->base.dev); - component_del(cs35l56->base.dev, &cs35l56_hda_comp_ops); - cs_dsp_remove(&cs35l56->cs_dsp); kfree(cs35l56->system_name); From 6f9a40d61cad0f5560e8530b4dd4a05fc4d15987 Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Thu, 13 Jun 2024 14:37:12 +0100 Subject: [PATCH 0940/1578] ALSA: hda: cs35l41: Component should be unbound before deconstruction The interface associated with the hda_component should be deactivated before the driver is deconstructed during removal. Fixes: 7b2f3eb492da ("ALSA: hda: cs35l41: Add support for CS35L41 in HDA systems") Signed-off-by: Simon Trimmer Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/r/20240613133713.75550-3-simont@opensource.cirrus.com --- sound/pci/hda/cs35l41_hda.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c index d54d4d60b03ec7..031703f010be5b 100644 --- a/sound/pci/hda/cs35l41_hda.c +++ b/sound/pci/hda/cs35l41_hda.c @@ -2019,6 +2019,8 @@ void cs35l41_hda_remove(struct device *dev) { struct cs35l41_hda *cs35l41 = dev_get_drvdata(dev); + component_del(cs35l41->dev, &cs35l41_hda_comp_ops); + pm_runtime_get_sync(cs35l41->dev); pm_runtime_dont_use_autosuspend(cs35l41->dev); pm_runtime_disable(cs35l41->dev); @@ -2026,8 +2028,6 @@ void cs35l41_hda_remove(struct device *dev) if (cs35l41->halo_initialized) cs35l41_remove_dsp(cs35l41); - component_del(cs35l41->dev, &cs35l41_hda_comp_ops); - acpi_dev_put(cs35l41->dacpi); pm_runtime_put_noidle(cs35l41->dev); From d832b5a03e94a2a9f866dab3d04937a0f84ea116 Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Thu, 13 Jun 2024 14:37:13 +0100 Subject: [PATCH 0941/1578] ALSA: hda: tas2781: Component should be unbound before deconstruction The interface associated with the hda_component should be deactivated before the driver is deconstructed during removal. Fixes: 4e7914eb1dae ("ALSA: hda/tas2781: remove sound controls in unbind") Signed-off-by: Simon Trimmer Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/r/20240613133713.75550-4-simont@opensource.cirrus.com --- sound/pci/hda/tas2781_hda_i2c.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/tas2781_hda_i2c.c b/sound/pci/hda/tas2781_hda_i2c.c index 75f7674c66ee7a..fdee6592c502d0 100644 --- a/sound/pci/hda/tas2781_hda_i2c.c +++ b/sound/pci/hda/tas2781_hda_i2c.c @@ -777,11 +777,11 @@ static void tas2781_hda_remove(struct device *dev) { struct tas2781_hda *tas_hda = dev_get_drvdata(dev); + component_del(tas_hda->dev, &tas2781_hda_comp_ops); + pm_runtime_get_sync(tas_hda->dev); pm_runtime_disable(tas_hda->dev); - component_del(tas_hda->dev, &tas2781_hda_comp_ops); - pm_runtime_put_noidle(tas_hda->dev); tasdevice_remove(tas_hda->priv); From 3253aba3408aa4eb2e4e09365eede3e63ef7536b Mon Sep 17 00:00:00 2001 From: Andreas Hindborg Date: Tue, 11 Jun 2024 13:45:49 +0200 Subject: [PATCH 0942/1578] rust: block: introduce `kernel::block::mq` module Add initial abstractions for working with blk-mq. This patch is a maintained, refactored subset of code originally published by Wedson Almeida Filho [1]. [1] https://github.com/wedsonaf/linux/tree/f2cfd2fe0e2ca4e90994f96afe268bbd4382a891/rust/kernel/blk/mq.rs Cc: Wedson Almeida Filho Signed-off-by: Andreas Hindborg Reviewed-by: Benno Lossin Link: https://lore.kernel.org/r/20240611114551.228679-2-nmi@metaspace.dk Signed-off-by: Jens Axboe --- rust/bindings/bindings_helper.h | 3 + rust/helpers.c | 16 ++ rust/kernel/block.rs | 5 + rust/kernel/block/mq.rs | 98 +++++++++++ rust/kernel/block/mq/gen_disk.rs | 215 ++++++++++++++++++++++++ rust/kernel/block/mq/operations.rs | 245 ++++++++++++++++++++++++++++ rust/kernel/block/mq/raw_writer.rs | 55 +++++++ rust/kernel/block/mq/request.rs | 253 +++++++++++++++++++++++++++++ rust/kernel/block/mq/tag_set.rs | 86 ++++++++++ rust/kernel/error.rs | 6 + rust/kernel/lib.rs | 2 + 11 files changed, 984 insertions(+) create mode 100644 rust/kernel/block.rs create mode 100644 rust/kernel/block/mq.rs create mode 100644 rust/kernel/block/mq/gen_disk.rs create mode 100644 rust/kernel/block/mq/operations.rs create mode 100644 rust/kernel/block/mq/raw_writer.rs create mode 100644 rust/kernel/block/mq/request.rs create mode 100644 rust/kernel/block/mq/tag_set.rs diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h index ddb5644d4fd900..84f601d7068e37 100644 --- a/rust/bindings/bindings_helper.h +++ b/rust/bindings/bindings_helper.h @@ -7,6 +7,8 @@ */ #include +#include +#include #include #include #include @@ -20,6 +22,7 @@ /* `bindgen` gets confused at certain things. */ const size_t RUST_CONST_HELPER_ARCH_SLAB_MINALIGN = ARCH_SLAB_MINALIGN; +const size_t RUST_CONST_HELPER_PAGE_SIZE = PAGE_SIZE; const gfp_t RUST_CONST_HELPER_GFP_ATOMIC = GFP_ATOMIC; const gfp_t RUST_CONST_HELPER_GFP_KERNEL = GFP_KERNEL; const gfp_t RUST_CONST_HELPER_GFP_KERNEL_ACCOUNT = GFP_KERNEL_ACCOUNT; diff --git a/rust/helpers.c b/rust/helpers.c index 2c37a0f5d7a84f..3df5217fb2fff8 100644 --- a/rust/helpers.c +++ b/rust/helpers.c @@ -186,3 +186,19 @@ static_assert( __alignof__(size_t) == __alignof__(uintptr_t), "Rust code expects C `size_t` to match Rust `usize`" ); + +// This will soon be moved to a separate file, so no need to merge with above. +#include +#include + +void *rust_helper_blk_mq_rq_to_pdu(struct request *rq) +{ + return blk_mq_rq_to_pdu(rq); +} +EXPORT_SYMBOL_GPL(rust_helper_blk_mq_rq_to_pdu); + +struct request *rust_helper_blk_mq_rq_from_pdu(void *pdu) +{ + return blk_mq_rq_from_pdu(pdu); +} +EXPORT_SYMBOL_GPL(rust_helper_blk_mq_rq_from_pdu); diff --git a/rust/kernel/block.rs b/rust/kernel/block.rs new file mode 100644 index 00000000000000..150f710efe5b46 --- /dev/null +++ b/rust/kernel/block.rs @@ -0,0 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Types for working with the block layer. + +pub mod mq; diff --git a/rust/kernel/block/mq.rs b/rust/kernel/block/mq.rs new file mode 100644 index 00000000000000..fb0f393c1cea6c --- /dev/null +++ b/rust/kernel/block/mq.rs @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! This module provides types for implementing block drivers that interface the +//! blk-mq subsystem. +//! +//! To implement a block device driver, a Rust module must do the following: +//! +//! - Implement [`Operations`] for a type `T`. +//! - Create a [`TagSet`]. +//! - Create a [`GenDisk`], via the [`GenDiskBuilder`]. +//! - Add the disk to the system by calling [`GenDiskBuilder::build`] passing in +//! the `TagSet` reference. +//! +//! The types available in this module that have direct C counterparts are: +//! +//! - The [`TagSet`] type that abstracts the C type `struct tag_set`. +//! - The [`GenDisk`] type that abstracts the C type `struct gendisk`. +//! - The [`Request`] type that abstracts the C type `struct request`. +//! +//! The kernel will interface with the block device driver by calling the method +//! implementations of the `Operations` trait. +//! +//! IO requests are passed to the driver as [`kernel::types::ARef`] +//! instances. The `Request` type is a wrapper around the C `struct request`. +//! The driver must mark end of processing by calling one of the +//! `Request::end`, methods. Failure to do so can lead to deadlock or timeout +//! errors. Please note that the C function `blk_mq_start_request` is implicitly +//! called when the request is queued with the driver. +//! +//! The `TagSet` is responsible for creating and maintaining a mapping between +//! `Request`s and integer ids as well as carrying a pointer to the vtable +//! generated by `Operations`. This mapping is useful for associating +//! completions from hardware with the correct `Request` instance. The `TagSet` +//! determines the maximum queue depth by setting the number of `Request` +//! instances available to the driver, and it determines the number of queues to +//! instantiate for the driver. If possible, a driver should allocate one queue +//! per core, to keep queue data local to a core. +//! +//! One `TagSet` instance can be shared between multiple `GenDisk` instances. +//! This can be useful when implementing drivers where one piece of hardware +//! with one set of IO resources are represented to the user as multiple disks. +//! +//! One significant difference between block device drivers implemented with +//! these Rust abstractions and drivers implemented in C, is that the Rust +//! drivers have to own a reference count on the `Request` type when the IO is +//! in flight. This is to ensure that the C `struct request` instances backing +//! the Rust `Request` instances are live while the Rust driver holds a +//! reference to the `Request`. In addition, the conversion of an integer tag to +//! a `Request` via the `TagSet` would not be sound without this bookkeeping. +//! +//! [`GenDisk`]: gen_disk::GenDisk +//! [`GenDisk`]: gen_disk::GenDisk +//! [`GenDiskBuilder`]: gen_disk::GenDiskBuilder +//! [`GenDiskBuilder::build`]: gen_disk::GenDiskBuilder::build +//! +//! # Example +//! +//! ```rust +//! use kernel::{ +//! alloc::flags, +//! block::mq::*, +//! new_mutex, +//! prelude::*, +//! sync::{Arc, Mutex}, +//! types::{ARef, ForeignOwnable}, +//! }; +//! +//! struct MyBlkDevice; +//! +//! #[vtable] +//! impl Operations for MyBlkDevice { +//! +//! fn queue_rq(rq: ARef>, _is_last: bool) -> Result { +//! Request::end_ok(rq); +//! Ok(()) +//! } +//! +//! fn commit_rqs() {} +//! } +//! +//! let tagset: Arc> = +//! Arc::pin_init(TagSet::new(1, 256, 1), flags::GFP_KERNEL)?; +//! let mut disk = gen_disk::GenDiskBuilder::new() +//! .capacity_sectors(4096) +//! .build(format_args!("myblk"), tagset)?; +//! +//! # Ok::<(), kernel::error::Error>(()) +//! ``` + +pub mod gen_disk; +mod operations; +mod raw_writer; +mod request; +mod tag_set; + +pub use operations::Operations; +pub use request::Request; +pub use tag_set::TagSet; diff --git a/rust/kernel/block/mq/gen_disk.rs b/rust/kernel/block/mq/gen_disk.rs new file mode 100644 index 00000000000000..3b9edb96c8ff77 --- /dev/null +++ b/rust/kernel/block/mq/gen_disk.rs @@ -0,0 +1,215 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Generic disk abstraction. +//! +//! C header: [`include/linux/blkdev.h`](srctree/include/linux/blkdev.h) +//! C header: [`include/linux/blk_mq.h`](srctree/include/linux/blk_mq.h) + +use crate::block::mq::{raw_writer::RawWriter, Operations, TagSet}; +use crate::error; +use crate::{bindings, error::from_err_ptr, error::Result, sync::Arc}; +use core::fmt::{self, Write}; + +/// A builder for [`GenDisk`]. +/// +/// Use this struct to configure and add new [`GenDisk`] to the VFS. +pub struct GenDiskBuilder { + rotational: bool, + logical_block_size: u32, + physical_block_size: u32, + capacity_sectors: u64, +} + +impl Default for GenDiskBuilder { + fn default() -> Self { + Self { + rotational: false, + logical_block_size: bindings::PAGE_SIZE as u32, + physical_block_size: bindings::PAGE_SIZE as u32, + capacity_sectors: 0, + } + } +} + +impl GenDiskBuilder { + /// Create a new instance. + pub fn new() -> Self { + Self::default() + } + + /// Set the rotational media attribute for the device to be built. + pub fn rotational(mut self, rotational: bool) -> Self { + self.rotational = rotational; + self + } + + /// Validate block size by verifying that it is between 512 and `PAGE_SIZE`, + /// and that it is a power of two. + fn validate_block_size(size: u32) -> Result<()> { + if !(512..=bindings::PAGE_SIZE as u32).contains(&size) || !size.is_power_of_two() { + Err(error::code::EINVAL) + } else { + Ok(()) + } + } + + /// Set the logical block size of the device to be built. + /// + /// This method will check that block size is a power of two and between 512 + /// and 4096. If not, an error is returned and the block size is not set. + /// + /// This is the smallest unit the storage device can address. It is + /// typically 4096 bytes. + pub fn logical_block_size(mut self, block_size: u32) -> Result { + Self::validate_block_size(block_size)?; + self.logical_block_size = block_size; + Ok(self) + } + + /// Set the physical block size of the device to be built. + /// + /// This method will check that block size is a power of two and between 512 + /// and 4096. If not, an error is returned and the block size is not set. + /// + /// This is the smallest unit a physical storage device can write + /// atomically. It is usually the same as the logical block size but may be + /// bigger. One example is SATA drives with 4096 byte physical block size + /// that expose a 512 byte logical block size to the operating system. + pub fn physical_block_size(mut self, block_size: u32) -> Result { + Self::validate_block_size(block_size)?; + self.physical_block_size = block_size; + Ok(self) + } + + /// Set the capacity of the device to be built, in sectors (512 bytes). + pub fn capacity_sectors(mut self, capacity: u64) -> Self { + self.capacity_sectors = capacity; + self + } + + /// Build a new `GenDisk` and add it to the VFS. + pub fn build( + self, + name: fmt::Arguments<'_>, + tagset: Arc>, + ) -> Result> { + let lock_class_key = crate::sync::LockClassKey::new(); + + // SAFETY: `tagset.raw_tag_set()` points to a valid and initialized tag set + let gendisk = from_err_ptr(unsafe { + bindings::__blk_mq_alloc_disk( + tagset.raw_tag_set(), + core::ptr::null_mut(), // TODO: We can pass queue limits right here + core::ptr::null_mut(), + lock_class_key.as_ptr(), + ) + })?; + + const TABLE: bindings::block_device_operations = bindings::block_device_operations { + submit_bio: None, + open: None, + release: None, + ioctl: None, + compat_ioctl: None, + check_events: None, + unlock_native_capacity: None, + getgeo: None, + set_read_only: None, + swap_slot_free_notify: None, + report_zones: None, + devnode: None, + alternative_gpt_sector: None, + get_unique_id: None, + // TODO: Set to THIS_MODULE. Waiting for const_refs_to_static feature to + // be merged (unstable in rustc 1.78 which is staged for linux 6.10) + // https://github.com/rust-lang/rust/issues/119618 + owner: core::ptr::null_mut(), + pr_ops: core::ptr::null_mut(), + free_disk: None, + poll_bio: None, + }; + + // SAFETY: `gendisk` is a valid pointer as we initialized it above + unsafe { (*gendisk).fops = &TABLE }; + + let mut raw_writer = RawWriter::from_array( + // SAFETY: `gendisk` points to a valid and initialized instance. We + // have exclusive access, since the disk is not added to the VFS + // yet. + unsafe { &mut (*gendisk).disk_name }, + )?; + raw_writer.write_fmt(name)?; + raw_writer.write_char('\0')?; + + // SAFETY: `gendisk` points to a valid and initialized instance of + // `struct gendisk`. We have exclusive access, so we cannot race. + unsafe { + bindings::blk_queue_logical_block_size((*gendisk).queue, self.logical_block_size) + }; + + // SAFETY: `gendisk` points to a valid and initialized instance of + // `struct gendisk`. We have exclusive access, so we cannot race. + unsafe { + bindings::blk_queue_physical_block_size((*gendisk).queue, self.physical_block_size) + }; + + // SAFETY: `gendisk` points to a valid and initialized instance of + // `struct gendisk`. `set_capacity` takes a lock to synchronize this + // operation, so we will not race. + unsafe { bindings::set_capacity(gendisk, self.capacity_sectors) }; + + if !self.rotational { + // SAFETY: `gendisk` points to a valid and initialized instance of + // `struct gendisk`. This operation uses a relaxed atomic bit flip + // operation, so there is no race on this field. + unsafe { bindings::blk_queue_flag_set(bindings::QUEUE_FLAG_NONROT, (*gendisk).queue) }; + } else { + // SAFETY: `gendisk` points to a valid and initialized instance of + // `struct gendisk`. This operation uses a relaxed atomic bit flip + // operation, so there is no race on this field. + unsafe { + bindings::blk_queue_flag_clear(bindings::QUEUE_FLAG_NONROT, (*gendisk).queue) + }; + } + + crate::error::to_result( + // SAFETY: `gendisk` points to a valid and initialized instance of + // `struct gendisk`. + unsafe { + bindings::device_add_disk(core::ptr::null_mut(), gendisk, core::ptr::null_mut()) + }, + )?; + + // INVARIANT: `gendisk` was initialized above. + // INVARIANT: `gendisk` was added to the VFS via `device_add_disk` above. + Ok(GenDisk { + _tagset: tagset, + gendisk, + }) + } +} + +/// A generic block device. +/// +/// # Invariants +/// +/// - `gendisk` must always point to an initialized and valid `struct gendisk`. +/// - `gendisk` was added to the VFS through a call to +/// `bindings::device_add_disk`. +pub struct GenDisk { + _tagset: Arc>, + gendisk: *mut bindings::gendisk, +} + +// SAFETY: `GenDisk` is an owned pointer to a `struct gendisk` and an `Arc` to a +// `TagSet` It is safe to send this to other threads as long as T is Send. +unsafe impl Send for GenDisk {} + +impl Drop for GenDisk { + fn drop(&mut self) { + // SAFETY: By type invariant, `self.gendisk` points to a valid and + // initialized instance of `struct gendisk`, and it was previously added + // to the VFS. + unsafe { bindings::del_gendisk(self.gendisk) }; + } +} diff --git a/rust/kernel/block/mq/operations.rs b/rust/kernel/block/mq/operations.rs new file mode 100644 index 00000000000000..9ba7fdfeb4b22c --- /dev/null +++ b/rust/kernel/block/mq/operations.rs @@ -0,0 +1,245 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! This module provides an interface for blk-mq drivers to implement. +//! +//! C header: [`include/linux/blk-mq.h`](srctree/include/linux/blk-mq.h) + +use crate::{ + bindings, + block::mq::request::RequestDataWrapper, + block::mq::Request, + error::{from_result, Result}, + types::ARef, +}; +use core::{marker::PhantomData, sync::atomic::AtomicU64, sync::atomic::Ordering}; + +/// Implement this trait to interface blk-mq as block devices. +/// +/// To implement a block device driver, implement this trait as described in the +/// [module level documentation]. The kernel will use the implementation of the +/// functions defined in this trait to interface a block device driver. Note: +/// There is no need for an exit_request() implementation, because the `drop` +/// implementation of the [`Request`] type will be invoked by automatically by +/// the C/Rust glue logic. +/// +/// [module level documentation]: kernel::block::mq +#[macros::vtable] +pub trait Operations: Sized { + /// Called by the kernel to queue a request with the driver. If `is_last` is + /// `false`, the driver is allowed to defer committing the request. + fn queue_rq(rq: ARef>, is_last: bool) -> Result; + + /// Called by the kernel to indicate that queued requests should be submitted. + fn commit_rqs(); + + /// Called by the kernel to poll the device for completed requests. Only + /// used for poll queues. + fn poll() -> bool { + crate::build_error(crate::error::VTABLE_DEFAULT_ERROR) + } +} + +/// A vtable for blk-mq to interact with a block device driver. +/// +/// A `bindings::blk_mq_ops` vtable is constructed from pointers to the `extern +/// "C"` functions of this struct, exposed through the `OperationsVTable::VTABLE`. +/// +/// For general documentation of these methods, see the kernel source +/// documentation related to `struct blk_mq_operations` in +/// [`include/linux/blk-mq.h`]. +/// +/// [`include/linux/blk-mq.h`]: srctree/include/linux/blk-mq.h +pub(crate) struct OperationsVTable(PhantomData); + +impl OperationsVTable { + /// This function is called by the C kernel. A pointer to this function is + /// installed in the `blk_mq_ops` vtable for the driver. + /// + /// # Safety + /// + /// - The caller of this function must ensure that the pointee of `bd` is + /// valid for reads for the duration of this function. + /// - This function must be called for an initialized and live `hctx`. That + /// is, `Self::init_hctx_callback` was called and + /// `Self::exit_hctx_callback()` was not yet called. + /// - `(*bd).rq` must point to an initialized and live `bindings:request`. + /// That is, `Self::init_request_callback` was called but + /// `Self::exit_request_callback` was not yet called for the request. + /// - `(*bd).rq` must be owned by the driver. That is, the block layer must + /// promise to not access the request until the driver calls + /// `bindings::blk_mq_end_request` for the request. + unsafe extern "C" fn queue_rq_callback( + _hctx: *mut bindings::blk_mq_hw_ctx, + bd: *const bindings::blk_mq_queue_data, + ) -> bindings::blk_status_t { + // SAFETY: `bd.rq` is valid as required by the safety requirement for + // this function. + let request = unsafe { &*(*bd).rq.cast::>() }; + + // One refcount for the ARef, one for being in flight + request.wrapper_ref().refcount().store(2, Ordering::Relaxed); + + // SAFETY: + // - We own a refcount that we took above. We pass that to `ARef`. + // - By the safety requirements of this function, `request` is a valid + // `struct request` and the private data is properly initialized. + // - `rq` will be alive until `blk_mq_end_request` is called and is + // reference counted by `ARef` until then. + let rq = unsafe { Request::aref_from_raw((*bd).rq) }; + + // SAFETY: We have exclusive access and we just set the refcount above. + unsafe { Request::start_unchecked(&rq) }; + + let ret = T::queue_rq( + rq, + // SAFETY: `bd` is valid as required by the safety requirement for + // this function. + unsafe { (*bd).last }, + ); + + if let Err(e) = ret { + e.to_blk_status() + } else { + bindings::BLK_STS_OK as _ + } + } + + /// This function is called by the C kernel. A pointer to this function is + /// installed in the `blk_mq_ops` vtable for the driver. + /// + /// # Safety + /// + /// This function may only be called by blk-mq C infrastructure. + unsafe extern "C" fn commit_rqs_callback(_hctx: *mut bindings::blk_mq_hw_ctx) { + T::commit_rqs() + } + + /// This function is called by the C kernel. It is not currently + /// implemented, and there is no way to exercise this code path. + /// + /// # Safety + /// + /// This function may only be called by blk-mq C infrastructure. + unsafe extern "C" fn complete_callback(_rq: *mut bindings::request) {} + + /// This function is called by the C kernel. A pointer to this function is + /// installed in the `blk_mq_ops` vtable for the driver. + /// + /// # Safety + /// + /// This function may only be called by blk-mq C infrastructure. + unsafe extern "C" fn poll_callback( + _hctx: *mut bindings::blk_mq_hw_ctx, + _iob: *mut bindings::io_comp_batch, + ) -> core::ffi::c_int { + T::poll().into() + } + + /// This function is called by the C kernel. A pointer to this function is + /// installed in the `blk_mq_ops` vtable for the driver. + /// + /// # Safety + /// + /// This function may only be called by blk-mq C infrastructure. This + /// function may only be called once before `exit_hctx_callback` is called + /// for the same context. + unsafe extern "C" fn init_hctx_callback( + _hctx: *mut bindings::blk_mq_hw_ctx, + _tagset_data: *mut core::ffi::c_void, + _hctx_idx: core::ffi::c_uint, + ) -> core::ffi::c_int { + from_result(|| Ok(0)) + } + + /// This function is called by the C kernel. A pointer to this function is + /// installed in the `blk_mq_ops` vtable for the driver. + /// + /// # Safety + /// + /// This function may only be called by blk-mq C infrastructure. + unsafe extern "C" fn exit_hctx_callback( + _hctx: *mut bindings::blk_mq_hw_ctx, + _hctx_idx: core::ffi::c_uint, + ) { + } + + /// This function is called by the C kernel. A pointer to this function is + /// installed in the `blk_mq_ops` vtable for the driver. + /// + /// # Safety + /// + /// - This function may only be called by blk-mq C infrastructure. + /// - `_set` must point to an initialized `TagSet`. + /// - `rq` must point to an initialized `bindings::request`. + /// - The allocation pointed to by `rq` must be at the size of `Request` + /// plus the size of `RequestDataWrapper`. + unsafe extern "C" fn init_request_callback( + _set: *mut bindings::blk_mq_tag_set, + rq: *mut bindings::request, + _hctx_idx: core::ffi::c_uint, + _numa_node: core::ffi::c_uint, + ) -> core::ffi::c_int { + from_result(|| { + // SAFETY: By the safety requirements of this function, `rq` points + // to a valid allocation. + let pdu = unsafe { Request::wrapper_ptr(rq.cast::>()) }; + + // SAFETY: The refcount field is allocated but not initialized, so + // it is valid for writes. + unsafe { RequestDataWrapper::refcount_ptr(pdu.as_ptr()).write(AtomicU64::new(0)) }; + + Ok(0) + }) + } + + /// This function is called by the C kernel. A pointer to this function is + /// installed in the `blk_mq_ops` vtable for the driver. + /// + /// # Safety + /// + /// - This function may only be called by blk-mq C infrastructure. + /// - `_set` must point to an initialized `TagSet`. + /// - `rq` must point to an initialized and valid `Request`. + unsafe extern "C" fn exit_request_callback( + _set: *mut bindings::blk_mq_tag_set, + rq: *mut bindings::request, + _hctx_idx: core::ffi::c_uint, + ) { + // SAFETY: The tagset invariants guarantee that all requests are allocated with extra memory + // for the request data. + let pdu = unsafe { bindings::blk_mq_rq_to_pdu(rq) }.cast::(); + + // SAFETY: `pdu` is valid for read and write and is properly initialised. + unsafe { core::ptr::drop_in_place(pdu) }; + } + + const VTABLE: bindings::blk_mq_ops = bindings::blk_mq_ops { + queue_rq: Some(Self::queue_rq_callback), + queue_rqs: None, + commit_rqs: Some(Self::commit_rqs_callback), + get_budget: None, + put_budget: None, + set_rq_budget_token: None, + get_rq_budget_token: None, + timeout: None, + poll: if T::HAS_POLL { + Some(Self::poll_callback) + } else { + None + }, + complete: Some(Self::complete_callback), + init_hctx: Some(Self::init_hctx_callback), + exit_hctx: Some(Self::exit_hctx_callback), + init_request: Some(Self::init_request_callback), + exit_request: Some(Self::exit_request_callback), + cleanup_rq: None, + busy: None, + map_queues: None, + #[cfg(CONFIG_BLK_DEBUG_FS)] + show_rq: None, + }; + + pub(crate) const fn build() -> &'static bindings::blk_mq_ops { + &Self::VTABLE + } +} diff --git a/rust/kernel/block/mq/raw_writer.rs b/rust/kernel/block/mq/raw_writer.rs new file mode 100644 index 00000000000000..9222465d670bfe --- /dev/null +++ b/rust/kernel/block/mq/raw_writer.rs @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0 + +use core::fmt::{self, Write}; + +use crate::error::Result; +use crate::prelude::EINVAL; + +/// A mutable reference to a byte buffer where a string can be written into. +/// +/// # Invariants +/// +/// `buffer` is always null terminated. +pub(crate) struct RawWriter<'a> { + buffer: &'a mut [u8], + pos: usize, +} + +impl<'a> RawWriter<'a> { + /// Create a new `RawWriter` instance. + fn new(buffer: &'a mut [u8]) -> Result> { + *(buffer.last_mut().ok_or(EINVAL)?) = 0; + + // INVARIANT: We null terminated the buffer above. + Ok(Self { buffer, pos: 0 }) + } + + pub(crate) fn from_array( + a: &'a mut [core::ffi::c_char; N], + ) -> Result> { + Self::new( + // SAFETY: the buffer of `a` is valid for read and write as `u8` for + // at least `N` bytes. + unsafe { core::slice::from_raw_parts_mut(a.as_mut_ptr().cast::(), N) }, + ) + } +} + +impl Write for RawWriter<'_> { + fn write_str(&mut self, s: &str) -> fmt::Result { + let bytes = s.as_bytes(); + let len = bytes.len(); + + // We do not want to overwrite our null terminator + if self.pos + len > self.buffer.len() - 1 { + return Err(fmt::Error); + } + + // INVARIANT: We are not overwriting the last byte + self.buffer[self.pos..self.pos + len].copy_from_slice(bytes); + + self.pos += len; + + Ok(()) + } +} diff --git a/rust/kernel/block/mq/request.rs b/rust/kernel/block/mq/request.rs new file mode 100644 index 00000000000000..a0e22827f3f4ec --- /dev/null +++ b/rust/kernel/block/mq/request.rs @@ -0,0 +1,253 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! This module provides a wrapper for the C `struct request` type. +//! +//! C header: [`include/linux/blk-mq.h`](srctree/include/linux/blk-mq.h) + +use crate::{ + bindings, + block::mq::Operations, + error::Result, + types::{ARef, AlwaysRefCounted, Opaque}, +}; +use core::{ + marker::PhantomData, + ptr::{addr_of_mut, NonNull}, + sync::atomic::{AtomicU64, Ordering}, +}; + +/// A wrapper around a blk-mq `struct request`. This represents an IO request. +/// +/// # Implementation details +/// +/// There are four states for a request that the Rust bindings care about: +/// +/// A) Request is owned by block layer (refcount 0) +/// B) Request is owned by driver but with zero `ARef`s in existence +/// (refcount 1) +/// C) Request is owned by driver with exactly one `ARef` in existence +/// (refcount 2) +/// D) Request is owned by driver with more than one `ARef` in existence +/// (refcount > 2) +/// +/// +/// We need to track A and B to ensure we fail tag to request conversions for +/// requests that are not owned by the driver. +/// +/// We need to track C and D to ensure that it is safe to end the request and hand +/// back ownership to the block layer. +/// +/// The states are tracked through the private `refcount` field of +/// `RequestDataWrapper`. This structure lives in the private data area of the C +/// `struct request`. +/// +/// # Invariants +/// +/// * `self.0` is a valid `struct request` created by the C portion of the kernel. +/// * The private data area associated with this request must be an initialized +/// and valid `RequestDataWrapper`. +/// * `self` is reference counted by atomic modification of +/// self.wrapper_ref().refcount(). +/// +#[repr(transparent)] +pub struct Request(Opaque, PhantomData); + +impl Request { + /// Create an `ARef` from a `struct request` pointer. + /// + /// # Safety + /// + /// * The caller must own a refcount on `ptr` that is transferred to the + /// returned `ARef`. + /// * The type invariants for `Request` must hold for the pointee of `ptr`. + pub(crate) unsafe fn aref_from_raw(ptr: *mut bindings::request) -> ARef { + // INVARIANT: By the safety requirements of this function, invariants are upheld. + // SAFETY: By the safety requirement of this function, we own a + // reference count that we can pass to `ARef`. + unsafe { ARef::from_raw(NonNull::new_unchecked(ptr as *const Self as *mut Self)) } + } + + /// Notify the block layer that a request is going to be processed now. + /// + /// The block layer uses this hook to do proper initializations such as + /// starting the timeout timer. It is a requirement that block device + /// drivers call this function when starting to process a request. + /// + /// # Safety + /// + /// The caller must have exclusive ownership of `self`, that is + /// `self.wrapper_ref().refcount() == 2`. + pub(crate) unsafe fn start_unchecked(this: &ARef) { + // SAFETY: By type invariant, `self.0` is a valid `struct request` and + // we have exclusive access. + unsafe { bindings::blk_mq_start_request(this.0.get()) }; + } + + /// Try to take exclusive ownership of `this` by dropping the refcount to 0. + /// This fails if `this` is not the only `ARef` pointing to the underlying + /// `Request`. + /// + /// If the operation is successful, `Ok` is returned with a pointer to the + /// C `struct request`. If the operation fails, `this` is returned in the + /// `Err` variant. + fn try_set_end(this: ARef) -> Result<*mut bindings::request, ARef> { + // We can race with `TagSet::tag_to_rq` + if let Err(_old) = this.wrapper_ref().refcount().compare_exchange( + 2, + 0, + Ordering::Relaxed, + Ordering::Relaxed, + ) { + return Err(this); + } + + let request_ptr = this.0.get(); + core::mem::forget(this); + + Ok(request_ptr) + } + + /// Notify the block layer that the request has been completed without errors. + /// + /// This function will return `Err` if `this` is not the only `ARef` + /// referencing the request. + pub fn end_ok(this: ARef) -> Result<(), ARef> { + let request_ptr = Self::try_set_end(this)?; + + // SAFETY: By type invariant, `this.0` was a valid `struct request`. The + // success of the call to `try_set_end` guarantees that there are no + // `ARef`s pointing to this request. Therefore it is safe to hand it + // back to the block layer. + unsafe { bindings::blk_mq_end_request(request_ptr, bindings::BLK_STS_OK as _) }; + + Ok(()) + } + + /// Return a pointer to the `RequestDataWrapper` stored in the private area + /// of the request structure. + /// + /// # Safety + /// + /// - `this` must point to a valid allocation of size at least size of + /// `Self` plus size of `RequestDataWrapper`. + pub(crate) unsafe fn wrapper_ptr(this: *mut Self) -> NonNull { + let request_ptr = this.cast::(); + // SAFETY: By safety requirements for this function, `this` is a + // valid allocation. + let wrapper_ptr = + unsafe { bindings::blk_mq_rq_to_pdu(request_ptr).cast::() }; + // SAFETY: By C API contract, wrapper_ptr points to a valid allocation + // and is not null. + unsafe { NonNull::new_unchecked(wrapper_ptr) } + } + + /// Return a reference to the `RequestDataWrapper` stored in the private + /// area of the request structure. + pub(crate) fn wrapper_ref(&self) -> &RequestDataWrapper { + // SAFETY: By type invariant, `self.0` is a valid allocation. Further, + // the private data associated with this request is initialized and + // valid. The existence of `&self` guarantees that the private data is + // valid as a shared reference. + unsafe { Self::wrapper_ptr(self as *const Self as *mut Self).as_ref() } + } +} + +/// A wrapper around data stored in the private area of the C `struct request`. +pub(crate) struct RequestDataWrapper { + /// The Rust request refcount has the following states: + /// + /// - 0: The request is owned by C block layer. + /// - 1: The request is owned by Rust abstractions but there are no ARef references to it. + /// - 2+: There are `ARef` references to the request. + refcount: AtomicU64, +} + +impl RequestDataWrapper { + /// Return a reference to the refcount of the request that is embedding + /// `self`. + pub(crate) fn refcount(&self) -> &AtomicU64 { + &self.refcount + } + + /// Return a pointer to the refcount of the request that is embedding the + /// pointee of `this`. + /// + /// # Safety + /// + /// - `this` must point to a live allocation of at least the size of `Self`. + pub(crate) unsafe fn refcount_ptr(this: *mut Self) -> *mut AtomicU64 { + // SAFETY: Because of the safety requirements of this function, the + // field projection is safe. + unsafe { addr_of_mut!((*this).refcount) } + } +} + +// SAFETY: Exclusive access is thread-safe for `Request`. `Request` has no `&mut +// self` methods and `&self` methods that mutate `self` are internally +// synchronized. +unsafe impl Send for Request {} + +// SAFETY: Shared access is thread-safe for `Request`. `&self` methods that +// mutate `self` are internally synchronized` +unsafe impl Sync for Request {} + +/// Store the result of `op(target.load())` in target, returning new value of +/// target. +fn atomic_relaxed_op_return(target: &AtomicU64, op: impl Fn(u64) -> u64) -> u64 { + let old = target.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |x| Some(op(x))); + + // SAFETY: Because the operation passed to `fetch_update` above always + // return `Some`, `old` will always be `Ok`. + let old = unsafe { old.unwrap_unchecked() }; + + op(old) +} + +/// Store the result of `op(target.load)` in `target` if `target.load() != +/// pred`, returning true if the target was updated. +fn atomic_relaxed_op_unless(target: &AtomicU64, op: impl Fn(u64) -> u64, pred: u64) -> bool { + target + .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |x| { + if x == pred { + None + } else { + Some(op(x)) + } + }) + .is_ok() +} + +// SAFETY: All instances of `Request` are reference counted. This +// implementation of `AlwaysRefCounted` ensure that increments to the ref count +// keeps the object alive in memory at least until a matching reference count +// decrement is executed. +unsafe impl AlwaysRefCounted for Request { + fn inc_ref(&self) { + let refcount = &self.wrapper_ref().refcount(); + + #[cfg_attr(not(CONFIG_DEBUG_MISC), allow(unused_variables))] + let updated = atomic_relaxed_op_unless(refcount, |x| x + 1, 0); + + #[cfg(CONFIG_DEBUG_MISC)] + if !updated { + panic!("Request refcount zero on clone") + } + } + + unsafe fn dec_ref(obj: core::ptr::NonNull) { + // SAFETY: The type invariants of `ARef` guarantee that `obj` is valid + // for read. + let wrapper_ptr = unsafe { Self::wrapper_ptr(obj.as_ptr()).as_ptr() }; + // SAFETY: The type invariant of `Request` guarantees that the private + // data area is initialized and valid. + let refcount = unsafe { &*RequestDataWrapper::refcount_ptr(wrapper_ptr) }; + + #[cfg_attr(not(CONFIG_DEBUG_MISC), allow(unused_variables))] + let new_refcount = atomic_relaxed_op_return(refcount, |x| x - 1); + + #[cfg(CONFIG_DEBUG_MISC)] + if new_refcount == 0 { + panic!("Request reached refcount zero in Rust abstractions"); + } + } +} diff --git a/rust/kernel/block/mq/tag_set.rs b/rust/kernel/block/mq/tag_set.rs new file mode 100644 index 00000000000000..f9a1ca655a35be --- /dev/null +++ b/rust/kernel/block/mq/tag_set.rs @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! This module provides the `TagSet` struct to wrap the C `struct blk_mq_tag_set`. +//! +//! C header: [`include/linux/blk-mq.h`](srctree/include/linux/blk-mq.h) + +use core::pin::Pin; + +use crate::{ + bindings, + block::mq::{operations::OperationsVTable, request::RequestDataWrapper, Operations}, + error, + prelude::PinInit, + try_pin_init, + types::Opaque, +}; +use core::{convert::TryInto, marker::PhantomData}; +use macros::{pin_data, pinned_drop}; + +/// A wrapper for the C `struct blk_mq_tag_set`. +/// +/// `struct blk_mq_tag_set` contains a `struct list_head` and so must be pinned. +/// +/// # Invariants +/// +/// - `inner` is initialized and valid. +#[pin_data(PinnedDrop)] +#[repr(transparent)] +pub struct TagSet { + #[pin] + inner: Opaque, + _p: PhantomData, +} + +impl TagSet { + /// Try to create a new tag set + pub fn new( + nr_hw_queues: u32, + num_tags: u32, + num_maps: u32, + ) -> impl PinInit { + // SAFETY: `blk_mq_tag_set` only contains integers and pointers, which + // all are allowed to be 0. + let tag_set: bindings::blk_mq_tag_set = unsafe { core::mem::zeroed() }; + let tag_set = core::mem::size_of::() + .try_into() + .map(|cmd_size| { + bindings::blk_mq_tag_set { + ops: OperationsVTable::::build(), + nr_hw_queues, + timeout: 0, // 0 means default which is 30Hz in C + numa_node: bindings::NUMA_NO_NODE, + queue_depth: num_tags, + cmd_size, + flags: bindings::BLK_MQ_F_SHOULD_MERGE, + driver_data: core::ptr::null_mut::(), + nr_maps: num_maps, + ..tag_set + } + }); + + try_pin_init!(TagSet { + inner <- PinInit::<_, error::Error>::pin_chain(Opaque::new(tag_set?), |tag_set| { + // SAFETY: we do not move out of `tag_set`. + let tag_set = unsafe { Pin::get_unchecked_mut(tag_set) }; + // SAFETY: `tag_set` is a reference to an initialized `blk_mq_tag_set`. + error::to_result( unsafe { bindings::blk_mq_alloc_tag_set(tag_set.get())}) + }), + _p: PhantomData, + }) + } + + /// Return the pointer to the wrapped `struct blk_mq_tag_set` + pub(crate) fn raw_tag_set(&self) -> *mut bindings::blk_mq_tag_set { + self.inner.get() + } +} + +#[pinned_drop] +impl PinnedDrop for TagSet { + fn drop(self: Pin<&mut Self>) { + // SAFETY: By type invariant `inner` is valid and has been properly + // initialized during construction. + unsafe { bindings::blk_mq_free_tag_set(self.inner.get()) }; + } +} diff --git a/rust/kernel/error.rs b/rust/kernel/error.rs index 55280ae9fe40e1..145f5c3970096f 100644 --- a/rust/kernel/error.rs +++ b/rust/kernel/error.rs @@ -126,6 +126,12 @@ impl Error { self.0 } + #[cfg(CONFIG_BLOCK)] + pub(crate) fn to_blk_status(self) -> bindings::blk_status_t { + // SAFETY: `self.0` is a valid error due to its invariant. + unsafe { bindings::errno_to_blk_status(self.0) } + } + /// Returns the error encoded as a pointer. #[allow(dead_code)] pub(crate) fn to_ptr(self) -> *mut T { diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs index fbd91a48ff8bc5..2cf7c6b6f66b9d 100644 --- a/rust/kernel/lib.rs +++ b/rust/kernel/lib.rs @@ -27,6 +27,8 @@ compile_error!("Missing kernel configuration for conditional compilation"); extern crate self as kernel; pub mod alloc; +#[cfg(CONFIG_BLOCK)] +pub mod block; mod build_assert; pub mod error; pub mod init; From bc5b533b91ef0b8a09fe507e23d1c6c43c1fb0f5 Mon Sep 17 00:00:00 2001 From: Andreas Hindborg Date: Tue, 11 Jun 2024 13:45:50 +0200 Subject: [PATCH 0943/1578] rust: block: add rnull, Rust null_blk implementation This patch adds an initial version of the Rust null block driver. Signed-off-by: Andreas Hindborg Reviewed-by: Benno Lossin Link: https://lore.kernel.org/r/20240611114551.228679-3-nmi@metaspace.dk Signed-off-by: Jens Axboe --- drivers/block/Kconfig | 9 ++++++ drivers/block/Makefile | 3 ++ drivers/block/rnull.rs | 73 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+) create mode 100644 drivers/block/rnull.rs diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 5b9d4aaebb81d4..ed209f4f279839 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -354,6 +354,15 @@ config VIRTIO_BLK This is the virtual block driver for virtio. It can be used with QEMU based VMMs (like KVM or Xen). Say Y or M. +config BLK_DEV_RUST_NULL + tristate "Rust null block driver (Experimental)" + depends on RUST + help + This is the Rust implementation of the null block driver. For now it + is only a minimal stub. + + If unsure, say N. + config BLK_DEV_RBD tristate "Rados block device (RBD)" depends on INET && BLOCK diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 101612cba303a3..1105a2d4fdcb00 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -9,6 +9,9 @@ # needed for trace events ccflags-y += -I$(src) +obj-$(CONFIG_BLK_DEV_RUST_NULL) += rnull_mod.o +rnull_mod-y := rnull.o + obj-$(CONFIG_MAC_FLOPPY) += swim3.o obj-$(CONFIG_BLK_DEV_SWIM) += swim_mod.o obj-$(CONFIG_BLK_DEV_FD) += floppy.o diff --git a/drivers/block/rnull.rs b/drivers/block/rnull.rs new file mode 100644 index 00000000000000..b0227cf9ddd387 --- /dev/null +++ b/drivers/block/rnull.rs @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! This is a Rust implementation of the C null block driver. +//! +//! Supported features: +//! +//! - blk-mq interface +//! - direct completion +//! - block size 4k +//! +//! The driver is not configurable. + +use kernel::{ + alloc::flags, + block::mq::{ + self, + gen_disk::{self, GenDisk}, + Operations, TagSet, + }, + error::Result, + new_mutex, pr_info, + prelude::*, + sync::{Arc, Mutex}, + types::ARef, +}; + +module! { + type: NullBlkModule, + name: "rnull_mod", + author: "Andreas Hindborg", + license: "GPL v2", +} + +struct NullBlkModule { + _disk: Pin>>>, +} + +impl kernel::Module for NullBlkModule { + fn init(_module: &'static ThisModule) -> Result { + pr_info!("Rust null_blk loaded\n"); + let tagset = Arc::pin_init(TagSet::new(1, 256, 1), flags::GFP_KERNEL)?; + + let disk = gen_disk::GenDiskBuilder::new() + .capacity_sectors(4096 << 11) + .logical_block_size(4096)? + .physical_block_size(4096)? + .rotational(false) + .build(format_args!("rnullb{}", 0), tagset)?; + + let disk = Box::pin_init(new_mutex!(disk, "nullb:disk"), flags::GFP_KERNEL)?; + + Ok(Self { _disk: disk }) + } +} + +struct NullBlkDevice; + +#[vtable] +impl Operations for NullBlkDevice { + #[inline(always)] + fn queue_rq(rq: ARef>, _is_last: bool) -> Result { + mq::Request::end_ok(rq) + .map_err(|_e| kernel::error::code::EIO) + // We take no refcounts on the request, so we expect to be able to + // end the request. The request reference must be unique at this + // point, and so `end_ok` cannot fail. + .expect("Fatal error - expected to be able to end request"); + + Ok(()) + } + + fn commit_rqs() {} +} From d37a9ab8331cfc0fc2eac0480f0af624c0144a92 Mon Sep 17 00:00:00 2001 From: Andreas Hindborg Date: Tue, 11 Jun 2024 13:45:51 +0200 Subject: [PATCH 0944/1578] MAINTAINERS: add entry for Rust block device driver API Add an entry for the Rust block device driver abstractions. Signed-off-by: Andreas Hindborg Link: https://lore.kernel.org/r/20240611114551.228679-4-nmi@metaspace.dk Signed-off-by: Jens Axboe --- MAINTAINERS | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index aacccb376c28a1..aa1321fdc30010 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3781,6 +3781,20 @@ F: include/linux/blk* F: kernel/trace/blktrace.c F: lib/sbitmap.c +BLOCK LAYER DEVICE DRIVER API [RUST] +M: Andreas Hindborg +R: Boqun Feng +L: linux-block@vger.kernel.org +L: rust-for-linux@vger.kernel.org +S: Supported +W: https://rust-for-linux.com +B: https://github.com/Rust-for-Linux/linux/issues +C: https://rust-for-linux.zulipchat.com/#narrow/stream/Block +T: git https://github.com/Rust-for-Linux/linux.git rust-block-next +F: drivers/block/rnull.rs +F: rust/kernel/block.rs +F: rust/kernel/block/ + BLOCK2MTD DRIVER M: Joern Engel L: linux-mtd@lists.infradead.org From 8af49868e51ed1ba117b74728af12abe1eda82e5 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Thu, 13 Jun 2024 14:25:27 +0100 Subject: [PATCH 0945/1578] ASoC: cs35l56: Disconnect ASP1 TX sources when ASP1 DAI is hooked up If the ASP1 DAI is hooked up by the machine driver the ASP TX mixer sources should be initialized to disconnected. There aren't currently any available products using the ASP so this doesn't affect any existing systems. The cs35l56 does not have any fixed default for the mixer source registers. When the cs35l56 boots, its firmware patches these registers to setup a system-specific routing; this is so that Windows can use generic SDCA drivers instead of needing knowledge of chip-specific registers. The setup varies between end-products, which each have customized firmware, and so the default register state varies between end-products. It can also change if the firmware on an end-product is upgraded - for example if a change was needed to the routing for Windows use-cases. It must be emphasized that the settings applied by the firmware are not internal magic tuning; they are statically implementing use-case setup that on Linux would be done via ALSA controls. The driver is currently syncing the mixer controls with whatever initial state the firmware wrote to the registers, so that they report the actual audio routing. But if the ASP DAI is hooked up this can create a powered-up DAPM graph without anything intentionally setting up a path. This can lead to parts of the audio system powering up unexpectedly. For example when cs35l56 is connected to cs42l43 using a codec-codec link, this can create a complete DAPM graph which then powers-up cs42l43. But the cs42l43 can only be clocked from its SoundWire bus so this causes a bunch of errors in the kernel log where cs42l43 is unexpectedly powered-up without a clock. If the host is taking ownership of the ASP (either directly or as a codec-to-codec link) there is no need to keep the mixer settings that the firmware wrote. The driver has ALSA controls for setting these using standard Linux mechanisms. So if the machine driver hooks up the ASP the ASP mixers are initialized to "None" (no input). This prevents unintended DAPM-graph power-ups, and means the initial state of the mixers is always going to be None. Since the initial state of the mixers can vary from system to system and potentially between firmware upgrades, no use-case manager can currently assume that cs35l56 has a known initial state. The firmware could just as easily default them to "None" as to any input source. So defaulting them to "None" in the driver is not increasing the entropy of the system. Signed-off-by: Richard Fitzgerald Link: https://lore.kernel.org/r/20240613132527.46537-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs35l56-shared.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/soc/codecs/cs35l56-shared.c b/sound/soc/codecs/cs35l56-shared.c index 8af89a2635949c..30497152e02a74 100644 --- a/sound/soc/codecs/cs35l56-shared.c +++ b/sound/soc/codecs/cs35l56-shared.c @@ -215,6 +215,10 @@ static const struct reg_sequence cs35l56_asp1_defaults[] = { REG_SEQ0(CS35L56_ASP1_FRAME_CONTROL5, 0x00020100), REG_SEQ0(CS35L56_ASP1_DATA_CONTROL1, 0x00000018), REG_SEQ0(CS35L56_ASP1_DATA_CONTROL5, 0x00000018), + REG_SEQ0(CS35L56_ASP1TX1_INPUT, 0x00000000), + REG_SEQ0(CS35L56_ASP1TX2_INPUT, 0x00000000), + REG_SEQ0(CS35L56_ASP1TX3_INPUT, 0x00000000), + REG_SEQ0(CS35L56_ASP1TX4_INPUT, 0x00000000), }; /* From be1fae62cf253a5b67526cee9fbc07689b97c125 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Thu, 13 Jun 2024 13:13:05 +0100 Subject: [PATCH 0946/1578] ASoC: q6apm-lpass-dai: close graph on prepare errors There is an issue around with error handling and graph management with the exising code, none of the error paths close the graph, which result in leaving the loaded graph in dsp, however the driver thinks otherwise. This can have a nasty side effect specially when we try to load the same graph to dsp, dsp returns error which leaves the board with no sound and requires restart. Fix this by properly closing the graph when we hit errors between open and close. Fixes: 30ad723b93ad ("ASoC: qdsp6: audioreach: add q6apm lpass dai support") Signed-off-by: Srinivas Kandagatla Reviewed-by: Dmitry Baryshkov Tested-by: Dmitry Baryshkov # X13s Link: https://lore.kernel.org/r/20240613-q6apm-fixes-v1-1-d88953675ab3@linaro.org Signed-off-by: Mark Brown --- sound/soc/qcom/qdsp6/q6apm-lpass-dais.c | 32 +++++++++++++++---------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/sound/soc/qcom/qdsp6/q6apm-lpass-dais.c b/sound/soc/qcom/qdsp6/q6apm-lpass-dais.c index 68a38f63a2dbf5..66b911b49e3f41 100644 --- a/sound/soc/qcom/qdsp6/q6apm-lpass-dais.c +++ b/sound/soc/qcom/qdsp6/q6apm-lpass-dais.c @@ -141,14 +141,17 @@ static void q6apm_lpass_dai_shutdown(struct snd_pcm_substream *substream, struct struct q6apm_lpass_dai_data *dai_data = dev_get_drvdata(dai->dev); int rc; - if (!dai_data->is_port_started[dai->id]) - return; - rc = q6apm_graph_stop(dai_data->graph[dai->id]); - if (rc < 0) - dev_err(dai->dev, "fail to close APM port (%d)\n", rc); + if (dai_data->is_port_started[dai->id]) { + rc = q6apm_graph_stop(dai_data->graph[dai->id]); + dai_data->is_port_started[dai->id] = false; + if (rc < 0) + dev_err(dai->dev, "fail to close APM port (%d)\n", rc); + } - q6apm_graph_close(dai_data->graph[dai->id]); - dai_data->is_port_started[dai->id] = false; + if (dai_data->graph[dai->id]) { + q6apm_graph_close(dai_data->graph[dai->id]); + dai_data->graph[dai->id] = NULL; + } } static int q6apm_lpass_dai_prepare(struct snd_pcm_substream *substream, struct snd_soc_dai *dai) @@ -163,8 +166,10 @@ static int q6apm_lpass_dai_prepare(struct snd_pcm_substream *substream, struct s q6apm_graph_stop(dai_data->graph[dai->id]); dai_data->is_port_started[dai->id] = false; - if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) + if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) { q6apm_graph_close(dai_data->graph[dai->id]); + dai_data->graph[dai->id] = NULL; + } } /** @@ -183,26 +188,29 @@ static int q6apm_lpass_dai_prepare(struct snd_pcm_substream *substream, struct s cfg->direction = substream->stream; rc = q6apm_graph_media_format_pcm(dai_data->graph[dai->id], cfg); - if (rc) { dev_err(dai->dev, "Failed to set media format %d\n", rc); - return rc; + goto err; } rc = q6apm_graph_prepare(dai_data->graph[dai->id]); if (rc) { dev_err(dai->dev, "Failed to prepare Graph %d\n", rc); - return rc; + goto err; } rc = q6apm_graph_start(dai_data->graph[dai->id]); if (rc < 0) { dev_err(dai->dev, "fail to start APM port %x\n", dai->id); - return rc; + goto err; } dai_data->is_port_started[dai->id] = true; return 0; +err: + q6apm_graph_close(dai_data->graph[dai->id]); + dai_data->graph[dai->id] = NULL; + return rc; } static int q6apm_lpass_dai_startup(struct snd_pcm_substream *substream, struct snd_soc_dai *dai) From 2bbe3e5a2f4ef69d13be54f1cf895b4658287080 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 14 Jun 2024 12:17:33 +0200 Subject: [PATCH 0947/1578] bpf: Avoid splat in pskb_pull_reason syzkaller builds (CONFIG_DEBUG_NET=y) frequently trigger a debug hint in pskb_may_pull. We'd like to retain this debug check because it might hint at integer overflows and other issues (kernel code should pull headers, not huge value). In bpf case, this splat isn't interesting at all: such (nonsensical) bpf programs are typically generated by a fuzzer anyway. Do what Eric suggested and suppress such warning. For CONFIG_DEBUG_NET=n we don't need the extra check because pskb_may_pull will do the right thing: return an error without the WARN() backtrace. Fixes: 219eee9c0d16 ("net: skbuff: add overflow debug check to pull/push helpers") Reported-by: syzbot+0c4150bff9fff3bf023c@syzkaller.appspotmail.com Suggested-by: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: Daniel Borkmann Reviewed-by: Eric Dumazet Acked-by: Daniel Borkmann Closes: https://syzkaller.appspot.com/bug?extid=0c4150bff9fff3bf023c Link: https://lore.kernel.org/netdev/9f254c96-54f2-4457-b7ab-1d9f6187939c@gmail.com/ Link: https://lore.kernel.org/bpf/20240614101801.9496-1-fw@strlen.de --- net/core/filter.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index 2510464692af0f..9933851c685e73 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1665,6 +1665,11 @@ static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp); static inline int __bpf_try_make_writable(struct sk_buff *skb, unsigned int write_len) { +#ifdef CONFIG_DEBUG_NET + /* Avoid a splat in pskb_may_pull_reason() */ + if (write_len > INT_MAX) + return -EINVAL; +#endif return skb_ensure_writable(skb, write_len); } From d2278f3533a8c4933c52f85784ffa73e8250c524 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 14 Jun 2024 17:22:25 +0200 Subject: [PATCH 0948/1578] thermal: core: Synchronize suspend-prepare and post-suspend actions After commit 5a5efdaffda5 ("thermal: core: Resume thermal zones asynchronously") it is theoretically possible that, if a system suspend starts immediately after a system resume, thermal_zone_device_resume() spawned by the thermal PM notifier for one of the thermal zones at the end of the system resume will run after the PM thermal notifier for the suspend-prepare action. If that happens, tz->suspended set by the latter will be reset by the former which may lead to unexpected consequences. To avoid that race, synchronize thermal_zone_device_resume() with the suspend-prepare thermal PM notifier with the help of additional bool field and completion in struct thermal_zone_device. Note that this also ensures running __thermal_zone_device_update() at least once for each thermal zone between system resume and the following system suspend in case it is needed to start thermal mitigation. Fixes: 5a5efdaffda5 ("thermal: core: Resume thermal zones asynchronously") Signed-off-by: Rafael J. Wysocki --- drivers/thermal/thermal_core.c | 21 +++++++++++++++++++++ drivers/thermal/thermal_core.h | 4 ++++ 2 files changed, 25 insertions(+) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 30567b4994551b..f92529fb0d10e4 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -1397,6 +1397,7 @@ thermal_zone_device_register_with_trips(const char *type, ida_init(&tz->ida); mutex_init(&tz->lock); init_completion(&tz->removal); + init_completion(&tz->resume); id = ida_alloc(&thermal_tz_ida, GFP_KERNEL); if (id < 0) { result = id; @@ -1642,6 +1643,9 @@ static void thermal_zone_device_resume(struct work_struct *work) thermal_zone_device_init(tz); __thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED); + complete(&tz->resume); + tz->resuming = false; + mutex_unlock(&tz->lock); } @@ -1659,6 +1663,20 @@ static int thermal_pm_notify(struct notifier_block *nb, list_for_each_entry(tz, &thermal_tz_list, node) { mutex_lock(&tz->lock); + if (tz->resuming) { + /* + * thermal_zone_device_resume() queued up for + * this zone has not acquired the lock yet, so + * release it to let the function run and wait + * util it has done the work. + */ + mutex_unlock(&tz->lock); + + wait_for_completion(&tz->resume); + + mutex_lock(&tz->lock); + } + tz->suspended = true; mutex_unlock(&tz->lock); @@ -1676,6 +1694,9 @@ static int thermal_pm_notify(struct notifier_block *nb, cancel_delayed_work(&tz->poll_queue); + reinit_completion(&tz->resume); + tz->resuming = true; + /* * Replace the work function with the resume one, which * will restore the original work function and schedule diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h index 20e7b45673d683..66f67e54e0c8dc 100644 --- a/drivers/thermal/thermal_core.h +++ b/drivers/thermal/thermal_core.h @@ -55,6 +55,7 @@ struct thermal_governor { * @type: the thermal zone device type * @device: &struct device for this thermal zone * @removal: removal completion + * @resume: resume completion * @trip_temp_attrs: attributes for trip points for sysfs: trip temperature * @trip_type_attrs: attributes for trip points for sysfs: trip type * @trip_hyst_attrs: attributes for trip points for sysfs: trip hysteresis @@ -89,6 +90,7 @@ struct thermal_governor { * @poll_queue: delayed work for polling * @notify_event: Last notification event * @suspended: thermal zone suspend indicator + * @resuming: indicates whether or not thermal zone resume is in progress * @trips: array of struct thermal_trip objects */ struct thermal_zone_device { @@ -96,6 +98,7 @@ struct thermal_zone_device { char type[THERMAL_NAME_LENGTH]; struct device device; struct completion removal; + struct completion resume; struct attribute_group trips_attribute_group; struct thermal_attr *trip_temp_attrs; struct thermal_attr *trip_type_attrs; @@ -123,6 +126,7 @@ struct thermal_zone_device { struct delayed_work poll_queue; enum thermal_notify_event notify_event; bool suspended; + bool resuming; #ifdef CONFIG_THERMAL_DEBUGFS struct thermal_debugfs *debugfs; #endif From 494c7d055081da066424706b28faa9a4c719d852 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 14 Jun 2024 17:26:00 +0200 Subject: [PATCH 0949/1578] thermal: core: Change PM notifier priority to the minimum It is reported that commit 5a5efdaffda5 ("thermal: core: Resume thermal zones asynchronously") causes battery data in sysfs on Thinkpad P1 Gen2 to become invalid after a resume from S3 (and it is necessary to reboot the machine to restore correct battery data). Some investigation into the problem indicated that it happened because, after the commit in question, the ACPI battery PM notifier ran in parallel with thermal_zone_device_resume() for one of the thermal zones which apparently confused the platform firmware on the affected system. While the exact reason for the firmware confusion remains unclear, it is arguably not particularly relevant, and the expected behavior of the affected system can be restored by making the thermal PM notifier run at the lowest priority which avoids interference between work items spawned by it and the other PM notifiers (that will run before those work items now). Fixes: 5a5efdaffda5 ("thermal: core: Resume thermal zones asynchronously") Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218881 Reported-by: fhortner@yahoo.de Tested-by: fhortner@yahoo.de Cc: 6.8+ # 6.8+ Signed-off-by: Rafael J. Wysocki --- drivers/thermal/thermal_core.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index f92529fb0d10e4..f7c38c0d6199b3 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -1721,6 +1721,12 @@ static int thermal_pm_notify(struct notifier_block *nb, static struct notifier_block thermal_pm_nb = { .notifier_call = thermal_pm_notify, + /* + * Run at the lowest priority to avoid interference between the thermal + * zone resume work items spawned by thermal_pm_notify() and the other + * PM notifiers. + */ + .priority = INT_MIN, }; static int __init thermal_init(void) From 5db755fbb1a0de4a4cfd5d5edfaa19853b9c56e6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 31 May 2024 09:47:56 +0200 Subject: [PATCH 0950/1578] ubd: refactor the interrupt handler Instead of a separate handler function that leaves no work in the interrupt hanler itself, split out a per-request end I/O helper and clean up the coding style and variable naming while we're at it. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Acked-By: Anton Ivanov Link: https://lore.kernel.org/r/20240531074837.1648501-2-hch@lst.de Signed-off-by: Jens Axboe --- arch/um/drivers/ubd_kern.c | 49 ++++++++++++++------------------------ 1 file changed, 18 insertions(+), 31 deletions(-) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index ef805eaa9e013d..0c9542d58c01b7 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -447,43 +447,30 @@ static int bulk_req_safe_read( return n; } -/* Called without dev->lock held, and only in interrupt context. */ -static void ubd_handler(void) +static void ubd_end_request(struct io_thread_req *io_req) { - int n; - int count; - - while(1){ - n = bulk_req_safe_read( - thread_fd, - irq_req_buffer, - &irq_remainder, - &irq_remainder_size, - UBD_REQ_BUFFER_SIZE - ); - if (n < 0) { - if(n == -EAGAIN) - break; - printk(KERN_ERR "spurious interrupt in ubd_handler, " - "err = %d\n", -n); - return; - } - for (count = 0; count < n/sizeof(struct io_thread_req *); count++) { - struct io_thread_req *io_req = (*irq_req_buffer)[count]; - - if ((io_req->error == BLK_STS_NOTSUPP) && (req_op(io_req->req) == REQ_OP_DISCARD)) { - blk_queue_max_discard_sectors(io_req->req->q, 0); - blk_queue_max_write_zeroes_sectors(io_req->req->q, 0); - } - blk_mq_end_request(io_req->req, io_req->error); - kfree(io_req); - } + if (io_req->error == BLK_STS_NOTSUPP && + req_op(io_req->req) == REQ_OP_DISCARD) { + blk_queue_max_discard_sectors(io_req->req->q, 0); + blk_queue_max_write_zeroes_sectors(io_req->req->q, 0); } + blk_mq_end_request(io_req->req, io_req->error); + kfree(io_req); } static irqreturn_t ubd_intr(int irq, void *dev) { - ubd_handler(); + int len, i; + + while ((len = bulk_req_safe_read(thread_fd, irq_req_buffer, + &irq_remainder, &irq_remainder_size, + UBD_REQ_BUFFER_SIZE)) >= 0) { + for (i = 0; i < len / sizeof(struct io_thread_req *); i++) + ubd_end_request((*irq_req_buffer)[i]); + } + + if (len < 0 && len != -EAGAIN) + pr_err("spurious interrupt in %s, err = %d\n", __func__, len); return IRQ_HANDLED; } From 31ade7d4fdcf382beb8cb229a1f5d77e0f239672 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 31 May 2024 09:47:57 +0200 Subject: [PATCH 0951/1578] ubd: untagle discard vs write zeroes not support handling Discard and Write Zeroes are different operation and implemented by different fallocate opcodes for ubd. If one fails the other one can work and vice versa. Split the code to disable the operations in ubd_handler to only disable the operation that actually failed. Fixes: 50109b5a03b4 ("um: Add support for DISCARD in the UBD Driver") Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Acked-By: Anton Ivanov Link: https://lore.kernel.org/r/20240531074837.1648501-3-hch@lst.de Signed-off-by: Jens Axboe --- arch/um/drivers/ubd_kern.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 0c9542d58c01b7..093c87879d08ba 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -449,10 +449,11 @@ static int bulk_req_safe_read( static void ubd_end_request(struct io_thread_req *io_req) { - if (io_req->error == BLK_STS_NOTSUPP && - req_op(io_req->req) == REQ_OP_DISCARD) { - blk_queue_max_discard_sectors(io_req->req->q, 0); - blk_queue_max_write_zeroes_sectors(io_req->req->q, 0); + if (io_req->error == BLK_STS_NOTSUPP) { + if (req_op(io_req->req) == REQ_OP_DISCARD) + blk_queue_max_discard_sectors(io_req->req->q, 0); + else if (req_op(io_req->req) == REQ_OP_WRITE_ZEROES) + blk_queue_max_write_zeroes_sectors(io_req->req->q, 0); } blk_mq_end_request(io_req->req, io_req->error); kfree(io_req); From a00d4bfce7c6d7fa4712b8133ec195c9bd142ae6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 31 May 2024 09:47:58 +0200 Subject: [PATCH 0952/1578] rbd: increase io_opt again Commit 16d80c54ad42 ("rbd: set io_min, io_opt and discard_granularity to alloc_size") lowered the io_opt size for rbd from objset_bytes which is 4MB for typical setup to alloc_size which is typically 64KB. The commit mostly talks about discard behavior and does mention io_min in passing. Reducing io_opt means reducing the readahead size, which seems counter-intuitive given that rbd currently abuses the user max_sectors setting to actually increase the I/O size. Switch back to the old setting to allow larger reads (the readahead size despite it's name actually limits the size of any buffered read) and to prepare for using io_opt in the max_sectors calculation and getting drivers out of the business of overriding the max_user_sectors value. Signed-off-by: Christoph Hellwig Acked-by: Ilya Dryomov Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240531074837.1648501-4-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/rbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 26ff5cd2bf0abc..46dc487ccc17eb 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4955,8 +4955,8 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) struct queue_limits lim = { .max_hw_sectors = objset_bytes >> SECTOR_SHIFT, .max_user_sectors = objset_bytes >> SECTOR_SHIFT, + .io_opt = objset_bytes, .io_min = rbd_dev->opts->alloc_size, - .io_opt = rbd_dev->opts->alloc_size, .max_segments = USHRT_MAX, .max_segment_size = UINT_MAX, }; From a23634644afc2f7c1bac98776440a1f3b161819e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 31 May 2024 09:47:59 +0200 Subject: [PATCH 0953/1578] block: take io_opt and io_min into account for max_sectors The soft max_sectors limit is normally capped by the hardware limits and an arbitrary upper limit enforced by the kernel, but can be modified by the user. A few drivers want to increase this limit (nbd, rbd) or adjust it up or down based on hardware capabilities (sd). Change blk_validate_limits to default max_sectors to the optimal I/O size, or upgrade it to the preferred minimal I/O size if that is larger than the kernel default if no optimal I/O size is provided based on the logic in the SD driver. This keeps the existing kernel default for drivers that do not provide an io_opt or very big io_min value, but picks a much more useful default for those who provide these hints, and allows to remove the hacks to set the user max_sectors limit in nbd, rbd and sd. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Acked-by: Ilya Dryomov Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240531074837.1648501-5-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 7 +++++++ drivers/block/nbd.c | 2 +- drivers/block/rbd.c | 1 - drivers/scsi/sd.c | 29 +++++------------------------ 4 files changed, 13 insertions(+), 26 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index effeb9a639bb45..a49abdb3554834 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -153,6 +153,13 @@ static int blk_validate_limits(struct queue_limits *lim) if (lim->max_user_sectors < PAGE_SIZE / SECTOR_SIZE) return -EINVAL; lim->max_sectors = min(max_hw_sectors, lim->max_user_sectors); + } else if (lim->io_opt) { + lim->max_sectors = + min(max_hw_sectors, lim->io_opt >> SECTOR_SHIFT); + } else if (lim->io_min && + lim->io_min > (BLK_DEF_MAX_SECTORS_CAP << SECTOR_SHIFT)) { + lim->max_sectors = + min(max_hw_sectors, lim->io_min >> SECTOR_SHIFT); } else { lim->max_sectors = min(max_hw_sectors, BLK_DEF_MAX_SECTORS_CAP); } diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 22a79a62cc4eab..ad887d614d5b3f 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1808,7 +1808,7 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) { struct queue_limits lim = { .max_hw_sectors = 65536, - .max_user_sectors = 256, + .io_opt = 256 << SECTOR_SHIFT, .max_segments = USHRT_MAX, .max_segment_size = UINT_MAX, }; diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 46dc487ccc17eb..22ad704f81d8b9 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4954,7 +4954,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) rbd_dev->layout.object_size * rbd_dev->layout.stripe_count; struct queue_limits lim = { .max_hw_sectors = objset_bytes >> SECTOR_SHIFT, - .max_user_sectors = objset_bytes >> SECTOR_SHIFT, .io_opt = objset_bytes, .io_min = rbd_dev->opts->alloc_size, .max_segments = USHRT_MAX, diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index f6c822c9cbd2d3..3dff9150ce11e2 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3593,7 +3593,7 @@ static int sd_revalidate_disk(struct gendisk *disk) struct request_queue *q = sdkp->disk->queue; sector_t old_capacity = sdkp->capacity; unsigned char *buffer; - unsigned int dev_max, rw_max; + unsigned int dev_max; SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_revalidate_disk\n")); @@ -3675,34 +3675,15 @@ static int sd_revalidate_disk(struct gendisk *disk) else blk_queue_io_min(sdkp->disk->queue, 0); - if (sd_validate_opt_xfer_size(sdkp, dev_max)) { - q->limits.io_opt = logical_to_bytes(sdp, sdkp->opt_xfer_blocks); - rw_max = logical_to_sectors(sdp, sdkp->opt_xfer_blocks); - } else { - q->limits.io_opt = 0; - rw_max = min_not_zero(logical_to_sectors(sdp, dev_max), - (sector_t)BLK_DEF_MAX_SECTORS_CAP); - } - /* * Limit default to SCSI host optimal sector limit if set. There may be * an impact on performance for when the size of a request exceeds this * host limit. */ - rw_max = min_not_zero(rw_max, sdp->host->opt_sectors); - - /* Do not exceed controller limit */ - rw_max = min(rw_max, queue_max_hw_sectors(q)); - - /* - * Only update max_sectors if previously unset or if the current value - * exceeds the capabilities of the hardware. - */ - if (sdkp->first_scan || - q->limits.max_sectors > q->limits.max_dev_sectors || - q->limits.max_sectors > q->limits.max_hw_sectors) { - q->limits.max_sectors = rw_max; - q->limits.max_user_sectors = rw_max; + q->limits.io_opt = sdp->host->opt_sectors << SECTOR_SHIFT; + if (sd_validate_opt_xfer_size(sdkp, dev_max)) { + q->limits.io_opt = min_not_zero(q->limits.io_opt, + logical_to_bytes(sdp, sdkp->opt_xfer_blocks)); } sdkp->first_scan = 0; From b3491b0db165c0cbe25874da66d97652c03db654 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 31 May 2024 09:48:00 +0200 Subject: [PATCH 0954/1578] sd: simplify the ZBC case in provisioning_mode_store Don't reset the discard settings to no-op over and over when a user writes to the provisioning attribute as that is already the default mode for ZBC devices. In hindsight we should have made writing to the attribute fail for ZBC devices, but the code has probably been around for far too long to change this now. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240531074837.1648501-6-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/sd.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 3dff9150ce11e2..83aa17fea39d39 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -461,14 +461,13 @@ provisioning_mode_store(struct device *dev, struct device_attribute *attr, if (!capable(CAP_SYS_ADMIN)) return -EACCES; - if (sd_is_zoned(sdkp)) { - sd_config_discard(sdkp, SD_LBP_DISABLE); - return count; - } - if (sdp->type != TYPE_DISK) return -EINVAL; + /* ignore the provisioning mode for ZBC devices */ + if (sd_is_zoned(sdkp)) + return count; + mode = sysfs_match_string(lbp_mode, buf); if (mode < 0) return -EINVAL; From b0dadb86a90bd5a7b723f9d3a6cf69da9b596496 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 31 May 2024 09:48:01 +0200 Subject: [PATCH 0955/1578] sd: add a sd_disable_discard helper Add helper to disable discard when it is not supported and use it instead of sd_config_discard in the I/O completion handler. This avoids touching more fields than required in the I/O completion handler and prepares for converting sd to use the atomic queue limits API. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240531074837.1648501-7-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/sd.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 83aa17fea39d39..f07d90474e682b 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -821,6 +821,12 @@ static unsigned char sd_setup_protect_cmnd(struct scsi_cmnd *scmd, return protect; } +static void sd_disable_discard(struct scsi_disk *sdkp) +{ + sdkp->provisioning_mode = SD_LBP_DISABLE; + blk_queue_max_discard_sectors(sdkp->disk->queue, 0); +} + static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode) { struct request_queue *q = sdkp->disk->queue; @@ -2245,12 +2251,12 @@ static int sd_done(struct scsi_cmnd *SCpnt) case 0x24: /* INVALID FIELD IN CDB */ switch (SCpnt->cmnd[0]) { case UNMAP: - sd_config_discard(sdkp, SD_LBP_DISABLE); + sd_disable_discard(sdkp); break; case WRITE_SAME_16: case WRITE_SAME: if (SCpnt->cmnd[1] & 8) { /* UNMAP */ - sd_config_discard(sdkp, SD_LBP_DISABLE); + sd_disable_discard(sdkp); } else { sdkp->device->no_write_same = 1; sd_config_write_same(sdkp); From 9972b8ce0d4ba373901bdd1e15e4de58fcd7f662 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 31 May 2024 09:48:02 +0200 Subject: [PATCH 0956/1578] sd: add a sd_disable_write_same helper Add helper to disable WRITE SAME when it is not supported and use it instead of sd_config_write_same in the I/O completion handler. This avoids touching more fields than required in the I/O completion handler and prepares for converting sd to use the atomic queue limits API. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240531074837.1648501-8-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/sd.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index f07d90474e682b..70211d0b187652 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1004,6 +1004,13 @@ static blk_status_t sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd) return sd_setup_write_same10_cmnd(cmd, false); } +static void sd_disable_write_same(struct scsi_disk *sdkp) +{ + sdkp->device->no_write_same = 1; + sdkp->max_ws_blocks = 0; + blk_queue_max_write_zeroes_sectors(sdkp->disk->queue, 0); +} + static void sd_config_write_same(struct scsi_disk *sdkp) { struct request_queue *q = sdkp->disk->queue; @@ -2258,8 +2265,7 @@ static int sd_done(struct scsi_cmnd *SCpnt) if (SCpnt->cmnd[1] & 8) { /* UNMAP */ sd_disable_discard(sdkp); } else { - sdkp->device->no_write_same = 1; - sd_config_write_same(sdkp); + sd_disable_write_same(sdkp); req->rq_flags |= RQF_QUIET; } break; From d15b9bd42cd3b2077812d4bf32f532a9bd5c4914 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 31 May 2024 09:48:03 +0200 Subject: [PATCH 0957/1578] sd: simplify the disable case in sd_config_discard Fall through to the main call to blk_queue_max_discard_sectors given that max_blocks has been initialized to zero above instead of duplicating the call. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240531074837.1648501-9-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/sd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 70211d0b187652..0dbc6eb7a7cac3 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -844,8 +844,7 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode) case SD_LBP_FULL: case SD_LBP_DISABLE: - blk_queue_max_discard_sectors(q, 0); - return; + break; case SD_LBP_UNMAP: max_blocks = min_not_zero(sdkp->max_unmap_blocks, From f1e8185fc12c699c3abf4f39b1ff5d7793da3a95 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 31 May 2024 09:48:04 +0200 Subject: [PATCH 0958/1578] sd: factor out a sd_discard_mode helper Split the logic to pick the right discard mode into a little helper to prepare for further changes. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240531074837.1648501-10-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/sd.c | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 0dbc6eb7a7cac3..39eddfac09ef8f 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3201,6 +3201,25 @@ static void sd_read_app_tag_own(struct scsi_disk *sdkp, unsigned char *buffer) return; } +static unsigned int sd_discard_mode(struct scsi_disk *sdkp) +{ + if (!sdkp->lbpvpd) { + /* LBP VPD page not provided */ + if (sdkp->max_unmap_blocks) + return SD_LBP_UNMAP; + return SD_LBP_WS16; + } + + /* LBP VPD page tells us what to use */ + if (sdkp->lbpu && sdkp->max_unmap_blocks) + return SD_LBP_UNMAP; + if (sdkp->lbpws) + return SD_LBP_WS16; + if (sdkp->lbpws10) + return SD_LBP_WS10; + return SD_LBP_DISABLE; +} + /** * sd_read_block_limits - Query disk device for preferred I/O sizes. * @sdkp: disk to query @@ -3239,23 +3258,7 @@ static void sd_read_block_limits(struct scsi_disk *sdkp) sdkp->unmap_alignment = get_unaligned_be32(&vpd->data[32]) & ~(1 << 31); - if (!sdkp->lbpvpd) { /* LBP VPD page not provided */ - - if (sdkp->max_unmap_blocks) - sd_config_discard(sdkp, SD_LBP_UNMAP); - else - sd_config_discard(sdkp, SD_LBP_WS16); - - } else { /* LBP VPD page tells us what to use */ - if (sdkp->lbpu && sdkp->max_unmap_blocks) - sd_config_discard(sdkp, SD_LBP_UNMAP); - else if (sdkp->lbpws) - sd_config_discard(sdkp, SD_LBP_WS16); - else if (sdkp->lbpws10) - sd_config_discard(sdkp, SD_LBP_WS10); - else - sd_config_discard(sdkp, SD_LBP_DISABLE); - } + sd_config_discard(sdkp, sd_discard_mode(sdkp)); } out: From 9c1d339a1bf45f4d3a2e77bbf24b0ec51f02551c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 31 May 2024 09:48:05 +0200 Subject: [PATCH 0959/1578] sd: cleanup zoned queue limits initialization Consolidate setting zone-related queue limits in sd_zbc_read_zones instead of splitting them between sd_zbc_revalidate_zones and sd_zbc_read_zones, and move the early_zone_information initialization in sd_zbc_read_zones above setting up the queue limits. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240531074837.1648501-11-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/sd_zbc.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index 806036e48abeda..1c24c844e8d178 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -565,12 +565,6 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp) sdkp->zone_info.zone_blocks = zone_blocks; sdkp->zone_info.nr_zones = nr_zones; - blk_queue_chunk_sectors(q, - logical_to_sectors(sdkp->device, zone_blocks)); - - /* Enable block layer zone append emulation */ - blk_queue_max_zone_append_sectors(q, 0); - flags = memalloc_noio_save(); ret = blk_revalidate_disk_zones(disk); memalloc_noio_restore(flags); @@ -625,6 +619,10 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]) if (ret != 0) goto err; + nr_zones = round_up(sdkp->capacity, zone_blocks) >> ilog2(zone_blocks); + sdkp->early_zone_info.nr_zones = nr_zones; + sdkp->early_zone_info.zone_blocks = zone_blocks; + /* The drive satisfies the kernel restrictions: set it up */ blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); if (sdkp->zones_max_open == U32_MAX) @@ -632,10 +630,10 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]) else disk_set_max_open_zones(disk, sdkp->zones_max_open); disk_set_max_active_zones(disk, 0); - nr_zones = round_up(sdkp->capacity, zone_blocks) >> ilog2(zone_blocks); - - sdkp->early_zone_info.nr_zones = nr_zones; - sdkp->early_zone_info.zone_blocks = zone_blocks; + blk_queue_chunk_sectors(q, + logical_to_sectors(sdkp->device, zone_blocks)); + /* Enable block layer zone append emulation */ + blk_queue_max_zone_append_sectors(q, 0); return 0; From 804e498e0496d889090f32f812b5ce1a4f2aa63e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 31 May 2024 09:48:06 +0200 Subject: [PATCH 0960/1578] sd: convert to the atomic queue limits API Assign all queue limits through a local queue_limits variable and queue_limits_commit_update so that we can't race updating them from multiple places, and freeze the queue when updating them so that in-progress I/O submissions don't see half-updated limits. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: John Garry Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240531074837.1648501-12-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/sd.c | 130 ++++++++++++++++++++++++------------------ drivers/scsi/sd.h | 6 +- drivers/scsi/sd_zbc.c | 15 ++--- 3 files changed, 85 insertions(+), 66 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 39eddfac09ef8f..049071b5681989 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -101,12 +101,13 @@ MODULE_ALIAS_SCSI_DEVICE(TYPE_ZBC); #define SD_MINORS 16 -static void sd_config_discard(struct scsi_disk *, unsigned int); -static void sd_config_write_same(struct scsi_disk *); +static void sd_config_discard(struct scsi_disk *sdkp, struct queue_limits *lim, + unsigned int mode); +static void sd_config_write_same(struct scsi_disk *sdkp, + struct queue_limits *lim); static int sd_revalidate_disk(struct gendisk *); static void sd_unlock_native_capacity(struct gendisk *disk); static void sd_shutdown(struct device *); -static void sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer); static void scsi_disk_release(struct device *cdev); static DEFINE_IDA(sd_index_ida); @@ -456,7 +457,8 @@ provisioning_mode_store(struct device *dev, struct device_attribute *attr, { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; - int mode; + struct queue_limits lim; + int mode, err; if (!capable(CAP_SYS_ADMIN)) return -EACCES; @@ -472,8 +474,13 @@ provisioning_mode_store(struct device *dev, struct device_attribute *attr, if (mode < 0) return -EINVAL; - sd_config_discard(sdkp, mode); - + lim = queue_limits_start_update(sdkp->disk->queue); + sd_config_discard(sdkp, &lim, mode); + blk_mq_freeze_queue(sdkp->disk->queue); + err = queue_limits_commit_update(sdkp->disk->queue, &lim); + blk_mq_unfreeze_queue(sdkp->disk->queue); + if (err) + return err; return count; } static DEVICE_ATTR_RW(provisioning_mode); @@ -556,6 +563,7 @@ max_write_same_blocks_store(struct device *dev, struct device_attribute *attr, { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; + struct queue_limits lim; unsigned long max; int err; @@ -577,8 +585,13 @@ max_write_same_blocks_store(struct device *dev, struct device_attribute *attr, sdkp->max_ws_blocks = max; } - sd_config_write_same(sdkp); - + lim = queue_limits_start_update(sdkp->disk->queue); + sd_config_write_same(sdkp, &lim); + blk_mq_freeze_queue(sdkp->disk->queue); + err = queue_limits_commit_update(sdkp->disk->queue, &lim); + blk_mq_unfreeze_queue(sdkp->disk->queue); + if (err) + return err; return count; } static DEVICE_ATTR_RW(max_write_same_blocks); @@ -827,17 +840,15 @@ static void sd_disable_discard(struct scsi_disk *sdkp) blk_queue_max_discard_sectors(sdkp->disk->queue, 0); } -static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode) +static void sd_config_discard(struct scsi_disk *sdkp, struct queue_limits *lim, + unsigned int mode) { - struct request_queue *q = sdkp->disk->queue; unsigned int logical_block_size = sdkp->device->sector_size; unsigned int max_blocks = 0; - q->limits.discard_alignment = - sdkp->unmap_alignment * logical_block_size; - q->limits.discard_granularity = - max(sdkp->physical_block_size, - sdkp->unmap_granularity * logical_block_size); + lim->discard_alignment = sdkp->unmap_alignment * logical_block_size; + lim->discard_granularity = max(sdkp->physical_block_size, + sdkp->unmap_granularity * logical_block_size); sdkp->provisioning_mode = mode; switch (mode) { @@ -875,7 +886,8 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode) break; } - blk_queue_max_discard_sectors(q, max_blocks * (logical_block_size >> 9)); + lim->max_hw_discard_sectors = max_blocks * + (logical_block_size >> SECTOR_SHIFT); } static void *sd_set_special_bvec(struct request *rq, unsigned int data_len) @@ -1010,9 +1022,9 @@ static void sd_disable_write_same(struct scsi_disk *sdkp) blk_queue_max_write_zeroes_sectors(sdkp->disk->queue, 0); } -static void sd_config_write_same(struct scsi_disk *sdkp) +static void sd_config_write_same(struct scsi_disk *sdkp, + struct queue_limits *lim) { - struct request_queue *q = sdkp->disk->queue; unsigned int logical_block_size = sdkp->device->sector_size; if (sdkp->device->no_write_same) { @@ -1066,8 +1078,8 @@ static void sd_config_write_same(struct scsi_disk *sdkp) } out: - blk_queue_max_write_zeroes_sectors(q, sdkp->max_ws_blocks * - (logical_block_size >> 9)); + lim->max_write_zeroes_sectors = + sdkp->max_ws_blocks * (logical_block_size >> SECTOR_SHIFT); } static blk_status_t sd_setup_flush_cmnd(struct scsi_cmnd *cmd) @@ -2523,7 +2535,7 @@ static void read_capacity_error(struct scsi_disk *sdkp, struct scsi_device *sdp, #define READ_CAPACITY_RETRIES_ON_RESET 10 static int read_capacity_16(struct scsi_disk *sdkp, struct scsi_device *sdp, - unsigned char *buffer) + struct queue_limits *lim, unsigned char *buffer) { unsigned char cmd[16]; struct scsi_sense_hdr sshdr; @@ -2597,7 +2609,7 @@ static int read_capacity_16(struct scsi_disk *sdkp, struct scsi_device *sdp, /* Lowest aligned logical block */ alignment = ((buffer[14] & 0x3f) << 8 | buffer[15]) * sector_size; - blk_queue_alignment_offset(sdp->request_queue, alignment); + lim->alignment_offset = alignment; if (alignment && sdkp->first_scan) sd_printk(KERN_NOTICE, sdkp, "physical block alignment offset: %u\n", alignment); @@ -2608,7 +2620,7 @@ static int read_capacity_16(struct scsi_disk *sdkp, struct scsi_device *sdp, if (buffer[14] & 0x40) /* LBPRZ */ sdkp->lbprz = 1; - sd_config_discard(sdkp, SD_LBP_WS16); + sd_config_discard(sdkp, lim, SD_LBP_WS16); } sdkp->capacity = lba + 1; @@ -2711,13 +2723,14 @@ static int sd_try_rc16_first(struct scsi_device *sdp) * read disk capacity */ static void -sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer) +sd_read_capacity(struct scsi_disk *sdkp, struct queue_limits *lim, + unsigned char *buffer) { int sector_size; struct scsi_device *sdp = sdkp->device; if (sd_try_rc16_first(sdp)) { - sector_size = read_capacity_16(sdkp, sdp, buffer); + sector_size = read_capacity_16(sdkp, sdp, lim, buffer); if (sector_size == -EOVERFLOW) goto got_data; if (sector_size == -ENODEV) @@ -2737,7 +2750,7 @@ sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer) int old_sector_size = sector_size; sd_printk(KERN_NOTICE, sdkp, "Very big device. " "Trying to use READ CAPACITY(16).\n"); - sector_size = read_capacity_16(sdkp, sdp, buffer); + sector_size = read_capacity_16(sdkp, sdp, lim, buffer); if (sector_size < 0) { sd_printk(KERN_NOTICE, sdkp, "Using 0xffffffff as device size\n"); @@ -2796,9 +2809,8 @@ sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer) */ sector_size = 512; } - blk_queue_logical_block_size(sdp->request_queue, sector_size); - blk_queue_physical_block_size(sdp->request_queue, - sdkp->physical_block_size); + lim->logical_block_size = sector_size; + lim->physical_block_size = sdkp->physical_block_size; sdkp->device->sector_size = sector_size; if (sdkp->capacity > 0xffffffff) @@ -3220,11 +3232,11 @@ static unsigned int sd_discard_mode(struct scsi_disk *sdkp) return SD_LBP_DISABLE; } -/** - * sd_read_block_limits - Query disk device for preferred I/O sizes. - * @sdkp: disk to query +/* + * Query disk device for preferred I/O sizes. */ -static void sd_read_block_limits(struct scsi_disk *sdkp) +static void sd_read_block_limits(struct scsi_disk *sdkp, + struct queue_limits *lim) { struct scsi_vpd *vpd; @@ -3258,7 +3270,7 @@ static void sd_read_block_limits(struct scsi_disk *sdkp) sdkp->unmap_alignment = get_unaligned_be32(&vpd->data[32]) & ~(1 << 31); - sd_config_discard(sdkp, sd_discard_mode(sdkp)); + sd_config_discard(sdkp, lim, sd_discard_mode(sdkp)); } out: @@ -3277,11 +3289,9 @@ static void sd_read_block_limits_ext(struct scsi_disk *sdkp) rcu_read_unlock(); } -/** - * sd_read_block_characteristics - Query block dev. characteristics - * @sdkp: disk to query - */ -static void sd_read_block_characteristics(struct scsi_disk *sdkp) +/* Query block device characteristics */ +static void sd_read_block_characteristics(struct scsi_disk *sdkp, + struct queue_limits *lim) { struct request_queue *q = sdkp->disk->queue; struct scsi_vpd *vpd; @@ -3307,29 +3317,26 @@ static void sd_read_block_characteristics(struct scsi_disk *sdkp) #ifdef CONFIG_BLK_DEV_ZONED /* sd_probe rejects ZBD devices early otherwise */ if (sdkp->device->type == TYPE_ZBC) { - /* - * Host-managed. - */ - disk_set_zoned(sdkp->disk); + lim->zoned = true; /* * Per ZBC and ZAC specifications, writes in sequential write * required zones of host-managed devices must be aligned to * the device physical block size. */ - blk_queue_zone_write_granularity(q, sdkp->physical_block_size); + lim->zone_write_granularity = sdkp->physical_block_size; } else { /* * Host-aware devices are treated as conventional. */ - WARN_ON_ONCE(blk_queue_is_zoned(q)); + lim->zoned = false; } #endif /* CONFIG_BLK_DEV_ZONED */ if (!sdkp->first_scan) return; - if (blk_queue_is_zoned(q)) + if (lim->zoned) sd_printk(KERN_NOTICE, sdkp, "Host-managed zoned block device\n"); else if (sdkp->zoned == 1) sd_printk(KERN_NOTICE, sdkp, "Host-aware SMR disk used as regular disk\n"); @@ -3605,8 +3612,10 @@ static int sd_revalidate_disk(struct gendisk *disk) struct scsi_device *sdp = sdkp->device; struct request_queue *q = sdkp->disk->queue; sector_t old_capacity = sdkp->capacity; + struct queue_limits lim; unsigned char *buffer; unsigned int dev_max; + int err; SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_revalidate_disk\n")); @@ -3627,12 +3636,14 @@ static int sd_revalidate_disk(struct gendisk *disk) sd_spinup_disk(sdkp); + lim = queue_limits_start_update(sdkp->disk->queue); + /* * Without media there is no reason to ask; moreover, some devices * react badly if we do. */ if (sdkp->media_present) { - sd_read_capacity(sdkp, buffer); + sd_read_capacity(sdkp, &lim, buffer); /* * Some USB/UAS devices return generic values for mode pages * until the media has been accessed. Trigger a READ operation @@ -3651,10 +3662,10 @@ static int sd_revalidate_disk(struct gendisk *disk) if (scsi_device_supports_vpd(sdp)) { sd_read_block_provisioning(sdkp); - sd_read_block_limits(sdkp); + sd_read_block_limits(sdkp, &lim); sd_read_block_limits_ext(sdkp); - sd_read_block_characteristics(sdkp); - sd_zbc_read_zones(sdkp, buffer); + sd_read_block_characteristics(sdkp, &lim); + sd_zbc_read_zones(sdkp, &lim, buffer); sd_read_cpr(sdkp); } @@ -3680,31 +3691,36 @@ static int sd_revalidate_disk(struct gendisk *disk) /* Some devices report a maximum block count for READ/WRITE requests. */ dev_max = min_not_zero(dev_max, sdkp->max_xfer_blocks); - q->limits.max_dev_sectors = logical_to_sectors(sdp, dev_max); + lim.max_dev_sectors = logical_to_sectors(sdp, dev_max); if (sd_validate_min_xfer_size(sdkp)) - blk_queue_io_min(sdkp->disk->queue, - logical_to_bytes(sdp, sdkp->min_xfer_blocks)); + lim.io_min = logical_to_bytes(sdp, sdkp->min_xfer_blocks); else - blk_queue_io_min(sdkp->disk->queue, 0); + lim.io_min = 0; /* * Limit default to SCSI host optimal sector limit if set. There may be * an impact on performance for when the size of a request exceeds this * host limit. */ - q->limits.io_opt = sdp->host->opt_sectors << SECTOR_SHIFT; + lim.io_opt = sdp->host->opt_sectors << SECTOR_SHIFT; if (sd_validate_opt_xfer_size(sdkp, dev_max)) { - q->limits.io_opt = min_not_zero(q->limits.io_opt, + lim.io_opt = min_not_zero(lim.io_opt, logical_to_bytes(sdp, sdkp->opt_xfer_blocks)); } sdkp->first_scan = 0; set_capacity_and_notify(disk, logical_to_sectors(sdp, sdkp->capacity)); - sd_config_write_same(sdkp); + sd_config_write_same(sdkp, &lim); kfree(buffer); + blk_mq_freeze_queue(sdkp->disk->queue); + err = queue_limits_commit_update(sdkp->disk->queue, &lim); + blk_mq_unfreeze_queue(sdkp->disk->queue); + if (err) + return err; + /* * For a zoned drive, revalidating the zones can be done only once * the gendisk capacity is set. So if this fails, set back the gendisk diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index 49dd600bfa4825..b4170b17bad47a 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -239,7 +239,8 @@ static inline int sd_is_zoned(struct scsi_disk *sdkp) #ifdef CONFIG_BLK_DEV_ZONED -int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]); +int sd_zbc_read_zones(struct scsi_disk *sdkp, struct queue_limits *lim, + u8 buf[SD_BUF_SIZE]); int sd_zbc_revalidate_zones(struct scsi_disk *sdkp); blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd, unsigned char op, bool all); @@ -250,7 +251,8 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, #else /* CONFIG_BLK_DEV_ZONED */ -static inline int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]) +static inline int sd_zbc_read_zones(struct scsi_disk *sdkp, + struct queue_limits *lim, u8 buf[SD_BUF_SIZE]) { return 0; } diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index 1c24c844e8d178..f685838d9ed214 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -582,13 +582,15 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp) /** * sd_zbc_read_zones - Read zone information and update the request queue * @sdkp: SCSI disk pointer. + * @lim: queue limits to read into * @buf: 512 byte buffer used for storing SCSI command output. * * Read zone information and update the request queue zone characteristics and * also the zoned device information in *sdkp. Called by sd_revalidate_disk() * before the gendisk capacity has been set. */ -int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]) +int sd_zbc_read_zones(struct scsi_disk *sdkp, struct queue_limits *lim, + u8 buf[SD_BUF_SIZE]) { struct gendisk *disk = sdkp->disk; struct request_queue *q = disk->queue; @@ -626,14 +628,13 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]) /* The drive satisfies the kernel restrictions: set it up */ blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); if (sdkp->zones_max_open == U32_MAX) - disk_set_max_open_zones(disk, 0); + lim->max_open_zones = 0; else - disk_set_max_open_zones(disk, sdkp->zones_max_open); - disk_set_max_active_zones(disk, 0); - blk_queue_chunk_sectors(q, - logical_to_sectors(sdkp->device, zone_blocks)); + lim->max_open_zones = sdkp->zones_max_open; + lim->max_active_zones = 0; + lim->chunk_sectors = logical_to_sectors(sdkp->device, zone_blocks); /* Enable block layer zone append emulation */ - blk_queue_max_zone_append_sectors(q, 0); + lim->max_zone_append_sectors = 0; return 0; From 969f17e10f5b732c05186ee0126c8a08166d0cda Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 31 May 2024 09:48:07 +0200 Subject: [PATCH 0961/1578] sr: convert to the atomic queue limits API Assign all queue limits through a local queue_limits variable and queue_limits_commit_update so that we can't race updating them from multiple places, and free the queue when updating them so that in-progress I/O submissions don't see half-updated limits. Also use the chance to clean up variable names to standard ones. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240531074837.1648501-13-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/sr.c | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 7ab000942b97fc..3f491019103e0c 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -111,7 +111,7 @@ static struct lock_class_key sr_bio_compl_lkclass; static int sr_open(struct cdrom_device_info *, int); static void sr_release(struct cdrom_device_info *); -static void get_sectorsize(struct scsi_cd *); +static int get_sectorsize(struct scsi_cd *); static int get_capabilities(struct scsi_cd *); static unsigned int sr_check_events(struct cdrom_device_info *cdi, @@ -473,15 +473,15 @@ static blk_status_t sr_init_command(struct scsi_cmnd *SCpnt) return BLK_STS_IOERR; } -static void sr_revalidate_disk(struct scsi_cd *cd) +static int sr_revalidate_disk(struct scsi_cd *cd) { struct scsi_sense_hdr sshdr; /* if the unit is not ready, nothing more to do */ if (scsi_test_unit_ready(cd->device, SR_TIMEOUT, MAX_RETRIES, &sshdr)) - return; + return 0; sr_cd_check(&cd->cdi); - get_sectorsize(cd); + return get_sectorsize(cd); } static int sr_block_open(struct gendisk *disk, blk_mode_t mode) @@ -494,13 +494,16 @@ static int sr_block_open(struct gendisk *disk, blk_mode_t mode) return -ENXIO; scsi_autopm_get_device(sdev); - if (disk_check_media_change(disk)) - sr_revalidate_disk(cd); + if (disk_check_media_change(disk)) { + ret = sr_revalidate_disk(cd); + if (ret) + goto out; + } mutex_lock(&cd->lock); ret = cdrom_open(&cd->cdi, mode); mutex_unlock(&cd->lock); - +out: scsi_autopm_put_device(sdev); if (ret) scsi_device_put(cd->device); @@ -685,7 +688,9 @@ static int sr_probe(struct device *dev) blk_pm_runtime_init(sdev->request_queue, dev); dev_set_drvdata(dev, cd); - sr_revalidate_disk(cd); + error = sr_revalidate_disk(cd); + if (error) + goto unregister_cdrom; error = device_add_disk(&sdev->sdev_gendev, disk, NULL); if (error) @@ -714,13 +719,14 @@ static int sr_probe(struct device *dev) } -static void get_sectorsize(struct scsi_cd *cd) +static int get_sectorsize(struct scsi_cd *cd) { + struct request_queue *q = cd->device->request_queue; static const u8 cmd[10] = { READ_CAPACITY }; unsigned char buffer[8] = { }; - int the_result; + struct queue_limits lim; + int err; int sector_size; - struct request_queue *queue; struct scsi_failure failure_defs[] = { { .result = SCMD_FAILURE_RESULT_ANY, @@ -736,10 +742,10 @@ static void get_sectorsize(struct scsi_cd *cd) }; /* Do the command and wait.. */ - the_result = scsi_execute_cmd(cd->device, cmd, REQ_OP_DRV_IN, buffer, + err = scsi_execute_cmd(cd->device, cmd, REQ_OP_DRV_IN, buffer, sizeof(buffer), SR_TIMEOUT, MAX_RETRIES, &exec_args); - if (the_result) { + if (err) { cd->capacity = 0x1fffff; sector_size = 2048; /* A guess, just in case */ } else { @@ -789,10 +795,12 @@ static void get_sectorsize(struct scsi_cd *cd) set_capacity(cd->disk, cd->capacity); } - queue = cd->device->request_queue; - blk_queue_logical_block_size(queue, sector_size); - - return; + lim = queue_limits_start_update(q); + lim.logical_block_size = sector_size; + blk_mq_freeze_queue(q); + err = queue_limits_commit_update(q, &lim); + blk_mq_unfreeze_queue(q); + return err; } static int get_capabilities(struct scsi_cd *cd) From 1652b0bafeaa8281ca9a805d81e13d7647bd2f44 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 31 May 2024 09:48:08 +0200 Subject: [PATCH 0962/1578] block: remove unused queue limits API Remove all APIs that are unused now that sd and sr have been converted to the atomic queue limits API. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: John Garry Reviewed-by: Nitesh Shetty Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240531074837.1648501-14-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 190 ----------------------------------------- include/linux/blkdev.h | 24 ------ 2 files changed, 214 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index a49abdb3554834..0b038729608f4b 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -293,24 +293,6 @@ int queue_limits_set(struct request_queue *q, struct queue_limits *lim) } EXPORT_SYMBOL_GPL(queue_limits_set); -/** - * blk_queue_chunk_sectors - set size of the chunk for this queue - * @q: the request queue for the device - * @chunk_sectors: chunk sectors in the usual 512b unit - * - * Description: - * If a driver doesn't want IOs to cross a given chunk size, it can set - * this limit and prevent merging across chunks. Note that the block layer - * must accept a page worth of data at any offset. So if the crossing of - * chunks is a hard limitation in the driver, it must still be prepared - * to split single page bios. - **/ -void blk_queue_chunk_sectors(struct request_queue *q, unsigned int chunk_sectors) -{ - q->limits.chunk_sectors = chunk_sectors; -} -EXPORT_SYMBOL(blk_queue_chunk_sectors); - /** * blk_queue_max_discard_sectors - set max sectors for a single discard * @q: the request queue for the device @@ -352,139 +334,6 @@ void blk_queue_max_write_zeroes_sectors(struct request_queue *q, } EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors); -/** - * blk_queue_max_zone_append_sectors - set max sectors for a single zone append - * @q: the request queue for the device - * @max_zone_append_sectors: maximum number of sectors to write per command - * - * Sets the maximum number of sectors allowed for zone append commands. If - * Specifying 0 for @max_zone_append_sectors indicates that the queue does - * not natively support zone append operations and that the block layer must - * emulate these operations using regular writes. - **/ -void blk_queue_max_zone_append_sectors(struct request_queue *q, - unsigned int max_zone_append_sectors) -{ - unsigned int max_sectors = 0; - - if (WARN_ON(!blk_queue_is_zoned(q))) - return; - - if (max_zone_append_sectors) { - max_sectors = min(q->limits.max_hw_sectors, - max_zone_append_sectors); - max_sectors = min(q->limits.chunk_sectors, max_sectors); - - /* - * Signal eventual driver bugs resulting in the max_zone_append - * sectors limit being 0 due to the chunk_sectors limit (zone - * size) not set or the max_hw_sectors limit not set. - */ - WARN_ON_ONCE(!max_sectors); - } - - q->limits.max_zone_append_sectors = max_sectors; -} -EXPORT_SYMBOL_GPL(blk_queue_max_zone_append_sectors); - -/** - * blk_queue_logical_block_size - set logical block size for the queue - * @q: the request queue for the device - * @size: the logical block size, in bytes - * - * Description: - * This should be set to the lowest possible block size that the - * storage device can address. The default of 512 covers most - * hardware. - **/ -void blk_queue_logical_block_size(struct request_queue *q, unsigned int size) -{ - struct queue_limits *limits = &q->limits; - - limits->logical_block_size = size; - - if (limits->discard_granularity < limits->logical_block_size) - limits->discard_granularity = limits->logical_block_size; - - if (limits->physical_block_size < size) - limits->physical_block_size = size; - - if (limits->io_min < limits->physical_block_size) - limits->io_min = limits->physical_block_size; - - limits->max_hw_sectors = - round_down(limits->max_hw_sectors, size >> SECTOR_SHIFT); - limits->max_sectors = - round_down(limits->max_sectors, size >> SECTOR_SHIFT); -} -EXPORT_SYMBOL(blk_queue_logical_block_size); - -/** - * blk_queue_physical_block_size - set physical block size for the queue - * @q: the request queue for the device - * @size: the physical block size, in bytes - * - * Description: - * This should be set to the lowest possible sector size that the - * hardware can operate on without reverting to read-modify-write - * operations. - */ -void blk_queue_physical_block_size(struct request_queue *q, unsigned int size) -{ - q->limits.physical_block_size = size; - - if (q->limits.physical_block_size < q->limits.logical_block_size) - q->limits.physical_block_size = q->limits.logical_block_size; - - if (q->limits.discard_granularity < q->limits.physical_block_size) - q->limits.discard_granularity = q->limits.physical_block_size; - - if (q->limits.io_min < q->limits.physical_block_size) - q->limits.io_min = q->limits.physical_block_size; -} -EXPORT_SYMBOL(blk_queue_physical_block_size); - -/** - * blk_queue_zone_write_granularity - set zone write granularity for the queue - * @q: the request queue for the zoned device - * @size: the zone write granularity size, in bytes - * - * Description: - * This should be set to the lowest possible size allowing to write in - * sequential zones of a zoned block device. - */ -void blk_queue_zone_write_granularity(struct request_queue *q, - unsigned int size) -{ - if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) - return; - - q->limits.zone_write_granularity = size; - - if (q->limits.zone_write_granularity < q->limits.logical_block_size) - q->limits.zone_write_granularity = q->limits.logical_block_size; -} -EXPORT_SYMBOL_GPL(blk_queue_zone_write_granularity); - -/** - * blk_queue_alignment_offset - set physical block alignment offset - * @q: the request queue for the device - * @offset: alignment offset in bytes - * - * Description: - * Some devices are naturally misaligned to compensate for things like - * the legacy DOS partition table 63-sector offset. Low-level drivers - * should call this function for devices whose first sector is not - * naturally aligned. - */ -void blk_queue_alignment_offset(struct request_queue *q, unsigned int offset) -{ - q->limits.alignment_offset = - offset & (q->limits.physical_block_size - 1); - q->limits.misaligned = 0; -} -EXPORT_SYMBOL(blk_queue_alignment_offset); - void disk_update_readahead(struct gendisk *disk) { blk_apply_bdi_limits(disk->bdi, &disk->queue->limits); @@ -514,26 +363,6 @@ void blk_limits_io_min(struct queue_limits *limits, unsigned int min) } EXPORT_SYMBOL(blk_limits_io_min); -/** - * blk_queue_io_min - set minimum request size for the queue - * @q: the request queue for the device - * @min: smallest I/O size in bytes - * - * Description: - * Storage devices may report a granularity or preferred minimum I/O - * size which is the smallest request the device can perform without - * incurring a performance penalty. For disk drives this is often the - * physical block size. For RAID arrays it is often the stripe chunk - * size. A properly aligned multiple of minimum_io_size is the - * preferred request size for workloads where a high number of I/O - * operations is desired. - */ -void blk_queue_io_min(struct request_queue *q, unsigned int min) -{ - blk_limits_io_min(&q->limits, min); -} -EXPORT_SYMBOL(blk_queue_io_min); - /** * blk_limits_io_opt - set optimal request size for a device * @limits: the queue limits @@ -841,25 +670,6 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) } EXPORT_SYMBOL_GPL(blk_queue_write_cache); -/** - * disk_set_zoned - inidicate a zoned device - * @disk: gendisk to configure - */ -void disk_set_zoned(struct gendisk *disk) -{ - struct request_queue *q = disk->queue; - - WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED)); - - /* - * Set the zone write granularity to the device logical block - * size by default. The driver can change this value if needed. - */ - q->limits.zoned = true; - blk_queue_zone_write_granularity(q, queue_logical_block_size(q)); -} -EXPORT_SYMBOL_GPL(disk_set_zoned); - int bdev_alignment_offset(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 24c36929920b76..ad995e7a769811 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -332,8 +332,6 @@ struct queue_limits { typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx, void *data); -void disk_set_zoned(struct gendisk *disk); - #define BLK_ALL_ZONES ((unsigned int)-1) int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); @@ -638,18 +636,6 @@ static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector) return sector >> ilog2(disk->queue->limits.chunk_sectors); } -static inline void disk_set_max_open_zones(struct gendisk *disk, - unsigned int max_open_zones) -{ - disk->queue->limits.max_open_zones = max_open_zones; -} - -static inline void disk_set_max_active_zones(struct gendisk *disk, - unsigned int max_active_zones) -{ - disk->queue->limits.max_active_zones = max_active_zones; -} - static inline unsigned int bdev_max_open_zones(struct block_device *bdev) { return bdev->bd_disk->queue->limits.max_open_zones; @@ -929,24 +915,14 @@ static inline void queue_limits_cancel_update(struct request_queue *q) /* * Access functions for manipulating queue properties */ -extern void blk_queue_chunk_sectors(struct request_queue *, unsigned int); void blk_queue_max_secure_erase_sectors(struct request_queue *q, unsigned int max_sectors); extern void blk_queue_max_discard_sectors(struct request_queue *q, unsigned int max_discard_sectors); extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q, unsigned int max_write_same_sectors); -extern void blk_queue_logical_block_size(struct request_queue *, unsigned int); -extern void blk_queue_max_zone_append_sectors(struct request_queue *q, - unsigned int max_zone_append_sectors); -extern void blk_queue_physical_block_size(struct request_queue *, unsigned int); -void blk_queue_zone_write_granularity(struct request_queue *q, - unsigned int size); -extern void blk_queue_alignment_offset(struct request_queue *q, - unsigned int alignment); void disk_update_readahead(struct gendisk *disk); extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); -extern void blk_queue_io_min(struct request_queue *q, unsigned int min); extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth); extern void blk_set_stacking_limits(struct queue_limits *lim); From 73e3715ed14844067c5c598e72777641004a7f60 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 31 May 2024 09:48:09 +0200 Subject: [PATCH 0963/1578] block: add special APIs for run-time disabling of discard and friends A few drivers optimistically try to support discard, write zeroes and secure erase and disable the features from the I/O completion handler if the hardware can't support them. This disable can't be done using the atomic queue limits API because the I/O completion handlers can't take sleeping locks or freeze the queue. Keep the existing clearing of the relevant field to zero, but replace the old blk_queue_max_* APIs with new disable APIs that force the value to 0. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Reviewed-by: John Garry Reviewed-by: Nitesh Shetty Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240531074837.1648501-15-hch@lst.de Signed-off-by: Jens Axboe --- arch/um/drivers/ubd_kern.c | 4 ++-- block/blk-settings.c | 41 ------------------------------------ drivers/block/xen-blkfront.c | 4 ++-- drivers/scsi/sd.c | 4 ++-- include/linux/blkdev.h | 28 ++++++++++++++++++------ 5 files changed, 28 insertions(+), 53 deletions(-) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 093c87879d08ba..cdcb75a68989dd 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -451,9 +451,9 @@ static void ubd_end_request(struct io_thread_req *io_req) { if (io_req->error == BLK_STS_NOTSUPP) { if (req_op(io_req->req) == REQ_OP_DISCARD) - blk_queue_max_discard_sectors(io_req->req->q, 0); + blk_queue_disable_discard(io_req->req->q); else if (req_op(io_req->req) == REQ_OP_WRITE_ZEROES) - blk_queue_max_write_zeroes_sectors(io_req->req->q, 0); + blk_queue_disable_write_zeroes(io_req->req->q); } blk_mq_end_request(io_req->req, io_req->error); kfree(io_req); diff --git a/block/blk-settings.c b/block/blk-settings.c index 0b038729608f4b..996f247fc98e80 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -293,47 +293,6 @@ int queue_limits_set(struct request_queue *q, struct queue_limits *lim) } EXPORT_SYMBOL_GPL(queue_limits_set); -/** - * blk_queue_max_discard_sectors - set max sectors for a single discard - * @q: the request queue for the device - * @max_discard_sectors: maximum number of sectors to discard - **/ -void blk_queue_max_discard_sectors(struct request_queue *q, - unsigned int max_discard_sectors) -{ - struct queue_limits *lim = &q->limits; - - lim->max_hw_discard_sectors = max_discard_sectors; - lim->max_discard_sectors = - min(max_discard_sectors, lim->max_user_discard_sectors); -} -EXPORT_SYMBOL(blk_queue_max_discard_sectors); - -/** - * blk_queue_max_secure_erase_sectors - set max sectors for a secure erase - * @q: the request queue for the device - * @max_sectors: maximum number of sectors to secure_erase - **/ -void blk_queue_max_secure_erase_sectors(struct request_queue *q, - unsigned int max_sectors) -{ - q->limits.max_secure_erase_sectors = max_sectors; -} -EXPORT_SYMBOL(blk_queue_max_secure_erase_sectors); - -/** - * blk_queue_max_write_zeroes_sectors - set max sectors for a single - * write zeroes - * @q: the request queue for the device - * @max_write_zeroes_sectors: maximum number of sectors to write per command - **/ -void blk_queue_max_write_zeroes_sectors(struct request_queue *q, - unsigned int max_write_zeroes_sectors) -{ - q->limits.max_write_zeroes_sectors = max_write_zeroes_sectors; -} -EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors); - void disk_update_readahead(struct gendisk *disk) { blk_apply_bdi_limits(disk->bdi, &disk->queue->limits); diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index fd7c0ff2139cee..9b4ec3e4908cce 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1605,8 +1605,8 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) blkif_req(req)->error = BLK_STS_NOTSUPP; info->feature_discard = 0; info->feature_secdiscard = 0; - blk_queue_max_discard_sectors(rq, 0); - blk_queue_max_secure_erase_sectors(rq, 0); + blk_queue_disable_discard(rq); + blk_queue_disable_secure_erase(rq); } break; case BLKIF_OP_FLUSH_DISKCACHE: diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 049071b5681989..d957e29b17a98a 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -837,7 +837,7 @@ static unsigned char sd_setup_protect_cmnd(struct scsi_cmnd *scmd, static void sd_disable_discard(struct scsi_disk *sdkp) { sdkp->provisioning_mode = SD_LBP_DISABLE; - blk_queue_max_discard_sectors(sdkp->disk->queue, 0); + blk_queue_disable_discard(sdkp->disk->queue); } static void sd_config_discard(struct scsi_disk *sdkp, struct queue_limits *lim, @@ -1019,7 +1019,7 @@ static void sd_disable_write_same(struct scsi_disk *sdkp) { sdkp->device->no_write_same = 1; sdkp->max_ws_blocks = 0; - blk_queue_max_write_zeroes_sectors(sdkp->disk->queue, 0); + blk_queue_disable_write_zeroes(sdkp->disk->queue); } static void sd_config_write_same(struct scsi_disk *sdkp, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ad995e7a769811..ac8e0cb2353a0e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -912,15 +912,31 @@ static inline void queue_limits_cancel_update(struct request_queue *q) mutex_unlock(&q->limits_lock); } +/* + * These helpers are for drivers that have sloppy feature negotiation and might + * have to disable DISCARD, WRITE_ZEROES or SECURE_DISCARD from the I/O + * completion handler when the device returned an indicator that the respective + * feature is not actually supported. They are racy and the driver needs to + * cope with that. Try to avoid this scheme if you can. + */ +static inline void blk_queue_disable_discard(struct request_queue *q) +{ + q->limits.max_discard_sectors = 0; +} + +static inline void blk_queue_disable_secure_erase(struct request_queue *q) +{ + q->limits.max_secure_erase_sectors = 0; +} + +static inline void blk_queue_disable_write_zeroes(struct request_queue *q) +{ + q->limits.max_write_zeroes_sectors = 0; +} + /* * Access functions for manipulating queue properties */ -void blk_queue_max_secure_erase_sectors(struct request_queue *q, - unsigned int max_sectors); -extern void blk_queue_max_discard_sectors(struct request_queue *q, - unsigned int max_discard_sectors); -extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q, - unsigned int max_write_same_sectors); void disk_update_readahead(struct gendisk *disk); extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); From 899ee2c3829c5ac14bfc7d3c4a5846c0b709b78f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Jun 2024 10:48:11 +0200 Subject: [PATCH 0964/1578] block: initialize integrity buffer to zero before writing it to media Metadata added by bio_integrity_prep is using plain kmalloc, which leads to random kernel memory being written media. For PI metadata this is limited to the app tag that isn't used by kernel generated metadata, but for non-PI metadata the entire buffer leaks kernel memory. Fix this by adding the __GFP_ZERO flag to allocations for writes. Fixes: 7ba1ba12eeef ("block: Block layer data integrity support") Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Kanchan Joshi Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20240613084839.1044015-2-hch@lst.de Signed-off-by: Jens Axboe --- block/bio-integrity.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 2e3e8e04961eae..af7f71d16114de 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -432,6 +432,7 @@ bool bio_integrity_prep(struct bio *bio) unsigned long start, end; unsigned int len, nr_pages; unsigned int bytes, offset, i; + gfp_t gfp = GFP_NOIO; if (!bi) return true; @@ -454,11 +455,19 @@ bool bio_integrity_prep(struct bio *bio) if (!bi->profile->generate_fn || !(bi->flags & BLK_INTEGRITY_GENERATE)) return true; + + /* + * Zero the memory allocated to not leak uninitialized kernel + * memory to disk. For PI this only affects the app tag, but + * for non-integrity metadata it affects the entire metadata + * buffer. + */ + gfp |= __GFP_ZERO; } /* Allocate kernel buffer for protection data */ len = bio_integrity_bytes(bi, bio_sectors(bio)); - buf = kmalloc(len, GFP_NOIO); + buf = kmalloc(len, gfp); if (unlikely(buf == NULL)) { printk(KERN_ERR "could not allocate integrity buffer\n"); goto err_end_io; From d11854ed05635e4a73fa61a988ffdd0978c9e202 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Jun 2024 10:48:12 +0200 Subject: [PATCH 0965/1578] md/raid0: don't free conf on raid0_run failure The core md code calls the ->free method which already frees conf. Fixes: 0c031fd37f69 ("md: Move alloc/free acct bioset in to personality") Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20240613084839.1044015-3-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/raid0.c | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index c5d4aeb68404c9..81c01347cd24e6 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -365,18 +365,13 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks return array_sectors; } -static void free_conf(struct mddev *mddev, struct r0conf *conf) -{ - kfree(conf->strip_zone); - kfree(conf->devlist); - kfree(conf); -} - static void raid0_free(struct mddev *mddev, void *priv) { struct r0conf *conf = priv; - free_conf(mddev, conf); + kfree(conf->strip_zone); + kfree(conf->devlist); + kfree(conf); } static int raid0_set_limits(struct mddev *mddev) @@ -415,7 +410,7 @@ static int raid0_run(struct mddev *mddev) if (!mddev_is_dm(mddev)) { ret = raid0_set_limits(mddev); if (ret) - goto out_free_conf; + return ret; } /* calculate array device size */ @@ -427,13 +422,7 @@ static int raid0_run(struct mddev *mddev) dump_zones(mddev); - ret = md_integrity_register(mddev); - if (ret) - goto out_free_conf; - return 0; -out_free_conf: - free_conf(mddev, conf); - return ret; + return md_integrity_register(mddev); } /* From 799af947ed132956d6de6d77a5bc053817ccb06b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Jun 2024 10:48:13 +0200 Subject: [PATCH 0966/1578] md/raid1: don't free conf on raid0_run failure The core md code calls the ->free method which already frees conf. Fixes: 07f1a6850c5d ("md/raid1: fail run raid1 array when active disk less than one") Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240613084839.1044015-4-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/raid1.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 7b8a71ca66dde0..1f321826ef02ba 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -3204,7 +3204,6 @@ static int raid1_set_limits(struct mddev *mddev) return queue_limits_set(mddev->gendisk->queue, &lim); } -static void raid1_free(struct mddev *mddev, void *priv); static int raid1_run(struct mddev *mddev) { struct r1conf *conf; @@ -3238,7 +3237,7 @@ static int raid1_run(struct mddev *mddev) if (!mddev_is_dm(mddev)) { ret = raid1_set_limits(mddev); if (ret) - goto abort; + return ret; } mddev->degraded = 0; @@ -3252,8 +3251,7 @@ static int raid1_run(struct mddev *mddev) */ if (conf->raid_disks - mddev->degraded < 1) { md_unregister_thread(mddev, &conf->thread); - ret = -EINVAL; - goto abort; + return -EINVAL; } if (conf->raid_disks - mddev->degraded == 1) @@ -3277,14 +3275,8 @@ static int raid1_run(struct mddev *mddev) md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); ret = md_integrity_register(mddev); - if (ret) { + if (ret) md_unregister_thread(mddev, &mddev->thread); - goto abort; - } - return 0; - -abort: - raid1_free(mddev, conf); return ret; } From 63e649594ab19cc3122a2d0fc2c94b19932f0b19 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Jun 2024 10:48:14 +0200 Subject: [PATCH 0967/1578] dm-integrity: use the nop integrity profile Use the block layer built-in nop profile instead of duplicating it. Tested by: $ dd if=/dev/urandom of=key.bin bs=512 count=1 $ cryptsetup luksFormat -q --type luks2 --integrity hmac-sha256 \ --integrity-no-wipe /dev/nvme0n1 key.bin $ cryptsetup luksOpen /dev/nvme0n1 luks-integrity --key-file key.bin and then doing mkfs.xfs and simple I/O on the mount file system. Signed-off-by: Christoph Hellwig Reviewed-by: Milan Broz Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240613084839.1044015-5-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/dm-crypt.c | 4 ++-- drivers/md/dm-integrity.c | 20 -------------------- 2 files changed, 2 insertions(+), 22 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 1b7a97cc377943..1dfc462f29cd6f 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1176,8 +1176,8 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti) struct blk_integrity *bi = blk_get_integrity(cc->dev->bdev->bd_disk); struct mapped_device *md = dm_table_get_md(ti->table); - /* From now we require underlying device with our integrity profile */ - if (!bi || strcasecmp(bi->profile->name, "DM-DIF-EXT-TAG")) { + /* We require an underlying device with non-PI metadata */ + if (!bi || strcmp(bi->profile->name, "nop")) { ti->error = "Integrity profile not supported."; return -EINVAL; } diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 417fddebe367a2..c1cc27541673c7 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -350,25 +350,6 @@ static struct kmem_cache *journal_io_cache; #define DEBUG_bytes(bytes, len, msg, ...) do { } while (0) #endif -static void dm_integrity_prepare(struct request *rq) -{ -} - -static void dm_integrity_complete(struct request *rq, unsigned int nr_bytes) -{ -} - -/* - * DM Integrity profile, protection is performed layer above (dm-crypt) - */ -static const struct blk_integrity_profile dm_integrity_profile = { - .name = "DM-DIF-EXT-TAG", - .generate_fn = NULL, - .verify_fn = NULL, - .prepare_fn = dm_integrity_prepare, - .complete_fn = dm_integrity_complete, -}; - static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map); static void integrity_bio_wait(struct work_struct *w); static void dm_integrity_dtr(struct dm_target *ti); @@ -3656,7 +3637,6 @@ static void dm_integrity_set(struct dm_target *ti, struct dm_integrity_c *ic) struct blk_integrity bi; memset(&bi, 0, sizeof(bi)); - bi.profile = &dm_integrity_profile; bi.tuple_size = ic->tag_size; bi.tag_size = bi.tuple_size; bi.interval_exp = ic->sb->log2_sectors_per_block + SECTOR_SHIFT; From e9f5f44ad3725335d9c559c3c22cd3726152a7b1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Jun 2024 10:48:15 +0200 Subject: [PATCH 0968/1578] block: remove the blk_integrity_profile structure Block layer integrity configuration is a bit complex right now, as it indirects through operation vectors for a simple two-dimensional configuration: a) the checksum type of none, ip checksum, crc, crc64 b) the presence or absence of a reference tag Remove the integrity profile, and instead add a separate csum_type flag which replaces the existing ip-checksum field and a new flag that indicates the presence of the reference tag. This removes up to two layers of indirect calls, remove the need to offload the no-op verification of non-PI metadata to a workqueue and generally simplifies the code. The downside is that block/t10-pi.c now has to be built into the kernel when CONFIG_BLK_DEV_INTEGRITY is supported. Given that both nvme and SCSI require t10-pi.ko, it is loaded for all usual configurations that enabled CONFIG_BLK_DEV_INTEGRITY already, though. Signed-off-by: Christoph Hellwig Reviewed-by: Kanchan Joshi Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240613084839.1044015-6-hch@lst.de Signed-off-by: Jens Axboe --- block/Kconfig | 8 +- block/Makefile | 3 +- block/bio-integrity.c | 32 ++-- block/blk-integrity.c | 66 ++++---- block/blk-mq.c | 13 +- block/blk.h | 8 + block/t10-pi.c | 241 ++++++++++------------------ drivers/md/dm-crypt.c | 2 +- drivers/nvme/host/Kconfig | 1 - drivers/nvme/host/core.c | 17 +- drivers/nvme/target/Kconfig | 1 - drivers/nvme/target/io-cmd-bdev.c | 16 +- drivers/scsi/Kconfig | 1 - drivers/scsi/sd_dif.c | 19 +-- drivers/target/target_core_iblock.c | 49 +++--- include/linux/blk-integrity.h | 35 ++-- include/linux/blkdev.h | 9 +- include/linux/t10-pi.h | 8 - 18 files changed, 215 insertions(+), 314 deletions(-) diff --git a/block/Kconfig b/block/Kconfig index dc12af58dbaeca..5b623b876d3b4a 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -62,6 +62,8 @@ config BLK_DEV_BSGLIB config BLK_DEV_INTEGRITY bool "Block layer data integrity support" + select CRC_T10DIF + select CRC64_ROCKSOFT help Some storage devices allow extra information to be stored/retrieved to help protect the data. The block layer @@ -72,12 +74,6 @@ config BLK_DEV_INTEGRITY T10/SCSI Data Integrity Field or the T13/ATA External Path Protection. If in doubt, say N. -config BLK_DEV_INTEGRITY_T10 - tristate - depends on BLK_DEV_INTEGRITY - select CRC_T10DIF - select CRC64_ROCKSOFT - config BLK_DEV_WRITE_MOUNTED bool "Allow writing to mounted block devices" default y diff --git a/block/Makefile b/block/Makefile index 168150b9c51025..ddfd21c1a9ffc9 100644 --- a/block/Makefile +++ b/block/Makefile @@ -26,8 +26,7 @@ obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o obj-$(CONFIG_IOSCHED_BFQ) += bfq.o -obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o -obj-$(CONFIG_BLK_DEV_INTEGRITY_T10) += t10-pi.o +obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o obj-$(CONFIG_BLK_MQ_VIRTIO) += blk-mq-virtio.o obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o diff --git a/block/bio-integrity.c b/block/bio-integrity.c index af7f71d16114de..31dbc2853f92e3 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -378,10 +378,9 @@ EXPORT_SYMBOL_GPL(bio_integrity_map_user); * bio_integrity_process - Process integrity metadata for a bio * @bio: bio to generate/verify integrity metadata for * @proc_iter: iterator to process - * @proc_fn: Pointer to the relevant processing function */ static blk_status_t bio_integrity_process(struct bio *bio, - struct bvec_iter *proc_iter, integrity_processing_fn *proc_fn) + struct bvec_iter *proc_iter) { struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct blk_integrity_iter iter; @@ -392,17 +391,18 @@ static blk_status_t bio_integrity_process(struct bio *bio, iter.disk_name = bio->bi_bdev->bd_disk->disk_name; iter.interval = 1 << bi->interval_exp; - iter.tuple_size = bi->tuple_size; iter.seed = proc_iter->bi_sector; iter.prot_buf = bvec_virt(bip->bip_vec); - iter.pi_offset = bi->pi_offset; __bio_for_each_segment(bv, bio, bviter, *proc_iter) { void *kaddr = bvec_kmap_local(&bv); iter.data_buf = kaddr; iter.data_size = bv.bv_len; - ret = proc_fn(&iter); + if (bio_data_dir(bio) == WRITE) + blk_integrity_generate(&iter, bi); + else + ret = blk_integrity_verify(&iter, bi); kunmap_local(kaddr); if (ret) @@ -448,12 +448,10 @@ bool bio_integrity_prep(struct bio *bio) return true; if (bio_data_dir(bio) == READ) { - if (!bi->profile->verify_fn || - !(bi->flags & BLK_INTEGRITY_VERIFY)) + if (!(bi->flags & BLK_INTEGRITY_VERIFY)) return true; } else { - if (!bi->profile->generate_fn || - !(bi->flags & BLK_INTEGRITY_GENERATE)) + if (!(bi->flags & BLK_INTEGRITY_GENERATE)) return true; /* @@ -488,7 +486,7 @@ bool bio_integrity_prep(struct bio *bio) bip->bip_flags |= BIP_BLOCK_INTEGRITY; bip_set_seed(bip, bio->bi_iter.bi_sector); - if (bi->flags & BLK_INTEGRITY_IP_CHECKSUM) + if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) bip->bip_flags |= BIP_IP_CHECKSUM; /* Map it */ @@ -511,12 +509,10 @@ bool bio_integrity_prep(struct bio *bio) } /* Auto-generate integrity metadata if this is a write */ - if (bio_data_dir(bio) == WRITE) { - bio_integrity_process(bio, &bio->bi_iter, - bi->profile->generate_fn); - } else { + if (bio_data_dir(bio) == WRITE) + bio_integrity_process(bio, &bio->bi_iter); + else bip->bio_iter = bio->bi_iter; - } return true; err_end_io: @@ -539,15 +535,13 @@ static void bio_integrity_verify_fn(struct work_struct *work) struct bio_integrity_payload *bip = container_of(work, struct bio_integrity_payload, bip_work); struct bio *bio = bip->bip_bio; - struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); /* * At the moment verify is called bio's iterator was advanced * during split and completion, we need to rewind iterator to * it's original position. */ - bio->bi_status = bio_integrity_process(bio, &bip->bio_iter, - bi->profile->verify_fn); + bio->bi_status = bio_integrity_process(bio, &bip->bio_iter); bio_integrity_free(bio); bio_endio(bio); } @@ -569,7 +563,7 @@ bool __bio_integrity_endio(struct bio *bio) struct bio_integrity_payload *bip = bio_integrity(bio); if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && - (bip->bip_flags & BIP_BLOCK_INTEGRITY) && bi->profile->verify_fn) { + (bip->bip_flags & BIP_BLOCK_INTEGRITY) && bi->csum_type) { INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); queue_work(kintegrityd_wq, &bip->bip_work); return false; diff --git a/block/blk-integrity.c b/block/blk-integrity.c index ccbeb6dfa87a4d..17d37badfbb8bc 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -123,10 +123,10 @@ int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2) struct blk_integrity *b1 = &gd1->queue->integrity; struct blk_integrity *b2 = &gd2->queue->integrity; - if (!b1->profile && !b2->profile) + if (!b1->tuple_size && !b2->tuple_size) return 0; - if (!b1->profile || !b2->profile) + if (!b1->tuple_size || !b2->tuple_size) return -1; if (b1->interval_exp != b2->interval_exp) { @@ -150,10 +150,13 @@ int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2) return -1; } - if (b1->profile != b2->profile) { + if (b1->csum_type != b2->csum_type || + (b1->flags & BLK_INTEGRITY_REF_TAG) != + (b2->flags & BLK_INTEGRITY_REF_TAG)) { pr_err("%s: %s/%s type %s != %s\n", __func__, gd1->disk_name, gd2->disk_name, - b1->profile->name, b2->profile->name); + blk_integrity_profile_name(b1), + blk_integrity_profile_name(b2)); return -1; } @@ -217,14 +220,37 @@ static inline struct blk_integrity *dev_to_bi(struct device *dev) return &dev_to_disk(dev)->queue->integrity; } +const char *blk_integrity_profile_name(struct blk_integrity *bi) +{ + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_IP: + if (bi->flags & BLK_INTEGRITY_REF_TAG) + return "T10-DIF-TYPE1-IP"; + return "T10-DIF-TYPE3-IP"; + case BLK_INTEGRITY_CSUM_CRC: + if (bi->flags & BLK_INTEGRITY_REF_TAG) + return "T10-DIF-TYPE1-CRC"; + return "T10-DIF-TYPE3-CRC"; + case BLK_INTEGRITY_CSUM_CRC64: + if (bi->flags & BLK_INTEGRITY_REF_TAG) + return "EXT-DIF-TYPE1-CRC64"; + return "EXT-DIF-TYPE3-CRC64"; + case BLK_INTEGRITY_CSUM_NONE: + break; + } + + return "nop"; +} +EXPORT_SYMBOL_GPL(blk_integrity_profile_name); + static ssize_t format_show(struct device *dev, struct device_attribute *attr, char *page) { struct blk_integrity *bi = dev_to_bi(dev); - if (bi->profile && bi->profile->name) - return sysfs_emit(page, "%s\n", bi->profile->name); - return sysfs_emit(page, "none\n"); + if (!bi->tuple_size) + return sysfs_emit(page, "none\n"); + return sysfs_emit(page, "%s\n", blk_integrity_profile_name(bi)); } static ssize_t tag_size_show(struct device *dev, struct device_attribute *attr, @@ -326,28 +352,6 @@ const struct attribute_group blk_integrity_attr_group = { .attrs = integrity_attrs, }; -static blk_status_t blk_integrity_nop_fn(struct blk_integrity_iter *iter) -{ - return BLK_STS_OK; -} - -static void blk_integrity_nop_prepare(struct request *rq) -{ -} - -static void blk_integrity_nop_complete(struct request *rq, - unsigned int nr_bytes) -{ -} - -static const struct blk_integrity_profile nop_profile = { - .name = "nop", - .generate_fn = blk_integrity_nop_fn, - .verify_fn = blk_integrity_nop_fn, - .prepare_fn = blk_integrity_nop_prepare, - .complete_fn = blk_integrity_nop_complete, -}; - /** * blk_integrity_register - Register a gendisk as being integrity-capable * @disk: struct gendisk pointer to make integrity-aware @@ -363,11 +367,11 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template { struct blk_integrity *bi = &disk->queue->integrity; + bi->csum_type = template->csum_type; bi->flags = BLK_INTEGRITY_VERIFY | BLK_INTEGRITY_GENERATE | template->flags; bi->interval_exp = template->interval_exp ? : ilog2(queue_logical_block_size(disk->queue)); - bi->profile = template->profile ? template->profile : &nop_profile; bi->tuple_size = template->tuple_size; bi->tag_size = template->tag_size; bi->pi_offset = template->pi_offset; @@ -394,7 +398,7 @@ void blk_integrity_unregister(struct gendisk *disk) { struct blk_integrity *bi = &disk->queue->integrity; - if (!bi->profile) + if (!bi->tuple_size) return; /* ensure all bios are off the integrity workqueue */ diff --git a/block/blk-mq.c b/block/blk-mq.c index 3b4df8e5ac9e5f..0d4cd39c3d25da 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -804,10 +804,8 @@ static void blk_complete_request(struct request *req) if (!bio) return; -#ifdef CONFIG_BLK_DEV_INTEGRITY if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ) - req->q->integrity.profile->complete_fn(req, total_bytes); -#endif + blk_integrity_complete(req, total_bytes); /* * Upper layers may call blk_crypto_evict_key() anytime after the last @@ -875,11 +873,9 @@ bool blk_update_request(struct request *req, blk_status_t error, if (!req->bio) return false; -#ifdef CONFIG_BLK_DEV_INTEGRITY if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && error == BLK_STS_OK) - req->q->integrity.profile->complete_fn(req, nr_bytes); -#endif + blk_integrity_complete(req, nr_bytes); /* * Upper layers may call blk_crypto_evict_key() anytime after the last @@ -1264,10 +1260,9 @@ void blk_mq_start_request(struct request *rq) WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT); rq->mq_hctx->tags->rqs[rq->tag] = rq; -#ifdef CONFIG_BLK_DEV_INTEGRITY if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE) - q->integrity.profile->prepare_fn(rq); -#endif + blk_integrity_prepare(rq); + if (rq->bio && rq->bio->bi_opf & REQ_POLLED) WRITE_ONCE(rq->bio->bi_cookie, rq->mq_hctx->queue_num); } diff --git a/block/blk.h b/block/blk.h index 189bc25beb502a..79e8d5d4fe0caf 100644 --- a/block/blk.h +++ b/block/blk.h @@ -9,6 +9,7 @@ #include #include "blk-crypto-internal.h" +struct blk_integrity_iter; struct elevator_type; /* Max future timer expiry for timeouts */ @@ -673,4 +674,11 @@ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops, struct file *bdev_file); int bdev_permission(dev_t dev, blk_mode_t mode, void *holder); +void blk_integrity_generate(struct blk_integrity_iter *iter, + struct blk_integrity *bi); +blk_status_t blk_integrity_verify(struct blk_integrity_iter *iter, + struct blk_integrity *bi); +void blk_integrity_prepare(struct request *rq); +void blk_integrity_complete(struct request *rq, unsigned int nr_bytes); + #endif /* BLK_INTERNAL_H */ diff --git a/block/t10-pi.c b/block/t10-pi.c index f4cc91156da1f2..dadecf621497bb 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -11,17 +11,14 @@ #include #include #include +#include "blk.h" -typedef __be16 (csum_fn) (__be16, void *, unsigned int); - -static __be16 t10_pi_crc_fn(__be16 crc, void *data, unsigned int len) -{ - return cpu_to_be16(crc_t10dif_update(be16_to_cpu(crc), data, len)); -} - -static __be16 t10_pi_ip_fn(__be16 csum, void *data, unsigned int len) +static __be16 t10_pi_csum(__be16 csum, void *data, unsigned int len, + unsigned char csum_type) { - return (__force __be16)ip_compute_csum(data, len); + if (csum_type == BLK_INTEGRITY_CSUM_IP) + return (__force __be16)ip_compute_csum(data, len); + return cpu_to_be16(crc_t10dif_update(be16_to_cpu(csum), data, len)); } /* @@ -29,48 +26,44 @@ static __be16 t10_pi_ip_fn(__be16 csum, void *data, unsigned int len) * 16 bit app tag, 32 bit reference tag. Type 3 does not define the ref * tag. */ -static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter, - csum_fn *fn, enum t10_dif_type type) +static void t10_pi_generate(struct blk_integrity_iter *iter, + struct blk_integrity *bi) { - u8 offset = iter->pi_offset; + u8 offset = bi->pi_offset; unsigned int i; for (i = 0 ; i < iter->data_size ; i += iter->interval) { struct t10_pi_tuple *pi = iter->prot_buf + offset; - pi->guard_tag = fn(0, iter->data_buf, iter->interval); + pi->guard_tag = t10_pi_csum(0, iter->data_buf, iter->interval, + bi->csum_type); if (offset) - pi->guard_tag = fn(pi->guard_tag, iter->prot_buf, - offset); + pi->guard_tag = t10_pi_csum(pi->guard_tag, + iter->prot_buf, offset, bi->csum_type); pi->app_tag = 0; - if (type == T10_PI_TYPE1_PROTECTION) + if (bi->flags & BLK_INTEGRITY_REF_TAG) pi->ref_tag = cpu_to_be32(lower_32_bits(iter->seed)); else pi->ref_tag = 0; iter->data_buf += iter->interval; - iter->prot_buf += iter->tuple_size; + iter->prot_buf += bi->tuple_size; iter->seed++; } - - return BLK_STS_OK; } static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, - csum_fn *fn, enum t10_dif_type type) + struct blk_integrity *bi) { - u8 offset = iter->pi_offset; + u8 offset = bi->pi_offset; unsigned int i; - BUG_ON(type == T10_PI_TYPE0_PROTECTION); - for (i = 0 ; i < iter->data_size ; i += iter->interval) { struct t10_pi_tuple *pi = iter->prot_buf + offset; __be16 csum; - if (type == T10_PI_TYPE1_PROTECTION || - type == T10_PI_TYPE2_PROTECTION) { + if (bi->flags & BLK_INTEGRITY_REF_TAG) { if (pi->app_tag == T10_PI_APP_ESCAPE) goto next; @@ -82,15 +75,17 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, iter->seed, be32_to_cpu(pi->ref_tag)); return BLK_STS_PROTECTION; } - } else if (type == T10_PI_TYPE3_PROTECTION) { + } else { if (pi->app_tag == T10_PI_APP_ESCAPE && pi->ref_tag == T10_PI_REF_ESCAPE) goto next; } - csum = fn(0, iter->data_buf, iter->interval); + csum = t10_pi_csum(0, iter->data_buf, iter->interval, + bi->csum_type); if (offset) - csum = fn(csum, iter->prot_buf, offset); + csum = t10_pi_csum(csum, iter->prot_buf, offset, + bi->csum_type); if (pi->guard_tag != csum) { pr_err("%s: guard tag error at sector %llu " \ @@ -102,33 +97,13 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, next: iter->data_buf += iter->interval; - iter->prot_buf += iter->tuple_size; + iter->prot_buf += bi->tuple_size; iter->seed++; } return BLK_STS_OK; } -static blk_status_t t10_pi_type1_generate_crc(struct blk_integrity_iter *iter) -{ - return t10_pi_generate(iter, t10_pi_crc_fn, T10_PI_TYPE1_PROTECTION); -} - -static blk_status_t t10_pi_type1_generate_ip(struct blk_integrity_iter *iter) -{ - return t10_pi_generate(iter, t10_pi_ip_fn, T10_PI_TYPE1_PROTECTION); -} - -static blk_status_t t10_pi_type1_verify_crc(struct blk_integrity_iter *iter) -{ - return t10_pi_verify(iter, t10_pi_crc_fn, T10_PI_TYPE1_PROTECTION); -} - -static blk_status_t t10_pi_type1_verify_ip(struct blk_integrity_iter *iter) -{ - return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE1_PROTECTION); -} - /** * t10_pi_type1_prepare - prepare PI prior submitting request to device * @rq: request with PI that should be prepared @@ -225,81 +200,15 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) } } -static blk_status_t t10_pi_type3_generate_crc(struct blk_integrity_iter *iter) -{ - return t10_pi_generate(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION); -} - -static blk_status_t t10_pi_type3_generate_ip(struct blk_integrity_iter *iter) -{ - return t10_pi_generate(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION); -} - -static blk_status_t t10_pi_type3_verify_crc(struct blk_integrity_iter *iter) -{ - return t10_pi_verify(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION); -} - -static blk_status_t t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) -{ - return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION); -} - -/* Type 3 does not have a reference tag so no remapping is required. */ -static void t10_pi_type3_prepare(struct request *rq) -{ -} - -/* Type 3 does not have a reference tag so no remapping is required. */ -static void t10_pi_type3_complete(struct request *rq, unsigned int nr_bytes) -{ -} - -const struct blk_integrity_profile t10_pi_type1_crc = { - .name = "T10-DIF-TYPE1-CRC", - .generate_fn = t10_pi_type1_generate_crc, - .verify_fn = t10_pi_type1_verify_crc, - .prepare_fn = t10_pi_type1_prepare, - .complete_fn = t10_pi_type1_complete, -}; -EXPORT_SYMBOL(t10_pi_type1_crc); - -const struct blk_integrity_profile t10_pi_type1_ip = { - .name = "T10-DIF-TYPE1-IP", - .generate_fn = t10_pi_type1_generate_ip, - .verify_fn = t10_pi_type1_verify_ip, - .prepare_fn = t10_pi_type1_prepare, - .complete_fn = t10_pi_type1_complete, -}; -EXPORT_SYMBOL(t10_pi_type1_ip); - -const struct blk_integrity_profile t10_pi_type3_crc = { - .name = "T10-DIF-TYPE3-CRC", - .generate_fn = t10_pi_type3_generate_crc, - .verify_fn = t10_pi_type3_verify_crc, - .prepare_fn = t10_pi_type3_prepare, - .complete_fn = t10_pi_type3_complete, -}; -EXPORT_SYMBOL(t10_pi_type3_crc); - -const struct blk_integrity_profile t10_pi_type3_ip = { - .name = "T10-DIF-TYPE3-IP", - .generate_fn = t10_pi_type3_generate_ip, - .verify_fn = t10_pi_type3_verify_ip, - .prepare_fn = t10_pi_type3_prepare, - .complete_fn = t10_pi_type3_complete, -}; -EXPORT_SYMBOL(t10_pi_type3_ip); - static __be64 ext_pi_crc64(u64 crc, void *data, unsigned int len) { return cpu_to_be64(crc64_rocksoft_update(crc, data, len)); } -static blk_status_t ext_pi_crc64_generate(struct blk_integrity_iter *iter, - enum t10_dif_type type) +static void ext_pi_crc64_generate(struct blk_integrity_iter *iter, + struct blk_integrity *bi) { - u8 offset = iter->pi_offset; + u8 offset = bi->pi_offset; unsigned int i; for (i = 0 ; i < iter->data_size ; i += iter->interval) { @@ -311,17 +220,15 @@ static blk_status_t ext_pi_crc64_generate(struct blk_integrity_iter *iter, iter->prot_buf, offset); pi->app_tag = 0; - if (type == T10_PI_TYPE1_PROTECTION) + if (bi->flags & BLK_INTEGRITY_REF_TAG) put_unaligned_be48(iter->seed, pi->ref_tag); else put_unaligned_be48(0ULL, pi->ref_tag); iter->data_buf += iter->interval; - iter->prot_buf += iter->tuple_size; + iter->prot_buf += bi->tuple_size; iter->seed++; } - - return BLK_STS_OK; } static bool ext_pi_ref_escape(u8 *ref_tag) @@ -332,9 +239,9 @@ static bool ext_pi_ref_escape(u8 *ref_tag) } static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, - enum t10_dif_type type) + struct blk_integrity *bi) { - u8 offset = iter->pi_offset; + u8 offset = bi->pi_offset; unsigned int i; for (i = 0; i < iter->data_size; i += iter->interval) { @@ -342,7 +249,7 @@ static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, u64 ref, seed; __be64 csum; - if (type == T10_PI_TYPE1_PROTECTION) { + if (bi->flags & BLK_INTEGRITY_REF_TAG) { if (pi->app_tag == T10_PI_APP_ESCAPE) goto next; @@ -353,7 +260,7 @@ static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, iter->disk_name, seed, ref); return BLK_STS_PROTECTION; } - } else if (type == T10_PI_TYPE3_PROTECTION) { + } else { if (pi->app_tag == T10_PI_APP_ESCAPE && ext_pi_ref_escape(pi->ref_tag)) goto next; @@ -374,23 +281,13 @@ static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, next: iter->data_buf += iter->interval; - iter->prot_buf += iter->tuple_size; + iter->prot_buf += bi->tuple_size; iter->seed++; } return BLK_STS_OK; } -static blk_status_t ext_pi_type1_verify_crc64(struct blk_integrity_iter *iter) -{ - return ext_pi_crc64_verify(iter, T10_PI_TYPE1_PROTECTION); -} - -static blk_status_t ext_pi_type1_generate_crc64(struct blk_integrity_iter *iter) -{ - return ext_pi_crc64_generate(iter, T10_PI_TYPE1_PROTECTION); -} - static void ext_pi_type1_prepare(struct request *rq) { struct blk_integrity *bi = &rq->q->integrity; @@ -467,33 +364,61 @@ static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes) } } -static blk_status_t ext_pi_type3_verify_crc64(struct blk_integrity_iter *iter) +void blk_integrity_generate(struct blk_integrity_iter *iter, + struct blk_integrity *bi) { - return ext_pi_crc64_verify(iter, T10_PI_TYPE3_PROTECTION); + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + ext_pi_crc64_generate(iter, bi); + break; + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + t10_pi_generate(iter, bi); + break; + default: + break; + } } -static blk_status_t ext_pi_type3_generate_crc64(struct blk_integrity_iter *iter) +blk_status_t blk_integrity_verify(struct blk_integrity_iter *iter, + struct blk_integrity *bi) { - return ext_pi_crc64_generate(iter, T10_PI_TYPE3_PROTECTION); + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + return ext_pi_crc64_verify(iter, bi); + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + return t10_pi_verify(iter, bi); + default: + return BLK_STS_OK; + } } -const struct blk_integrity_profile ext_pi_type1_crc64 = { - .name = "EXT-DIF-TYPE1-CRC64", - .generate_fn = ext_pi_type1_generate_crc64, - .verify_fn = ext_pi_type1_verify_crc64, - .prepare_fn = ext_pi_type1_prepare, - .complete_fn = ext_pi_type1_complete, -}; -EXPORT_SYMBOL_GPL(ext_pi_type1_crc64); - -const struct blk_integrity_profile ext_pi_type3_crc64 = { - .name = "EXT-DIF-TYPE3-CRC64", - .generate_fn = ext_pi_type3_generate_crc64, - .verify_fn = ext_pi_type3_verify_crc64, - .prepare_fn = t10_pi_type3_prepare, - .complete_fn = t10_pi_type3_complete, -}; -EXPORT_SYMBOL_GPL(ext_pi_type3_crc64); +void blk_integrity_prepare(struct request *rq) +{ + struct blk_integrity *bi = &rq->q->integrity; + + if (!(bi->flags & BLK_INTEGRITY_REF_TAG)) + return; + + if (bi->csum_type == BLK_INTEGRITY_CSUM_CRC64) + ext_pi_type1_prepare(rq); + else + t10_pi_type1_prepare(rq); +} + +void blk_integrity_complete(struct request *rq, unsigned int nr_bytes) +{ + struct blk_integrity *bi = &rq->q->integrity; + + if (!(bi->flags & BLK_INTEGRITY_REF_TAG)) + return; + + if (bi->csum_type == BLK_INTEGRITY_CSUM_CRC64) + ext_pi_type1_complete(rq, nr_bytes); + else + t10_pi_type1_complete(rq, nr_bytes); +} MODULE_DESCRIPTION("T10 Protection Information module"); MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 1dfc462f29cd6f..6c013ceb0e5f1d 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1177,7 +1177,7 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti) struct mapped_device *md = dm_table_get_md(ti->table); /* We require an underlying device with non-PI metadata */ - if (!bi || strcmp(bi->profile->name, "nop")) { + if (!bi || bi->csum_type != BLK_INTEGRITY_CSUM_NONE) { ti->error = "Integrity profile not supported."; return -EINVAL; } diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index b309c8be720f47..a3caef75aa0a83 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only config NVME_CORE tristate - select BLK_DEV_INTEGRITY_T10 if BLK_DEV_INTEGRITY config BLK_DEV_NVME tristate "NVM Express block device" diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f5d150c62955d8..14bac248cde4ca 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1744,17 +1744,16 @@ static bool nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head) case NVME_NS_DPS_PI_TYPE3: switch (head->guard_type) { case NVME_NVM_NS_16B_GUARD: - integrity.profile = &t10_pi_type3_crc; + integrity.csum_type = BLK_INTEGRITY_CSUM_CRC; integrity.tag_size = sizeof(u16) + sizeof(u32); integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; break; case NVME_NVM_NS_64B_GUARD: - integrity.profile = &ext_pi_type3_crc64; + integrity.csum_type = BLK_INTEGRITY_CSUM_CRC64; integrity.tag_size = sizeof(u16) + 6; integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; break; default: - integrity.profile = NULL; break; } break; @@ -1762,22 +1761,22 @@ static bool nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head) case NVME_NS_DPS_PI_TYPE2: switch (head->guard_type) { case NVME_NVM_NS_16B_GUARD: - integrity.profile = &t10_pi_type1_crc; + integrity.csum_type = BLK_INTEGRITY_CSUM_CRC; integrity.tag_size = sizeof(u16); - integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; + integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE | + BLK_INTEGRITY_REF_TAG; break; case NVME_NVM_NS_64B_GUARD: - integrity.profile = &ext_pi_type1_crc64; + integrity.csum_type = BLK_INTEGRITY_CSUM_CRC64; integrity.tag_size = sizeof(u16); - integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; + integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE | + BLK_INTEGRITY_REF_TAG; break; default: - integrity.profile = NULL; break; } break; default: - integrity.profile = NULL; break; } diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig index 872dd1a0acd804..c42aec41cc7b1f 100644 --- a/drivers/nvme/target/Kconfig +++ b/drivers/nvme/target/Kconfig @@ -6,7 +6,6 @@ config NVME_TARGET depends on CONFIGFS_FS select NVME_KEYRING if NVME_TARGET_TCP_TLS select KEYS if NVME_TARGET_TCP_TLS - select BLK_DEV_INTEGRITY_T10 if BLK_DEV_INTEGRITY select SGL_ALLOC help This enabled target side support for the NVMe protocol, that is diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 6426aac2634aeb..b628bc5ee99847 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -61,15 +61,17 @@ static void nvmet_bdev_ns_enable_integrity(struct nvmet_ns *ns) { struct blk_integrity *bi = bdev_get_integrity(ns->bdev); - if (bi) { + if (!bi) + return; + + if (bi->csum_type == BLK_INTEGRITY_CSUM_CRC) { ns->metadata_size = bi->tuple_size; - if (bi->profile == &t10_pi_type1_crc) + if (bi->flags & BLK_INTEGRITY_REF_TAG) ns->pi_type = NVME_NS_DPS_PI_TYPE1; - else if (bi->profile == &t10_pi_type3_crc) - ns->pi_type = NVME_NS_DPS_PI_TYPE3; else - /* Unsupported metadata type */ - ns->metadata_size = 0; + ns->pi_type = NVME_NS_DPS_PI_TYPE3; + } else { + ns->metadata_size = 0; } } @@ -102,7 +104,7 @@ int nvmet_bdev_ns_enable(struct nvmet_ns *ns) ns->pi_type = 0; ns->metadata_size = 0; - if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY_T10)) + if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) nvmet_bdev_ns_enable_integrity(ns); if (bdev_is_zoned(ns->bdev)) { diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig index 065db86d602164..37c24ffea65cc0 100644 --- a/drivers/scsi/Kconfig +++ b/drivers/scsi/Kconfig @@ -82,7 +82,6 @@ comment "SCSI support type (disk, tape, CD-ROM)" config BLK_DEV_SD tristate "SCSI disk support" depends on SCSI - select BLK_DEV_INTEGRITY_T10 if BLK_DEV_INTEGRITY help If you want to use SCSI hard disks, Fibre Channel disks, Serial ATA (SATA) or Parallel ATA (PATA) hard disks, diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c index 1df847b5f74764..6f0921c7db787b 100644 --- a/drivers/scsi/sd_dif.c +++ b/drivers/scsi/sd_dif.c @@ -47,18 +47,13 @@ void sd_dif_config_host(struct scsi_disk *sdkp) memset(&bi, 0, sizeof(bi)); /* Enable DMA of protection information */ - if (scsi_host_get_guard(sdkp->device->host) & SHOST_DIX_GUARD_IP) { - if (type == T10_PI_TYPE3_PROTECTION) - bi.profile = &t10_pi_type3_ip; - else - bi.profile = &t10_pi_type1_ip; + if (scsi_host_get_guard(sdkp->device->host) & SHOST_DIX_GUARD_IP) + bi.csum_type = BLK_INTEGRITY_CSUM_IP; + else + bi.csum_type = BLK_INTEGRITY_CSUM_CRC; - bi.flags |= BLK_INTEGRITY_IP_CHECKSUM; - } else - if (type == T10_PI_TYPE3_PROTECTION) - bi.profile = &t10_pi_type3_crc; - else - bi.profile = &t10_pi_type1_crc; + if (type != T10_PI_TYPE3_PROTECTION) + bi.flags |= BLK_INTEGRITY_REF_TAG; bi.tuple_size = sizeof(struct t10_pi_tuple); @@ -76,7 +71,7 @@ void sd_dif_config_host(struct scsi_disk *sdkp) sd_first_printk(KERN_NOTICE, sdkp, "Enabling DIX %s, application tag size %u bytes\n", - bi.profile->name, bi.tag_size); + blk_integrity_profile_name(&bi), bi.tag_size); out: blk_integrity_register(disk, &bi); } diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index 7f6ca81778453b..a3e09adc4e767c 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -148,35 +148,38 @@ static int iblock_configure_device(struct se_device *dev) dev->dev_attrib.is_nonrot = 1; bi = bdev_get_integrity(bd); - if (bi) { - struct bio_set *bs = &ib_dev->ibd_bio_set; - - if (!strcmp(bi->profile->name, "T10-DIF-TYPE3-IP") || - !strcmp(bi->profile->name, "T10-DIF-TYPE1-IP")) { - pr_err("IBLOCK export of blk_integrity: %s not" - " supported\n", bi->profile->name); - ret = -ENOSYS; - goto out_blkdev_put; - } + if (!bi) + return 0; - if (!strcmp(bi->profile->name, "T10-DIF-TYPE3-CRC")) { - dev->dev_attrib.pi_prot_type = TARGET_DIF_TYPE3_PROT; - } else if (!strcmp(bi->profile->name, "T10-DIF-TYPE1-CRC")) { + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_IP: + pr_err("IBLOCK export of blk_integrity: %s not supported\n", + blk_integrity_profile_name(bi)); + ret = -ENOSYS; + goto out_blkdev_put; + case BLK_INTEGRITY_CSUM_CRC: + if (bi->flags & BLK_INTEGRITY_REF_TAG) dev->dev_attrib.pi_prot_type = TARGET_DIF_TYPE1_PROT; - } + else + dev->dev_attrib.pi_prot_type = TARGET_DIF_TYPE3_PROT; + break; + default: + break; + } - if (dev->dev_attrib.pi_prot_type) { - if (bioset_integrity_create(bs, IBLOCK_BIO_POOL_SIZE) < 0) { - pr_err("Unable to allocate bioset for PI\n"); - ret = -ENOMEM; - goto out_blkdev_put; - } - pr_debug("IBLOCK setup BIP bs->bio_integrity_pool: %p\n", - &bs->bio_integrity_pool); + if (dev->dev_attrib.pi_prot_type) { + struct bio_set *bs = &ib_dev->ibd_bio_set; + + if (bioset_integrity_create(bs, IBLOCK_BIO_POOL_SIZE) < 0) { + pr_err("Unable to allocate bioset for PI\n"); + ret = -ENOMEM; + goto out_blkdev_put; } - dev->dev_attrib.hw_pi_prot_type = dev->dev_attrib.pi_prot_type; + pr_debug("IBLOCK setup BIP bs->bio_integrity_pool: %p\n", + &bs->bio_integrity_pool); } + dev->dev_attrib.hw_pi_prot_type = dev->dev_attrib.pi_prot_type; return 0; out_blkdev_put: diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index 7428cb43952da0..56ce1ae355805d 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -10,7 +10,7 @@ enum blk_integrity_flags { BLK_INTEGRITY_VERIFY = 1 << 0, BLK_INTEGRITY_GENERATE = 1 << 1, BLK_INTEGRITY_DEVICE_CAPABLE = 1 << 2, - BLK_INTEGRITY_IP_CHECKSUM = 1 << 3, + BLK_INTEGRITY_REF_TAG = 1 << 3, }; struct blk_integrity_iter { @@ -19,22 +19,10 @@ struct blk_integrity_iter { sector_t seed; unsigned int data_size; unsigned short interval; - unsigned char tuple_size; - unsigned char pi_offset; const char *disk_name; }; -typedef blk_status_t (integrity_processing_fn) (struct blk_integrity_iter *); -typedef void (integrity_prepare_fn) (struct request *); -typedef void (integrity_complete_fn) (struct request *, unsigned int); - -struct blk_integrity_profile { - integrity_processing_fn *generate_fn; - integrity_processing_fn *verify_fn; - integrity_prepare_fn *prepare_fn; - integrity_complete_fn *complete_fn; - const char *name; -}; +const char *blk_integrity_profile_name(struct blk_integrity *bi); #ifdef CONFIG_BLK_DEV_INTEGRITY void blk_integrity_register(struct gendisk *, struct blk_integrity *); @@ -44,14 +32,17 @@ int blk_rq_map_integrity_sg(struct request_queue *, struct bio *, struct scatterlist *); int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); -static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) +static inline bool +blk_integrity_queue_supports_integrity(struct request_queue *q) { - struct blk_integrity *bi = &disk->queue->integrity; + return q->integrity.tuple_size; +} - if (!bi->profile) +static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) +{ + if (!blk_integrity_queue_supports_integrity(disk->queue)) return NULL; - - return bi; + return &disk->queue->integrity; } static inline struct blk_integrity * @@ -60,12 +51,6 @@ bdev_get_integrity(struct block_device *bdev) return blk_get_integrity(bdev->bd_disk); } -static inline bool -blk_integrity_queue_supports_integrity(struct request_queue *q) -{ - return q->integrity.profile; -} - static inline unsigned short queue_max_integrity_segments(const struct request_queue *q) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ac8e0cb2353a0e..bdd33388e1ced8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -105,9 +105,16 @@ enum { struct disk_events; struct badblocks; +enum blk_integrity_checksum { + BLK_INTEGRITY_CSUM_NONE = 0, + BLK_INTEGRITY_CSUM_IP = 1, + BLK_INTEGRITY_CSUM_CRC = 2, + BLK_INTEGRITY_CSUM_CRC64 = 3, +} __packed ; + struct blk_integrity { - const struct blk_integrity_profile *profile; unsigned char flags; + enum blk_integrity_checksum csum_type; unsigned char tuple_size; unsigned char pi_offset; unsigned char interval_exp; diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h index 248f4ac9564258..d2bafb76badfb9 100644 --- a/include/linux/t10-pi.h +++ b/include/linux/t10-pi.h @@ -48,11 +48,6 @@ static inline u32 t10_pi_ref_tag(struct request *rq) return blk_rq_pos(rq) >> (shift - SECTOR_SHIFT) & 0xffffffff; } -extern const struct blk_integrity_profile t10_pi_type1_crc; -extern const struct blk_integrity_profile t10_pi_type1_ip; -extern const struct blk_integrity_profile t10_pi_type3_crc; -extern const struct blk_integrity_profile t10_pi_type3_ip; - struct crc64_pi_tuple { __be64 guard_tag; __be16 app_tag; @@ -79,7 +74,4 @@ static inline u64 ext_pi_ref_tag(struct request *rq) return lower_48_bits(blk_rq_pos(rq) >> (shift - SECTOR_SHIFT)); } -extern const struct blk_integrity_profile ext_pi_type1_crc64; -extern const struct blk_integrity_profile ext_pi_type3_crc64; - #endif From e8bc14d116aeac8f0f133ec8d249acf4e0658da7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Jun 2024 10:48:16 +0200 Subject: [PATCH 0969/1578] block: remove the blk_flush_integrity call in blk_integrity_unregister Now that there are no indirect calls for PI processing there is no way to dereference a NULL pointer here. Additionally drivers now always freeze the queue (or in case of stacking drivers use their internal equivalent) around changing the integrity profile. This is effectively a revert of commit 3df49967f6f1 ("block: flush the integrity workqueue in blk_integrity_unregister"). Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240613084839.1044015-7-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-integrity.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 17d37badfbb8bc..24f04575096d39 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -401,8 +401,6 @@ void blk_integrity_unregister(struct gendisk *disk) if (!bi->tuple_size) return; - /* ensure all bios are off the integrity workqueue */ - blk_flush_integrity(); blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, disk->queue); memset(bi, 0, sizeof(*bi)); } From 1366251a794b149a132ef8423c8946b6e565a923 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Jun 2024 10:48:17 +0200 Subject: [PATCH 0970/1578] block: factor out flag_{store,show} helper for integrity Factor the duplicate code for the generate and verify attributes into common helpers. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240613084839.1044015-8-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-integrity.c | 53 +++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 24f04575096d39..24671d9f90a124 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -243,6 +243,28 @@ const char *blk_integrity_profile_name(struct blk_integrity *bi) } EXPORT_SYMBOL_GPL(blk_integrity_profile_name); +static ssize_t flag_store(struct device *dev, struct device_attribute *attr, + const char *page, size_t count, unsigned char flag) +{ + struct blk_integrity *bi = dev_to_bi(dev); + char *p = (char *) page; + unsigned long val = simple_strtoul(p, &p, 10); + + if (val) + bi->flags |= flag; + else + bi->flags &= ~flag; + return count; +} + +static ssize_t flag_show(struct device *dev, struct device_attribute *attr, + char *page, unsigned char flag) +{ + struct blk_integrity *bi = dev_to_bi(dev); + + return sysfs_emit(page, "%d\n", !!(bi->flags & flag)); +} + static ssize_t format_show(struct device *dev, struct device_attribute *attr, char *page) { @@ -275,49 +297,26 @@ static ssize_t read_verify_store(struct device *dev, struct device_attribute *attr, const char *page, size_t count) { - struct blk_integrity *bi = dev_to_bi(dev); - char *p = (char *) page; - unsigned long val = simple_strtoul(p, &p, 10); - - if (val) - bi->flags |= BLK_INTEGRITY_VERIFY; - else - bi->flags &= ~BLK_INTEGRITY_VERIFY; - - return count; + return flag_store(dev, attr, page, count, BLK_INTEGRITY_VERIFY); } static ssize_t read_verify_show(struct device *dev, struct device_attribute *attr, char *page) { - struct blk_integrity *bi = dev_to_bi(dev); - - return sysfs_emit(page, "%d\n", !!(bi->flags & BLK_INTEGRITY_VERIFY)); + return flag_show(dev, attr, page, BLK_INTEGRITY_VERIFY); } static ssize_t write_generate_store(struct device *dev, struct device_attribute *attr, const char *page, size_t count) { - struct blk_integrity *bi = dev_to_bi(dev); - - char *p = (char *) page; - unsigned long val = simple_strtoul(p, &p, 10); - - if (val) - bi->flags |= BLK_INTEGRITY_GENERATE; - else - bi->flags &= ~BLK_INTEGRITY_GENERATE; - - return count; + return flag_store(dev, attr, page, count, BLK_INTEGRITY_GENERATE); } static ssize_t write_generate_show(struct device *dev, struct device_attribute *attr, char *page) { - struct blk_integrity *bi = dev_to_bi(dev); - - return sysfs_emit(page, "%d\n", !!(bi->flags & BLK_INTEGRITY_GENERATE)); + return flag_show(dev, attr, page, BLK_INTEGRITY_GENERATE); } static ssize_t device_is_integrity_capable_show(struct device *dev, From 1d59857ed2ec4d506e346859713c4325b5053da3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Jun 2024 10:48:18 +0200 Subject: [PATCH 0971/1578] block: use kstrtoul in flag_store Use the text to integer helper that has error handling and doesn't modify the input pointer. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Reviewed-by: Kanchan Joshi Link: https://lore.kernel.org/r/20240613084839.1044015-9-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-integrity.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 24671d9f90a124..58760a6d6b2209 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -247,8 +247,12 @@ static ssize_t flag_store(struct device *dev, struct device_attribute *attr, const char *page, size_t count, unsigned char flag) { struct blk_integrity *bi = dev_to_bi(dev); - char *p = (char *) page; - unsigned long val = simple_strtoul(p, &p, 10); + unsigned long val; + int err; + + err = kstrtoul(page, 10, &val); + if (err) + return err; if (val) bi->flags |= flag; From 43c5dbe98a3953e07f4fbf89aa137b9207d52378 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Jun 2024 10:48:19 +0200 Subject: [PATCH 0972/1578] block: don't require stable pages for non-PI metadata Non-PI metadata doesn't contain checksums and thus doesn't require stable pages. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240613084839.1044015-10-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-integrity.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 58760a6d6b2209..1d2d371cd632d3 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -379,7 +379,8 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template bi->tag_size = template->tag_size; bi->pi_offset = template->pi_offset; - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue); + if (bi->csum_type != BLK_INTEGRITY_CSUM_NONE) + blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue); #ifdef CONFIG_BLK_INLINE_ENCRYPTION if (disk->queue->crypto_profile) { @@ -404,7 +405,8 @@ void blk_integrity_unregister(struct gendisk *disk) if (!bi->tuple_size) return; - blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, disk->queue); + if (bi->csum_type != BLK_INTEGRITY_CSUM_NONE) + blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, disk->queue); memset(bi, 0, sizeof(*bi)); } EXPORT_SYMBOL(blk_integrity_unregister); From 3c3e85ddffae93eba1a257eb6939bf5dc1e93b9e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Jun 2024 10:48:20 +0200 Subject: [PATCH 0973/1578] block: bypass the STABLE_WRITES flag for protection information Currently registering a checksum-enabled (aka PI) integrity profile sets the QUEUE_FLAG_STABLE_WRITE flag, and unregistering it clears the flag. This can incorrectly clear the flag when the driver requires stable writes even without PI, e.g. in case of iSCSI or NVMe/TCP with data digest enabled. Fix this by looking at the csum_type directly in bdev_stable_writes and not setting the queue flag. Also remove the blk_queue_stable_writes helper as the only user in nvme wants to only look at the actual QUEUE_FLAG_STABLE_WRITE flag as it inherits the integrity configuration by other means. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240613084839.1044015-11-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-integrity.c | 6 ------ drivers/nvme/host/multipath.c | 3 ++- include/linux/blkdev.h | 12 ++++++++---- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 1d2d371cd632d3..bec0d1df387ce9 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -379,9 +379,6 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template bi->tag_size = template->tag_size; bi->pi_offset = template->pi_offset; - if (bi->csum_type != BLK_INTEGRITY_CSUM_NONE) - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue); - #ifdef CONFIG_BLK_INLINE_ENCRYPTION if (disk->queue->crypto_profile) { pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n"); @@ -404,9 +401,6 @@ void blk_integrity_unregister(struct gendisk *disk) if (!bi->tuple_size) return; - - if (bi->csum_type != BLK_INTEGRITY_CSUM_NONE) - blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, disk->queue); memset(bi, 0, sizeof(*bi)); } EXPORT_SYMBOL(blk_integrity_unregister); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index d8b6b4648eaff9..12c59db02539e5 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -875,7 +875,8 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid) nvme_mpath_set_live(ns); } - if (blk_queue_stable_writes(ns->queue) && ns->head->disk) + if (test_bit(QUEUE_FLAG_STABLE_WRITES, &ns->queue->queue_flags) && + ns->head->disk) blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->head->disk->queue); #ifdef CONFIG_BLK_DEV_ZONED diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bdd33388e1ced8..f9089750919c6b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -571,8 +571,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_noxmerges(q) \ test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) #define blk_queue_nonrot(q) test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags) -#define blk_queue_stable_writes(q) \ - test_bit(QUEUE_FLAG_STABLE_WRITES, &(q)->queue_flags) #define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) #define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags) #define blk_queue_zone_resetall(q) \ @@ -1300,8 +1298,14 @@ static inline bool bdev_synchronous(struct block_device *bdev) static inline bool bdev_stable_writes(struct block_device *bdev) { - return test_bit(QUEUE_FLAG_STABLE_WRITES, - &bdev_get_queue(bdev)->queue_flags); + struct request_queue *q = bdev_get_queue(bdev); + +#ifdef CONFIG_BLK_DEV_INTEGRITY + /* BLK_INTEGRITY_CSUM_NONE is not available in blkdev.h */ + if (q->integrity.csum_type != 0) + return true; +#endif + return test_bit(QUEUE_FLAG_STABLE_WRITES, &q->queue_flags); } static inline bool bdev_write_cache(struct block_device *bdev) From 9f4aa46f2a7401025d8561495cf8740f773310fc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Jun 2024 10:48:21 +0200 Subject: [PATCH 0974/1578] block: invert the BLK_INTEGRITY_{GENERATE,VERIFY} flags Invert the flags so that user set values will be able to persist revalidating the integrity information once we switch the integrity information to queue_limits. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240613084839.1044015-12-hch@lst.de Signed-off-by: Jens Axboe --- block/bio-integrity.c | 4 ++-- block/blk-integrity.c | 18 +++++++++--------- include/linux/blk-integrity.h | 4 ++-- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 31dbc2853f92e3..173ffd4d623788 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -448,10 +448,10 @@ bool bio_integrity_prep(struct bio *bio) return true; if (bio_data_dir(bio) == READ) { - if (!(bi->flags & BLK_INTEGRITY_VERIFY)) + if (bi->flags & BLK_INTEGRITY_NOVERIFY) return true; } else { - if (!(bi->flags & BLK_INTEGRITY_GENERATE)) + if (bi->flags & BLK_INTEGRITY_NOGENERATE) return true; /* diff --git a/block/blk-integrity.c b/block/blk-integrity.c index bec0d1df387ce9..b37b8855eed147 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -254,10 +254,11 @@ static ssize_t flag_store(struct device *dev, struct device_attribute *attr, if (err) return err; + /* the flags are inverted vs the values in the sysfs files */ if (val) - bi->flags |= flag; - else bi->flags &= ~flag; + else + bi->flags |= flag; return count; } @@ -266,7 +267,7 @@ static ssize_t flag_show(struct device *dev, struct device_attribute *attr, { struct blk_integrity *bi = dev_to_bi(dev); - return sysfs_emit(page, "%d\n", !!(bi->flags & flag)); + return sysfs_emit(page, "%d\n", !(bi->flags & flag)); } static ssize_t format_show(struct device *dev, struct device_attribute *attr, @@ -301,26 +302,26 @@ static ssize_t read_verify_store(struct device *dev, struct device_attribute *attr, const char *page, size_t count) { - return flag_store(dev, attr, page, count, BLK_INTEGRITY_VERIFY); + return flag_store(dev, attr, page, count, BLK_INTEGRITY_NOVERIFY); } static ssize_t read_verify_show(struct device *dev, struct device_attribute *attr, char *page) { - return flag_show(dev, attr, page, BLK_INTEGRITY_VERIFY); + return flag_show(dev, attr, page, BLK_INTEGRITY_NOVERIFY); } static ssize_t write_generate_store(struct device *dev, struct device_attribute *attr, const char *page, size_t count) { - return flag_store(dev, attr, page, count, BLK_INTEGRITY_GENERATE); + return flag_store(dev, attr, page, count, BLK_INTEGRITY_NOGENERATE); } static ssize_t write_generate_show(struct device *dev, struct device_attribute *attr, char *page) { - return flag_show(dev, attr, page, BLK_INTEGRITY_GENERATE); + return flag_show(dev, attr, page, BLK_INTEGRITY_NOGENERATE); } static ssize_t device_is_integrity_capable_show(struct device *dev, @@ -371,8 +372,7 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template struct blk_integrity *bi = &disk->queue->integrity; bi->csum_type = template->csum_type; - bi->flags = BLK_INTEGRITY_VERIFY | BLK_INTEGRITY_GENERATE | - template->flags; + bi->flags = template->flags; bi->interval_exp = template->interval_exp ? : ilog2(queue_logical_block_size(disk->queue)); bi->tuple_size = template->tuple_size; diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index 56ce1ae355805d..bafa01d4e7f95b 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -7,8 +7,8 @@ struct request; enum blk_integrity_flags { - BLK_INTEGRITY_VERIFY = 1 << 0, - BLK_INTEGRITY_GENERATE = 1 << 1, + BLK_INTEGRITY_NOVERIFY = 1 << 0, + BLK_INTEGRITY_NOGENERATE = 1 << 1, BLK_INTEGRITY_DEVICE_CAPABLE = 1 << 2, BLK_INTEGRITY_REF_TAG = 1 << 3, }; From c6e56cf6b2e79a463af21286ba951714ed20828c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Jun 2024 10:48:22 +0200 Subject: [PATCH 0975/1578] block: move integrity information into queue_limits Move the integrity information into the queue limits so that it can be set atomically with other queue limits, and that the sysfs changes to the read_verify and write_generate flags are properly synchronized. This also allows to provide a more useful helper to stack the integrity fields, although it still is separate from the main stacking function as not all stackable devices want to inherit the integrity settings. Even with that it greatly simplifies the code in md and dm. Note that the integrity field is moved as-is into the queue limits. While there are good arguments for removing the separate blk_integrity structure, this would cause a lot of churn and might better be done at a later time if desired. However the integrity field in the queue_limits structure is now unconditional so that various ifdefs can be avoided or replaced with IS_ENABLED(). Given that tiny size of it that seems like a worthwhile trade off. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240613084839.1044015-13-hch@lst.de Signed-off-by: Jens Axboe --- Documentation/block/data-integrity.rst | 49 +------- block/blk-integrity.c | 124 ++----------------- block/blk-settings.c | 118 +++++++++++++++++- block/t10-pi.c | 12 +- drivers/md/dm-core.h | 1 - drivers/md/dm-integrity.c | 27 ++--- drivers/md/dm-table.c | 161 +++++-------------------- drivers/md/md.c | 72 +++-------- drivers/md/md.h | 5 +- drivers/md/raid0.c | 7 +- drivers/md/raid1.c | 10 +- drivers/md/raid10.c | 10 +- drivers/md/raid5.c | 2 +- drivers/nvdimm/btt.c | 13 +- drivers/nvme/host/core.c | 70 +++++------ drivers/scsi/sd.c | 8 +- drivers/scsi/sd.h | 12 +- drivers/scsi/sd_dif.c | 34 +++--- include/linux/blk-integrity.h | 27 ++--- include/linux/blkdev.h | 12 +- include/linux/t10-pi.h | 12 +- 21 files changed, 289 insertions(+), 497 deletions(-) diff --git a/Documentation/block/data-integrity.rst b/Documentation/block/data-integrity.rst index 6a760c0eb1924e..99905e880a0e56 100644 --- a/Documentation/block/data-integrity.rst +++ b/Documentation/block/data-integrity.rst @@ -153,18 +153,11 @@ bio_free() will automatically free the bip. 4.2 Block Device ---------------- -Because the format of the protection data is tied to the physical -disk, each block device has been extended with a block integrity -profile (struct blk_integrity). This optional profile is registered -with the block layer using blk_integrity_register(). - -The profile contains callback functions for generating and verifying -the protection data, as well as getting and setting application tags. -The profile also contains a few constants to aid in completing, -merging and splitting the integrity metadata. +Block devices can set up the integrity information in the integrity +sub-struture of the queue_limits structure. Layered block devices will need to pick a profile that's appropriate -for all subdevices. blk_integrity_compare() can help with that. DM +for all subdevices. queue_limits_stack_integrity() can help with that. DM and MD linear, RAID0 and RAID1 are currently supported. RAID4/5/6 will require extra work due to the application tag. @@ -250,42 +243,6 @@ will require extra work due to the application tag. integrity upon completion. -5.4 Registering A Block Device As Capable Of Exchanging Integrity Metadata --------------------------------------------------------------------------- - - To enable integrity exchange on a block device the gendisk must be - registered as capable: - - `int blk_integrity_register(gendisk, blk_integrity);` - - The blk_integrity struct is a template and should contain the - following:: - - static struct blk_integrity my_profile = { - .name = "STANDARDSBODY-TYPE-VARIANT-CSUM", - .generate_fn = my_generate_fn, - .verify_fn = my_verify_fn, - .tuple_size = sizeof(struct my_tuple_size), - .tag_size = , - }; - - 'name' is a text string which will be visible in sysfs. This is - part of the userland API so chose it carefully and never change - it. The format is standards body-type-variant. - E.g. T10-DIF-TYPE1-IP or T13-EPP-0-CRC. - - 'generate_fn' generates appropriate integrity metadata (for WRITE). - - 'verify_fn' verifies that the data buffer matches the integrity - metadata. - - 'tuple_size' must be set to match the size of the integrity - metadata per sector. I.e. 8 for DIF and EPP. - - 'tag_size' must be set to identify how many bytes of tag space - are available per hardware sector. For DIF this is either 2 or - 0 depending on the value of the Control Mode Page ATO bit. - ---------------------------------------------------------------------- 2007-12-24 Martin K. Petersen diff --git a/block/blk-integrity.c b/block/blk-integrity.c index b37b8855eed147..05a48689a424b2 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -107,63 +107,6 @@ int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio, } EXPORT_SYMBOL(blk_rq_map_integrity_sg); -/** - * blk_integrity_compare - Compare integrity profile of two disks - * @gd1: Disk to compare - * @gd2: Disk to compare - * - * Description: Meta-devices like DM and MD need to verify that all - * sub-devices use the same integrity format before advertising to - * upper layers that they can send/receive integrity metadata. This - * function can be used to check whether two gendisk devices have - * compatible integrity formats. - */ -int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2) -{ - struct blk_integrity *b1 = &gd1->queue->integrity; - struct blk_integrity *b2 = &gd2->queue->integrity; - - if (!b1->tuple_size && !b2->tuple_size) - return 0; - - if (!b1->tuple_size || !b2->tuple_size) - return -1; - - if (b1->interval_exp != b2->interval_exp) { - pr_err("%s: %s/%s protection interval %u != %u\n", - __func__, gd1->disk_name, gd2->disk_name, - 1 << b1->interval_exp, 1 << b2->interval_exp); - return -1; - } - - if (b1->tuple_size != b2->tuple_size) { - pr_err("%s: %s/%s tuple sz %u != %u\n", __func__, - gd1->disk_name, gd2->disk_name, - b1->tuple_size, b2->tuple_size); - return -1; - } - - if (b1->tag_size && b2->tag_size && (b1->tag_size != b2->tag_size)) { - pr_err("%s: %s/%s tag sz %u != %u\n", __func__, - gd1->disk_name, gd2->disk_name, - b1->tag_size, b2->tag_size); - return -1; - } - - if (b1->csum_type != b2->csum_type || - (b1->flags & BLK_INTEGRITY_REF_TAG) != - (b2->flags & BLK_INTEGRITY_REF_TAG)) { - pr_err("%s: %s/%s type %s != %s\n", __func__, - gd1->disk_name, gd2->disk_name, - blk_integrity_profile_name(b1), - blk_integrity_profile_name(b2)); - return -1; - } - - return 0; -} -EXPORT_SYMBOL(blk_integrity_compare); - bool blk_integrity_merge_rq(struct request_queue *q, struct request *req, struct request *next) { @@ -217,7 +160,7 @@ bool blk_integrity_merge_bio(struct request_queue *q, struct request *req, static inline struct blk_integrity *dev_to_bi(struct device *dev) { - return &dev_to_disk(dev)->queue->integrity; + return &dev_to_disk(dev)->queue->limits.integrity; } const char *blk_integrity_profile_name(struct blk_integrity *bi) @@ -246,7 +189,8 @@ EXPORT_SYMBOL_GPL(blk_integrity_profile_name); static ssize_t flag_store(struct device *dev, struct device_attribute *attr, const char *page, size_t count, unsigned char flag) { - struct blk_integrity *bi = dev_to_bi(dev); + struct request_queue *q = dev_to_disk(dev)->queue; + struct queue_limits lim; unsigned long val; int err; @@ -254,11 +198,18 @@ static ssize_t flag_store(struct device *dev, struct device_attribute *attr, if (err) return err; - /* the flags are inverted vs the values in the sysfs files */ + /* note that the flags are inverted vs the values in the sysfs files */ + lim = queue_limits_start_update(q); if (val) - bi->flags &= ~flag; + lim.integrity.flags &= ~flag; else - bi->flags |= flag; + lim.integrity.flags |= flag; + + blk_mq_freeze_queue(q); + err = queue_limits_commit_update(q, &lim); + blk_mq_unfreeze_queue(q); + if (err) + return err; return count; } @@ -355,52 +306,3 @@ const struct attribute_group blk_integrity_attr_group = { .name = "integrity", .attrs = integrity_attrs, }; - -/** - * blk_integrity_register - Register a gendisk as being integrity-capable - * @disk: struct gendisk pointer to make integrity-aware - * @template: block integrity profile to register - * - * Description: When a device needs to advertise itself as being able to - * send/receive integrity metadata it must use this function to register - * the capability with the block layer. The template is a blk_integrity - * struct with values appropriate for the underlying hardware. See - * Documentation/block/data-integrity.rst. - */ -void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) -{ - struct blk_integrity *bi = &disk->queue->integrity; - - bi->csum_type = template->csum_type; - bi->flags = template->flags; - bi->interval_exp = template->interval_exp ? : - ilog2(queue_logical_block_size(disk->queue)); - bi->tuple_size = template->tuple_size; - bi->tag_size = template->tag_size; - bi->pi_offset = template->pi_offset; - -#ifdef CONFIG_BLK_INLINE_ENCRYPTION - if (disk->queue->crypto_profile) { - pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n"); - disk->queue->crypto_profile = NULL; - } -#endif -} -EXPORT_SYMBOL(blk_integrity_register); - -/** - * blk_integrity_unregister - Unregister block integrity profile - * @disk: disk whose integrity profile to unregister - * - * Description: This function unregisters the integrity capability from - * a block device. - */ -void blk_integrity_unregister(struct gendisk *disk) -{ - struct blk_integrity *bi = &disk->queue->integrity; - - if (!bi->tuple_size) - return; - memset(bi, 0, sizeof(*bi)); -} -EXPORT_SYMBOL(blk_integrity_unregister); diff --git a/block/blk-settings.c b/block/blk-settings.c index 996f247fc98e80..f11c8676eb4c67 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -6,7 +6,7 @@ #include #include #include -#include +#include #include #include #include @@ -97,6 +97,36 @@ static int blk_validate_zoned_limits(struct queue_limits *lim) return 0; } +static int blk_validate_integrity_limits(struct queue_limits *lim) +{ + struct blk_integrity *bi = &lim->integrity; + + if (!bi->tuple_size) { + if (bi->csum_type != BLK_INTEGRITY_CSUM_NONE || + bi->tag_size || ((bi->flags & BLK_INTEGRITY_REF_TAG))) { + pr_warn("invalid PI settings.\n"); + return -EINVAL; + } + return 0; + } + + if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) { + pr_warn("integrity support disabled.\n"); + return -EINVAL; + } + + if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE && + (bi->flags & BLK_INTEGRITY_REF_TAG)) { + pr_warn("ref tag not support without checksum.\n"); + return -EINVAL; + } + + if (!bi->interval_exp) + bi->interval_exp = ilog2(lim->logical_block_size); + + return 0; +} + /* * Check that the limits in lim are valid, initialize defaults for unset * values, and cap values based on others where needed. @@ -105,6 +135,7 @@ static int blk_validate_limits(struct queue_limits *lim) { unsigned int max_hw_sectors; unsigned int logical_block_sectors; + int err; /* * Unless otherwise specified, default to 512 byte logical blocks and a @@ -230,6 +261,9 @@ static int blk_validate_limits(struct queue_limits *lim) lim->misaligned = 0; } + err = blk_validate_integrity_limits(lim); + if (err) + return err; return blk_validate_zoned_limits(lim); } @@ -263,13 +297,24 @@ int queue_limits_commit_update(struct request_queue *q, struct queue_limits *lim) __releases(q->limits_lock) { - int error = blk_validate_limits(lim); + int error; - if (!error) { - q->limits = *lim; - if (q->disk) - blk_apply_bdi_limits(q->disk->bdi, lim); + error = blk_validate_limits(lim); + if (error) + goto out_unlock; + +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + if (q->crypto_profile && lim->integrity.tag_size) { + pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together.\n"); + error = -EINVAL; + goto out_unlock; } +#endif + + q->limits = *lim; + if (q->disk) + blk_apply_bdi_limits(q->disk->bdi, lim); +out_unlock: mutex_unlock(&q->limits_lock); return error; } @@ -575,6 +620,67 @@ void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, } EXPORT_SYMBOL_GPL(queue_limits_stack_bdev); +/** + * queue_limits_stack_integrity - stack integrity profile + * @t: target queue limits + * @b: base queue limits + * + * Check if the integrity profile in the @b can be stacked into the + * target @t. Stacking is possible if either: + * + * a) does not have any integrity information stacked into it yet + * b) the integrity profile in @b is identical to the one in @t + * + * If @b can be stacked into @t, return %true. Else return %false and clear the + * integrity information in @t. + */ +bool queue_limits_stack_integrity(struct queue_limits *t, + struct queue_limits *b) +{ + struct blk_integrity *ti = &t->integrity; + struct blk_integrity *bi = &b->integrity; + + if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) + return true; + + if (!ti->tuple_size) { + /* inherit the settings from the first underlying device */ + if (!(ti->flags & BLK_INTEGRITY_STACKED)) { + ti->flags = BLK_INTEGRITY_DEVICE_CAPABLE | + (bi->flags & BLK_INTEGRITY_REF_TAG); + ti->csum_type = bi->csum_type; + ti->tuple_size = bi->tuple_size; + ti->pi_offset = bi->pi_offset; + ti->interval_exp = bi->interval_exp; + ti->tag_size = bi->tag_size; + goto done; + } + if (!bi->tuple_size) + goto done; + } + + if (ti->tuple_size != bi->tuple_size) + goto incompatible; + if (ti->interval_exp != bi->interval_exp) + goto incompatible; + if (ti->tag_size != bi->tag_size) + goto incompatible; + if (ti->csum_type != bi->csum_type) + goto incompatible; + if ((ti->flags & BLK_INTEGRITY_REF_TAG) != + (bi->flags & BLK_INTEGRITY_REF_TAG)) + goto incompatible; + +done: + ti->flags |= BLK_INTEGRITY_STACKED; + return true; + +incompatible: + memset(ti, 0, sizeof(*ti)); + return false; +} +EXPORT_SYMBOL_GPL(queue_limits_stack_integrity); + /** * blk_queue_update_dma_pad - update pad mask * @q: the request queue for the device diff --git a/block/t10-pi.c b/block/t10-pi.c index dadecf621497bb..cd7fa60d63ff21 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -116,7 +116,7 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, */ static void t10_pi_type1_prepare(struct request *rq) { - struct blk_integrity *bi = &rq->q->integrity; + struct blk_integrity *bi = &rq->q->limits.integrity; const int tuple_sz = bi->tuple_size; u32 ref_tag = t10_pi_ref_tag(rq); u8 offset = bi->pi_offset; @@ -167,7 +167,7 @@ static void t10_pi_type1_prepare(struct request *rq) */ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) { - struct blk_integrity *bi = &rq->q->integrity; + struct blk_integrity *bi = &rq->q->limits.integrity; unsigned intervals = nr_bytes >> bi->interval_exp; const int tuple_sz = bi->tuple_size; u32 ref_tag = t10_pi_ref_tag(rq); @@ -290,7 +290,7 @@ static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, static void ext_pi_type1_prepare(struct request *rq) { - struct blk_integrity *bi = &rq->q->integrity; + struct blk_integrity *bi = &rq->q->limits.integrity; const int tuple_sz = bi->tuple_size; u64 ref_tag = ext_pi_ref_tag(rq); u8 offset = bi->pi_offset; @@ -330,7 +330,7 @@ static void ext_pi_type1_prepare(struct request *rq) static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes) { - struct blk_integrity *bi = &rq->q->integrity; + struct blk_integrity *bi = &rq->q->limits.integrity; unsigned intervals = nr_bytes >> bi->interval_exp; const int tuple_sz = bi->tuple_size; u64 ref_tag = ext_pi_ref_tag(rq); @@ -396,7 +396,7 @@ blk_status_t blk_integrity_verify(struct blk_integrity_iter *iter, void blk_integrity_prepare(struct request *rq) { - struct blk_integrity *bi = &rq->q->integrity; + struct blk_integrity *bi = &rq->q->limits.integrity; if (!(bi->flags & BLK_INTEGRITY_REF_TAG)) return; @@ -409,7 +409,7 @@ void blk_integrity_prepare(struct request *rq) void blk_integrity_complete(struct request *rq, unsigned int nr_bytes) { - struct blk_integrity *bi = &rq->q->integrity; + struct blk_integrity *bi = &rq->q->limits.integrity; if (!(bi->flags & BLK_INTEGRITY_REF_TAG)) return; diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 08700bfc3e2343..14a44c0f82868b 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -206,7 +206,6 @@ struct dm_table { bool integrity_supported:1; bool singleton:1; - unsigned integrity_added:1; /* * Indicates the rw permissions for the new logical device. This diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index c1cc27541673c7..2a89f8eb4713c9 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -3475,6 +3475,17 @@ static void dm_integrity_io_hints(struct dm_target *ti, struct queue_limits *lim limits->dma_alignment = limits->logical_block_size - 1; limits->discard_granularity = ic->sectors_per_block << SECTOR_SHIFT; } + + if (!ic->internal_hash) { + struct blk_integrity *bi = &limits->integrity; + + memset(bi, 0, sizeof(*bi)); + bi->tuple_size = ic->tag_size; + bi->tag_size = bi->tuple_size; + bi->interval_exp = + ic->sb->log2_sectors_per_block + SECTOR_SHIFT; + } + limits->max_integrity_segments = USHRT_MAX; } @@ -3631,19 +3642,6 @@ static int initialize_superblock(struct dm_integrity_c *ic, return 0; } -static void dm_integrity_set(struct dm_target *ti, struct dm_integrity_c *ic) -{ - struct gendisk *disk = dm_disk(dm_table_get_md(ti->table)); - struct blk_integrity bi; - - memset(&bi, 0, sizeof(bi)); - bi.tuple_size = ic->tag_size; - bi.tag_size = bi.tuple_size; - bi.interval_exp = ic->sb->log2_sectors_per_block + SECTOR_SHIFT; - - blk_integrity_register(disk, &bi); -} - static void dm_integrity_free_page_list(struct page_list *pl) { unsigned int i; @@ -4629,9 +4627,6 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv } } - if (!ic->internal_hash) - dm_integrity_set(ti, ic); - ti->num_flush_bios = 1; ti->flush_supported = true; if (ic->discard) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index b2d5246cff2102..fd789eeb62d943 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -425,6 +425,13 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, q->limits.logical_block_size, q->limits.alignment_offset, (unsigned long long) start << SECTOR_SHIFT); + + /* + * Only stack the integrity profile if the target doesn't have native + * integrity support. + */ + if (!dm_target_has_integrity(ti->type)) + queue_limits_stack_integrity_bdev(limits, bdev); return 0; } @@ -702,9 +709,6 @@ int dm_table_add_target(struct dm_table *t, const char *type, t->immutable_target_type = ti->type; } - if (dm_target_has_integrity(ti->type)) - t->integrity_added = 1; - ti->table = t; ti->begin = start; ti->len = len; @@ -1119,99 +1123,6 @@ static int dm_table_build_index(struct dm_table *t) return r; } -static bool integrity_profile_exists(struct gendisk *disk) -{ - return !!blk_get_integrity(disk); -} - -/* - * Get a disk whose integrity profile reflects the table's profile. - * Returns NULL if integrity support was inconsistent or unavailable. - */ -static struct gendisk *dm_table_get_integrity_disk(struct dm_table *t) -{ - struct list_head *devices = dm_table_get_devices(t); - struct dm_dev_internal *dd = NULL; - struct gendisk *prev_disk = NULL, *template_disk = NULL; - - for (unsigned int i = 0; i < t->num_targets; i++) { - struct dm_target *ti = dm_table_get_target(t, i); - - if (!dm_target_passes_integrity(ti->type)) - goto no_integrity; - } - - list_for_each_entry(dd, devices, list) { - template_disk = dd->dm_dev->bdev->bd_disk; - if (!integrity_profile_exists(template_disk)) - goto no_integrity; - else if (prev_disk && - blk_integrity_compare(prev_disk, template_disk) < 0) - goto no_integrity; - prev_disk = template_disk; - } - - return template_disk; - -no_integrity: - if (prev_disk) - DMWARN("%s: integrity not set: %s and %s profile mismatch", - dm_device_name(t->md), - prev_disk->disk_name, - template_disk->disk_name); - return NULL; -} - -/* - * Register the mapped device for blk_integrity support if the - * underlying devices have an integrity profile. But all devices may - * not have matching profiles (checking all devices isn't reliable - * during table load because this table may use other DM device(s) which - * must be resumed before they will have an initialized integity - * profile). Consequently, stacked DM devices force a 2 stage integrity - * profile validation: First pass during table load, final pass during - * resume. - */ -static int dm_table_register_integrity(struct dm_table *t) -{ - struct mapped_device *md = t->md; - struct gendisk *template_disk = NULL; - - /* If target handles integrity itself do not register it here. */ - if (t->integrity_added) - return 0; - - template_disk = dm_table_get_integrity_disk(t); - if (!template_disk) - return 0; - - if (!integrity_profile_exists(dm_disk(md))) { - t->integrity_supported = true; - /* - * Register integrity profile during table load; we can do - * this because the final profile must match during resume. - */ - blk_integrity_register(dm_disk(md), - blk_get_integrity(template_disk)); - return 0; - } - - /* - * If DM device already has an initialized integrity - * profile the new profile should not conflict. - */ - if (blk_integrity_compare(dm_disk(md), template_disk) < 0) { - DMERR("%s: conflict with existing integrity profile: %s profile mismatch", - dm_device_name(t->md), - template_disk->disk_name); - return 1; - } - - /* Preserve existing integrity profile */ - t->integrity_supported = true; - return 0; -} - #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct dm_crypto_profile { @@ -1423,12 +1334,6 @@ int dm_table_complete(struct dm_table *t) return r; } - r = dm_table_register_integrity(t); - if (r) { - DMERR("could not register integrity profile."); - return r; - } - r = dm_table_construct_crypto_profile(t); if (r) { DMERR("could not construct crypto profile."); @@ -1688,6 +1593,14 @@ int dm_calculate_queue_limits(struct dm_table *t, blk_set_stacking_limits(limits); + t->integrity_supported = true; + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + if (!dm_target_passes_integrity(ti->type)) + t->integrity_supported = false; + } + for (unsigned int i = 0; i < t->num_targets; i++) { struct dm_target *ti = dm_table_get_target(t, i); @@ -1738,6 +1651,18 @@ int dm_calculate_queue_limits(struct dm_table *t, dm_device_name(t->md), (unsigned long long) ti->begin, (unsigned long long) ti->len); + + if (t->integrity_supported || + dm_target_has_integrity(ti->type)) { + if (!queue_limits_stack_integrity(limits, &ti_limits)) { + DMWARN("%s: adding target device (start sect %llu len %llu) " + "disabled integrity support due to incompatibility", + dm_device_name(t->md), + (unsigned long long) ti->begin, + (unsigned long long) ti->len); + t->integrity_supported = false; + } + } } /* @@ -1761,36 +1686,6 @@ int dm_calculate_queue_limits(struct dm_table *t, return validate_hardware_logical_block_alignment(t, limits); } -/* - * Verify that all devices have an integrity profile that matches the - * DM device's registered integrity profile. If the profiles don't - * match then unregister the DM device's integrity profile. - */ -static void dm_table_verify_integrity(struct dm_table *t) -{ - struct gendisk *template_disk = NULL; - - if (t->integrity_added) - return; - - if (t->integrity_supported) { - /* - * Verify that the original integrity profile - * matches all the devices in this table. - */ - template_disk = dm_table_get_integrity_disk(t); - if (template_disk && - blk_integrity_compare(dm_disk(t->md), template_disk) >= 0) - return; - } - - if (integrity_profile_exists(dm_disk(t->md))) { - DMWARN("%s: unable to establish an integrity profile", - dm_device_name(t->md)); - blk_integrity_unregister(dm_disk(t->md)); - } -} - static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { @@ -2004,8 +1899,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, else blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - dm_table_verify_integrity(t); - /* * Some devices don't use blk_integrity but still want stable pages * because they do their own checksumming. diff --git a/drivers/md/md.c b/drivers/md/md.c index aff9118ff69750..67ece2cd725f50 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2410,36 +2410,10 @@ static LIST_HEAD(pending_raid_disks); */ int md_integrity_register(struct mddev *mddev) { - struct md_rdev *rdev, *reference = NULL; - if (list_empty(&mddev->disks)) return 0; /* nothing to do */ - if (mddev_is_dm(mddev) || blk_get_integrity(mddev->gendisk)) - return 0; /* shouldn't register, or already is */ - rdev_for_each(rdev, mddev) { - /* skip spares and non-functional disks */ - if (test_bit(Faulty, &rdev->flags)) - continue; - if (rdev->raid_disk < 0) - continue; - if (!reference) { - /* Use the first rdev as the reference */ - reference = rdev; - continue; - } - /* does this rdev's profile match the reference profile? */ - if (blk_integrity_compare(reference->bdev->bd_disk, - rdev->bdev->bd_disk) < 0) - return -EINVAL; - } - if (!reference || !bdev_get_integrity(reference->bdev)) - return 0; - /* - * All component devices are integrity capable and have matching - * profiles, register the common profile for the md device. - */ - blk_integrity_register(mddev->gendisk, - bdev_get_integrity(reference->bdev)); + if (mddev_is_dm(mddev) || !blk_get_integrity(mddev->gendisk)) + return 0; /* shouldn't register */ pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || @@ -2459,32 +2433,6 @@ int md_integrity_register(struct mddev *mddev) } EXPORT_SYMBOL(md_integrity_register); -/* - * Attempt to add an rdev, but only if it is consistent with the current - * integrity profile - */ -int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) -{ - struct blk_integrity *bi_mddev; - - if (mddev_is_dm(mddev)) - return 0; - - bi_mddev = blk_get_integrity(mddev->gendisk); - - if (!bi_mddev) /* nothing to do */ - return 0; - - if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { - pr_err("%s: incompatible integrity profile for %pg\n", - mdname(mddev), rdev->bdev); - return -ENXIO; - } - - return 0; -} -EXPORT_SYMBOL(md_integrity_add_rdev); - static bool rdev_read_only(struct md_rdev *rdev) { return bdev_read_only(rdev->bdev) || @@ -5755,14 +5703,20 @@ static const struct kobj_type md_ktype = { int mdp_major = 0; /* stack the limit for all rdevs into lim */ -void mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim) +int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim, + unsigned int flags) { struct md_rdev *rdev; rdev_for_each(rdev, mddev) { queue_limits_stack_bdev(lim, rdev->bdev, rdev->data_offset, mddev->gendisk->disk_name); + if ((flags & MDDEV_STACK_INTEGRITY) && + !queue_limits_stack_integrity_bdev(lim, rdev->bdev)) + return -EINVAL; } + + return 0; } EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits); @@ -5777,6 +5731,14 @@ int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev) lim = queue_limits_start_update(mddev->gendisk->queue); queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset, mddev->gendisk->disk_name); + + if (!queue_limits_stack_integrity_bdev(&lim, rdev->bdev)) { + pr_err("%s: incompatible integrity profile for %pg\n", + mdname(mddev), rdev->bdev); + queue_limits_cancel_update(mddev->gendisk->queue); + return -ENXIO; + } + return queue_limits_commit_update(mddev->gendisk->queue, &lim); } EXPORT_SYMBOL_GPL(mddev_stack_new_rdev); diff --git a/drivers/md/md.h b/drivers/md/md.h index ca085ecad50449..6733b0b0abf999 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -809,7 +809,6 @@ extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev); extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors); extern int md_check_no_bitmap(struct mddev *mddev); extern int md_integrity_register(struct mddev *mddev); -extern int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev); extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); extern int mddev_init(struct mddev *mddev); @@ -908,7 +907,9 @@ void md_autostart_arrays(int part); int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info); int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info); int do_md_run(struct mddev *mddev); -void mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim); +#define MDDEV_STACK_INTEGRITY (1u << 0) +int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim, + unsigned int flags); int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev); void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 81c01347cd24e6..62634e2a33bd0f 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -377,13 +377,18 @@ static void raid0_free(struct mddev *mddev, void *priv) static int raid0_set_limits(struct mddev *mddev) { struct queue_limits lim; + int err; blk_set_stacking_limits(&lim); lim.max_hw_sectors = mddev->chunk_sectors; lim.max_write_zeroes_sectors = mddev->chunk_sectors; lim.io_min = mddev->chunk_sectors << 9; lim.io_opt = lim.io_min * mddev->raid_disks; - mddev_stack_rdev_limits(mddev, &lim); + err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); + if (err) { + queue_limits_cancel_update(mddev->gendisk->queue); + return err; + } return queue_limits_set(mddev->gendisk->queue, &lim); } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 1f321826ef02ba..779cad62f6f8c0 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1907,9 +1907,6 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (mddev->recovery_disabled == conf->recovery_disabled) return -EBUSY; - if (md_integrity_add_rdev(rdev, mddev)) - return -ENXIO; - if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; @@ -3197,10 +3194,15 @@ static struct r1conf *setup_conf(struct mddev *mddev) static int raid1_set_limits(struct mddev *mddev) { struct queue_limits lim; + int err; blk_set_stacking_limits(&lim); lim.max_write_zeroes_sectors = 0; - mddev_stack_rdev_limits(mddev, &lim); + err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); + if (err) { + queue_limits_cancel_update(mddev->gendisk->queue); + return err; + } return queue_limits_set(mddev->gendisk->queue, &lim); } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index a4556d2e46bf95..5f6885b53b691a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2083,9 +2083,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1)) return -EINVAL; - if (md_integrity_add_rdev(rdev, mddev)) - return -ENXIO; - if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; @@ -3980,12 +3977,17 @@ static int raid10_set_queue_limits(struct mddev *mddev) { struct r10conf *conf = mddev->private; struct queue_limits lim; + int err; blk_set_stacking_limits(&lim); lim.max_write_zeroes_sectors = 0; lim.io_min = mddev->chunk_sectors << 9; lim.io_opt = lim.io_min * raid10_nr_stripes(conf); - mddev_stack_rdev_limits(mddev, &lim); + err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); + if (err) { + queue_limits_cancel_update(mddev->gendisk->queue); + return err; + } return queue_limits_set(mddev->gendisk->queue, &lim); } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2bd1ce9b39226a..675c68fa6c6403 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7708,7 +7708,7 @@ static int raid5_set_limits(struct mddev *mddev) lim.raid_partial_stripes_expensive = 1; lim.discard_granularity = stripe; lim.max_write_zeroes_sectors = 0; - mddev_stack_rdev_limits(mddev, &lim); + mddev_stack_rdev_limits(mddev, &lim, 0); rdev_for_each(rdev, mddev) queue_limits_stack_bdev(&lim, rdev->bdev, rdev->new_data_offset, mddev->gendisk->disk_name); diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 1e5aedaf8c7bd9..c5f8451b494d6c 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1504,6 +1504,11 @@ static int btt_blk_init(struct btt *btt) }; int rc; + if (btt_meta_size(btt) && IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) { + lim.integrity.tuple_size = btt_meta_size(btt); + lim.integrity.tag_size = btt_meta_size(btt); + } + btt->btt_disk = blk_alloc_disk(&lim, NUMA_NO_NODE); if (IS_ERR(btt->btt_disk)) return PTR_ERR(btt->btt_disk); @@ -1516,14 +1521,6 @@ static int btt_blk_init(struct btt *btt) blk_queue_flag_set(QUEUE_FLAG_NONROT, btt->btt_disk->queue); blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, btt->btt_disk->queue); - if (btt_meta_size(btt) && IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) { - struct blk_integrity bi = { - .tuple_size = btt_meta_size(btt), - .tag_size = btt_meta_size(btt), - }; - blk_integrity_register(btt->btt_disk, &bi); - } - set_capacity(btt->btt_disk, btt->nlba * btt->sector_size >> 9); rc = device_add_disk(&btt->nd_btt->dev, btt->btt_disk, NULL); if (rc) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 14bac248cde4ca..5a673fa5cb2612 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1723,11 +1723,12 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) return 0; } -static bool nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head) +static bool nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head, + struct queue_limits *lim) { - struct blk_integrity integrity = { }; + struct blk_integrity *bi = &lim->integrity; - blk_integrity_unregister(disk); + memset(bi, 0, sizeof(*bi)); if (!head->ms) return true; @@ -1744,14 +1745,14 @@ static bool nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head) case NVME_NS_DPS_PI_TYPE3: switch (head->guard_type) { case NVME_NVM_NS_16B_GUARD: - integrity.csum_type = BLK_INTEGRITY_CSUM_CRC; - integrity.tag_size = sizeof(u16) + sizeof(u32); - integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; + bi->csum_type = BLK_INTEGRITY_CSUM_CRC; + bi->tag_size = sizeof(u16) + sizeof(u32); + bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE; break; case NVME_NVM_NS_64B_GUARD: - integrity.csum_type = BLK_INTEGRITY_CSUM_CRC64; - integrity.tag_size = sizeof(u16) + 6; - integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; + bi->csum_type = BLK_INTEGRITY_CSUM_CRC64; + bi->tag_size = sizeof(u16) + 6; + bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE; break; default: break; @@ -1761,16 +1762,16 @@ static bool nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head) case NVME_NS_DPS_PI_TYPE2: switch (head->guard_type) { case NVME_NVM_NS_16B_GUARD: - integrity.csum_type = BLK_INTEGRITY_CSUM_CRC; - integrity.tag_size = sizeof(u16); - integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE | - BLK_INTEGRITY_REF_TAG; + bi->csum_type = BLK_INTEGRITY_CSUM_CRC; + bi->tag_size = sizeof(u16); + bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE | + BLK_INTEGRITY_REF_TAG; break; case NVME_NVM_NS_64B_GUARD: - integrity.csum_type = BLK_INTEGRITY_CSUM_CRC64; - integrity.tag_size = sizeof(u16); - integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE | - BLK_INTEGRITY_REF_TAG; + bi->csum_type = BLK_INTEGRITY_CSUM_CRC64; + bi->tag_size = sizeof(u16); + bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE | + BLK_INTEGRITY_REF_TAG; break; default: break; @@ -1780,9 +1781,8 @@ static bool nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head) break; } - integrity.tuple_size = head->ms; - integrity.pi_offset = head->pi_offset; - blk_integrity_register(disk, &integrity); + bi->tuple_size = head->ms; + bi->pi_offset = head->pi_offset; return true; } @@ -2105,11 +2105,6 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && ns->head->ids.csi == NVME_CSI_ZNS) nvme_update_zone_info(ns, &lim, &zi); - ret = queue_limits_commit_update(ns->disk->queue, &lim); - if (ret) { - blk_mq_unfreeze_queue(ns->disk->queue); - goto out; - } /* * Register a metadata profile for PI, or the plain non-integrity NVMe @@ -2117,9 +2112,15 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, * I/O to namespaces with metadata except when the namespace supports * PI, as it can strip/insert in that case. */ - if (!nvme_init_integrity(ns->disk, ns->head)) + if (!nvme_init_integrity(ns->disk, ns->head, &lim)) capacity = 0; + ret = queue_limits_commit_update(ns->disk->queue, &lim); + if (ret) { + blk_mq_unfreeze_queue(ns->disk->queue); + goto out; + } + set_capacity_and_notify(ns->disk, capacity); /* @@ -2191,14 +2192,6 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info) struct queue_limits lim; blk_mq_freeze_queue(ns->head->disk->queue); - if (unsupported) - ns->head->disk->flags |= GENHD_FL_HIDDEN; - else - nvme_init_integrity(ns->head->disk, ns->head); - set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk)); - set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info)); - nvme_mpath_revalidate_paths(ns); - /* * queue_limits mixes values that are the hardware limitations * for bio splitting with what is the device configuration. @@ -2221,7 +2214,16 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info) lim.io_opt = ns_lim->io_opt; queue_limits_stack_bdev(&lim, ns->disk->part0, 0, ns->head->disk->disk_name); + if (unsupported) + ns->head->disk->flags |= GENHD_FL_HIDDEN; + else + nvme_init_integrity(ns->head->disk, ns->head, &lim); ret = queue_limits_commit_update(ns->head->disk->queue, &lim); + + set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk)); + set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info)); + nvme_mpath_revalidate_paths(ns); + blk_mq_unfreeze_queue(ns->head->disk->queue); } diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index d957e29b17a98a..e01393ed42076b 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -2482,11 +2482,13 @@ static int sd_read_protection_type(struct scsi_disk *sdkp, unsigned char *buffer return 0; } -static void sd_config_protection(struct scsi_disk *sdkp) +static void sd_config_protection(struct scsi_disk *sdkp, + struct queue_limits *lim) { struct scsi_device *sdp = sdkp->device; - sd_dif_config_host(sdkp); + if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) + sd_dif_config_host(sdkp, lim); if (!sdkp->protection_type) return; @@ -3677,7 +3679,7 @@ static int sd_revalidate_disk(struct gendisk *disk) sd_read_app_tag_own(sdkp, buffer); sd_read_write_same(sdkp, buffer); sd_read_security(sdkp, buffer); - sd_config_protection(sdkp); + sd_config_protection(sdkp, &lim); } /* diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index b4170b17bad47a..726f1613f6cb56 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -220,17 +220,7 @@ static inline sector_t sectors_to_logical(struct scsi_device *sdev, sector_t sec return sector >> (ilog2(sdev->sector_size) - 9); } -#ifdef CONFIG_BLK_DEV_INTEGRITY - -extern void sd_dif_config_host(struct scsi_disk *); - -#else /* CONFIG_BLK_DEV_INTEGRITY */ - -static inline void sd_dif_config_host(struct scsi_disk *disk) -{ -} - -#endif /* CONFIG_BLK_DEV_INTEGRITY */ +void sd_dif_config_host(struct scsi_disk *sdkp, struct queue_limits *lim); static inline int sd_is_zoned(struct scsi_disk *sdkp) { diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c index 6f0921c7db787b..ae6ce6f5d622d9 100644 --- a/drivers/scsi/sd_dif.c +++ b/drivers/scsi/sd_dif.c @@ -24,14 +24,15 @@ /* * Configure exchange of protection information between OS and HBA. */ -void sd_dif_config_host(struct scsi_disk *sdkp) +void sd_dif_config_host(struct scsi_disk *sdkp, struct queue_limits *lim) { struct scsi_device *sdp = sdkp->device; - struct gendisk *disk = sdkp->disk; u8 type = sdkp->protection_type; - struct blk_integrity bi; + struct blk_integrity *bi = &lim->integrity; int dif, dix; + memset(bi, 0, sizeof(*bi)); + dif = scsi_host_dif_capable(sdp->host, type); dix = scsi_host_dix_capable(sdp->host, type); @@ -39,40 +40,33 @@ void sd_dif_config_host(struct scsi_disk *sdkp) dif = 0; dix = 1; } - if (!dix) { - blk_integrity_unregister(disk); + if (!dix) return; - } - - memset(&bi, 0, sizeof(bi)); /* Enable DMA of protection information */ if (scsi_host_get_guard(sdkp->device->host) & SHOST_DIX_GUARD_IP) - bi.csum_type = BLK_INTEGRITY_CSUM_IP; + bi->csum_type = BLK_INTEGRITY_CSUM_IP; else - bi.csum_type = BLK_INTEGRITY_CSUM_CRC; + bi->csum_type = BLK_INTEGRITY_CSUM_CRC; if (type != T10_PI_TYPE3_PROTECTION) - bi.flags |= BLK_INTEGRITY_REF_TAG; + bi->flags |= BLK_INTEGRITY_REF_TAG; - bi.tuple_size = sizeof(struct t10_pi_tuple); + bi->tuple_size = sizeof(struct t10_pi_tuple); if (dif && type) { - bi.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; + bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE; if (!sdkp->ATO) - goto out; + return; if (type == T10_PI_TYPE3_PROTECTION) - bi.tag_size = sizeof(u16) + sizeof(u32); + bi->tag_size = sizeof(u16) + sizeof(u32); else - bi.tag_size = sizeof(u16); + bi->tag_size = sizeof(u16); } sd_first_printk(KERN_NOTICE, sdkp, "Enabling DIX %s, application tag size %u bytes\n", - blk_integrity_profile_name(&bi), bi.tag_size); -out: - blk_integrity_register(disk, &bi); + blk_integrity_profile_name(bi), bi->tag_size); } - diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index bafa01d4e7f95b..d201140d77a336 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -11,6 +11,7 @@ enum blk_integrity_flags { BLK_INTEGRITY_NOGENERATE = 1 << 1, BLK_INTEGRITY_DEVICE_CAPABLE = 1 << 2, BLK_INTEGRITY_REF_TAG = 1 << 3, + BLK_INTEGRITY_STACKED = 1 << 4, }; struct blk_integrity_iter { @@ -23,11 +24,15 @@ struct blk_integrity_iter { }; const char *blk_integrity_profile_name(struct blk_integrity *bi); +bool queue_limits_stack_integrity(struct queue_limits *t, + struct queue_limits *b); +static inline bool queue_limits_stack_integrity_bdev(struct queue_limits *t, + struct block_device *bdev) +{ + return queue_limits_stack_integrity(t, &bdev->bd_disk->queue->limits); +} #ifdef CONFIG_BLK_DEV_INTEGRITY -void blk_integrity_register(struct gendisk *, struct blk_integrity *); -void blk_integrity_unregister(struct gendisk *); -int blk_integrity_compare(struct gendisk *, struct gendisk *); int blk_rq_map_integrity_sg(struct request_queue *, struct bio *, struct scatterlist *); int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); @@ -35,14 +40,14 @@ int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); static inline bool blk_integrity_queue_supports_integrity(struct request_queue *q) { - return q->integrity.tuple_size; + return q->limits.integrity.tuple_size; } static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) { if (!blk_integrity_queue_supports_integrity(disk->queue)) return NULL; - return &disk->queue->integrity; + return &disk->queue->limits.integrity; } static inline struct blk_integrity * @@ -119,17 +124,6 @@ blk_integrity_queue_supports_integrity(struct request_queue *q) { return false; } -static inline int blk_integrity_compare(struct gendisk *a, struct gendisk *b) -{ - return 0; -} -static inline void blk_integrity_register(struct gendisk *d, - struct blk_integrity *b) -{ -} -static inline void blk_integrity_unregister(struct gendisk *d) -{ -} static inline unsigned short queue_max_integrity_segments(const struct request_queue *q) { @@ -157,4 +151,5 @@ static inline struct bio_vec *rq_integrity_vec(struct request *rq) return NULL; } #endif /* CONFIG_BLK_DEV_INTEGRITY */ + #endif /* _LINUX_BLK_INTEGRITY_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f9089750919c6b..0c247a71688561 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -334,6 +334,8 @@ struct queue_limits { * due to possible offsets. */ unsigned int dma_alignment; + + struct blk_integrity integrity; }; typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx, @@ -419,10 +421,6 @@ struct request_queue { struct queue_limits limits; -#ifdef CONFIG_BLK_DEV_INTEGRITY - struct blk_integrity integrity; -#endif /* CONFIG_BLK_DEV_INTEGRITY */ - #ifdef CONFIG_PM struct device *dev; enum rpm_status rpm_status; @@ -1300,11 +1298,9 @@ static inline bool bdev_stable_writes(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); -#ifdef CONFIG_BLK_DEV_INTEGRITY - /* BLK_INTEGRITY_CSUM_NONE is not available in blkdev.h */ - if (q->integrity.csum_type != 0) + if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && + q->limits.integrity.csum_type != BLK_INTEGRITY_CSUM_NONE) return true; -#endif return test_bit(QUEUE_FLAG_STABLE_WRITES, &q->queue_flags); } diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h index d2bafb76badfb9..1773610010ebaf 100644 --- a/include/linux/t10-pi.h +++ b/include/linux/t10-pi.h @@ -39,12 +39,8 @@ struct t10_pi_tuple { static inline u32 t10_pi_ref_tag(struct request *rq) { - unsigned int shift = ilog2(queue_logical_block_size(rq->q)); + unsigned int shift = rq->q->limits.integrity.interval_exp; -#ifdef CONFIG_BLK_DEV_INTEGRITY - if (rq->q->integrity.interval_exp) - shift = rq->q->integrity.interval_exp; -#endif return blk_rq_pos(rq) >> (shift - SECTOR_SHIFT) & 0xffffffff; } @@ -65,12 +61,8 @@ static inline u64 lower_48_bits(u64 n) static inline u64 ext_pi_ref_tag(struct request *rq) { - unsigned int shift = ilog2(queue_logical_block_size(rq->q)); + unsigned int shift = rq->q->limits.integrity.interval_exp; -#ifdef CONFIG_BLK_DEV_INTEGRITY - if (rq->q->integrity.interval_exp) - shift = rq->q->integrity.interval_exp; -#endif return lower_48_bits(blk_rq_pos(rq) >> (shift - SECTOR_SHIFT)); } From 52734206b2273aa0b3448980a58ec920a8e571a5 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Thu, 23 May 2024 17:45:17 -0400 Subject: [PATCH 0976/1578] io_uring: Drop per-ctx dummy_ubuf Commit 19a63c402170 ("io_uring/rsrc: keep one global dummy_ubuf") replaced it with a global static object but this stayed behind. Fixes: 19a63c402170 ("io_uring/rsrc: keep one global dummy_ubuf") Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20240523214517.31803-1-krisman@suse.de Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 7a6b190c7da74a..91224bbcfa73ff 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -373,7 +373,6 @@ struct io_ring_ctx { struct io_restriction restrictions; /* slow path rsrc auxilary data, used by update/register */ - struct io_mapped_ubuf *dummy_ubuf; struct io_rsrc_data *file_data; struct io_rsrc_data *buf_data; From 4faf204386db68d8457b5ecd6cbcef56fcca328a Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Thu, 23 May 2024 17:45:35 -0400 Subject: [PATCH 0977/1578] io_uring/rsrc: Drop io_copy_iov in favor of iovec API Instead of open coding an io_uring function to copy iovs from userspace, rely on the existing iovec_from_user function. While there, avoid repeatedly zeroing the iov in the !arg case for io_sqe_buffer_register. tested with liburing testsuite. Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20240523214535.31890-1-krisman@suse.de Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 60 +++++++++++++++++-------------------------------- 1 file changed, 21 insertions(+), 39 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 65417c9553b1d8..338e771bcafb3b 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -85,31 +85,6 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) return 0; } -static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, - void __user *arg, unsigned index) -{ - struct iovec __user *src; - -#ifdef CONFIG_COMPAT - if (ctx->compat) { - struct compat_iovec __user *ciovs; - struct compat_iovec ciov; - - ciovs = (struct compat_iovec __user *) arg; - if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) - return -EFAULT; - - dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); - dst->iov_len = ciov.iov_len; - return 0; - } -#endif - src = (struct iovec __user *) arg; - if (copy_from_user(dst, &src[index], sizeof(*dst))) - return -EFAULT; - return 0; -} - static int io_buffer_validate(struct iovec *iov) { unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); @@ -419,8 +394,9 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, struct io_uring_rsrc_update2 *up, unsigned int nr_args) { + struct iovec __user *uvec = u64_to_user_ptr(up->data); u64 __user *tags = u64_to_user_ptr(up->tags); - struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); + struct iovec fast_iov, *iov; struct page *last_hpage = NULL; __u32 done; int i, err; @@ -434,21 +410,23 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu; u64 tag = 0; - err = io_copy_iov(ctx, &iov, iovs, done); - if (err) + iov = iovec_from_user(&uvec[done], 1, 1, &fast_iov, ctx->compat); + if (IS_ERR(iov)) { + err = PTR_ERR(iov); break; + } if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { err = -EFAULT; break; } - err = io_buffer_validate(&iov); + err = io_buffer_validate(iov); if (err) break; - if (!iov.iov_base && tag) { + if (!iov->iov_base && tag) { err = -EINVAL; break; } - err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); + err = io_sqe_buffer_register(ctx, iov, &imu, &last_hpage); if (err) break; @@ -970,8 +948,9 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, { struct page *last_hpage = NULL; struct io_rsrc_data *data; + struct iovec fast_iov, *iov = &fast_iov; + const struct iovec __user *uvec = (struct iovec * __user) arg; int i, ret; - struct iovec iov; BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); @@ -988,24 +967,27 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, return ret; } + if (!arg) + memset(iov, 0, sizeof(*iov)); + for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { if (arg) { - ret = io_copy_iov(ctx, &iov, arg, i); - if (ret) + iov = iovec_from_user(&uvec[i], 1, 1, &fast_iov, ctx->compat); + if (IS_ERR(iov)) { + ret = PTR_ERR(iov); break; - ret = io_buffer_validate(&iov); + } + ret = io_buffer_validate(iov); if (ret) break; - } else { - memset(&iov, 0, sizeof(iov)); } - if (!iov.iov_base && *io_get_tag_slot(data, i)) { + if (!iov->iov_base && *io_get_tag_slot(data, i)) { ret = -EINVAL; break; } - ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], + ret = io_sqe_buffer_register(ctx, iov, &ctx->user_bufs[i], &last_hpage); if (ret) break; From 4313c8d79a6d8b6213fa3271f7b758bb64f87a24 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 3 Jun 2024 11:19:10 -0600 Subject: [PATCH 0978/1578] io_uring/eventfd: move to more idiomatic RCU free usage In some ways, it just "happens to work" currently with using the ops field for both the free and signaling bit. But it depends on ordering of operations in terms of freeing and signaling. Clean it up and use the usual refs == 0 under RCU read side lock to determine if the ev_fd is still valid, and use the reference to gate the freeing as well. Fixes: 21a091b970cd ("io_uring: signal registered eventfd to process deferred task work") Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 49 ++++++++++++++++++++++++--------------------- io_uring/io_uring.h | 4 ++-- io_uring/register.c | 6 +++--- 3 files changed, 31 insertions(+), 28 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 816e93e7f94907..b874836ee49db5 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -541,29 +541,33 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx) } } -void io_eventfd_ops(struct rcu_head *rcu) +void io_eventfd_free(struct rcu_head *rcu) { struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); - int ops = atomic_xchg(&ev_fd->ops, 0); - if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT)) - eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); + eventfd_ctx_put(ev_fd->cq_ev_fd); + kfree(ev_fd); +} - /* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback - * ordering in a race but if references are 0 we know we have to free - * it regardless. - */ - if (atomic_dec_and_test(&ev_fd->refs)) { - eventfd_ctx_put(ev_fd->cq_ev_fd); - kfree(ev_fd); - } +void io_eventfd_do_signal(struct rcu_head *rcu) +{ + struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); + + eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); + + if (atomic_dec_and_test(&ev_fd->refs)) + io_eventfd_free(rcu); } static void io_eventfd_signal(struct io_ring_ctx *ctx) { struct io_ev_fd *ev_fd = NULL; - rcu_read_lock(); + if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) + return; + + guard(rcu)(); + /* * rcu_dereference ctx->io_ev_fd once and use it for both for checking * and eventfd_signal @@ -576,24 +580,23 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx) * the function and rcu_read_lock. */ if (unlikely(!ev_fd)) - goto out; - if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) - goto out; + return; + if (!atomic_inc_not_zero(&ev_fd->refs)) + return; if (ev_fd->eventfd_async && !io_wq_current_is_worker()) goto out; if (likely(eventfd_signal_allowed())) { eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); } else { - atomic_inc(&ev_fd->refs); - if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) - call_rcu_hurry(&ev_fd->rcu, io_eventfd_ops); - else - atomic_dec(&ev_fd->refs); + if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) { + call_rcu_hurry(&ev_fd->rcu, io_eventfd_do_signal); + return; + } } - out: - rcu_read_unlock(); + if (atomic_dec_and_test(&ev_fd->refs)) + call_rcu(&ev_fd->rcu, io_eventfd_free); } static void io_eventfd_flush_signal(struct io_ring_ctx *ctx) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 726e6367af4d37..2b08b402b716d8 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -106,10 +106,10 @@ bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, enum { IO_EVENTFD_OP_SIGNAL_BIT, - IO_EVENTFD_OP_FREE_BIT, }; -void io_eventfd_ops(struct rcu_head *rcu); +void io_eventfd_do_signal(struct rcu_head *rcu); +void io_eventfd_free(struct rcu_head *rcu); void io_activate_pollwq(struct io_ring_ctx *ctx); static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) diff --git a/io_uring/register.c b/io_uring/register.c index c0010a66a6f2c2..212711e9bc8a67 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -63,9 +63,9 @@ static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, ev_fd->eventfd_async = eventfd_async; ctx->has_evfd = true; - rcu_assign_pointer(ctx->io_ev_fd, ev_fd); atomic_set(&ev_fd->refs, 1); atomic_set(&ev_fd->ops, 0); + rcu_assign_pointer(ctx->io_ev_fd, ev_fd); return 0; } @@ -78,8 +78,8 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx) if (ev_fd) { ctx->has_evfd = false; rcu_assign_pointer(ctx->io_ev_fd, NULL); - if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops)) - call_rcu(&ev_fd->rcu, io_eventfd_ops); + if (atomic_dec_and_test(&ev_fd->refs)) + call_rcu(&ev_fd->rcu, io_eventfd_free); return 0; } From 31718d0ba5777340c23604708b471e02fb9e0c2f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 3 Jun 2024 11:51:19 -0600 Subject: [PATCH 0979/1578] io_uring/eventfd: move eventfd handling to separate file This is pretty nicely abstracted already, but let's move it to a separate file rather than have it in the main io_uring file. With that, we can also move the io_ev_fd struct and enum out of global scope. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 8 -- io_uring/Makefile | 6 +- io_uring/eventfd.c | 160 +++++++++++++++++++++++++++++++++ io_uring/eventfd.h | 8 ++ io_uring/io_uring.c | 82 +---------------- io_uring/io_uring.h | 6 -- io_uring/register.c | 56 +----------- 7 files changed, 173 insertions(+), 153 deletions(-) create mode 100644 io_uring/eventfd.c create mode 100644 io_uring/eventfd.h diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 91224bbcfa73ff..a2227ab7fd1666 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -211,14 +211,6 @@ struct io_submit_state { struct blk_plug plug; }; -struct io_ev_fd { - struct eventfd_ctx *cq_ev_fd; - unsigned int eventfd_async: 1; - struct rcu_head rcu; - atomic_t refs; - atomic_t ops; -}; - struct io_alloc_cache { void **entries; unsigned int nr_cached; diff --git a/io_uring/Makefile b/io_uring/Makefile index fc1b23c524e83b..61923e11c76772 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -4,9 +4,9 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ tctx.o filetable.o rw.o net.o poll.o \ - uring_cmd.o openclose.o sqpoll.o \ - xattr.o nop.o fs.o splice.o sync.o \ - msg_ring.o advise.o openclose.o \ + eventfd.o uring_cmd.o openclose.o \ + sqpoll.o xattr.o nop.o fs.o splice.o \ + sync.o msg_ring.o advise.o openclose.o \ epoll.o statx.o timeout.o fdinfo.o \ cancel.o waitid.o register.o \ truncate.o memmap.o diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c new file mode 100644 index 00000000000000..b9384503a2b752 --- /dev/null +++ b/io_uring/eventfd.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include + +#include "io-wq.h" +#include "eventfd.h" + +struct io_ev_fd { + struct eventfd_ctx *cq_ev_fd; + unsigned int eventfd_async: 1; + struct rcu_head rcu; + atomic_t refs; + atomic_t ops; +}; + +enum { + IO_EVENTFD_OP_SIGNAL_BIT, +}; + +static void io_eventfd_free(struct rcu_head *rcu) +{ + struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); + + eventfd_ctx_put(ev_fd->cq_ev_fd); + kfree(ev_fd); +} + +static void io_eventfd_do_signal(struct rcu_head *rcu) +{ + struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); + + eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); + + if (atomic_dec_and_test(&ev_fd->refs)) + io_eventfd_free(rcu); +} + +void io_eventfd_signal(struct io_ring_ctx *ctx) +{ + struct io_ev_fd *ev_fd = NULL; + + if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) + return; + + guard(rcu)(); + + /* + * rcu_dereference ctx->io_ev_fd once and use it for both for checking + * and eventfd_signal + */ + ev_fd = rcu_dereference(ctx->io_ev_fd); + + /* + * Check again if ev_fd exists incase an io_eventfd_unregister call + * completed between the NULL check of ctx->io_ev_fd at the start of + * the function and rcu_read_lock. + */ + if (unlikely(!ev_fd)) + return; + if (!atomic_inc_not_zero(&ev_fd->refs)) + return; + if (ev_fd->eventfd_async && !io_wq_current_is_worker()) + goto out; + + if (likely(eventfd_signal_allowed())) { + eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); + } else { + if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) { + call_rcu_hurry(&ev_fd->rcu, io_eventfd_do_signal); + return; + } + } +out: + if (atomic_dec_and_test(&ev_fd->refs)) + call_rcu(&ev_fd->rcu, io_eventfd_free); +} + +void io_eventfd_flush_signal(struct io_ring_ctx *ctx) +{ + bool skip; + + spin_lock(&ctx->completion_lock); + + /* + * Eventfd should only get triggered when at least one event has been + * posted. Some applications rely on the eventfd notification count + * only changing IFF a new CQE has been added to the CQ ring. There's + * no depedency on 1:1 relationship between how many times this + * function is called (and hence the eventfd count) and number of CQEs + * posted to the CQ ring. + */ + skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail; + ctx->evfd_last_cq_tail = ctx->cached_cq_tail; + spin_unlock(&ctx->completion_lock); + if (skip) + return; + + io_eventfd_signal(ctx); +} + +int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned int eventfd_async) +{ + struct io_ev_fd *ev_fd; + __s32 __user *fds = arg; + int fd; + + ev_fd = rcu_dereference_protected(ctx->io_ev_fd, + lockdep_is_held(&ctx->uring_lock)); + if (ev_fd) + return -EBUSY; + + if (copy_from_user(&fd, fds, sizeof(*fds))) + return -EFAULT; + + ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); + if (!ev_fd) + return -ENOMEM; + + ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); + if (IS_ERR(ev_fd->cq_ev_fd)) { + int ret = PTR_ERR(ev_fd->cq_ev_fd); + kfree(ev_fd); + return ret; + } + + spin_lock(&ctx->completion_lock); + ctx->evfd_last_cq_tail = ctx->cached_cq_tail; + spin_unlock(&ctx->completion_lock); + + ev_fd->eventfd_async = eventfd_async; + ctx->has_evfd = true; + atomic_set(&ev_fd->refs, 1); + atomic_set(&ev_fd->ops, 0); + rcu_assign_pointer(ctx->io_ev_fd, ev_fd); + return 0; +} + +int io_eventfd_unregister(struct io_ring_ctx *ctx) +{ + struct io_ev_fd *ev_fd; + + ev_fd = rcu_dereference_protected(ctx->io_ev_fd, + lockdep_is_held(&ctx->uring_lock)); + if (ev_fd) { + ctx->has_evfd = false; + rcu_assign_pointer(ctx->io_ev_fd, NULL); + if (atomic_dec_and_test(&ev_fd->refs)) + call_rcu(&ev_fd->rcu, io_eventfd_free); + return 0; + } + + return -ENXIO; +} diff --git a/io_uring/eventfd.h b/io_uring/eventfd.h new file mode 100644 index 00000000000000..d394f49c632105 --- /dev/null +++ b/io_uring/eventfd.h @@ -0,0 +1,8 @@ + +struct io_ring_ctx; +int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned int eventfd_async); +int io_eventfd_unregister(struct io_ring_ctx *ctx); + +void io_eventfd_flush_signal(struct io_ring_ctx *ctx); +void io_eventfd_signal(struct io_ring_ctx *ctx); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index b874836ee49db5..96f6da0bf5cd98 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -101,6 +101,7 @@ #include "poll.h" #include "rw.h" #include "alloc_cache.h" +#include "eventfd.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -541,87 +542,6 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx) } } -void io_eventfd_free(struct rcu_head *rcu) -{ - struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); - - eventfd_ctx_put(ev_fd->cq_ev_fd); - kfree(ev_fd); -} - -void io_eventfd_do_signal(struct rcu_head *rcu) -{ - struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); - - eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); - - if (atomic_dec_and_test(&ev_fd->refs)) - io_eventfd_free(rcu); -} - -static void io_eventfd_signal(struct io_ring_ctx *ctx) -{ - struct io_ev_fd *ev_fd = NULL; - - if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) - return; - - guard(rcu)(); - - /* - * rcu_dereference ctx->io_ev_fd once and use it for both for checking - * and eventfd_signal - */ - ev_fd = rcu_dereference(ctx->io_ev_fd); - - /* - * Check again if ev_fd exists incase an io_eventfd_unregister call - * completed between the NULL check of ctx->io_ev_fd at the start of - * the function and rcu_read_lock. - */ - if (unlikely(!ev_fd)) - return; - if (!atomic_inc_not_zero(&ev_fd->refs)) - return; - if (ev_fd->eventfd_async && !io_wq_current_is_worker()) - goto out; - - if (likely(eventfd_signal_allowed())) { - eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); - } else { - if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) { - call_rcu_hurry(&ev_fd->rcu, io_eventfd_do_signal); - return; - } - } -out: - if (atomic_dec_and_test(&ev_fd->refs)) - call_rcu(&ev_fd->rcu, io_eventfd_free); -} - -static void io_eventfd_flush_signal(struct io_ring_ctx *ctx) -{ - bool skip; - - spin_lock(&ctx->completion_lock); - - /* - * Eventfd should only get triggered when at least one event has been - * posted. Some applications rely on the eventfd notification count - * only changing IFF a new CQE has been added to the CQ ring. There's - * no depedency on 1:1 relationship between how many times this - * function is called (and hence the eventfd count) and number of CQEs - * posted to the CQ ring. - */ - skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail; - ctx->evfd_last_cq_tail = ctx->cached_cq_tail; - spin_unlock(&ctx->completion_lock); - if (skip) - return; - - io_eventfd_signal(ctx); -} - void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { if (ctx->poll_activated) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 2b08b402b716d8..cd43924eed04e1 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -104,12 +104,6 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx); bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, bool cancel_all); -enum { - IO_EVENTFD_OP_SIGNAL_BIT, -}; - -void io_eventfd_do_signal(struct rcu_head *rcu); -void io_eventfd_free(struct rcu_head *rcu); void io_activate_pollwq(struct io_ring_ctx *ctx); static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) diff --git a/io_uring/register.c b/io_uring/register.c index 212711e9bc8a67..f121e02f5e10e6 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -27,65 +27,11 @@ #include "cancel.h" #include "kbuf.h" #include "napi.h" +#include "eventfd.h" #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) -static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, - unsigned int eventfd_async) -{ - struct io_ev_fd *ev_fd; - __s32 __user *fds = arg; - int fd; - - ev_fd = rcu_dereference_protected(ctx->io_ev_fd, - lockdep_is_held(&ctx->uring_lock)); - if (ev_fd) - return -EBUSY; - - if (copy_from_user(&fd, fds, sizeof(*fds))) - return -EFAULT; - - ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); - if (!ev_fd) - return -ENOMEM; - - ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); - if (IS_ERR(ev_fd->cq_ev_fd)) { - int ret = PTR_ERR(ev_fd->cq_ev_fd); - kfree(ev_fd); - return ret; - } - - spin_lock(&ctx->completion_lock); - ctx->evfd_last_cq_tail = ctx->cached_cq_tail; - spin_unlock(&ctx->completion_lock); - - ev_fd->eventfd_async = eventfd_async; - ctx->has_evfd = true; - atomic_set(&ev_fd->refs, 1); - atomic_set(&ev_fd->ops, 0); - rcu_assign_pointer(ctx->io_ev_fd, ev_fd); - return 0; -} - -int io_eventfd_unregister(struct io_ring_ctx *ctx) -{ - struct io_ev_fd *ev_fd; - - ev_fd = rcu_dereference_protected(ctx->io_ev_fd, - lockdep_is_held(&ctx->uring_lock)); - if (ev_fd) { - ctx->has_evfd = false; - rcu_assign_pointer(ctx->io_ev_fd, NULL); - if (atomic_dec_and_test(&ev_fd->refs)) - call_rcu(&ev_fd->rcu, io_eventfd_free); - return 0; - } - - return -ENXIO; -} - static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { From 7cc167a8abaa1f4b904bd85a2927b44c5602ad7c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 14 Jun 2024 10:57:03 -0600 Subject: [PATCH 0980/1578] io_uring: use 'state' consistently __io_submit_flush_completions() assigns ctx->submit_state to a local variable and uses it in all but one spot, switch that forgotten statement to using 'state' as well. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 96f6da0bf5cd98..0c86f504fc6659 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1390,7 +1390,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) } __io_cq_unlock_post(ctx); - if (!wq_list_empty(&ctx->submit_state.compl_reqs)) { + if (!wq_list_empty(&state->compl_reqs)) { io_free_batch_list(ctx, state->compl_reqs.first); INIT_WQ_LIST(&state->compl_reqs); } From 940b8c41dd4ecbc3e334dfe46b3694ba7a91406e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 13 Jun 2024 19:28:27 +0000 Subject: [PATCH 0981/1578] io_uring/io-wq: make io_wq_work flags atomic The work flags can be set/accessed from different tasks, both the originator of the request, and the io-wq workers. While modifications aren't concurrent, it still makes KMSAN unhappy. There's no real downside to just making the flag reading/manipulation use proper atomics here. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- io_uring/io-wq.c | 19 ++++++++++--------- io_uring/io-wq.h | 2 +- io_uring/io_uring.c | 12 ++++++------ 4 files changed, 18 insertions(+), 17 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index a2227ab7fd1666..d3f88b5e11955b 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -50,7 +50,7 @@ struct io_wq_work_list { struct io_wq_work { struct io_wq_work_node list; - unsigned flags; + atomic_t flags; /* place it here instead of io_kiocb as it fills padding and saves 4B */ int cancel_seq; }; diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 7d3316fe9bfc46..913c92249522e8 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -159,7 +159,7 @@ static inline struct io_wq_acct *io_get_acct(struct io_wq *wq, bool bound) static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq, struct io_wq_work *work) { - return io_get_acct(wq, !(work->flags & IO_WQ_WORK_UNBOUND)); + return io_get_acct(wq, !(atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND)); } static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker) @@ -451,7 +451,7 @@ static void __io_worker_idle(struct io_wq *wq, struct io_worker *worker) static inline unsigned int io_get_work_hash(struct io_wq_work *work) { - return work->flags >> IO_WQ_HASH_SHIFT; + return atomic_read(&work->flags) >> IO_WQ_HASH_SHIFT; } static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash) @@ -592,8 +592,9 @@ static void io_worker_handle_work(struct io_wq_acct *acct, next_hashed = wq_next_work(work); - if (unlikely(do_kill) && (work->flags & IO_WQ_WORK_UNBOUND)) - work->flags |= IO_WQ_WORK_CANCEL; + if (do_kill && + (atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND)) + atomic_or(IO_WQ_WORK_CANCEL, &work->flags); wq->do_work(work); io_assign_current_work(worker, NULL); @@ -891,7 +892,7 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data) static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq) { do { - work->flags |= IO_WQ_WORK_CANCEL; + atomic_or(IO_WQ_WORK_CANCEL, &work->flags); wq->do_work(work); work = wq->free_work(work); } while (work); @@ -926,7 +927,7 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data) void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) { struct io_wq_acct *acct = io_work_get_acct(wq, work); - unsigned long work_flags = work->flags; + unsigned int work_flags = atomic_read(&work->flags); struct io_cb_cancel_data match = { .fn = io_wq_work_match_item, .data = work, @@ -939,7 +940,7 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) * been marked as one that should not get executed, cancel it here. */ if (test_bit(IO_WQ_BIT_EXIT, &wq->state) || - (work->flags & IO_WQ_WORK_CANCEL)) { + (work_flags & IO_WQ_WORK_CANCEL)) { io_run_cancel(work, wq); return; } @@ -982,7 +983,7 @@ void io_wq_hash_work(struct io_wq_work *work, void *val) unsigned int bit; bit = hash_ptr(val, IO_WQ_HASH_ORDER); - work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT)); + atomic_or(IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT), &work->flags); } static bool __io_wq_worker_cancel(struct io_worker *worker, @@ -990,7 +991,7 @@ static bool __io_wq_worker_cancel(struct io_worker *worker, struct io_wq_work *work) { if (work && match->fn(work, match->data)) { - work->flags |= IO_WQ_WORK_CANCEL; + atomic_or(IO_WQ_WORK_CANCEL, &work->flags); __set_notify_signal(worker->task); return true; } diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h index 2b2a6406dd8ee8..b3b004a7b62528 100644 --- a/io_uring/io-wq.h +++ b/io_uring/io-wq.h @@ -56,7 +56,7 @@ bool io_wq_worker_stopped(void); static inline bool io_wq_is_hashed(struct io_wq_work *work) { - return work->flags & IO_WQ_WORK_HASHED; + return atomic_read(&work->flags) & IO_WQ_WORK_HASHED; } typedef bool (work_cancel_fn)(struct io_wq_work *, void *); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 0c86f504fc6659..872477412726eb 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -462,9 +462,9 @@ static void io_prep_async_work(struct io_kiocb *req) } req->work.list.next = NULL; - req->work.flags = 0; + atomic_set(&req->work.flags, 0); if (req->flags & REQ_F_FORCE_ASYNC) - req->work.flags |= IO_WQ_WORK_CONCURRENT; + atomic_or(IO_WQ_WORK_CONCURRENT, &req->work.flags); if (req->file && !(req->flags & REQ_F_FIXED_FILE)) req->flags |= io_file_get_flags(req->file); @@ -480,7 +480,7 @@ static void io_prep_async_work(struct io_kiocb *req) io_wq_hash_work(&req->work, file_inode(req->file)); } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { if (def->unbound_nonreg_file) - req->work.flags |= IO_WQ_WORK_UNBOUND; + atomic_or(IO_WQ_WORK_UNBOUND, &req->work.flags); } } @@ -520,7 +520,7 @@ static void io_queue_iowq(struct io_kiocb *req) * worker for it). */ if (WARN_ON_ONCE(!same_thread_group(req->task, current))) - req->work.flags |= IO_WQ_WORK_CANCEL; + atomic_or(IO_WQ_WORK_CANCEL, &req->work.flags); trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work)); io_wq_enqueue(tctx->io_wq, &req->work); @@ -1736,14 +1736,14 @@ void io_wq_submit_work(struct io_wq_work *work) io_arm_ltimeout(req); /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ - if (work->flags & IO_WQ_WORK_CANCEL) { + if (atomic_read(&work->flags) & IO_WQ_WORK_CANCEL) { fail: io_req_task_queue_fail(req, err); return; } if (!io_assign_file(req, def, issue_flags)) { err = -EBADF; - work->flags |= IO_WQ_WORK_CANCEL; + atomic_or(IO_WQ_WORK_CANCEL, &work->flags); goto fail; } From 0a5d3258d7c97295a89d22e54733b54aacb62562 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 3 Jun 2024 22:23:15 -0700 Subject: [PATCH 0982/1578] compiler_types.h: Define __retain for __attribute__((__retain__)) Some code includes the __used macro to prevent functions and data from being optimized out. This macro implements __attribute__((__used__)), which operates at the compiler and IR-level, and so still allows a linker to remove objects intended to be kept. Compilers supporting __attribute__((__retain__)) can address this gap by setting the flag SHF_GNU_RETAIN on the section of a function/variable, indicating to the linker the object should be retained. This attribute is available since gcc 11, clang 13, and binutils 2.36. Provide a __retain macro implementing __attribute__((__retain__)), whose first user will be the '__bpf_kfunc' tag. [ Additional remark from discussion: Why is CONFIG_LTO_CLANG added here? The __used macro permits garbage collection at section level, so CLANG_LTO_CLANG without CONFIG_LD_DEAD_CODE_DATA_ELIMINATION should not change final section dynamics? The conditional guard was included to ensure consistent behaviour between __retain and other features forcing split sections. In particular, the same guard is used in vmlinux.lds.h to merge split sections where needed. For example, using __retain in LLVM builds without CONFIG_LTO was failing CI tests on kernel-patches/bpf because the kernel didn't boot properly. And in further testing, the kernel had no issues loading BPF kfunc modules with such split sections, so the module (partial) linking scripts were left alone. ] Signed-off-by: Tony Ambardar Signed-off-by: Daniel Borkmann Cc: Yonghong Song Link: https://lore.kernel.org/bpf/ZlmGoT9KiYLZd91S@krava/T/ Link: https://lore.kernel.org/bpf/b31bca5a5e6765a0f32cc8c19b1d9cdbfaa822b5.1717477560.git.Tony.Ambardar@gmail.com --- include/linux/compiler_types.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 93600de3800bfb..f14c275950b5bc 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -143,6 +143,29 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { } # define __preserve_most #endif +/* + * Annotating a function/variable with __retain tells the compiler to place + * the object in its own section and set the flag SHF_GNU_RETAIN. This flag + * instructs the linker to retain the object during garbage-cleanup or LTO + * phases. + * + * Note that the __used macro is also used to prevent functions or data + * being optimized out, but operates at the compiler/IR-level and may still + * allow unintended removal of objects during linking. + * + * Optional: only supported since gcc >= 11, clang >= 13 + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-retain-function-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#retain + */ +#if __has_attribute(__retain__) && \ + (defined(CONFIG_LD_DEAD_CODE_DATA_ELIMINATION) || \ + defined(CONFIG_LTO_CLANG)) +# define __retain __attribute__((__retain__)) +#else +# define __retain +#endif + /* Compiler specific macros. */ #ifdef __clang__ #include From 7bdcedd5c8fb88e7176b93812b139eca5fe0aa46 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 3 Jun 2024 22:23:16 -0700 Subject: [PATCH 0983/1578] bpf: Harden __bpf_kfunc tag against linker kfunc removal BPF kfuncs are often not directly referenced and may be inadvertently removed by optimization steps during kernel builds, thus the __bpf_kfunc tag mitigates against this removal by including the __used macro. However, this macro alone does not prevent removal during linking, and may still yield build warnings (e.g. on mips64el): [...] LD vmlinux BTFIDS vmlinux WARN: resolve_btfids: unresolved symbol bpf_verify_pkcs7_signature WARN: resolve_btfids: unresolved symbol bpf_lookup_user_key WARN: resolve_btfids: unresolved symbol bpf_lookup_system_key WARN: resolve_btfids: unresolved symbol bpf_key_put WARN: resolve_btfids: unresolved symbol bpf_iter_task_next WARN: resolve_btfids: unresolved symbol bpf_iter_css_task_new WARN: resolve_btfids: unresolved symbol bpf_get_file_xattr WARN: resolve_btfids: unresolved symbol bpf_ct_insert_entry WARN: resolve_btfids: unresolved symbol bpf_cgroup_release WARN: resolve_btfids: unresolved symbol bpf_cgroup_from_id WARN: resolve_btfids: unresolved symbol bpf_cgroup_acquire WARN: resolve_btfids: unresolved symbol bpf_arena_free_pages NM System.map SORTTAB vmlinux OBJCOPY vmlinux.32 [...] Update the __bpf_kfunc tag to better guard against linker optimization by including the new __retain compiler macro, which fixes the warnings above. Verify the __retain macro with readelf by checking object flags for 'R': $ readelf -Wa kernel/trace/bpf_trace.o Section Headers: [Nr] Name Type Address Off Size ES Flg Lk Inf Al [...] [178] .text.bpf_key_put PROGBITS 00000000 6420 0050 00 AXR 0 0 8 [...] Key to Flags: [...] R (retain), D (mbind), p (processor specific) Fixes: 57e7c169cd6a ("bpf: Add __bpf_kfunc tag for marking kernel functions as kfuncs") Reported-by: kernel test robot Signed-off-by: Tony Ambardar Signed-off-by: Daniel Borkmann Tested-by: Jiri Olsa Reviewed-by: Jiri Olsa Cc: Yonghong Song Closes: https://lore.kernel.org/r/202401211357.OCX9yllM-lkp@intel.com/ Link: https://lore.kernel.org/bpf/ZlmGoT9KiYLZd91S@krava/T/ Link: https://lore.kernel.org/bpf/e9c64e9b5c073dabd457ff45128aabcab7630098.1717477560.git.Tony.Ambardar@gmail.com --- include/linux/btf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/btf.h b/include/linux/btf.h index f9e56fd12a9fd5..7c3e40c3295ef9 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -82,7 +82,7 @@ * as to avoid issues such as the compiler inlining or eliding either a static * kfunc, or a global kfunc in an LTO build. */ -#define __bpf_kfunc __used noinline +#define __bpf_kfunc __used __retain noinline #define __bpf_kfunc_start_defs() \ __diag_push(); \ From 5e3b7009f116f684ac6b93d8924506154f3b1f6d Mon Sep 17 00:00:00 2001 From: Andreas Hindborg Date: Sat, 15 Jun 2024 01:53:50 +0200 Subject: [PATCH 0984/1578] rust: block: do not use removed queue limit API The Rust block layer API was using the old queue limit API, which was just removed. Use the new API instead. Reported-by: Boqun Feng Fixes: 3253aba3408a ("rust: block: introduce `kernel::block::mq` module") Signed-off-by: Andreas Hindborg Link: https://lore.kernel.org/r/20240614235350.621121-1-nmi@metaspace.dk Signed-off-by: Jens Axboe --- rust/kernel/block/mq/gen_disk.rs | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/rust/kernel/block/mq/gen_disk.rs b/rust/kernel/block/mq/gen_disk.rs index 3b9edb96c8ff77..e06044b549e0be 100644 --- a/rust/kernel/block/mq/gen_disk.rs +++ b/rust/kernel/block/mq/gen_disk.rs @@ -95,11 +95,17 @@ impl GenDiskBuilder { ) -> Result> { let lock_class_key = crate::sync::LockClassKey::new(); + // SAFETY: `bindings::queue_limits` contain only fields that are valid when zeroed. + let mut lim: bindings::queue_limits = unsafe { core::mem::zeroed() }; + + lim.logical_block_size = self.logical_block_size; + lim.physical_block_size = self.physical_block_size; + // SAFETY: `tagset.raw_tag_set()` points to a valid and initialized tag set let gendisk = from_err_ptr(unsafe { bindings::__blk_mq_alloc_disk( tagset.raw_tag_set(), - core::ptr::null_mut(), // TODO: We can pass queue limits right here + &mut lim, core::ptr::null_mut(), lock_class_key.as_ptr(), ) @@ -141,18 +147,6 @@ impl GenDiskBuilder { raw_writer.write_fmt(name)?; raw_writer.write_char('\0')?; - // SAFETY: `gendisk` points to a valid and initialized instance of - // `struct gendisk`. We have exclusive access, so we cannot race. - unsafe { - bindings::blk_queue_logical_block_size((*gendisk).queue, self.logical_block_size) - }; - - // SAFETY: `gendisk` points to a valid and initialized instance of - // `struct gendisk`. We have exclusive access, so we cannot race. - unsafe { - bindings::blk_queue_physical_block_size((*gendisk).queue, self.physical_block_size) - }; - // SAFETY: `gendisk` points to a valid and initialized instance of // `struct gendisk`. `set_capacity` takes a lock to synchronize this // operation, so we will not race. From 7ed352d34f1a09a7659c53de07785115587499fe Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 13 Jun 2024 14:30:44 -0700 Subject: [PATCH 0985/1578] netdev-genl: fix error codes when outputting XDP features -EINVAL will interrupt the dump. The correct error to return if we have more data to dump is -EMSGSIZE. Discovered by doing: for i in `seq 80`; do ip link add type veth; done ./cli.py --dbg-small-recv 5300 --spec netdev.yaml --dump dev-get >> /dev/null [...] nl_len = 64 (48) nl_flags = 0x0 nl_type = 19 nl_len = 20 (4) nl_flags = 0x2 nl_type = 3 error: -22 Fixes: d3d854fd6a1d ("netdev-genl: create a simple family for netdev stuff") Reviewed-by: Amritha Nambiar Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240613213044.3675745-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 1f6ae6379e0fc2..05f9515d2c05c1 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -59,22 +59,22 @@ XDP_METADATA_KFUNC_xxx nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, xdp_rx_meta, NETDEV_A_DEV_PAD) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XSK_FEATURES, - xsk_features, NETDEV_A_DEV_PAD)) { - genlmsg_cancel(rsp, hdr); - return -EINVAL; - } + xsk_features, NETDEV_A_DEV_PAD)) + goto err_cancel_msg; if (netdev->xdp_features & NETDEV_XDP_ACT_XSK_ZEROCOPY) { if (nla_put_u32(rsp, NETDEV_A_DEV_XDP_ZC_MAX_SEGS, - netdev->xdp_zc_max_segs)) { - genlmsg_cancel(rsp, hdr); - return -EINVAL; - } + netdev->xdp_zc_max_segs)) + goto err_cancel_msg; } genlmsg_end(rsp, hdr); return 0; + +err_cancel_msg: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; } static void From e789523fe248a80839f8dfe736ca96e5c442a9e8 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Thu, 13 Jun 2024 18:03:43 +0900 Subject: [PATCH 0986/1578] firewire: fix website URL in Kconfig The wiki in kernel.org is no longer updated. This commit replaces the website URL with the latest one. Link: https://lore.kernel.org/r/20240613090343.416198-1-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firewire/Kconfig b/drivers/firewire/Kconfig index 869598b20e3a05..5268b3f0a25a5c 100644 --- a/drivers/firewire/Kconfig +++ b/drivers/firewire/Kconfig @@ -11,7 +11,7 @@ config FIREWIRE This is the new-generation IEEE 1394 (FireWire) driver stack a.k.a. Juju, a new implementation designed for robustness and simplicity. - See http://ieee1394.wiki.kernel.org/index.php/Juju_Migration + See http://ieee1394.docs.kernel.org/en/latest/migration.html for information about migration from the older Linux 1394 stack to the new driver stack. From e7da16abf030b39c3598574d5457f324a6f0e4a7 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Thu, 13 Jun 2024 22:14:33 +0900 Subject: [PATCH 0987/1578] firewire: core: record card index in tracepoinrts events derived from async_outbound_complete_template The asynchronous transaction is initiated on one of 1394 OHCI controller, however the existing tracepoints events has the lack of data about it. This commit adds card_index member into event structure to store the index of host controller in use, and prints it. Link: https://lore.kernel.org/r/20240613131440.431766-2-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-transaction.c | 6 +++--- include/trace/events/firewire.h | 17 ++++++++++------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c index 571fdff65c2bcf..de75e758fd0721 100644 --- a/drivers/firewire/core-transaction.c +++ b/drivers/firewire/core-transaction.c @@ -174,8 +174,8 @@ static void transmit_complete_callback(struct fw_packet *packet, struct fw_transaction *t = container_of(packet, struct fw_transaction, packet); - trace_async_request_outbound_complete((uintptr_t)t, packet->generation, packet->speed, - status, packet->timestamp); + trace_async_request_outbound_complete((uintptr_t)t, card->index, packet->generation, + packet->speed, status, packet->timestamp); switch (status) { case ACK_COMPLETE: @@ -674,7 +674,7 @@ static void free_response_callback(struct fw_packet *packet, { struct fw_request *request = container_of(packet, struct fw_request, response); - trace_async_response_outbound_complete((uintptr_t)request, packet->generation, + trace_async_response_outbound_complete((uintptr_t)request, card->index, packet->generation, packet->speed, status, packet->timestamp); // Decrease the reference count since not at in-flight. diff --git a/include/trace/events/firewire.h b/include/trace/events/firewire.h index d695a560673f7d..ca6ea9bd1ebafe 100644 --- a/include/trace/events/firewire.h +++ b/include/trace/events/firewire.h @@ -71,10 +71,11 @@ DECLARE_EVENT_CLASS(async_outbound_initiate_template, // The value of status is one of ack codes and rcodes specific to Linux FireWire subsystem. DECLARE_EVENT_CLASS(async_outbound_complete_template, - TP_PROTO(u64 transaction, unsigned int generation, unsigned int scode, unsigned int status, unsigned int timestamp), - TP_ARGS(transaction, generation, scode, status, timestamp), + TP_PROTO(u64 transaction, unsigned int card_index, unsigned int generation, unsigned int scode, unsigned int status, unsigned int timestamp), + TP_ARGS(transaction, card_index, generation, scode, status, timestamp), TP_STRUCT__entry( __field(u64, transaction) + __field(u8, card_index) __field(u8, generation) __field(u8, scode) __field(u8, status) @@ -82,14 +83,16 @@ DECLARE_EVENT_CLASS(async_outbound_complete_template, ), TP_fast_assign( __entry->transaction = transaction; + __entry->card_index = card_index; __entry->generation = generation; __entry->scode = scode; __entry->status = status; __entry->timestamp = timestamp; ), TP_printk( - "transaction=0x%llx generation=%u scode=%u status=%u timestamp=0x%04x", + "transaction=0x%llx card_index=%u generation=%u scode=%u status=%u timestamp=0x%04x", __entry->transaction, + __entry->card_index, __entry->generation, __entry->scode, __entry->status, @@ -144,8 +147,8 @@ DEFINE_EVENT(async_outbound_initiate_template, async_request_outbound_initiate, ); DEFINE_EVENT(async_outbound_complete_template, async_request_outbound_complete, - TP_PROTO(u64 transaction, unsigned int generation, unsigned int scode, unsigned int status, unsigned int timestamp), - TP_ARGS(transaction, generation, scode, status, timestamp) + TP_PROTO(u64 transaction, unsigned int card_index, unsigned int generation, unsigned int scode, unsigned int status, unsigned int timestamp), + TP_ARGS(transaction, card_index, generation, scode, status, timestamp) ); DEFINE_EVENT(async_inbound_template, async_response_inbound, @@ -194,8 +197,8 @@ DEFINE_EVENT_PRINT(async_outbound_initiate_template, async_response_outbound_ini ); DEFINE_EVENT(async_outbound_complete_template, async_response_outbound_complete, - TP_PROTO(u64 transaction, unsigned int generation, unsigned int scode, unsigned int status, unsigned int timestamp), - TP_ARGS(transaction, generation, scode, status, timestamp) + TP_PROTO(u64 transaction, unsigned int card_index, unsigned int generation, unsigned int scode, unsigned int status, unsigned int timestamp), + TP_ARGS(transaction, card_index, generation, scode, status, timestamp) ); #undef ASYNC_HEADER_GET_DESTINATION From 64e02b64fb1d832a9a5254055a829db64750421a Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Thu, 13 Jun 2024 22:14:34 +0900 Subject: [PATCH 0988/1578] firewire: core: record card index in tracepoinrts events derived from async_outbound_initiate_template The asynchronous transaction is initiated on one of 1394 OHCI controller, however the existing tracepoints events has the lack of data about it. This commit adds card_index member into event structure to store the index of host controller in use, and prints it. Link: https://lore.kernel.org/r/20240613131440.431766-3-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-transaction.c | 10 ++++++---- include/trace/events/firewire.h | 20 ++++++++++++-------- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c index de75e758fd0721..3f9361d1560762 100644 --- a/drivers/firewire/core-transaction.c +++ b/drivers/firewire/core-transaction.c @@ -398,7 +398,8 @@ void __fw_send_request(struct fw_card *card, struct fw_transaction *t, int tcode spin_unlock_irqrestore(&card->lock, flags); - trace_async_request_outbound_initiate((uintptr_t)t, generation, speed, t->packet.header, payload, + trace_async_request_outbound_initiate((uintptr_t)t, card->index, generation, speed, + t->packet.header, payload, tcode_is_read_request(tcode) ? 0 : length / 4); card->driver->send_request(card, &t->packet); @@ -879,9 +880,10 @@ void fw_send_response(struct fw_card *card, // Increase the reference count so that the object is kept during in-flight. fw_request_get(request); - trace_async_response_outbound_initiate((uintptr_t)request, request->response.generation, - request->response.speed, request->response.header, - data, data ? data_length / 4 : 0); + trace_async_response_outbound_initiate((uintptr_t)request, card->index, + request->response.generation, request->response.speed, + request->response.header, data, + data ? data_length / 4 : 0); card->driver->send_response(card, &request->response); } diff --git a/include/trace/events/firewire.h b/include/trace/events/firewire.h index ca6ea9bd1ebafe..a3d9916cbad19f 100644 --- a/include/trace/events/firewire.h +++ b/include/trace/events/firewire.h @@ -36,10 +36,11 @@ #define QUADLET_SIZE 4 DECLARE_EVENT_CLASS(async_outbound_initiate_template, - TP_PROTO(u64 transaction, unsigned int generation, unsigned int scode, const u32 *header, const u32 *data, unsigned int data_count), - TP_ARGS(transaction, generation, scode, header, data, data_count), + TP_PROTO(u64 transaction, unsigned int card_index, unsigned int generation, unsigned int scode, const u32 *header, const u32 *data, unsigned int data_count), + TP_ARGS(transaction, card_index, generation, scode, header, data, data_count), TP_STRUCT__entry( __field(u64, transaction) + __field(u8, card_index) __field(u8, generation) __field(u8, scode) __array(u32, header, ASYNC_HEADER_QUADLET_COUNT) @@ -47,6 +48,7 @@ DECLARE_EVENT_CLASS(async_outbound_initiate_template, ), TP_fast_assign( __entry->transaction = transaction; + __entry->card_index = card_index; __entry->generation = generation; __entry->scode = scode; memcpy(__entry->header, header, QUADLET_SIZE * ASYNC_HEADER_QUADLET_COUNT); @@ -54,8 +56,9 @@ DECLARE_EVENT_CLASS(async_outbound_initiate_template, ), // This format is for the request subaction. TP_printk( - "transaction=0x%llx generation=%u scode=%u dst_id=0x%04x tlabel=%u tcode=%u src_id=0x%04x offset=0x%012llx header=%s data=%s", + "transaction=0x%llx card_index=%u generation=%u scode=%u dst_id=0x%04x tlabel=%u tcode=%u src_id=0x%04x offset=0x%012llx header=%s data=%s", __entry->transaction, + __entry->card_index, __entry->generation, __entry->scode, ASYNC_HEADER_GET_DESTINATION(__entry->header), @@ -142,8 +145,8 @@ DECLARE_EVENT_CLASS(async_inbound_template, ); DEFINE_EVENT(async_outbound_initiate_template, async_request_outbound_initiate, - TP_PROTO(u64 transaction, unsigned int generation, unsigned int scode, const u32 *header, const u32 *data, unsigned int data_count), - TP_ARGS(transaction, generation, scode, header, data, data_count) + TP_PROTO(u64 transaction, unsigned int card_index, unsigned int generation, unsigned int scode, const u32 *header, const u32 *data, unsigned int data_count), + TP_ARGS(transaction, card_index, generation, scode, header, data, data_count) ); DEFINE_EVENT(async_outbound_complete_template, async_request_outbound_complete, @@ -178,11 +181,12 @@ DEFINE_EVENT_PRINT(async_inbound_template, async_request_inbound, ); DEFINE_EVENT_PRINT(async_outbound_initiate_template, async_response_outbound_initiate, - TP_PROTO(u64 transaction, unsigned int generation, unsigned int scode, const u32 *header, const u32 *data, unsigned int data_count), - TP_ARGS(transaction, generation, scode, header, data, data_count), + TP_PROTO(u64 transaction, unsigned int card_index, unsigned int generation, unsigned int scode, const u32 *header, const u32 *data, unsigned int data_count), + TP_ARGS(transaction, card_index, generation, scode, header, data, data_count), TP_printk( - "transaction=0x%llx generation=%u scode=%u dst_id=0x%04x tlabel=%u tcode=%u src_id=0x%04x rcode=%u header=%s data=%s", + "transaction=0x%llx card_index=%u generation=%u scode=%u dst_id=0x%04x tlabel=%u tcode=%u src_id=0x%04x rcode=%u header=%s data=%s", __entry->transaction, + __entry->card_index, __entry->generation, __entry->scode, ASYNC_HEADER_GET_DESTINATION(__entry->header), From 65ec7ebefe7de01281cce1f552ebd4dd00386665 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Thu, 13 Jun 2024 22:14:35 +0900 Subject: [PATCH 0989/1578] firewire: core: record card index in tracepoinrts events derived from async_inbound_template The asynchronous transaction is initiated on one of 1394 OHCI controller, however the existing tracepoints events has the lack of data about it. This commit adds card_index member into event structure to store the index of host controller in use, and prints it. Link: https://lore.kernel.org/r/20240613131440.431766-4-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-transaction.c | 8 ++++---- include/trace/events/firewire.h | 20 ++++++++++++-------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c index 3f9361d1560762..3503c238f8ae27 100644 --- a/drivers/firewire/core-transaction.c +++ b/drivers/firewire/core-transaction.c @@ -1009,8 +1009,8 @@ void fw_core_handle_request(struct fw_card *card, struct fw_packet *p) return; } - trace_async_request_inbound((uintptr_t)request, p->generation, p->speed, p->ack, - p->timestamp, p->header, request->data, + trace_async_request_inbound((uintptr_t)request, card->index, p->generation, p->speed, + p->ack, p->timestamp, p->header, request->data, tcode_is_read_request(tcode) ? 0 : request->length / 4); offset = async_header_get_offset(p->header); @@ -1080,8 +1080,8 @@ void fw_core_handle_response(struct fw_card *card, struct fw_packet *p) } spin_unlock_irqrestore(&card->lock, flags); - trace_async_response_inbound((uintptr_t)t, p->generation, p->speed, p->ack, p->timestamp, - p->header, data, data_length / 4); + trace_async_response_inbound((uintptr_t)t, card->index, p->generation, p->speed, p->ack, + p->timestamp, p->header, data, data_length / 4); if (!t) { timed_out: diff --git a/include/trace/events/firewire.h b/include/trace/events/firewire.h index a3d9916cbad19f..b72f613cfa0255 100644 --- a/include/trace/events/firewire.h +++ b/include/trace/events/firewire.h @@ -105,10 +105,11 @@ DECLARE_EVENT_CLASS(async_outbound_complete_template, // The value of status is one of ack codes and rcodes specific to Linux FireWire subsystem. DECLARE_EVENT_CLASS(async_inbound_template, - TP_PROTO(u64 transaction, unsigned int generation, unsigned int scode, unsigned int status, unsigned int timestamp, const u32 *header, const u32 *data, unsigned int data_count), - TP_ARGS(transaction, generation, scode, status, timestamp, header, data, data_count), + TP_PROTO(u64 transaction, unsigned int card_index, unsigned int generation, unsigned int scode, unsigned int status, unsigned int timestamp, const u32 *header, const u32 *data, unsigned int data_count), + TP_ARGS(transaction, card_index, generation, scode, status, timestamp, header, data, data_count), TP_STRUCT__entry( __field(u64, transaction) + __field(u8, card_index) __field(u8, generation) __field(u8, scode) __field(u8, status) @@ -118,6 +119,7 @@ DECLARE_EVENT_CLASS(async_inbound_template, ), TP_fast_assign( __entry->transaction = transaction; + __entry->card_index = card_index; __entry->generation = generation; __entry->scode = scode; __entry->status = status; @@ -127,8 +129,9 @@ DECLARE_EVENT_CLASS(async_inbound_template, ), // This format is for the response subaction. TP_printk( - "transaction=0x%llx generation=%u scode=%u status=%u timestamp=0x%04x dst_id=0x%04x tlabel=%u tcode=%u src_id=0x%04x rcode=%u header=%s data=%s", + "transaction=0x%llx card_index=%u generation=%u scode=%u status=%u timestamp=0x%04x dst_id=0x%04x tlabel=%u tcode=%u src_id=0x%04x rcode=%u header=%s data=%s", __entry->transaction, + __entry->card_index, __entry->generation, __entry->scode, __entry->status, @@ -155,16 +158,17 @@ DEFINE_EVENT(async_outbound_complete_template, async_request_outbound_complete, ); DEFINE_EVENT(async_inbound_template, async_response_inbound, - TP_PROTO(u64 transaction, unsigned int generation, unsigned int scode, unsigned int status, unsigned int timestamp, const u32 *header, const u32 *data, unsigned int data_count), - TP_ARGS(transaction, generation, scode, status, timestamp, header, data, data_count) + TP_PROTO(u64 transaction, unsigned int card_index, unsigned int generation, unsigned int scode, unsigned int status, unsigned int timestamp, const u32 *header, const u32 *data, unsigned int data_count), + TP_ARGS(transaction, card_index, generation, scode, status, timestamp, header, data, data_count) ); DEFINE_EVENT_PRINT(async_inbound_template, async_request_inbound, - TP_PROTO(u64 transaction, unsigned int generation, unsigned int scode, unsigned int status, unsigned int timestamp, const u32 *header, const u32 *data, unsigned int data_count), - TP_ARGS(transaction, generation, scode, status, timestamp, header, data, data_count), + TP_PROTO(u64 transaction, unsigned int card_index, unsigned int generation, unsigned int scode, unsigned int status, unsigned int timestamp, const u32 *header, const u32 *data, unsigned int data_count), + TP_ARGS(transaction, card_index, generation, scode, status, timestamp, header, data, data_count), TP_printk( - "transaction=0x%llx generation=%u scode=%u status=%u timestamp=0x%04x dst_id=0x%04x tlabel=%u tcode=%u src_id=0x%04x offset=0x%012llx header=%s data=%s", + "transaction=0x%llx card_index=%u generation=%u scode=%u status=%u timestamp=0x%04x dst_id=0x%04x tlabel=%u tcode=%u src_id=0x%04x offset=0x%012llx header=%s data=%s", __entry->transaction, + __entry->card_index, __entry->generation, __entry->scode, __entry->status, From 3cb44a72a39835b368ab78d739819330089aa2bf Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Thu, 13 Jun 2024 22:14:36 +0900 Subject: [PATCH 0990/1578] firewire: core: record card index in async_phy_outbound_initiate tracepoints event The asynchronous transaction is initiated on one of 1394 OHCI controller, however the existing tracepoints events has the lack of data about it. This commit adds card_index member into event structure to store the index of host controller in use, and prints it. Link: https://lore.kernel.org/r/20240613131440.431766-5-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-cdev.c | 4 ++-- drivers/firewire/core-transaction.c | 2 +- include/trace/events/firewire.h | 9 ++++++--- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c index 55993c9e0b9043..ff8739f96af5bd 100644 --- a/drivers/firewire/core-cdev.c +++ b/drivers/firewire/core-cdev.c @@ -1659,8 +1659,8 @@ static int ioctl_send_phy_packet(struct client *client, union ioctl_arg *arg) memcpy(pp->data, a->data, sizeof(a->data)); } - trace_async_phy_outbound_initiate((uintptr_t)&e->p, e->p.generation, e->p.header[1], - e->p.header[2]); + trace_async_phy_outbound_initiate((uintptr_t)&e->p, card->index, e->p.generation, + e->p.header[1], e->p.header[2]); card->driver->send_request(card, &e->p); diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c index 3503c238f8ae27..e522dc3d989700 100644 --- a/drivers/firewire/core-transaction.c +++ b/drivers/firewire/core-transaction.c @@ -504,7 +504,7 @@ void fw_send_phy_config(struct fw_card *card, phy_config_packet.generation = generation; reinit_completion(&phy_config_done); - trace_async_phy_outbound_initiate((uintptr_t)&phy_config_packet, + trace_async_phy_outbound_initiate((uintptr_t)&phy_config_packet, card->index, phy_config_packet.generation, phy_config_packet.header[1], phy_config_packet.header[2]); diff --git a/include/trace/events/firewire.h b/include/trace/events/firewire.h index b72f613cfa0255..a59dc26b2a536a 100644 --- a/include/trace/events/firewire.h +++ b/include/trace/events/firewire.h @@ -217,23 +217,26 @@ DEFINE_EVENT(async_outbound_complete_template, async_response_outbound_complete, #undef ASYNC_HEADER_GET_RCODE TRACE_EVENT(async_phy_outbound_initiate, - TP_PROTO(u64 packet, unsigned int generation, u32 first_quadlet, u32 second_quadlet), - TP_ARGS(packet, generation, first_quadlet, second_quadlet), + TP_PROTO(u64 packet, unsigned int card_index, unsigned int generation, u32 first_quadlet, u32 second_quadlet), + TP_ARGS(packet, card_index, generation, first_quadlet, second_quadlet), TP_STRUCT__entry( __field(u64, packet) + __field(u8, card_index) __field(u8, generation) __field(u32, first_quadlet) __field(u32, second_quadlet) ), TP_fast_assign( __entry->packet = packet; + __entry->card_index = card_index; __entry->generation = generation; __entry->first_quadlet = first_quadlet; __entry->second_quadlet = second_quadlet ), TP_printk( - "packet=0x%llx generation=%u first_quadlet=0x%08x second_quadlet=0x%08x", + "packet=0x%llx card_index=%u generation=%u first_quadlet=0x%08x second_quadlet=0x%08x", __entry->packet, + __entry->card_index, __entry->generation, __entry->first_quadlet, __entry->second_quadlet From 810f2aa83563020d834425018303f2aaecef1e6e Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Thu, 13 Jun 2024 22:14:37 +0900 Subject: [PATCH 0991/1578] firewire: core: record card index in async_phy_outbound_complete tracepoints event The asynchronous transmission of phy packet is initiated on one of 1394 OHCI controller, however the existing tracepoints events has the lack of data about it. This commit adds card_index member into event structure to store the index of host controller in use, and prints it. Link: https://lore.kernel.org/r/20240613131440.431766-6-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-cdev.c | 2 +- drivers/firewire/core-transaction.c | 2 +- include/trace/events/firewire.h | 9 ++++++--- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c index ff8739f96af5bd..9a7dc90330a351 100644 --- a/drivers/firewire/core-cdev.c +++ b/drivers/firewire/core-cdev.c @@ -1559,7 +1559,7 @@ static void outbound_phy_packet_callback(struct fw_packet *packet, struct client *e_client = e->client; u32 rcode; - trace_async_phy_outbound_complete((uintptr_t)packet, status, packet->generation, + trace_async_phy_outbound_complete((uintptr_t)packet, card->index, status, packet->generation, packet->timestamp); switch (status) { diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c index e522dc3d989700..bd5a467cfd6032 100644 --- a/drivers/firewire/core-transaction.c +++ b/drivers/firewire/core-transaction.c @@ -464,7 +464,7 @@ static DECLARE_COMPLETION(phy_config_done); static void transmit_phy_packet_callback(struct fw_packet *packet, struct fw_card *card, int status) { - trace_async_phy_outbound_complete((uintptr_t)packet, packet->generation, status, + trace_async_phy_outbound_complete((uintptr_t)packet, card->index, packet->generation, status, packet->timestamp); complete(&phy_config_done); } diff --git a/include/trace/events/firewire.h b/include/trace/events/firewire.h index a59dc26b2a536a..61c7a2461fbc6c 100644 --- a/include/trace/events/firewire.h +++ b/include/trace/events/firewire.h @@ -244,23 +244,26 @@ TRACE_EVENT(async_phy_outbound_initiate, ); TRACE_EVENT(async_phy_outbound_complete, - TP_PROTO(u64 packet, unsigned int generation, unsigned int status, unsigned int timestamp), - TP_ARGS(packet, generation, status, timestamp), + TP_PROTO(u64 packet, unsigned int card_index, unsigned int generation, unsigned int status, unsigned int timestamp), + TP_ARGS(packet, card_index, generation, status, timestamp), TP_STRUCT__entry( __field(u64, packet) + __field(u8, card_index) __field(u8, generation) __field(u8, status) __field(u16, timestamp) ), TP_fast_assign( __entry->packet = packet; + __entry->card_index = card_index; __entry->generation = generation; __entry->status = status; __entry->timestamp = timestamp; ), TP_printk( - "packet=0x%llx generation=%u status=%u timestamp=0x%04x", + "packet=0x%llx card_index=%u generation=%u status=%u timestamp=0x%04x", __entry->packet, + __entry->card_index, __entry->generation, __entry->status, __entry->timestamp From abbb4bd96d7f871b10bd7058f7284ffaf4e8257f Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Thu, 13 Jun 2024 22:14:38 +0900 Subject: [PATCH 0992/1578] firewire: core: record card index in async_phy_inbound tracepoints event The asynchronous transmission of phy packet is initiated on one of 1394 OHCI controller, however the existing tracepoints events has the lack of data about it. This commit adds card_index member into event structure to store the index of host controller in use, and prints it. Link: https://lore.kernel.org/r/20240613131440.431766-7-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-transaction.c | 2 +- include/trace/events/firewire.h | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c index bd5a467cfd6032..76ab6a2097685e 100644 --- a/drivers/firewire/core-transaction.c +++ b/drivers/firewire/core-transaction.c @@ -997,7 +997,7 @@ void fw_core_handle_request(struct fw_card *card, struct fw_packet *p) tcode = async_header_get_tcode(p->header); if (tcode_is_link_internal(tcode)) { - trace_async_phy_inbound((uintptr_t)p, p->generation, p->ack, p->timestamp, + trace_async_phy_inbound((uintptr_t)p, card->index, p->generation, p->ack, p->timestamp, p->header[1], p->header[2]); fw_cdev_handle_phy_packet(card, p); return; diff --git a/include/trace/events/firewire.h b/include/trace/events/firewire.h index 61c7a2461fbc6c..e5524fc7188016 100644 --- a/include/trace/events/firewire.h +++ b/include/trace/events/firewire.h @@ -271,10 +271,11 @@ TRACE_EVENT(async_phy_outbound_complete, ); TRACE_EVENT(async_phy_inbound, - TP_PROTO(u64 packet, unsigned int generation, unsigned int status, unsigned int timestamp, u32 first_quadlet, u32 second_quadlet), - TP_ARGS(packet, generation, status, timestamp, first_quadlet, second_quadlet), + TP_PROTO(u64 packet, unsigned int card_index, unsigned int generation, unsigned int status, unsigned int timestamp, u32 first_quadlet, u32 second_quadlet), + TP_ARGS(packet, card_index, generation, status, timestamp, first_quadlet, second_quadlet), TP_STRUCT__entry( __field(u64, packet) + __field(u8, card_index) __field(u8, generation) __field(u8, status) __field(u16, timestamp) @@ -290,8 +291,9 @@ TRACE_EVENT(async_phy_inbound, __entry->second_quadlet = second_quadlet ), TP_printk( - "packet=0x%llx generation=%u status=%u timestamp=0x%04x first_quadlet=0x%08x second_quadlet=0x%08x", + "packet=0x%llx card_index=%u generation=%u status=%u timestamp=0x%04x first_quadlet=0x%08x second_quadlet=0x%08x", __entry->packet, + __entry->card_index, __entry->generation, __entry->status, __entry->timestamp, From 7507dbc46b780e9fa2ec3d4db7cd9ce07e722927 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Thu, 13 Jun 2024 22:14:39 +0900 Subject: [PATCH 0993/1578] firewire: core: record card index in tracepoinrts events derived from bus_reset_arrange_template The asynchronous transmission of phy packet is initiated on one of 1394 OHCI controller, however the existing tracepoints events has the lack of data about it. This commit adds card_index member into event structure to store the index of host controller in use, and prints it. Link: https://lore.kernel.org/r/20240613131440.431766-8-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 6 +++--- include/trace/events/firewire.h | 21 ++++++++++++--------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index 127d87e3a1530f..f8b99dd6cd827d 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -222,14 +222,14 @@ static int reset_bus(struct fw_card *card, bool short_reset) int reg = short_reset ? 5 : 1; int bit = short_reset ? PHY_BUS_SHORT_RESET : PHY_BUS_RESET; - trace_bus_reset_initiate(card->generation, short_reset); + trace_bus_reset_initiate(card->index, card->generation, short_reset); return card->driver->update_phy_reg(card, reg, 0, bit); } void fw_schedule_bus_reset(struct fw_card *card, bool delayed, bool short_reset) { - trace_bus_reset_schedule(card->generation, short_reset); + trace_bus_reset_schedule(card->index, card->generation, short_reset); /* We don't try hard to sort out requests of long vs. short resets. */ card->br_short = short_reset; @@ -249,7 +249,7 @@ static void br_work(struct work_struct *work) /* Delay for 2s after last reset per IEEE 1394 clause 8.2.1. */ if (card->reset_jiffies != 0 && time_before64(get_jiffies_64(), card->reset_jiffies + 2 * HZ)) { - trace_bus_reset_postpone(card->generation, card->br_short); + trace_bus_reset_postpone(card->index, card->generation, card->br_short); if (!queue_delayed_work(fw_workqueue, &card->br_work, 2 * HZ)) fw_card_put(card); diff --git a/include/trace/events/firewire.h b/include/trace/events/firewire.h index e5524fc7188016..e6485051f546df 100644 --- a/include/trace/events/firewire.h +++ b/include/trace/events/firewire.h @@ -303,36 +303,39 @@ TRACE_EVENT(async_phy_inbound, ); DECLARE_EVENT_CLASS(bus_reset_arrange_template, - TP_PROTO(unsigned int generation, bool short_reset), - TP_ARGS(generation, short_reset), + TP_PROTO(unsigned int card_index, unsigned int generation, bool short_reset), + TP_ARGS(card_index, generation, short_reset), TP_STRUCT__entry( + __field(u8, card_index) __field(u8, generation) __field(bool, short_reset) ), TP_fast_assign( + __entry->card_index = card_index; __entry->generation = generation; __entry->short_reset = short_reset; ), TP_printk( - "generation=%u short_reset=%s", + "card_index=%u generation=%u short_reset=%s", + __entry->card_index, __entry->generation, __entry->short_reset ? "true" : "false" ) ); DEFINE_EVENT(bus_reset_arrange_template, bus_reset_initiate, - TP_PROTO(unsigned int generation, bool short_reset), - TP_ARGS(generation, short_reset) + TP_PROTO(unsigned int card_index, unsigned int generation, bool short_reset), + TP_ARGS(card_index, generation, short_reset) ); DEFINE_EVENT(bus_reset_arrange_template, bus_reset_schedule, - TP_PROTO(unsigned int generation, bool short_reset), - TP_ARGS(generation, short_reset) + TP_PROTO(unsigned int card_index, unsigned int generation, bool short_reset), + TP_ARGS(card_index, generation, short_reset) ); DEFINE_EVENT(bus_reset_arrange_template, bus_reset_postpone, - TP_PROTO(unsigned int generation, bool short_reset), - TP_ARGS(generation, short_reset) + TP_PROTO(unsigned int card_index, unsigned int generation, bool short_reset), + TP_ARGS(card_index, generation, short_reset) ); TRACE_EVENT(bus_reset_handle, From 893098b2af3ea12bab2f505aa825662b379df67d Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Thu, 13 Jun 2024 22:14:40 +0900 Subject: [PATCH 0994/1578] firewire: core: record card index in bus_reset_handle tracepoints event The bus reset event occurs in the bus managed by one of 1394 OHCI controller in Linux system, however the existing tracepoints events has the lack of data about it to distinguish the issued hardware from the others. This commit adds card_index member into event structure to store the index of host controller in use, and prints it. Link: https://lore.kernel.org/r/20240613131440.431766-9-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-topology.c | 2 +- include/trace/events/firewire.h | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c index 837cc44d8d9f2a..8107eebd429670 100644 --- a/drivers/firewire/core-topology.c +++ b/drivers/firewire/core-topology.c @@ -508,7 +508,7 @@ void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation, struct fw_node *local_node; unsigned long flags; - trace_bus_reset_handle(generation, node_id, bm_abdicate, self_ids, self_id_count); + trace_bus_reset_handle(card->index, generation, node_id, bm_abdicate, self_ids, self_id_count); spin_lock_irqsave(&card->lock, flags); diff --git a/include/trace/events/firewire.h b/include/trace/events/firewire.h index e6485051f546df..5ccc0d91b220fe 100644 --- a/include/trace/events/firewire.h +++ b/include/trace/events/firewire.h @@ -339,22 +339,25 @@ DEFINE_EVENT(bus_reset_arrange_template, bus_reset_postpone, ); TRACE_EVENT(bus_reset_handle, - TP_PROTO(unsigned int generation, unsigned int node_id, bool bm_abdicate, u32 *self_ids, unsigned int self_id_count), - TP_ARGS(generation, node_id, bm_abdicate, self_ids, self_id_count), + TP_PROTO(unsigned int card_index, unsigned int generation, unsigned int node_id, bool bm_abdicate, u32 *self_ids, unsigned int self_id_count), + TP_ARGS(card_index, generation, node_id, bm_abdicate, self_ids, self_id_count), TP_STRUCT__entry( + __field(u8, card_index) __field(u8, generation) __field(u8, node_id) __field(bool, bm_abdicate) __dynamic_array(u32, self_ids, self_id_count) ), TP_fast_assign( + __entry->card_index = card_index; __entry->generation = generation; __entry->node_id = node_id; __entry->bm_abdicate = bm_abdicate; memcpy(__get_dynamic_array(self_ids), self_ids, __get_dynamic_array_len(self_ids)); ), TP_printk( - "generation=%u node_id=0x%04x bm_abdicate=%s self_ids=%s", + "card_index=%u generation=%u node_id=0x%04x bm_abdicate=%s self_ids=%s", + __entry->card_index, __entry->generation, __entry->node_id, __entry->bm_abdicate ? "true" : "false", From c03984d43a9dd9282da54ccf275419f666029452 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Sat, 15 Jun 2024 16:00:43 +0800 Subject: [PATCH 0995/1578] arm64: dts: imx8mp: Fix TC9595 input clock on DH i.MX8M Plus DHCOM SoM The IMX8MP_CLK_CLKOUT2 supplies the TC9595 bridge with 13 MHz reference clock. The IMX8MP_CLK_CLKOUT2 is supplied from IMX8MP_AUDIO_PLL2_OUT. The IMX8MP_CLK_CLKOUT2 operates only as a power-of-two divider, and the current 156 MHz is not power-of-two divisible to achieve 13 MHz. To achieve 13 MHz output from IMX8MP_CLK_CLKOUT2, set IMX8MP_AUDIO_PLL2_OUT to 208 MHz, because 208 MHz / 16 = 13 MHz. Fixes: 20d0b83e712b ("arm64: dts: imx8mp: Add TC9595 bridge on DH electronics i.MX8M Plus DHCOM") Signed-off-by: Marek Vasut Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mp-dhcom-som.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mp-dhcom-som.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-dhcom-som.dtsi index 43f1d45ccc96f0..f5115f9e8c473b 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-dhcom-som.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp-dhcom-som.dtsi @@ -254,7 +254,7 @@ <&clk IMX8MP_CLK_CLKOUT2>, <&clk IMX8MP_AUDIO_PLL2_OUT>; assigned-clock-parents = <&clk IMX8MP_AUDIO_PLL2_OUT>; - assigned-clock-rates = <13000000>, <13000000>, <156000000>; + assigned-clock-rates = <13000000>, <13000000>, <208000000>; reset-gpios = <&gpio4 1 GPIO_ACTIVE_HIGH>; status = "disabled"; From bcdea3e81ea51c9e89e3b11aac2612e1b4330bee Mon Sep 17 00:00:00 2001 From: Liu Ying Date: Tue, 14 May 2024 11:07:18 +0800 Subject: [PATCH 0996/1578] arm: dts: imx53-qsb-hdmi: Disable panel instead of deleting node We cannot use /delete-node/ directive to delete a node in a DT overlay. The node won't be deleted effectively. Instead, set the node's status property to "disabled" to achieve something similar. Fixes: eeb403df953f ("ARM: dts: imx53-qsb: add support for the HDMI expander") Signed-off-by: Liu Ying Reviewed-by: Dmitry Baryshkov Signed-off-by: Shawn Guo --- arch/arm/boot/dts/nxp/imx/imx53-qsb-common.dtsi | 2 +- arch/arm/boot/dts/nxp/imx/imx53-qsb-hdmi.dtso | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/arm/boot/dts/nxp/imx/imx53-qsb-common.dtsi b/arch/arm/boot/dts/nxp/imx/imx53-qsb-common.dtsi index d8044044647371..05d7a462ea25a7 100644 --- a/arch/arm/boot/dts/nxp/imx/imx53-qsb-common.dtsi +++ b/arch/arm/boot/dts/nxp/imx/imx53-qsb-common.dtsi @@ -85,7 +85,7 @@ }; }; - panel { + panel_dpi: panel { compatible = "sii,43wvf1g"; pinctrl-names = "default"; pinctrl-0 = <&pinctrl_display_power>; diff --git a/arch/arm/boot/dts/nxp/imx/imx53-qsb-hdmi.dtso b/arch/arm/boot/dts/nxp/imx/imx53-qsb-hdmi.dtso index c84e9b0525276a..151e9cee3c87ee 100644 --- a/arch/arm/boot/dts/nxp/imx/imx53-qsb-hdmi.dtso +++ b/arch/arm/boot/dts/nxp/imx/imx53-qsb-hdmi.dtso @@ -10,8 +10,6 @@ /plugin/; &{/} { - /delete-node/ panel; - hdmi: connector-hdmi { compatible = "hdmi-connector"; label = "hdmi"; @@ -82,6 +80,10 @@ }; }; +&panel_dpi { + status = "disabled"; +}; + &tve { status = "disabled"; }; From a1439d89480754ddbc0a837544129ff5100f4087 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 10 Jun 2024 14:12:45 +0200 Subject: [PATCH 0997/1578] efi/arm: Disable LPAE PAN when calling EFI runtime services EFI runtime services are remapped into the lower 1 GiB of virtual address space at boot, so they are guaranteed to be able to co-exist with the kernel virtual mappings without the need to allocate space for them in the kernel's vmalloc region, which is rather small. This means those mappings are covered by TTBR0 when LPAE PAN is enabled, and so 'user' access must be enabled while such calls are in progress. Reviewed-by: Linus Walleij Signed-off-by: Ard Biesheuvel --- arch/arm/include/asm/efi.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/arm/include/asm/efi.h b/arch/arm/include/asm/efi.h index 78282ced50387d..e408399d5f0e65 100644 --- a/arch/arm/include/asm/efi.h +++ b/arch/arm/include/asm/efi.h @@ -14,6 +14,7 @@ #include #include #include +#include #ifdef CONFIG_EFI void efi_init(void); @@ -25,6 +26,18 @@ int efi_set_mapping_permissions(struct mm_struct *mm, efi_memory_desc_t *md, boo #define arch_efi_call_virt_setup() efi_virtmap_load() #define arch_efi_call_virt_teardown() efi_virtmap_unload() +#ifdef CONFIG_CPU_TTBR0_PAN +#undef arch_efi_call_virt +#define arch_efi_call_virt(p, f, args...) ({ \ + unsigned int flags = uaccess_save_and_enable(); \ + efi_status_t res = _Generic((p)->f(args), \ + efi_status_t: (p)->f(args), \ + default: ((p)->f(args), EFI_ABORTED)); \ + uaccess_restore(flags); \ + res; \ +}) +#endif + #define ARCH_EFI_IRQ_FLAGS_MASK \ (PSR_J_BIT | PSR_E_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT | \ PSR_T_BIT | MODE_MASK) From 75dde792d6f6c2d0af50278bd374bf0c512fe196 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 10 Jun 2024 16:02:13 +0200 Subject: [PATCH 0998/1578] efi/x86: Free EFI memory map only when installing a new one. The logic in __efi_memmap_init() is shared between two different execution flows: - mapping the EFI memory map early or late into the kernel VA space, so that its entries can be accessed; - the x86 specific cloning of the EFI memory map in order to insert new entries that are created as a result of making a memory reservation via a call to efi_mem_reserve(). In the former case, the underlying memory containing the kernel's view of the EFI memory map (which may be heavily modified by the kernel itself on x86) is not modified at all, and the only thing that changes is the virtual mapping of this memory, which is different between early and late boot. In the latter case, an entirely new allocation is created that carries a new, updated version of the kernel's view of the EFI memory map. When installing this new version, the old version will no longer be referenced, and if the memory was allocated by the kernel, it will leak unless it gets freed. The logic that implements this freeing currently lives on the code path that is shared between these two use cases, but it should only apply to the latter. So move it to the correct spot. While at it, drop the dummy definition for non-x86 architectures, as that is no longer needed. Cc: Fixes: f0ef6523475f ("efi: Fix efi_memmap_alloc() leaks") Tested-by: Ashish Kalra Link: https://lore.kernel.org/all/36ad5079-4326-45ed-85f6-928ff76483d3@amd.com Signed-off-by: Ard Biesheuvel --- arch/x86/include/asm/efi.h | 1 - arch/x86/platform/efi/memmap.c | 12 +++++++++++- drivers/firmware/efi/memmap.c | 9 --------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 1dc600fa3ba536..481096177500e1 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -401,7 +401,6 @@ extern int __init efi_memmap_alloc(unsigned int num_entries, struct efi_memory_map_data *data); extern void __efi_memmap_free(u64 phys, unsigned long size, unsigned long flags); -#define __efi_memmap_free __efi_memmap_free extern int __init efi_memmap_install(struct efi_memory_map_data *data); extern int __init efi_memmap_split_count(efi_memory_desc_t *md, diff --git a/arch/x86/platform/efi/memmap.c b/arch/x86/platform/efi/memmap.c index 4ef20b49eb5e72..6ed1935504b96e 100644 --- a/arch/x86/platform/efi/memmap.c +++ b/arch/x86/platform/efi/memmap.c @@ -92,12 +92,22 @@ int __init efi_memmap_alloc(unsigned int num_entries, */ int __init efi_memmap_install(struct efi_memory_map_data *data) { + unsigned long size = efi.memmap.desc_size * efi.memmap.nr_map; + unsigned long flags = efi.memmap.flags; + u64 phys = efi.memmap.phys_map; + int ret; + efi_memmap_unmap(); if (efi_enabled(EFI_PARAVIRT)) return 0; - return __efi_memmap_init(data); + ret = __efi_memmap_init(data); + if (ret) + return ret; + + __efi_memmap_free(phys, size, flags); + return 0; } /** diff --git a/drivers/firmware/efi/memmap.c b/drivers/firmware/efi/memmap.c index 3365944f796544..34109fd86c55d2 100644 --- a/drivers/firmware/efi/memmap.c +++ b/drivers/firmware/efi/memmap.c @@ -15,10 +15,6 @@ #include #include -#ifndef __efi_memmap_free -#define __efi_memmap_free(phys, size, flags) do { } while (0) -#endif - /** * __efi_memmap_init - Common code for mapping the EFI memory map * @data: EFI memory map data @@ -51,11 +47,6 @@ int __init __efi_memmap_init(struct efi_memory_map_data *data) return -ENOMEM; } - if (efi.memmap.flags & (EFI_MEMMAP_MEMBLOCK | EFI_MEMMAP_SLAB)) - __efi_memmap_free(efi.memmap.phys_map, - efi.memmap.desc_size * efi.memmap.nr_map, - efi.memmap.flags); - map.phys_map = data->phys_map; map.nr_map = data->size / data->desc_size; map.map_end = map.map + data->size; From 46e27b9961d8712bc89234444ede314cec0e8bae Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 13 Jun 2024 12:20:31 -0400 Subject: [PATCH 0999/1578] efi/arm64: Fix kmemleak false positive in arm64_efi_rt_init() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kmemleak code sometimes complains about the following leak: unreferenced object 0xffff8000102e0000 (size 32768):   comm "swapper/0", pid 1, jiffies 4294937323 (age 71.240s)   hex dump (first 32 bytes):     00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................     00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................   backtrace:     [<00000000db9a88a3>] __vmalloc_node_range+0x324/0x450     [<00000000ff8903a4>] __vmalloc_node+0x90/0xd0     [<000000001a06634f>] arm64_efi_rt_init+0x64/0xdc     [<0000000007826a8d>] do_one_initcall+0x178/0xac0     [<0000000054a87017>] do_initcalls+0x190/0x1d0     [<00000000308092d0>] kernel_init_freeable+0x2c0/0x2f0     [<000000003e7b99e0>] kernel_init+0x28/0x14c     [<000000002246af5b>] ret_from_fork+0x10/0x20 The memory object in this case is for efi_rt_stack_top and is allocated in an initcall. So this is certainly a false positive. Mark the object as not a leak to quash it. Signed-off-by: Waiman Long Signed-off-by: Ard Biesheuvel --- arch/arm64/kernel/efi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 4a92096db34e0b..712718aed5dd9e 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -213,6 +214,7 @@ l: if (!p) { return -ENOMEM; } + kmemleak_not_leak(p); efi_rt_stack_top = p + THREAD_SIZE; return 0; } From e1b4622efbe7ad09c9a902365a993f68c270c453 Mon Sep 17 00:00:00 2001 From: Tim Harvey Date: Wed, 22 May 2024 14:38:28 -0700 Subject: [PATCH 1000/1578] arm64: dts: freescale: imx8mp-venice-gw73xx-2x: fix BT shutdown GPIO Fix the invalid BT shutdown GPIO (gpio1_io3 not gpio4_io16) Fixes: 716ced308234 ("arm64: dts: freescale: Add imx8mp-venice-gw73xx-2x") Signed-off-by: Tim Harvey Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi index dec57fad682855..e2b5e7ac3e465f 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi @@ -219,7 +219,7 @@ bluetooth { compatible = "brcm,bcm4330-bt"; - shutdown-gpios = <&gpio4 16 GPIO_ACTIVE_HIGH>; + shutdown-gpios = <&gpio1 3 GPIO_ACTIVE_HIGH>; }; }; From 68e5afd8f440a2cc18deb53ae151aa74e2a8eced Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 28 May 2024 14:07:31 +0200 Subject: [PATCH 1001/1578] watchdog: lenovo_se10_wdt: add HAS_IOPORT dependency Once the inb()/outb() helpers become conditional, the newly added driver fails to link on targets without CONFIG_HAS_IOPORT: In file included from arch/arm64/include/asm/io.h:299, from include/linux/io.h:14, from drivers/watchdog/lenovo_se10_wdt.c:8: drivers/watchdog/lenovo_se10_wdt.c: In function 'set_bram': include/asm-generic/io.h:596:15: error: call to '_outb' declared with attribute error: outb() requires CONFIG_HAS_IOPORT 596 | #define _outb _outb include/asm-generic/io.h:655:14: note: in expansion of macro '_outb' 655 | #define outb _outb | ^~~~~ drivers/watchdog/lenovo_se10_wdt.c:67:9: note: in expansion of macro 'outb' 67 | outb(offset, bram_base); | ^~~~ Add the same dependency we added to the other such drivers. Fixes: 1f6602c8ed1e ("watchdog: lenovo_se10_wdt: Watchdog driver for Lenovo SE10 platform") Signed-off-by: Arnd Bergmann Reviewed-by Mark Pearson Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20240528120759.3491774-1-arnd@kernel.org Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index 85eea38dbdf4bb..2882944d23ccd5 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -257,6 +257,7 @@ config GPIO_WATCHDOG_ARCH_INITCALL config LENOVO_SE10_WDT tristate "Lenovo SE10 Watchdog" depends on (X86 && DMI) || COMPILE_TEST + depends on HAS_IOPORT select WATCHDOG_CORE help If you say yes here you get support for the watchdog From acf9e67a7625367b89440855572b29c5ec19dd20 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Fri, 7 Jun 2024 11:04:40 -0700 Subject: [PATCH 1002/1578] watchdog: add missing MODULE_DESCRIPTION() macros make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/watchdog/omap_wdt.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/watchdog/twl4030_wdt.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/watchdog/ts4800_wdt.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/watchdog/simatic-ipc-wdt.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/watchdog/menz69_wdt.o Add the missing invocations of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20240607-md-drivers-watchdog-v1-1-485c1c58301f@quicinc.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/menz69_wdt.c | 1 + drivers/watchdog/omap_wdt.c | 1 + drivers/watchdog/simatic-ipc-wdt.c | 1 + drivers/watchdog/ts4800_wdt.c | 1 + drivers/watchdog/twl4030_wdt.c | 1 + 5 files changed, 5 insertions(+) diff --git a/drivers/watchdog/menz69_wdt.c b/drivers/watchdog/menz69_wdt.c index c7de30270043d8..0508a65acfa656 100644 --- a/drivers/watchdog/menz69_wdt.c +++ b/drivers/watchdog/menz69_wdt.c @@ -161,6 +161,7 @@ static struct mcb_driver men_z069_driver = { module_mcb_driver(men_z069_driver); MODULE_AUTHOR("Johannes Thumshirn "); +MODULE_DESCRIPTION("Watchdog driver for the MEN z069 IP-Core"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS("mcb:16z069"); MODULE_IMPORT_NS(MCB); diff --git a/drivers/watchdog/omap_wdt.c b/drivers/watchdog/omap_wdt.c index a7a12f2fe9de98..b6e0236509bbd6 100644 --- a/drivers/watchdog/omap_wdt.c +++ b/drivers/watchdog/omap_wdt.c @@ -370,5 +370,6 @@ static struct platform_driver omap_wdt_driver = { module_platform_driver(omap_wdt_driver); MODULE_AUTHOR("George G. Davis"); +MODULE_DESCRIPTION("Driver for the TI OMAP 16xx/24xx/34xx 32KHz (non-secure) watchdog"); MODULE_LICENSE("GPL"); MODULE_ALIAS("platform:omap_wdt"); diff --git a/drivers/watchdog/simatic-ipc-wdt.c b/drivers/watchdog/simatic-ipc-wdt.c index cdc1a2e1518051..1e91f0a560ffce 100644 --- a/drivers/watchdog/simatic-ipc-wdt.c +++ b/drivers/watchdog/simatic-ipc-wdt.c @@ -227,6 +227,7 @@ static struct platform_driver simatic_ipc_wdt_driver = { module_platform_driver(simatic_ipc_wdt_driver); +MODULE_DESCRIPTION("Siemens SIMATIC IPC driver for Watchdogs"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS("platform:" KBUILD_MODNAME); MODULE_AUTHOR("Gerd Haeussler "); diff --git a/drivers/watchdog/ts4800_wdt.c b/drivers/watchdog/ts4800_wdt.c index 0099403f49922f..24b1ad52102e81 100644 --- a/drivers/watchdog/ts4800_wdt.c +++ b/drivers/watchdog/ts4800_wdt.c @@ -200,5 +200,6 @@ static struct platform_driver ts4800_wdt_driver = { module_platform_driver(ts4800_wdt_driver); MODULE_AUTHOR("Damien Riegel "); +MODULE_DESCRIPTION("Watchdog driver for TS-4800 based boards"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS("platform:ts4800_wdt"); diff --git a/drivers/watchdog/twl4030_wdt.c b/drivers/watchdog/twl4030_wdt.c index 09d17e20f4a76a..8c80d04811e4ef 100644 --- a/drivers/watchdog/twl4030_wdt.c +++ b/drivers/watchdog/twl4030_wdt.c @@ -118,6 +118,7 @@ static struct platform_driver twl4030_wdt_driver = { module_platform_driver(twl4030_wdt_driver); MODULE_AUTHOR("Nokia Corporation"); +MODULE_DESCRIPTION("TWL4030 Watchdog"); MODULE_LICENSE("GPL"); MODULE_ALIAS("platform:twl4030_wdt"); From 58f7e1e2c9e72c7974054c64c3abeac81c11f822 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Thu, 30 May 2024 19:06:29 +0800 Subject: [PATCH 1003/1578] ocfs2: fix NULL pointer dereference in ocfs2_journal_dirty() bdev->bd_super has been removed and commit 8887b94d9322 change the usage from bdev->bd_super to b_assoc_map->host->i_sb. This introduces the following NULL pointer dereference in ocfs2_journal_dirty() since b_assoc_map is still not initialized. This can be easily reproduced by running xfstests generic/186, which simulate no more credits. [ 134.351592] BUG: kernel NULL pointer dereference, address: 0000000000000000 ... [ 134.355341] RIP: 0010:ocfs2_journal_dirty+0x14f/0x160 [ocfs2] ... [ 134.365071] Call Trace: [ 134.365312] [ 134.365524] ? __die_body+0x1e/0x60 [ 134.365868] ? page_fault_oops+0x13d/0x4f0 [ 134.366265] ? __pfx_bit_wait_io+0x10/0x10 [ 134.366659] ? schedule+0x27/0xb0 [ 134.366981] ? exc_page_fault+0x6a/0x140 [ 134.367356] ? asm_exc_page_fault+0x26/0x30 [ 134.367762] ? ocfs2_journal_dirty+0x14f/0x160 [ocfs2] [ 134.368305] ? ocfs2_journal_dirty+0x13d/0x160 [ocfs2] [ 134.368837] ocfs2_create_new_meta_bhs.isra.51+0x139/0x2e0 [ocfs2] [ 134.369454] ocfs2_grow_tree+0x688/0x8a0 [ocfs2] [ 134.369927] ocfs2_split_and_insert.isra.67+0x35c/0x4a0 [ocfs2] [ 134.370521] ocfs2_split_extent+0x314/0x4d0 [ocfs2] [ 134.371019] ocfs2_change_extent_flag+0x174/0x410 [ocfs2] [ 134.371566] ocfs2_add_refcount_flag+0x3fa/0x630 [ocfs2] [ 134.372117] ocfs2_reflink_remap_extent+0x21b/0x4c0 [ocfs2] [ 134.372994] ? inode_update_timestamps+0x4a/0x120 [ 134.373692] ? __pfx_ocfs2_journal_access_di+0x10/0x10 [ocfs2] [ 134.374545] ? __pfx_ocfs2_journal_access_di+0x10/0x10 [ocfs2] [ 134.375393] ocfs2_reflink_remap_blocks+0xe4/0x4e0 [ocfs2] [ 134.376197] ocfs2_remap_file_range+0x1de/0x390 [ocfs2] [ 134.376971] ? security_file_permission+0x29/0x50 [ 134.377644] vfs_clone_file_range+0xfe/0x320 [ 134.378268] ioctl_file_clone+0x45/0xa0 [ 134.378853] do_vfs_ioctl+0x457/0x990 [ 134.379422] __x64_sys_ioctl+0x6e/0xd0 [ 134.379987] do_syscall_64+0x5d/0x170 [ 134.380550] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 134.381231] RIP: 0033:0x7fa4926397cb [ 134.381786] Code: 73 01 c3 48 8b 0d bd 56 38 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 8d 56 38 00 f7 d8 64 89 01 48 [ 134.383930] RSP: 002b:00007ffc2b39f7b8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [ 134.384854] RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007fa4926397cb [ 134.385734] RDX: 00007ffc2b39f7f0 RSI: 000000004020940d RDI: 0000000000000003 [ 134.386606] RBP: 0000000000000000 R08: 00111a82a4f015bb R09: 00007fa494221000 [ 134.387476] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 [ 134.388342] R13: 0000000000f10000 R14: 0000558e844e2ac8 R15: 0000000000f10000 [ 134.389207] Fix it by only aborting transaction and journal in ocfs2_journal_dirty() now, and leave ocfs2_abort() later when detecting an aborted handle, e.g. start next transaction. Also log the handle details in this case. Link: https://lkml.kernel.org/r/20240530110630.3933832-1-joseph.qi@linux.alibaba.com Fixes: 8887b94d9322 ("ocfs2: stop using bdev->bd_super for journal error logging") Signed-off-by: Joseph Qi Reviewed-by: Heming Zhao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Cc: [6.6+] Signed-off-by: Andrew Morton --- fs/ocfs2/journal.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 604fea3a26ff04..27c7683c7d3fa0 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -778,13 +778,15 @@ void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh) if (!is_handle_aborted(handle)) { journal_t *journal = handle->h_transaction->t_journal; - mlog(ML_ERROR, "jbd2_journal_dirty_metadata failed. " - "Aborting transaction and journal.\n"); + mlog(ML_ERROR, "jbd2_journal_dirty_metadata failed: " + "handle type %u started at line %u, credits %u/%u " + "errcode %d. Aborting transaction and journal.\n", + handle->h_type, handle->h_line_no, + handle->h_requested_credits, + jbd2_handle_buffer_credits(handle), status); handle->h_err = status; jbd2_journal_abort_handle(handle); jbd2_journal_abort(journal, status); - ocfs2_abort(bh->b_assoc_map->host->i_sb, - "Journal already aborted.\n"); } } } From 685d03c3795378fca6a1b3d43581f7f1a3fc095f Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Thu, 30 May 2024 19:06:30 +0800 Subject: [PATCH 1004/1578] ocfs2: fix NULL pointer dereference in ocfs2_abort_trigger() bdev->bd_super has been removed and commit 8887b94d9322 change the usage from bdev->bd_super to b_assoc_map->host->i_sb. Since ocfs2 hasn't set bh->b_assoc_map, it will trigger NULL pointer dereference when calling into ocfs2_abort_trigger(). Actually this was pointed out in history, see commit 74e364ad1b13. But I've made a mistake when reviewing commit 8887b94d9322 and then re-introduce this regression. Since we cannot revive bdev in buffer head, so fix this issue by initializing all types of ocfs2 triggers when fill super, and then get the specific ocfs2 trigger from ocfs2_caching_info when access journal. [joseph.qi@linux.alibaba.com: v2] Link: https://lkml.kernel.org/r/20240602112045.1112708-1-joseph.qi@linux.alibaba.com Link: https://lkml.kernel.org/r/20240530110630.3933832-2-joseph.qi@linux.alibaba.com Fixes: 8887b94d9322 ("ocfs2: stop using bdev->bd_super for journal error logging") Signed-off-by: Joseph Qi Reviewed-by: Heming Zhao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Cc: [6.6+] Signed-off-by: Andrew Morton --- fs/ocfs2/journal.c | 182 +++++++++++++++++++++++++-------------------- fs/ocfs2/ocfs2.h | 27 +++++++ fs/ocfs2/super.c | 4 +- 3 files changed, 131 insertions(+), 82 deletions(-) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 27c7683c7d3fa0..86807086b2dfd4 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -479,12 +479,6 @@ int ocfs2_allocate_extend_trans(handle_t *handle, int thresh) return status; } - -struct ocfs2_triggers { - struct jbd2_buffer_trigger_type ot_triggers; - int ot_offset; -}; - static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger_type *triggers) { return container_of(triggers, struct ocfs2_triggers, ot_triggers); @@ -548,85 +542,76 @@ static void ocfs2_db_frozen_trigger(struct jbd2_buffer_trigger_type *triggers, static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers, struct buffer_head *bh) { + struct ocfs2_triggers *ot = to_ocfs2_trigger(triggers); + mlog(ML_ERROR, "ocfs2_abort_trigger called by JBD2. bh = 0x%lx, " "bh->b_blocknr = %llu\n", (unsigned long)bh, (unsigned long long)bh->b_blocknr); - ocfs2_error(bh->b_assoc_map->host->i_sb, + ocfs2_error(ot->sb, "JBD2 has aborted our journal, ocfs2 cannot continue\n"); } -static struct ocfs2_triggers di_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_dinode, i_check), -}; - -static struct ocfs2_triggers eb_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_extent_block, h_check), -}; - -static struct ocfs2_triggers rb_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_refcount_block, rf_check), -}; - -static struct ocfs2_triggers gd_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_group_desc, bg_check), -}; - -static struct ocfs2_triggers db_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_db_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, -}; +static void ocfs2_setup_csum_triggers(struct super_block *sb, + enum ocfs2_journal_trigger_type type, + struct ocfs2_triggers *ot) +{ + BUG_ON(type >= OCFS2_JOURNAL_TRIGGER_COUNT); -static struct ocfs2_triggers xb_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_xattr_block, xb_check), -}; + switch (type) { + case OCFS2_JTR_DI: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_dinode, i_check); + break; + case OCFS2_JTR_EB: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_extent_block, h_check); + break; + case OCFS2_JTR_RB: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_refcount_block, rf_check); + break; + case OCFS2_JTR_GD: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_group_desc, bg_check); + break; + case OCFS2_JTR_DB: + ot->ot_triggers.t_frozen = ocfs2_db_frozen_trigger; + break; + case OCFS2_JTR_XB: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_xattr_block, xb_check); + break; + case OCFS2_JTR_DQ: + ot->ot_triggers.t_frozen = ocfs2_dq_frozen_trigger; + break; + case OCFS2_JTR_DR: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check); + break; + case OCFS2_JTR_DL: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check); + break; + case OCFS2_JTR_NONE: + /* To make compiler happy... */ + return; + } -static struct ocfs2_triggers dq_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_dq_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, -}; + ot->ot_triggers.t_abort = ocfs2_abort_trigger; + ot->sb = sb; +} -static struct ocfs2_triggers dr_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check), -}; +void ocfs2_initialize_journal_triggers(struct super_block *sb, + struct ocfs2_triggers triggers[]) +{ + enum ocfs2_journal_trigger_type type; -static struct ocfs2_triggers dl_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check), -}; + for (type = OCFS2_JTR_DI; type < OCFS2_JOURNAL_TRIGGER_COUNT; type++) + ocfs2_setup_csum_triggers(sb, type, &triggers[type]); +} static int __ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci, @@ -708,56 +693,91 @@ static int __ocfs2_journal_access(handle_t *handle, int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &di_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_DI], + type); } int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &eb_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_EB], + type); } int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &rb_triggers, + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_RB], type); } int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &gd_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_GD], + type); } int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &db_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_DB], + type); } int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &xb_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_XB], + type); } int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &dq_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_DQ], + type); } int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &dr_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_DR], + type); } int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &dl_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_DL], + type); } int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci, diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index a503c553bab21b..8fe826143d7bf4 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -284,6 +284,30 @@ enum ocfs2_mount_options #define OCFS2_OSB_ERROR_FS 0x0004 #define OCFS2_DEFAULT_ATIME_QUANTUM 60 +struct ocfs2_triggers { + struct jbd2_buffer_trigger_type ot_triggers; + int ot_offset; + struct super_block *sb; +}; + +enum ocfs2_journal_trigger_type { + OCFS2_JTR_DI, + OCFS2_JTR_EB, + OCFS2_JTR_RB, + OCFS2_JTR_GD, + OCFS2_JTR_DB, + OCFS2_JTR_XB, + OCFS2_JTR_DQ, + OCFS2_JTR_DR, + OCFS2_JTR_DL, + OCFS2_JTR_NONE /* This must be the last entry */ +}; + +#define OCFS2_JOURNAL_TRIGGER_COUNT OCFS2_JTR_NONE + +void ocfs2_initialize_journal_triggers(struct super_block *sb, + struct ocfs2_triggers triggers[]); + struct ocfs2_journal; struct ocfs2_slot_info; struct ocfs2_recovery_map; @@ -351,6 +375,9 @@ struct ocfs2_super struct ocfs2_journal *journal; unsigned long osb_commit_interval; + /* Journal triggers for checksum */ + struct ocfs2_triggers s_journal_triggers[OCFS2_JOURNAL_TRIGGER_COUNT]; + struct delayed_work la_enable_wq; /* diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 8aabaed2c1cb94..afee70125ae3bf 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1075,9 +1075,11 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) debugfs_create_file("fs_state", S_IFREG|S_IRUSR, osb->osb_debug_root, osb, &ocfs2_osb_debug_fops); - if (ocfs2_meta_ecc(osb)) + if (ocfs2_meta_ecc(osb)) { + ocfs2_initialize_journal_triggers(sb, osb->s_journal_triggers); ocfs2_blockcheck_stats_debugfs_install( &osb->osb_ecc_stats, osb->osb_debug_root); + } status = ocfs2_mount_volume(sb); if (status < 0) From 8e5bd4eadd01ea0c47f3a1c798815849f813a700 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Wed, 22 May 2024 15:58:30 -0700 Subject: [PATCH 1005/1578] gcc: disable '-Warray-bounds' for gcc-9 '-Warray-bounds' is already disabled for gcc-10+. Now that we've merged bitmap_{read,write), I see the following error when building the kernel with gcc-9.4 (Ubuntu 20.04.4 LTS) for x86_64 allmodconfig: drivers/pinctrl/pinctrl-cy8c95x0.c: In function `cy8c95x0_read_regs_mask.isra.0': include/linux/bitmap.h:756:18: error: array subscript [1, 288230376151711744] is outside array bounds of `long unsigned int[1]' [-Werror=array-bounds] 756 | value_high = map[index + 1] & BITMAP_LAST_WORD_MASK(start + nbits); | ~~~^~~~~~~~~~~ The immediate reason is that the commit b44759705f7d ("bitmap: make bitmap_{get,set}_value8() use bitmap_{read,write}()") switched the bitmap_get_value8() to an alias of bitmap_read(); the same for 'set'. Now; the code that triggers Warray-bounds, calls the function like this: #define MAX_BANK 8 #define BANK_SZ 8 #define MAX_LINE (MAX_BANK * BANK_SZ) DECLARE_BITMAP(tval, MAX_LINE); // 64-bit map: unsigned long tval[1] read_val |= bitmap_get_value8(tval, i * BANK_SZ) & ~bits; bitmap_read() is implemented such that it may conditionally dereference a pointer beyond the boundary like this: unsigned long offset = start % BITS_PER_LONG; unsigned long space = BITS_PER_LONG - offset; if (space >= nbits) return (map[index] >> offset) & BITMAP_LAST_WORD_MASK(nbits); value_low = map[index] & BITMAP_FIRST_WORD_MASK(start); value_high = map[index + 1] & BITMAP_LAST_WORD_MASK(start + nbits); return (value_low >> offset) | (value_high << space); In case of bitmap_get_value8(), it's impossible to violate the boundary because 'space >= nbits' is never the true for byte-aligned 8-bit access. So, this is clearly a false-positive. The same type of false-positives break my allmodconfig build in many places. gcc-8, is clear, however. Link: https://lkml.kernel.org/r/20240522225830.1201778-1-yury.norov@gmail.com Fixes: b44759705f7d ("bitmap: make bitmap_{get,set}_value8() use bitmap_{read,write}()") Signed-off-by: Yury Norov Cc: Alexander Lobakin Cc: David S. Miller Cc: Gustavo A. R. Silva Cc: Masahiro Yamada Cc: Nhat Pham Cc: Petr Mladek Cc: Randy Dunlap Cc: Vincent Guittot Cc: Yoann Congal Cc: Arnd Bergmann Signed-off-by: Andrew Morton --- init/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/Kconfig b/init/Kconfig index 72404c1f21577a..febdea2afc3be3 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -883,7 +883,7 @@ config GCC10_NO_ARRAY_BOUNDS config CC_NO_ARRAY_BOUNDS bool - default y if CC_IS_GCC && GCC_VERSION >= 100000 && GCC10_NO_ARRAY_BOUNDS + default y if CC_IS_GCC && GCC_VERSION >= 90000 && GCC10_NO_ARRAY_BOUNDS # Currently, disable -Wstringop-overflow for GCC globally. config GCC_NO_STRINGOP_OVERFLOW From 8bb592c2eca8fd2bc06db7d80b38da18da4a2f43 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 5 Jun 2024 17:21:46 -0400 Subject: [PATCH 1006/1578] mm/page_table_check: fix crash on ZONE_DEVICE Not all pages may apply to pgtable check. One example is ZONE_DEVICE pages: they map PFNs directly, and they don't allocate page_ext at all even if there's struct page around. One may reference devm_memremap_pages(). When both ZONE_DEVICE and page-table-check enabled, then try to map some dax memories, one can trigger kernel bug constantly now when the kernel was trying to inject some pfn maps on the dax device: kernel BUG at mm/page_table_check.c:55! While it's pretty legal to use set_pxx_at() for ZONE_DEVICE pages for page fault resolutions, skip all the checks if page_ext doesn't even exist in pgtable checker, which applies to ZONE_DEVICE but maybe more. Link: https://lkml.kernel.org/r/20240605212146.994486-1-peterx@redhat.com Fixes: df4e817b7108 ("mm: page table check") Signed-off-by: Peter Xu Reviewed-by: Pasha Tatashin Reviewed-by: Dan Williams Reviewed-by: Alistair Popple Cc: Signed-off-by: Andrew Morton --- mm/page_table_check.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 4169576bed7298..509c6ef8de400e 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -73,6 +73,9 @@ static void page_table_check_clear(unsigned long pfn, unsigned long pgcnt) page = pfn_to_page(pfn); page_ext = page_ext_get(page); + if (!page_ext) + return; + BUG_ON(PageSlab(page)); anon = PageAnon(page); @@ -110,6 +113,9 @@ static void page_table_check_set(unsigned long pfn, unsigned long pgcnt, page = pfn_to_page(pfn); page_ext = page_ext_get(page); + if (!page_ext) + return; + BUG_ON(PageSlab(page)); anon = PageAnon(page); @@ -140,7 +146,10 @@ void __page_table_check_zero(struct page *page, unsigned int order) BUG_ON(PageSlab(page)); page_ext = page_ext_get(page); - BUG_ON(!page_ext); + + if (!page_ext) + return; + for (i = 0; i < (1ul << order); i++) { struct page_table_check *ptc = get_page_table_check(page_ext); From 384a746bb55960aa5ffb3a67de08f11fc2f51042 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 5 Jun 2024 11:17:10 +0200 Subject: [PATCH 1007/1578] Revert "mm: init_mlocked_on_free_v3" There was insufficient review and no agreement that this is the right approach. There are serious flaws with the implementation that make processes using mlock() not even work with simple fork() [1] and we get reliable crashes when rebooting. Further, simply because we might be unmapping a single PTE of a large mlocked folio, we shouldn't zero out the whole folio. ... especially because the code can also *corrupt* urelated memory because kernel_init_pages(page, folio_nr_pages(folio)); Could end up writing outside of the actual folio if we work with a tail page. Let's revert it. Once there is agreement that this is the right approach, the issues were fixed and there was reasonable review and proper testing, we can consider it again. [1] https://lkml.kernel.org/r/4da9da2f-73e4-45fd-b62f-a8a513314057@redhat.com Link: https://lkml.kernel.org/r/20240605091710.38961-1-david@redhat.com Fixes: ba42b524a040 ("mm: init_mlocked_on_free_v3") Signed-off-by: David Hildenbrand Reported-by: David Wang <00107082@163.com> Closes: https://lore.kernel.org/lkml/20240528151340.4282-1-00107082@163.com/ Reported-by: Lance Yang Closes: https://lkml.kernel.org/r/20240601140917.43562-1-ioworker0@gmail.com Acked-by: Lance Yang Cc: York Jasper Niebuhr Cc: Matthew Wilcox (Oracle) Cc: Kees Cook Signed-off-by: Andrew Morton --- .../admin-guide/kernel-parameters.txt | 6 --- include/linux/mm.h | 9 +--- mm/internal.h | 1 - mm/memory.c | 6 --- mm/mm_init.c | 43 +++---------------- mm/page_alloc.c | 2 +- security/Kconfig.hardening | 15 ------- 7 files changed, 9 insertions(+), 73 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index b600df82669db0..11e57ba2985ccd 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2192,12 +2192,6 @@ Format: 0 | 1 Default set by CONFIG_INIT_ON_FREE_DEFAULT_ON. - init_mlocked_on_free= [MM] Fill freed userspace memory with zeroes if - it was mlock'ed and not explicitly munlock'ed - afterwards. - Format: 0 | 1 - Default set by CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON - init_pkru= [X86] Specify the default memory protection keys rights register contents for all processes. 0x55555554 by default (disallow access to all but pkey 0). Can diff --git a/include/linux/mm.h b/include/linux/mm.h index 9849dfda44d43c..9a5652c5fadd51 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3776,14 +3776,7 @@ DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); static inline bool want_init_on_free(void) { return static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON, - &init_on_free); -} - -DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON, init_mlocked_on_free); -static inline bool want_init_mlocked_on_free(void) -{ - return static_branch_maybe(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON, - &init_mlocked_on_free); + &init_on_free); } extern bool _debug_pagealloc_enabled_early; diff --git a/mm/internal.h b/mm/internal.h index b2c75b12014e7f..c72c306761a48a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -588,7 +588,6 @@ extern void __putback_isolated_page(struct page *page, unsigned int order, extern void memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order); extern void __free_pages_core(struct page *page, unsigned int order); -extern void kernel_init_pages(struct page *page, int numpages); /* * This will have no effect, other than possibly generating a warning, if the diff --git a/mm/memory.c b/mm/memory.c index 0f47a533014e43..2bc8032a30a2ff 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1507,12 +1507,6 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb, if (unlikely(folio_mapcount(folio) < 0)) print_bad_pte(vma, addr, ptent, page); } - - if (want_init_mlocked_on_free() && folio_test_mlocked(folio) && - !delay_rmap && folio_test_anon(folio)) { - kernel_init_pages(page, folio_nr_pages(folio)); - } - if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) { *force_flush = true; *force_break = true; diff --git a/mm/mm_init.c b/mm/mm_init.c index f72b852bd5b8e3..3ec04933f7fd8a 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2523,9 +2523,6 @@ EXPORT_SYMBOL(init_on_alloc); DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); EXPORT_SYMBOL(init_on_free); -DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON, init_mlocked_on_free); -EXPORT_SYMBOL(init_mlocked_on_free); - static bool _init_on_alloc_enabled_early __read_mostly = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON); static int __init early_init_on_alloc(char *buf) @@ -2543,14 +2540,6 @@ static int __init early_init_on_free(char *buf) } early_param("init_on_free", early_init_on_free); -static bool _init_mlocked_on_free_enabled_early __read_mostly - = IS_ENABLED(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON); -static int __init early_init_mlocked_on_free(char *buf) -{ - return kstrtobool(buf, &_init_mlocked_on_free_enabled_early); -} -early_param("init_mlocked_on_free", early_init_mlocked_on_free); - DEFINE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled); /* @@ -2578,21 +2567,12 @@ static void __init mem_debugging_and_hardening_init(void) } #endif - if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early || - _init_mlocked_on_free_enabled_early) && + if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early) && page_poisoning_requested) { pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " - "will take precedence over init_on_alloc, init_on_free " - "and init_mlocked_on_free\n"); + "will take precedence over init_on_alloc and init_on_free\n"); _init_on_alloc_enabled_early = false; _init_on_free_enabled_early = false; - _init_mlocked_on_free_enabled_early = false; - } - - if (_init_mlocked_on_free_enabled_early && _init_on_free_enabled_early) { - pr_info("mem auto-init: init_on_free is on, " - "will take precedence over init_mlocked_on_free\n"); - _init_mlocked_on_free_enabled_early = false; } if (_init_on_alloc_enabled_early) { @@ -2609,17 +2589,9 @@ static void __init mem_debugging_and_hardening_init(void) static_branch_disable(&init_on_free); } - if (_init_mlocked_on_free_enabled_early) { - want_check_pages = true; - static_branch_enable(&init_mlocked_on_free); - } else { - static_branch_disable(&init_mlocked_on_free); - } - - if (IS_ENABLED(CONFIG_KMSAN) && (_init_on_alloc_enabled_early || - _init_on_free_enabled_early || _init_mlocked_on_free_enabled_early)) - pr_info("mem auto-init: please make sure init_on_alloc, init_on_free and " - "init_mlocked_on_free are disabled when running KMSAN\n"); + if (IS_ENABLED(CONFIG_KMSAN) && + (_init_on_alloc_enabled_early || _init_on_free_enabled_early)) + pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n"); #ifdef CONFIG_DEBUG_PAGEALLOC if (debug_pagealloc_enabled()) { @@ -2658,10 +2630,9 @@ static void __init report_meminit(void) else stack = "off"; - pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s, mlocked free:%s\n", + pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s\n", stack, want_init_on_alloc(GFP_KERNEL) ? "on" : "off", - want_init_on_free() ? "on" : "off", - want_init_mlocked_on_free() ? "on" : "off"); + want_init_on_free() ? "on" : "off"); if (want_init_on_free()) pr_info("mem auto-init: clearing system memory may take some time...\n"); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 222299b5c0e6af..7300aa9f14b0b1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1016,7 +1016,7 @@ static inline bool should_skip_kasan_poison(struct page *page) return page_kasan_tag(page) == KASAN_TAG_KERNEL; } -void kernel_init_pages(struct page *page, int numpages) +static void kernel_init_pages(struct page *page, int numpages) { int i; diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening index effbf5982be10f..2cff851ebfd7e1 100644 --- a/security/Kconfig.hardening +++ b/security/Kconfig.hardening @@ -255,21 +255,6 @@ config INIT_ON_FREE_DEFAULT_ON touching "cold" memory areas. Most cases see 3-5% impact. Some synthetic workloads have measured as high as 8%. -config INIT_MLOCKED_ON_FREE_DEFAULT_ON - bool "Enable mlocked memory zeroing on free" - depends on !KMSAN - help - This config has the effect of setting "init_mlocked_on_free=1" - on the kernel command line. If it is enabled, all mlocked process - memory is zeroed when freed. This restriction to mlocked memory - improves performance over "init_on_free" but can still be used to - protect confidential data like key material from content exposures - to other processes, as well as live forensics and cold boot attacks. - Any non-mlocked memory is not cleared before it is reassigned. This - configuration can be overwritten by setting "init_mlocked_on_free=0" - on the command line. The "init_on_free" boot option takes - precedence over "init_mlocked_on_free". - config CC_HAS_ZERO_CALL_USED_REGS def_bool $(cc-option,-fzero-call-used-regs=used-gpr) # https://github.com/ClangBuiltLinux/linux/issues/1766 From 3ab85f4046c12fb773084c2974cd7bbe8a3e2e68 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Sun, 2 Jun 2024 21:55:10 +0100 Subject: [PATCH 1008/1578] MAINTAINERS: remove Lorenzo as vmalloc reviewer I haven't had the bandwidth to review vmalloc patches recently and I suspect I won't be able to do so consistently moving forwards, so I think it's best if I remove myself as reviewer for the time being. Link: https://lkml.kernel.org/r/20240602205510.108807-1-lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Cc: Baoquan He Cc: Christoph Hellwig Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index aacccb376c28a1..3620c4e6b46908 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23974,7 +23974,6 @@ VMALLOC M: Andrew Morton R: Uladzislau Rezki R: Christoph Hellwig -R: Lorenzo Stoakes L: linux-mm@kvack.org S: Maintained W: http://www.linux-mm.org From c944bf60c16a65ae812a59fd1b66f6c9e18c91c9 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Sat, 1 Jun 2024 16:38:31 -0700 Subject: [PATCH 1009/1578] lib/alloc_tag: do not register sysctl interface when CONFIG_SYSCTL=n Memory allocation profiling is trying to register sysctl interface even when CONFIG_SYSCTL=n, resulting in proc_do_static_key() being undefined. Prevent that by skipping sysctl registration for such configurations. Link: https://lkml.kernel.org/r/20240601233831.617124-1-surenb@google.com Fixes: 22d407b164ff ("lib: add allocation tagging support for memory allocation profiling") Signed-off-by: Suren Baghdasaryan Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202405280616.wcOGWJEj-lkp@intel.com/ Acked-by: Vlastimil Babka Cc: Kent Overstreet Cc: Kees Cook Cc: Pasha Tatashin Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/alloc_tag.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 11ed973ac359df..c347b8b72d78dc 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -227,6 +227,7 @@ struct page_ext_operations page_alloc_tagging_ops = { }; EXPORT_SYMBOL(page_alloc_tagging_ops); +#ifdef CONFIG_SYSCTL static struct ctl_table memory_allocation_profiling_sysctls[] = { { .procname = "mem_profiling", @@ -241,6 +242,17 @@ static struct ctl_table memory_allocation_profiling_sysctls[] = { { } }; +static void __init sysctl_init(void) +{ + if (!mem_profiling_support) + memory_allocation_profiling_sysctls[0].mode = 0444; + + register_sysctl_init("vm", memory_allocation_profiling_sysctls); +} +#else /* CONFIG_SYSCTL */ +static inline void sysctl_init(void) {} +#endif /* CONFIG_SYSCTL */ + static int __init alloc_tag_init(void) { const struct codetag_type_desc desc = { @@ -253,9 +265,7 @@ static int __init alloc_tag_init(void) if (IS_ERR(alloc_tag_cttype)) return PTR_ERR(alloc_tag_cttype); - if (!mem_profiling_support) - memory_allocation_profiling_sysctls[0].mode = 0444; - register_sysctl_init("vm", memory_allocation_profiling_sysctls); + sysctl_init(); procfs_init(); return 0; From a273559e9eb68cb58c57803d76a1622b8324a878 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Sat, 1 Jun 2024 16:38:40 -0700 Subject: [PATCH 1010/1578] lib/alloc_tag: fix RCU imbalance in pgalloc_tag_get() put_page_tag_ref() should be called only when get_page_tag_ref() returns a valid reference because only in that case get_page_tag_ref() enters RCU read section while put_page_tag_ref() will call rcu_read_unlock() even if the provided reference is NULL. Fix pgalloc_tag_get() which does not follow this rule causing RCU imbalance. Add a warning in put_page_tag_ref() to catch any future mistakes. Link: https://lkml.kernel.org/r/20240601233840.617458-1-surenb@google.com Fixes: cc92eba1c88b ("mm: fix non-compound multi-order memory accounting in __free_pages") Signed-off-by: Suren Baghdasaryan Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202405271029.6d2f9c4c-lkp@intel.com Acked-by: Vlastimil Babka Cc: Kent Overstreet Cc: Kees Cook Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- include/linux/pgalloc_tag.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 86ba5d33e43bdf..9cacadbd61f8c1 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -37,6 +37,9 @@ static inline union codetag_ref *get_page_tag_ref(struct page *page) static inline void put_page_tag_ref(union codetag_ref *ref) { + if (WARN_ON(!ref)) + return; + page_ext_put(page_ext_from_codetag_ref(ref)); } @@ -102,9 +105,11 @@ static inline struct alloc_tag *pgalloc_tag_get(struct page *page) union codetag_ref *ref = get_page_tag_ref(page); alloc_tag_sub_check(ref); - if (ref && ref->ct) - tag = ct_to_alloc_tag(ref->ct); - put_page_tag_ref(ref); + if (ref) { + if (ref->ct) + tag = ct_to_alloc_tag(ref->ct); + put_page_tag_ref(ref); + } } return tag; From 6a50c9b512f7734bc356f4bd47885a6f7c98491a Mon Sep 17 00:00:00 2001 From: Ran Xiaokai Date: Fri, 7 Jun 2024 17:40:48 +0800 Subject: [PATCH 1011/1578] mm: huge_memory: fix misused mapping_large_folio_support() for anon folios When I did a large folios split test, a WARNING "[ 5059.122759][ T166] Cannot split file folio to non-0 order" was triggered. But the test cases are only for anonmous folios. while mapping_large_folio_support() is only reasonable for page cache folios. In split_huge_page_to_list_to_order(), the folio passed to mapping_large_folio_support() maybe anonmous folio. The folio_test_anon() check is missing. So the split of the anonmous THP is failed. This is also the same for shmem_mapping(). We'd better add a check for both. But the shmem_mapping() in __split_huge_page() is not involved, as for anonmous folios, the end parameter is set to -1, so (head[i].index >= end) is always false. shmem_mapping() is not called. Also add a VM_WARN_ON_ONCE() in mapping_large_folio_support() for anon mapping, So we can detect the wrong use more easily. THP folios maybe exist in the pagecache even the file system doesn't support large folio, it is because when CONFIG_TRANSPARENT_HUGEPAGE is enabled, khugepaged will try to collapse read-only file-backed pages to THP. But the mapping does not actually support multi order large folios properly. Using /sys/kernel/debug/split_huge_pages to verify this, with this patch, large anon THP is successfully split and the warning is ceased. Link: https://lkml.kernel.org/r/202406071740485174hcFl7jRxncsHDtI-Pz-o@zte.com.cn Fixes: c010d47f107f ("mm: thp: split huge page to any lower order pages") Reviewed-by: Barry Song Reviewed-by: Zi Yan Acked-by: David Hildenbrand Signed-off-by: Ran Xiaokai Cc: Michal Hocko Cc: xu xin Cc: Yang Yang Cc: Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 4 ++++ mm/huge_memory.c | 28 +++++++++++++++++----------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index ee633712bba0b9..59f1df0cde5a0c 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -381,6 +381,10 @@ static inline void mapping_set_large_folios(struct address_space *mapping) */ static inline bool mapping_large_folio_support(struct address_space *mapping) { + /* AS_LARGE_FOLIO_SUPPORT is only reasonable for pagecache folios */ + VM_WARN_ONCE((unsigned long)mapping & PAGE_MAPPING_ANON, + "Anonymous mapping always supports large folio"); + return IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 89932fd0f62e86..db7946a0a28c4b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3009,30 +3009,36 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, if (new_order >= folio_order(folio)) return -EINVAL; - /* Cannot split anonymous THP to order-1 */ - if (new_order == 1 && folio_test_anon(folio)) { - VM_WARN_ONCE(1, "Cannot split to order-1 folio"); - return -EINVAL; - } - - if (new_order) { - /* Only swapping a whole PMD-mapped folio is supported */ - if (folio_test_swapcache(folio)) + if (folio_test_anon(folio)) { + /* order-1 is not supported for anonymous THP. */ + if (new_order == 1) { + VM_WARN_ONCE(1, "Cannot split to order-1 folio"); return -EINVAL; + } + } else if (new_order) { /* Split shmem folio to non-zero order not supported */ if (shmem_mapping(folio->mapping)) { VM_WARN_ONCE(1, "Cannot split shmem folio to non-0 order"); return -EINVAL; } - /* No split if the file system does not support large folio */ - if (!mapping_large_folio_support(folio->mapping)) { + /* + * No split if the file system does not support large folio. + * Note that we might still have THPs in such mappings due to + * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping + * does not actually support large folios properly. + */ + if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && + !mapping_large_folio_support(folio->mapping)) { VM_WARN_ONCE(1, "Cannot split file folio to non-0 order"); return -EINVAL; } } + /* Only swapping a whole PMD-mapped folio is supported */ + if (folio_test_swapcache(folio) && new_order) + return -EINVAL; is_hzp = is_huge_zero_folio(folio); if (is_hzp) { From 7fea700e04bd3f424c2d836e98425782f97b494e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 8 Jun 2024 14:06:16 +0200 Subject: [PATCH 1012/1578] zap_pid_ns_processes: clear TIF_NOTIFY_SIGNAL along with TIF_SIGPENDING kernel_wait4() doesn't sleep and returns -EINTR if there is no eligible child and signal_pending() is true. That is why zap_pid_ns_processes() clears TIF_SIGPENDING but this is not enough, it should also clear TIF_NOTIFY_SIGNAL to make signal_pending() return false and avoid a busy-wait loop. Link: https://lkml.kernel.org/r/20240608120616.GB7947@redhat.com Fixes: 12db8b690010 ("entry: Add support for TIF_NOTIFY_SIGNAL") Signed-off-by: Oleg Nesterov Reported-by: Rachel Menge Closes: https://lore.kernel.org/all/1386cd49-36d0-4a5c-85e9-bc42056a5a38@linux.microsoft.com/ Reviewed-by: Boqun Feng Tested-by: Wei Fu Reviewed-by: Jens Axboe Cc: Allen Pais Cc: Christian Brauner Cc: Frederic Weisbecker Cc: Joel Fernandes (Google) Cc: Joel Granados Cc: Josh Triplett Cc: Lai Jiangshan Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Mike Christie Cc: Neeraj Upadhyay Cc: Paul E. McKenney Cc: Steven Rostedt (Google) Cc: Zqiang Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- kernel/pid_namespace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index dc48fecfa1dced..25f3cf679b358d 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -218,6 +218,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) */ do { clear_thread_flag(TIF_SIGPENDING); + clear_thread_flag(TIF_NOTIFY_SIGNAL); rc = kernel_wait4(-1, NULL, __WALL, NULL); } while (rc != -ECHILD); From c1558bc57b8e5b4da5d821537cd30e2e660861d8 Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Mon, 10 Jun 2024 11:27:43 +0200 Subject: [PATCH 1013/1578] gcov: add support for GCC 14 Using gcov on kernels compiled with GCC 14 results in truncated 16-byte long .gcda files with no usable data. To fix this, update GCOV_COUNTERS to match the value defined by GCC 14. Tested with GCC versions 14.1.0 and 13.2.0. Link: https://lkml.kernel.org/r/20240610092743.1609845-1-oberpar@linux.ibm.com Signed-off-by: Peter Oberparleiter Reported-by: Allison Henderson Reported-by: Chuck Lever III Tested-by: Chuck Lever Cc: Signed-off-by: Andrew Morton --- kernel/gcov/gcc_4_7.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index 74a4ef1da9ad77..fd75b4a484d76a 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -18,7 +18,9 @@ #include #include "gcov.h" -#if (__GNUC__ >= 10) +#if (__GNUC__ >= 14) +#define GCOV_COUNTERS 9 +#elif (__GNUC__ >= 10) #define GCOV_COUNTERS 8 #elif (__GNUC__ >= 7) #define GCOV_COUNTERS 9 From 3afb76a66b5559a7b595155803ce23801558a7a9 Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Thu, 6 Jun 2024 14:06:22 -0400 Subject: [PATCH 1014/1578] mm: mmap: allow for the maximum number of bits for randomizing mmap_base by default An ASLR regression was noticed [1] and tracked down to file-mapped areas being backed by THP in recent kernels. The 21-bit alignment constraint for such mappings reduces the entropy for randomizing the placement of 64-bit library mappings and breaks ASLR completely for 32-bit libraries. The reported issue is easily addressed by increasing vm.mmap_rnd_bits and vm.mmap_rnd_compat_bits. This patch just provides a simple way to set ARCH_MMAP_RND_BITS and ARCH_MMAP_RND_COMPAT_BITS to their maximum values allowed by the architecture at build time. [1] https://zolutal.github.io/aslrnt/ [akpm@linux-foundation.org: default to `y' if 32-bit, per Rafael] Link: https://lkml.kernel.org/r/20240606180622.102099-1-aquini@redhat.com Fixes: 1854bc6e2420 ("mm/readahead: Align file mappings for non-DAX") Signed-off-by: Rafael Aquini Cc: Arnd Bergmann Cc: Heiko Carstens Cc: Mike Rapoport (IBM) Cc: Paul E. McKenney Cc: Petr Mladek Cc: Samuel Holland Cc: Signed-off-by: Andrew Morton --- arch/Kconfig | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/Kconfig b/arch/Kconfig index 975dd22a2dbd22..3e2a63772b3db1 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1046,10 +1046,21 @@ config ARCH_MMAP_RND_BITS_MAX config ARCH_MMAP_RND_BITS_DEFAULT int +config FORCE_MAX_MMAP_RND_BITS + bool "Force maximum number of bits to use for ASLR of mmap base address" + default y if !64BIT + help + ARCH_MMAP_RND_BITS and ARCH_MMAP_RND_COMPAT_BITS represent the number + of bits to use for ASLR and if no custom value is assigned (EXPERT) + then the architecture's lower bound (minimum) value is assumed. + This toggle changes that default assumption to assume the arch upper + bound (maximum) value instead. + config ARCH_MMAP_RND_BITS int "Number of bits to use for ASLR of mmap base address" if EXPERT range ARCH_MMAP_RND_BITS_MIN ARCH_MMAP_RND_BITS_MAX default ARCH_MMAP_RND_BITS_DEFAULT if ARCH_MMAP_RND_BITS_DEFAULT + default ARCH_MMAP_RND_BITS_MAX if FORCE_MAX_MMAP_RND_BITS default ARCH_MMAP_RND_BITS_MIN depends on HAVE_ARCH_MMAP_RND_BITS help @@ -1084,6 +1095,7 @@ config ARCH_MMAP_RND_COMPAT_BITS int "Number of bits to use for ASLR of mmap base address for compatible applications" if EXPERT range ARCH_MMAP_RND_COMPAT_BITS_MIN ARCH_MMAP_RND_COMPAT_BITS_MAX default ARCH_MMAP_RND_COMPAT_BITS_DEFAULT if ARCH_MMAP_RND_COMPAT_BITS_DEFAULT + default ARCH_MMAP_RND_COMPAT_BITS_MAX if FORCE_MAX_MMAP_RND_BITS default ARCH_MMAP_RND_COMPAT_BITS_MIN depends on HAVE_ARCH_MMAP_RND_COMPAT_BITS help From 653c5c75115c1e23b8393c1cb1ad2d6f6712742f Mon Sep 17 00:00:00 2001 From: Jeff Xu Date: Fri, 7 Jun 2024 20:35:41 +0000 Subject: [PATCH 1015/1578] mm/memfd: add documentation for MFD_NOEXEC_SEAL MFD_EXEC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When MFD_NOEXEC_SEAL was introduced, there was one big mistake: it didn't have proper documentation. This led to a lot of confusion, especially about whether or not memfd created with the MFD_NOEXEC_SEAL flag is sealable. Before MFD_NOEXEC_SEAL, memfd had to explicitly set MFD_ALLOW_SEALING to be sealable, so it's a fair question. As one might have noticed, unlike other flags in memfd_create, MFD_NOEXEC_SEAL is actually a combination of multiple flags. The idea is to make it easier to use memfd in the most common way, which is NOEXEC + F_SEAL_EXEC + MFD_ALLOW_SEALING. This works with sysctl vm.noexec to help existing applications move to a more secure way of using memfd. Proposals have been made to put MFD_NOEXEC_SEAL non-sealable, unless MFD_ALLOW_SEALING is set, to be consistent with other flags [1], Those are based on the viewpoint that each flag is an atomic unit, which is a reasonable assumption. However, MFD_NOEXEC_SEAL was designed with the intent of promoting the most secure method of using memfd, therefore a combination of multiple functionalities into one bit. Furthermore, the MFD_NOEXEC_SEAL has been added for more than one year, and multiple applications and distributions have backported and utilized it. Altering ABI now presents a degree of risk and may lead to disruption. MFD_NOEXEC_SEAL is a new flag, and applications must change their code to use it. There is no backward compatibility problem. When sysctl vm.noexec == 1 or 2, applications that don't set MFD_NOEXEC_SEAL or MFD_EXEC will get MFD_NOEXEC_SEAL memfd. And old-application might break, that is by-design, in such a system vm.noexec = 0 shall be used. Also no backward compatibility problem. I propose to include this documentation patch to assist in clarifying the semantics of MFD_NOEXEC_SEAL, thereby preventing any potential future confusion. Finally, I would like to express my gratitude to David Rheinsberg and Barnabás Pőcze for initiating the discussion on the topic of sealability. [1] https://lore.kernel.org/lkml/20230714114753.170814-1-david@readahead.eu/ [jeffxu@chromium.org: updates per Randy] Link: https://lkml.kernel.org/r/20240611034903.3456796-2-jeffxu@chromium.org [jeffxu@chromium.org: v3] Link: https://lkml.kernel.org/r/20240611231409.3899809-2-jeffxu@chromium.org Link: https://lkml.kernel.org/r/20240607203543.2151433-2-jeffxu@google.com Signed-off-by: Jeff Xu Reviewed-by: Randy Dunlap Cc: Aleksa Sarai Cc: Barnabás Pőcze Cc: Daniel Verkamp Cc: David Rheinsberg Cc: Dmitry Torokhov Cc: Hugh Dickins Cc: Jorge Lucangeli Obes Cc: Kees Cook Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/userspace-api/index.rst | 1 + Documentation/userspace-api/mfd_noexec.rst | 86 ++++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 Documentation/userspace-api/mfd_noexec.rst diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst index 5926115ec0ed86..8a251d71fa6e14 100644 --- a/Documentation/userspace-api/index.rst +++ b/Documentation/userspace-api/index.rst @@ -32,6 +32,7 @@ Security-related interfaces seccomp_filter landlock lsm + mfd_noexec spec_ctrl tee diff --git a/Documentation/userspace-api/mfd_noexec.rst b/Documentation/userspace-api/mfd_noexec.rst new file mode 100644 index 00000000000000..7afcc480e38f00 --- /dev/null +++ b/Documentation/userspace-api/mfd_noexec.rst @@ -0,0 +1,86 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================== +Introduction of non-executable mfd +================================== +:Author: + Daniel Verkamp + Jeff Xu + +:Contributor: + Aleksa Sarai + +Since Linux introduced the memfd feature, memfds have always had their +execute bit set, and the memfd_create() syscall doesn't allow setting +it differently. + +However, in a secure-by-default system, such as ChromeOS, (where all +executables should come from the rootfs, which is protected by verified +boot), this executable nature of memfd opens a door for NoExec bypass +and enables “confused deputy attack”. E.g, in VRP bug [1]: cros_vm +process created a memfd to share the content with an external process, +however the memfd is overwritten and used for executing arbitrary code +and root escalation. [2] lists more VRP of this kind. + +On the other hand, executable memfd has its legit use: runc uses memfd’s +seal and executable feature to copy the contents of the binary then +execute them. For such a system, we need a solution to differentiate runc's +use of executable memfds and an attacker's [3]. + +To address those above: + - Let memfd_create() set X bit at creation time. + - Let memfd be sealed for modifying X bit when NX is set. + - Add a new pid namespace sysctl: vm.memfd_noexec to help applications in + migrating and enforcing non-executable MFD. + +User API +======== +``int memfd_create(const char *name, unsigned int flags)`` + +``MFD_NOEXEC_SEAL`` + When MFD_NOEXEC_SEAL bit is set in the ``flags``, memfd is created + with NX. F_SEAL_EXEC is set and the memfd can't be modified to + add X later. MFD_ALLOW_SEALING is also implied. + This is the most common case for the application to use memfd. + +``MFD_EXEC`` + When MFD_EXEC bit is set in the ``flags``, memfd is created with X. + +Note: + ``MFD_NOEXEC_SEAL`` implies ``MFD_ALLOW_SEALING``. In case that + an app doesn't want sealing, it can add F_SEAL_SEAL after creation. + + +Sysctl: +======== +``pid namespaced sysctl vm.memfd_noexec`` + +The new pid namespaced sysctl vm.memfd_noexec has 3 values: + + - 0: MEMFD_NOEXEC_SCOPE_EXEC + memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL acts like + MFD_EXEC was set. + + - 1: MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL + memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL acts like + MFD_NOEXEC_SEAL was set. + + - 2: MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED + memfd_create() without MFD_NOEXEC_SEAL will be rejected. + +The sysctl allows finer control of memfd_create for old software that +doesn't set the executable bit; for example, a container with +vm.memfd_noexec=1 means the old software will create non-executable memfd +by default while new software can create executable memfd by setting +MFD_EXEC. + +The value of vm.memfd_noexec is passed to child namespace at creation +time. In addition, the setting is hierarchical, i.e. during memfd_create, +we will search from current ns to root ns and use the most restrictive +setting. + +[1] https://crbug.com/1305267 + +[2] https://bugs.chromium.org/p/chromium/issues/list?q=type%3Dbug-security%20memfd%20escalation&can=1 + +[3] https://lwn.net/Articles/781013/ From e7d2a28bd0b27e43bff3f516ee0607d776b019f4 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 5 Jun 2024 23:36:12 +0100 Subject: [PATCH 1016/1578] selftests: mm: make map_fixed_noreplace test names stable KTAP parsers interpret the output of ksft_test_result_*() as being the name of the test. The map_fixed_noreplace test uses a dynamically allocated base address for the mmap()s that it tests and currently includes this in the test names that it logs so the test names that are logged are not stable between runs. It also uses multiples of PAGE_SIZE which mean that runs for kernels with different PAGE_SIZE configurations can't be directly compared. Both these factors cause issues for CI systems when interpreting and displaying results. Fix this by replacing the current test names with fixed strings describing the intent of the mappings that are logged, the existing messages with the actual addresses and sizes are retained as diagnostic prints to aid in debugging. Link: https://lkml.kernel.org/r/20240605-kselftest-mm-fixed-noreplace-v1-1-a235db8b9be9@kernel.org Fixes: 4838cf70e539 ("selftests/mm: map_fixed_noreplace: conform test to TAP format output") Signed-off-by: Mark Brown Reviewed-by: Ryan Roberts Cc: Muhammad Usama Anjum Cc: Shuah Khan Signed-off-by: Andrew Morton --- .../selftests/mm/map_fixed_noreplace.c | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/mm/map_fixed_noreplace.c b/tools/testing/selftests/mm/map_fixed_noreplace.c index b74813fdc95143..d53de2486080ee 100644 --- a/tools/testing/selftests/mm/map_fixed_noreplace.c +++ b/tools/testing/selftests/mm/map_fixed_noreplace.c @@ -67,7 +67,8 @@ int main(void) dump_maps(); ksft_exit_fail_msg("Error: munmap failed!?\n"); } - ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_test_result_pass("mmap() 5*PAGE_SIZE at base\n"); addr = base_addr + page_size; size = 3 * page_size; @@ -76,7 +77,8 @@ int main(void) dump_maps(); ksft_exit_fail_msg("Error: first mmap() failed unexpectedly\n"); } - ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_test_result_pass("mmap() 3*PAGE_SIZE at base+PAGE_SIZE\n"); /* * Exact same mapping again: @@ -93,7 +95,8 @@ int main(void) dump_maps(); ksft_exit_fail_msg("Error:1: mmap() succeeded when it shouldn't have\n"); } - ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_test_result_pass("mmap() 5*PAGE_SIZE at base\n"); /* * Second mapping contained within first: @@ -111,7 +114,8 @@ int main(void) dump_maps(); ksft_exit_fail_msg("Error:2: mmap() succeeded when it shouldn't have\n"); } - ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_test_result_pass("mmap() 2*PAGE_SIZE at base+PAGE_SIZE\n"); /* * Overlap end of existing mapping: @@ -128,7 +132,8 @@ int main(void) dump_maps(); ksft_exit_fail_msg("Error:3: mmap() succeeded when it shouldn't have\n"); } - ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_test_result_pass("mmap() 2*PAGE_SIZE at base+(3*PAGE_SIZE)\n"); /* * Overlap start of existing mapping: @@ -145,7 +150,8 @@ int main(void) dump_maps(); ksft_exit_fail_msg("Error:4: mmap() succeeded when it shouldn't have\n"); } - ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_test_result_pass("mmap() 2*PAGE_SIZE bytes at base\n"); /* * Adjacent to start of existing mapping: @@ -162,7 +168,8 @@ int main(void) dump_maps(); ksft_exit_fail_msg("Error:5: mmap() failed when it shouldn't have\n"); } - ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_test_result_pass("mmap() PAGE_SIZE at base\n"); /* * Adjacent to end of existing mapping: @@ -179,7 +186,8 @@ int main(void) dump_maps(); ksft_exit_fail_msg("Error:6: mmap() failed when it shouldn't have\n"); } - ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_test_result_pass("mmap() PAGE_SIZE at base+(4*PAGE_SIZE)\n"); addr = base_addr; size = 5 * page_size; From 8e279f970b5cb0628f856b6735e2e47b4da9f76e Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jun 2024 22:06:20 -0700 Subject: [PATCH 1017/1578] mm/migrate: fix kernel BUG at mm/compaction.c:2761! I hit the VM_BUG_ON(!list_empty(&cc->migratepages)) in compact_zone(); and if DEBUG_VM were off, then pages would be lost on a local list. Our convention is that if migrate_pages() reports complete success (0), then the migratepages list will be empty; but if it reports an error or some pages remaining, then its caller must putback_movable_pages(). There's a new case in which migrate_pages() has been reporting complete success, but returning with pages left on the migratepages list: when migrate_pages_batch() successfully split a folio on the deferred list, but then the "Failure isn't counted" call does not dispose of them all. Since that block is expecting the large folio to have been counted as 1 failure already, and since the return code is later adjusted to success whenever the returned list is found empty, the simple way to fix this safely is to count splitting the deferred folio as "a failure". Link: https://lkml.kernel.org/r/46c948b4-4dd8-6e03-4c7b-ce4e81cfa536@google.com Fixes: 7262f208ca68 ("mm/migrate: split source folio if it is on deferred split list") Signed-off-by: Hugh Dickins Cc: Baolin Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/migrate.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mm/migrate.c b/mm/migrate.c index dd04f578c19c3e..2cc5a68f684352 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1654,7 +1654,12 @@ static int migrate_pages_batch(struct list_head *from, /* * The rare folio on the deferred split list should - * be split now. It should not count as a failure. + * be split now. It should not count as a failure: + * but increment nr_failed because, without doing so, + * migrate_pages() may report success with (split but + * unmigrated) pages still on its fromlist; whereas it + * always reports success when its fromlist is empty. + * * Only check it without removing it from the list. * Since the folio can be on deferred_split_scan() * local list and removing it can cause the local list @@ -1669,6 +1674,7 @@ static int migrate_pages_batch(struct list_head *from, if (nr_pages > 2 && !list_empty(&folio->_deferred_list)) { if (try_split_folio(folio, split_folios) == 0) { + nr_failed++; stats->nr_thp_split += is_thp; stats->nr_split++; continue; From cfdd12b48202398a879e8bc4e7fa023f4d473f62 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 12 Jun 2024 20:28:22 +0800 Subject: [PATCH 1018/1578] mm: fix possible OOB in numa_rebuild_large_mapping() The large folio is mapped with folio size(not greater PMD_SIZE) aligned virtual address during the pagefault, ie, 'addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE)' in do_anonymous_page(). But after the mremap(), the virtual address only requires PAGE_SIZE alignment. Also pte is moved to new in move_page_tables(), then traversal of the new pte in the numa_rebuild_large_mapping() could hit the following issue, Unable to handle kernel paging request at virtual address 00000a80c021a788 Mem abort info: ESR = 0x0000000096000004 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x04: level 0 translation fault Data abort info: ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000 CM = 0, WnR = 0, TnD = 0, TagAccess = 0 GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 user pgtable: 4k pages, 48-bit VAs, pgdp=00002040341a6000 [00000a80c021a788] pgd=0000000000000000, p4d=0000000000000000 Internal error: Oops: 0000000096000004 [#1] SMP ... CPU: 76 PID: 15187 Comm: git Kdump: loaded Tainted: G W 6.10.0-rc2+ #209 Hardware name: Huawei TaiShan 2280 V2/BC82AMDD, BIOS 1.79 08/21/2021 pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : numa_rebuild_large_mapping+0x338/0x638 lr : numa_rebuild_large_mapping+0x320/0x638 sp : ffff8000b41c3b00 x29: ffff8000b41c3b30 x28: ffff8000812a0000 x27: 00000000000a8000 x26: 00000000000000a8 x25: 0010000000000001 x24: ffff20401c7170f0 x23: 0000ffff33a1e000 x22: 0000ffff33a76000 x21: ffff20400869eca0 x20: 0000ffff33976000 x19: 00000000000000a8 x18: ffffffffffffffff x17: 0000000000000000 x16: 0000000000000020 x15: ffff8000b41c36a8 x14: 0000000000000000 x13: 205d373831353154 x12: 5b5d333331363732 x11: 000000000011ff78 x10: 000000000011ff10 x9 : ffff800080273f30 x8 : 000000320400869e x7 : c0000000ffffd87f x6 : 00000000001e6ba8 x5 : ffff206f3fb5af88 x4 : 0000000000000000 x3 : 0000000000000000 x2 : 0000000000000000 x1 : fffffdffc0000000 x0 : 00000a80c021a780 Call trace: numa_rebuild_large_mapping+0x338/0x638 do_numa_page+0x3e4/0x4e0 handle_pte_fault+0x1bc/0x238 __handle_mm_fault+0x20c/0x400 handle_mm_fault+0xa8/0x288 do_page_fault+0x124/0x498 do_translation_fault+0x54/0x80 do_mem_abort+0x4c/0xa8 el0_da+0x40/0x110 el0t_64_sync_handler+0xe4/0x158 el0t_64_sync+0x188/0x190 Fix it by making the start and end not only within the vma range, but also within the page table range. Link: https://lkml.kernel.org/r/20240612122822.4033433-1-wangkefeng.wang@huawei.com Fixes: d2136d749d76 ("mm: support multi-size THP numa balancing") Signed-off-by: Kefeng Wang Acked-by: David Hildenbrand Reviewed-by: Baolin Wang Cc: "Huang, Ying" Cc: John Hubbard Cc: Liu Shixin Cc: Mel Gorman Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/memory.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 2bc8032a30a2ff..25a77c4fe4a03a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5100,10 +5100,16 @@ static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_stru bool ignore_writable, bool pte_write_upgrade) { int nr = pte_pfn(fault_pte) - folio_pfn(folio); - unsigned long start = max(vmf->address - nr * PAGE_SIZE, vma->vm_start); - unsigned long end = min(vmf->address + (folio_nr_pages(folio) - nr) * PAGE_SIZE, vma->vm_end); - pte_t *start_ptep = vmf->pte - (vmf->address - start) / PAGE_SIZE; - unsigned long addr; + unsigned long start, end, addr = vmf->address; + unsigned long addr_start = addr - (nr << PAGE_SHIFT); + unsigned long pt_start = ALIGN_DOWN(addr, PMD_SIZE); + pte_t *start_ptep; + + /* Stay within the VMA and within the page table. */ + start = max3(addr_start, pt_start, vma->vm_start); + end = min3(addr_start + folio_size(folio), pt_start + PMD_SIZE, + vma->vm_end); + start_ptep = vmf->pte - ((addr - start) >> PAGE_SHIFT); /* Restore all PTEs' mapping of the large folio */ for (addr = start; addr != end; start_ptep++, addr += PAGE_SIZE) { From 0b1ef4fde7a24909ff2afacffd0d6afa28b73652 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 23 May 2024 09:21:39 -0400 Subject: [PATCH 1019/1578] mm/debug_vm_pgtable: drop RANDOM_ORVALUE trick Macro RANDOM_ORVALUE was used to make sure the pgtable entry will be populated with !none data in clear tests. The RANDOM_ORVALUE tried to cover mostly all the bits in a pgtable entry, even if there's no discussion on whether all the bits will be vaild. Both S390 and PPC64 have their own masks to avoid touching some bits. Now it's the turn for x86_64. The issue is there's a recent report from Mikhail Gavrilov showing that this can cause a warning with the newly added pte set check in commit 8430557fc5 on writable v.s. userfaultfd-wp bit, even though the check itself was valid, the random pte is not. We can choose to mask more bits out. However the need to have such random bits setup is questionable, as now it's already guaranteed to be true on below: - For pte level, the pgtable entry will be installed with value from pfn_pte(), where pfn points to a valid page. Hence the pte will be !none already if populated with pfn_pte(). - For upper-than-pte level, the pgtable entry should contain a directory entry always, which is also !none. All the cases look like good enough to test a pxx_clear() helper. Instead of extending the bitmask, drop the "set random bits" trick completely. Add some warning guards to make sure the entries will be !none before clear(). Link: https://lkml.kernel.org/r/20240523132139.289719-1-peterx@redhat.com Fixes: 8430557fc584 ("mm/page_table_check: support userfault wr-protect entries") Signed-off-by: Peter Xu Reported-by: Mikhail Gavrilov Link: https://lore.kernel.org/r/CABXGCsMB9A8-X+Np_Q+fWLURYL_0t3Y-MdoNabDM-Lzk58-DGA@mail.gmail.com Tested-by: Mikhail Gavrilov Reviewed-by: Pasha Tatashin Acked-by: David Hildenbrand Cc: Aneesh Kumar K.V Cc: Gavin Shan Cc: Anshuman Khandual Signed-off-by: Andrew Morton --- mm/debug_vm_pgtable.c | 31 +++++-------------------------- 1 file changed, 5 insertions(+), 26 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index b104a353b532bf..e4969fb54da346 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -40,22 +40,7 @@ * Please refer Documentation/mm/arch_pgtable_helpers.rst for the semantics * expectations that are being validated here. All future changes in here * or the documentation need to be in sync. - * - * On s390 platform, the lower 4 bits are used to identify given page table - * entry type. But these bits might affect the ability to clear entries with - * pxx_clear() because of how dynamic page table folding works on s390. So - * while loading up the entries do not change the lower 4 bits. It does not - * have affect any other platform. Also avoid the 62nd bit on ppc64 that is - * used to mark a pte entry. */ -#define S390_SKIP_MASK GENMASK(3, 0) -#if __BITS_PER_LONG == 64 -#define PPC64_SKIP_MASK GENMASK(62, 62) -#else -#define PPC64_SKIP_MASK 0x0 -#endif -#define ARCH_SKIP_MASK (S390_SKIP_MASK | PPC64_SKIP_MASK) -#define RANDOM_ORVALUE (GENMASK(BITS_PER_LONG - 1, 0) & ~ARCH_SKIP_MASK) #define RANDOM_NZVALUE GENMASK(7, 0) struct pgtable_debug_args { @@ -511,8 +496,7 @@ static void __init pud_clear_tests(struct pgtable_debug_args *args) return; pr_debug("Validating PUD clear\n"); - pud = __pud(pud_val(pud) | RANDOM_ORVALUE); - WRITE_ONCE(*args->pudp, pud); + WARN_ON(pud_none(pud)); pud_clear(args->pudp); pud = READ_ONCE(*args->pudp); WARN_ON(!pud_none(pud)); @@ -548,8 +532,7 @@ static void __init p4d_clear_tests(struct pgtable_debug_args *args) return; pr_debug("Validating P4D clear\n"); - p4d = __p4d(p4d_val(p4d) | RANDOM_ORVALUE); - WRITE_ONCE(*args->p4dp, p4d); + WARN_ON(p4d_none(p4d)); p4d_clear(args->p4dp); p4d = READ_ONCE(*args->p4dp); WARN_ON(!p4d_none(p4d)); @@ -582,8 +565,7 @@ static void __init pgd_clear_tests(struct pgtable_debug_args *args) return; pr_debug("Validating PGD clear\n"); - pgd = __pgd(pgd_val(pgd) | RANDOM_ORVALUE); - WRITE_ONCE(*args->pgdp, pgd); + WARN_ON(pgd_none(pgd)); pgd_clear(args->pgdp); pgd = READ_ONCE(*args->pgdp); WARN_ON(!pgd_none(pgd)); @@ -634,10 +616,8 @@ static void __init pte_clear_tests(struct pgtable_debug_args *args) if (WARN_ON(!args->ptep)) return; -#ifndef CONFIG_RISCV - pte = __pte(pte_val(pte) | RANDOM_ORVALUE); -#endif set_pte_at(args->mm, args->vaddr, args->ptep, pte); + WARN_ON(pte_none(pte)); flush_dcache_page(page); barrier(); ptep_clear(args->mm, args->vaddr, args->ptep); @@ -650,8 +630,7 @@ static void __init pmd_clear_tests(struct pgtable_debug_args *args) pmd_t pmd = READ_ONCE(*args->pmdp); pr_debug("Validating PMD clear\n"); - pmd = __pmd(pmd_val(pmd) | RANDOM_ORVALUE); - WRITE_ONCE(*args->pmdp, pmd); + WARN_ON(pmd_none(pmd)); pmd_clear(args->pmdp); pmd = READ_ONCE(*args->pmdp); WARN_ON(!pmd_none(pmd)); From 9094b4a1c76cfe84b906cc152bab34d4ba26fa5c Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 13 Jun 2024 16:21:19 +0800 Subject: [PATCH 1020/1578] mm: shmem: fix getting incorrect lruvec when replacing a shmem folio When testing shmem swapin, I encountered the warning below on my machine. The reason is that replacing an old shmem folio with a new one causes mem_cgroup_migrate() to clear the old folio's memcg data. As a result, the old folio cannot get the correct memcg's lruvec needed to remove itself from the LRU list when it is being freed. This could lead to possible serious problems, such as LRU list crashes due to holding the wrong LRU lock, and incorrect LRU statistics. To fix this issue, we can fallback to use the mem_cgroup_replace_folio() to replace the old shmem folio. [ 5241.100311] page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x5d9960 [ 5241.100317] head: order:4 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0 [ 5241.100319] flags: 0x17fffe0000040068(uptodate|lru|head|swapbacked|node=0|zone=2|lastcpupid=0x3ffff) [ 5241.100323] raw: 17fffe0000040068 fffffdffd6687948 fffffdffd69ae008 0000000000000000 [ 5241.100325] raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 [ 5241.100326] head: 17fffe0000040068 fffffdffd6687948 fffffdffd69ae008 0000000000000000 [ 5241.100327] head: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 [ 5241.100328] head: 17fffe0000000204 fffffdffd6665801 ffffffffffffffff 0000000000000000 [ 5241.100329] head: 0000000a00000010 0000000000000000 00000000ffffffff 0000000000000000 [ 5241.100330] page dumped because: VM_WARN_ON_ONCE_FOLIO(!memcg && !mem_cgroup_disabled()) [ 5241.100338] ------------[ cut here ]------------ [ 5241.100339] WARNING: CPU: 19 PID: 78402 at include/linux/memcontrol.h:775 folio_lruvec_lock_irqsave+0x140/0x150 [...] [ 5241.100374] pc : folio_lruvec_lock_irqsave+0x140/0x150 [ 5241.100375] lr : folio_lruvec_lock_irqsave+0x138/0x150 [ 5241.100376] sp : ffff80008b38b930 [...] [ 5241.100398] Call trace: [ 5241.100399] folio_lruvec_lock_irqsave+0x140/0x150 [ 5241.100401] __page_cache_release+0x90/0x300 [ 5241.100404] __folio_put+0x50/0x108 [ 5241.100406] shmem_replace_folio+0x1b4/0x240 [ 5241.100409] shmem_swapin_folio+0x314/0x528 [ 5241.100411] shmem_get_folio_gfp+0x3b4/0x930 [ 5241.100412] shmem_fault+0x74/0x160 [ 5241.100414] __do_fault+0x40/0x218 [ 5241.100417] do_shared_fault+0x34/0x1b0 [ 5241.100419] do_fault+0x40/0x168 [ 5241.100420] handle_pte_fault+0x80/0x228 [ 5241.100422] __handle_mm_fault+0x1c4/0x440 [ 5241.100424] handle_mm_fault+0x60/0x1f0 [ 5241.100426] do_page_fault+0x120/0x488 [ 5241.100429] do_translation_fault+0x4c/0x68 [ 5241.100431] do_mem_abort+0x48/0xa0 [ 5241.100434] el0_da+0x38/0xc0 [ 5241.100436] el0t_64_sync_handler+0x68/0xc0 [ 5241.100437] el0t_64_sync+0x14c/0x150 [ 5241.100439] ---[ end trace 0000000000000000 ]--- [baolin.wang@linux.alibaba.com: remove less helpful comments, per Matthew] Link: https://lkml.kernel.org/r/ccad3fe1375b468ebca3227b6b729f3eaf9d8046.1718423197.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/3c11000dd6c1df83015a8321a859e9775ebbc23e.1718266112.git.baolin.wang@linux.alibaba.com Fixes: 85ce2c517ade ("memcontrol: only transfer the memcg data for migration") Signed-off-by: Baolin Wang Reviewed-by: Shakeel Butt Cc: Matthew Wilcox (Oracle) Cc: Hugh Dickins Cc: Johannes Weiner Cc: Nhat Pham Cc: Michal Hocko Cc: Roman Gushchin Cc: Muchun Song Cc: Signed-off-by: Andrew Morton --- mm/memcontrol.c | 3 +-- mm/shmem.c | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 36793e509f4708..71fe2a95b8bd33 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7745,8 +7745,7 @@ void __mem_cgroup_uncharge_folios(struct folio_batch *folios) * @new: Replacement folio. * * Charge @new as a replacement folio for @old. @old will - * be uncharged upon free. This is only used by the page cache - * (in replace_page_cache_folio()). + * be uncharged upon free. * * Both folios must be locked, @new->mapping must be set up. */ diff --git a/mm/shmem.c b/mm/shmem.c index f5d60436b604af..a8b181a6340296 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1786,7 +1786,7 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, xa_lock_irq(&swap_mapping->i_pages); error = shmem_replace_entry(swap_mapping, swap_index, old, new); if (!error) { - mem_cgroup_migrate(old, new); + mem_cgroup_replace_folio(old, new); __lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1); __lruvec_stat_mod_folio(new, NR_SHMEM, 1); __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1); From 01c8f9806bde438ca1c8cbbc439f0a14a6694f6c Mon Sep 17 00:00:00 2001 From: Aleksandr Nogikh Date: Tue, 11 Jun 2024 15:32:29 +0200 Subject: [PATCH 1021/1578] kcov: don't lose track of remote references during softirqs In kcov_remote_start()/kcov_remote_stop(), we swap the previous KCOV metadata of the current task into a per-CPU variable. However, the kcov_mode_enabled(mode) check is not sufficient in the case of remote KCOV coverage: current->kcov_mode always remains KCOV_MODE_DISABLED for remote KCOV objects. If the original task that has invoked the KCOV_REMOTE_ENABLE ioctl happens to get interrupted and kcov_remote_start() is called, it ultimately leads to kcov_remote_stop() NOT restoring the original KCOV reference. So when the task exits, all registered remote KCOV handles remain active forever. The most uncomfortable effect (at least for syzkaller) is that the bug prevents the reuse of the same /sys/kernel/debug/kcov descriptor. If we obtain it in the parent process and then e.g. drop some capabilities and continuously fork to execute individual programs, at some point current->kcov of the forked process is lost, kcov_task_exit() takes no action, and all KCOV_REMOTE_ENABLE ioctls calls from subsequent forks fail. And, yes, the efficiency is also affected if we keep on losing remote kcov objects. a) kcov_remote_map keeps on growing forever. b) (If I'm not mistaken), we're also not freeing the memory referenced by kcov->area. Fix it by introducing a special kcov_mode that is assigned to the task that owns a KCOV remote object. It makes kcov_mode_enabled() return true and yet does not trigger coverage collection in __sanitizer_cov_trace_pc() and write_comp_data(). [nogikh@google.com: replace WRITE_ONCE() with an ordinary assignment] Link: https://lkml.kernel.org/r/20240614171221.2837584-1-nogikh@google.com Link: https://lkml.kernel.org/r/20240611133229.527822-1-nogikh@google.com Fixes: 5ff3b30ab57d ("kcov: collect coverage from interrupts") Signed-off-by: Aleksandr Nogikh Reviewed-by: Dmitry Vyukov Reviewed-by: Andrey Konovalov Tested-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Arnd Bergmann Cc: Marco Elver Cc: Signed-off-by: Andrew Morton --- include/linux/kcov.h | 2 ++ kernel/kcov.c | 1 + 2 files changed, 3 insertions(+) diff --git a/include/linux/kcov.h b/include/linux/kcov.h index b851ba415e03fd..3b479a3d235a97 100644 --- a/include/linux/kcov.h +++ b/include/linux/kcov.h @@ -21,6 +21,8 @@ enum kcov_mode { KCOV_MODE_TRACE_PC = 2, /* Collecting comparison operands mode. */ KCOV_MODE_TRACE_CMP = 3, + /* The process owns a KCOV remote reference. */ + KCOV_MODE_REMOTE = 4, }; #define KCOV_IN_CTXSW (1 << 30) diff --git a/kernel/kcov.c b/kernel/kcov.c index c3124f6d5536b7..f0a69d402066e1 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -632,6 +632,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, return -EINVAL; kcov->mode = mode; t->kcov = kcov; + t->kcov_mode = KCOV_MODE_REMOTE; kcov->t = t; kcov->remote = true; kcov->remote_size = remote_arg->area_size; From a986fa57fd81a1430e00b3c6cf8a325d6f894a63 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 14 Jun 2024 22:29:10 +1000 Subject: [PATCH 1022/1578] KVM: PPC: Book3S HV: Prevent UAF in kvm_spapr_tce_attach_iommu_group() Al reported a possible use-after-free (UAF) in kvm_spapr_tce_attach_iommu_group(). It looks up `stt` from tablefd, but then continues to use it after doing fdput() on the returned fd. After the fdput() the tablefd is free to be closed by another thread. The close calls kvm_spapr_tce_release() and then release_spapr_tce_table() (via call_rcu()) which frees `stt`. Although there are calls to rcu_read_lock() in kvm_spapr_tce_attach_iommu_group() they are not sufficient to prevent the UAF, because `stt` is used outside the locked regions. With an artifcial delay after the fdput() and a userspace program which triggers the race, KASAN detects the UAF: BUG: KASAN: slab-use-after-free in kvm_spapr_tce_attach_iommu_group+0x298/0x720 [kvm] Read of size 4 at addr c000200027552c30 by task kvm-vfio/2505 CPU: 54 PID: 2505 Comm: kvm-vfio Not tainted 6.10.0-rc3-next-20240612-dirty #1 Hardware name: 8335-GTH POWER9 0x4e1202 opal:skiboot-v6.5.3-35-g1851b2a06 PowerNV Call Trace: dump_stack_lvl+0xb4/0x108 (unreliable) print_report+0x2b4/0x6ec kasan_report+0x118/0x2b0 __asan_load4+0xb8/0xd0 kvm_spapr_tce_attach_iommu_group+0x298/0x720 [kvm] kvm_vfio_set_attr+0x524/0xac0 [kvm] kvm_device_ioctl+0x144/0x240 [kvm] sys_ioctl+0x62c/0x1810 system_call_exception+0x190/0x440 system_call_vectored_common+0x15c/0x2ec ... Freed by task 0: ... kfree+0xec/0x3e0 release_spapr_tce_table+0xd4/0x11c [kvm] rcu_core+0x568/0x16a0 handle_softirqs+0x23c/0x920 do_softirq_own_stack+0x6c/0x90 do_softirq_own_stack+0x58/0x90 __irq_exit_rcu+0x218/0x2d0 irq_exit+0x30/0x80 arch_local_irq_restore+0x128/0x230 arch_local_irq_enable+0x1c/0x30 cpuidle_enter_state+0x134/0x5cc cpuidle_enter+0x6c/0xb0 call_cpuidle+0x7c/0x100 do_idle+0x394/0x410 cpu_startup_entry+0x60/0x70 start_secondary+0x3fc/0x410 start_secondary_prolog+0x10/0x14 Fix it by delaying the fdput() until `stt` is no longer in use, which is effectively the entire function. To keep the patch minimal add a call to fdput() at each of the existing return paths. Future work can convert the function to goto or __cleanup style cleanup. With the fix in place the test case no longer triggers the UAF. Reported-by: Al Viro Closes: https://lore.kernel.org/all/20240610024437.GA1464458@ZenIV/ Signed-off-by: Michael Ellerman Link: https://msgid.link/20240614122910.3499489-1-mpe@ellerman.id.au --- arch/powerpc/kvm/book3s_64_vio.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index b569ebaa590e2d..3ff3de9a52acf2 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -130,14 +130,16 @@ long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, } rcu_read_unlock(); - fdput(f); - - if (!found) + if (!found) { + fdput(f); return -EINVAL; + } table_group = iommu_group_get_iommudata(grp); - if (WARN_ON(!table_group)) + if (WARN_ON(!table_group)) { + fdput(f); return -EFAULT; + } for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { struct iommu_table *tbltmp = table_group->tables[i]; @@ -158,8 +160,10 @@ long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, break; } } - if (!tbl) + if (!tbl) { + fdput(f); return -EINVAL; + } rcu_read_lock(); list_for_each_entry_rcu(stit, &stt->iommu_tables, next) { @@ -170,6 +174,7 @@ long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, /* stit is being destroyed */ iommu_tce_table_put(tbl); rcu_read_unlock(); + fdput(f); return -ENOTTY; } /* @@ -177,6 +182,7 @@ long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, * its KVM reference counter and can return. */ rcu_read_unlock(); + fdput(f); return 0; } rcu_read_unlock(); @@ -184,6 +190,7 @@ long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, stit = kzalloc(sizeof(*stit), GFP_KERNEL); if (!stit) { iommu_tce_table_put(tbl); + fdput(f); return -ENOMEM; } @@ -192,6 +199,7 @@ long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, list_add_rcu(&stit->next, &stt->iommu_tables); + fdput(f); return 0; } From e21d12c7cd5cef8a6c5367f96aaab01249216ded Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 11 Jun 2024 11:36:36 +0900 Subject: [PATCH 1023/1578] block: Improve checks on zone resource limits Make sure that the zone resource limits of a zoned block device are correct by checking that: (a) If the device has a max active zones limit, make sure that the max open zones limit is lower than the max active zones limit. (b) If the device has zone resource limits, check that the limits values are lower than the number of sequential zones of the device. If it is not, assume that the zoned device has no limits by setting the limits to 0. For (a), a check is added to blk_validate_zoned_limits() and an error returned if the max open zones limit exceeds the value of the max active zone limit (if there is one). For (b), given that we need the number of sequential zones of the zoned device, this check is added to disk_update_zone_resources(). This is safe to do as that function is executed with the disk queue frozen and the check executed after queue_limits_start_update() which takes the queue limits lock. Of note is that the early return in this function for zoned devices that do not use zone write plugging (e.g. DM devices using native zone append) is moved to after the new check and adjustment of the zone resource limits so that the check applies to any zoned device. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Niklas Cassel Reviewed-by: Benjamin Marzinski Link: https://lore.kernel.org/r/20240611023639.89277-2-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-settings.c | 8 ++++++++ block/blk-zoned.c | 20 ++++++++++++++++---- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index f11c8676eb4c67..f574181105f8a5 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -80,6 +80,14 @@ static int blk_validate_zoned_limits(struct queue_limits *lim) if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED))) return -EINVAL; + /* + * Given that active zones include open zones, the maximum number of + * open zones cannot be larger than the maximum number of active zones. + */ + if (lim->max_active_zones && + lim->max_open_zones > lim->max_active_zones) + return -EINVAL; + if (lim->zone_write_granularity < lim->logical_block_size) lim->zone_write_granularity = lim->logical_block_size; diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 52abebf56027a8..8f89705f5e1c50 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -1647,8 +1647,22 @@ static int disk_update_zone_resources(struct gendisk *disk, return -ENODEV; } + lim = queue_limits_start_update(q); + + /* + * Some devices can advertize zone resource limits that are larger than + * the number of sequential zones of the zoned block device, e.g. a + * small ZNS namespace. For such case, assume that the zoned device has + * no zone resource limits. + */ + nr_seq_zones = disk->nr_zones - nr_conv_zones; + if (lim.max_open_zones >= nr_seq_zones) + lim.max_open_zones = 0; + if (lim.max_active_zones >= nr_seq_zones) + lim.max_active_zones = 0; + if (!disk->zone_wplugs_pool) - return 0; + goto commit; /* * If the device has no limit on the maximum number of open and active @@ -1657,9 +1671,6 @@ static int disk_update_zone_resources(struct gendisk *disk, * dynamic zone write plug allocation when simultaneously writing to * more zones than the size of the mempool. */ - lim = queue_limits_start_update(q); - - nr_seq_zones = disk->nr_zones - nr_conv_zones; pool_size = max(lim.max_open_zones, lim.max_active_zones); if (!pool_size) pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_seq_zones); @@ -1673,6 +1684,7 @@ static int disk_update_zone_resources(struct gendisk *disk, lim.max_open_zones = 0; } +commit: return queue_limits_commit_update(q, &lim); } From 7f91ccd8a608dbe39b97a6e43d635378d493f77e Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 11 Jun 2024 11:36:37 +0900 Subject: [PATCH 1024/1578] dm: Call dm_revalidate_zones() after setting the queue limits dm_revalidate_zones() is called from dm_set_zone_restrictions() when the mapped device queue limits are not yet set. However, dm_revalidate_zones() calls blk_revalidate_disk_zones() and this function consults and modifies the mapped device queue limits. Thus, currently, blk_revalidate_disk_zones() operates on limits that are not yet initialized. Fix this by moving the call to dm_revalidate_zones() out of dm_set_zone_restrictions() and into dm_table_set_restrictions() after executing queue_limits_set(). To further cleanup dm_set_zones_restrictions(), the message about the type of zone append (native or emulated) is also moved inside dm_revalidate_zones(). Fixes: 1c0e720228ad ("dm: use queue_limits_set") Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Benjamin Marzinski Reviewed-by: Niklas Cassel Link: https://lore.kernel.org/r/20240611023639.89277-3-dlemoal@kernel.org Signed-off-by: Jens Axboe --- drivers/md/dm-table.c | 15 +++++++++++---- drivers/md/dm-zone.c | 25 ++++++++++--------------- drivers/md/dm.h | 1 + 3 files changed, 22 insertions(+), 19 deletions(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index fd789eeb62d943..2805cd2b7c8bf2 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1921,10 +1921,7 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, dm_table_any_dev_attr(t, device_is_not_random, NULL)) blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q); - /* - * For a zoned target, setup the zones related queue attributes - * and resources necessary for zone append emulation if necessary. - */ + /* For a zoned table, setup the zone related queue attributes. */ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && limits->zoned) { r = dm_set_zones_restrictions(t, q, limits); if (r) @@ -1935,6 +1932,16 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, if (r) return r; + /* + * Now that the limits are set, check the zones mapped by the table + * and setup the resources for zone append emulation if necessary. + */ + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && limits->zoned) { + r = dm_revalidate_zones(t, q); + if (r) + return r; + } + dm_update_crypto_profile(q, t); /* diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 5d66d916730efa..75d0019a0649d4 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -166,14 +166,22 @@ static int dm_check_zoned_cb(struct blk_zone *zone, unsigned int idx, * blk_revalidate_disk_zones() function here as the mapped device is suspended * (this is called from __bind() context). */ -static int dm_revalidate_zones(struct mapped_device *md, struct dm_table *t) +int dm_revalidate_zones(struct dm_table *t, struct request_queue *q) { + struct mapped_device *md = t->md; struct gendisk *disk = md->disk; int ret; + if (!get_capacity(disk)) + return 0; + /* Revalidate only if something changed. */ - if (!disk->nr_zones || disk->nr_zones != md->nr_zones) + if (!disk->nr_zones || disk->nr_zones != md->nr_zones) { + DMINFO("%s using %s zone append", + disk->disk_name, + queue_emulates_zone_append(q) ? "emulated" : "native"); md->nr_zones = 0; + } if (md->nr_zones) return 0; @@ -240,9 +248,6 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, lim->max_zone_append_sectors = 0; } - if (!get_capacity(md->disk)) - return 0; - /* * Count conventional zones to check that the mapped device will indeed * have sequential write required zones. @@ -269,16 +274,6 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, return 0; } - if (!md->disk->nr_zones) { - DMINFO("%s using %s zone append", - md->disk->disk_name, - queue_emulates_zone_append(q) ? "emulated" : "native"); - } - - ret = dm_revalidate_zones(md, t); - if (ret < 0) - return ret; - if (!static_key_enabled(&zoned_enabled.key)) static_branch_enable(&zoned_enabled); return 0; diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 53ef8207fe2c15..c984ecb64b1e89 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -103,6 +103,7 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t); */ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *lim); +int dm_revalidate_zones(struct dm_table *t, struct request_queue *q); void dm_zone_endio(struct dm_io *io, struct bio *clone); #ifdef CONFIG_BLK_DEV_ZONED int dm_blk_report_zones(struct gendisk *disk, sector_t sector, From 73a74af0c72b7cfd843cbd93e088fc5c51471a84 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 11 Jun 2024 11:36:38 +0900 Subject: [PATCH 1025/1578] dm: Improve zone resource limits handling The generic stacking of limits implemented in the block layer cannot correctly handle stacking of zone resource limits (max open zones and max active zones) because these limits are for an entire device but the stacking may be for a portion of that device (e.g. a dm-linear target that does not cover an entire block device). As a result, when DM devices are created on top of zoned block devices, the DM device never has any zone resource limits advertized, which is only correct if all underlying target devices also have no zone resource limits. If at least one target device has resource limits, the user may see either performance issues (if the max open zone limit of the device is exceeded) or write I/O errors if the max active zone limit of one of the underlying target devices is exceeded. While it is very difficult to correctly and reliably stack zone resource limits in general, cases where targets are not sharing zone resources of the same device can be dealt with relatively easily. Such situation happens when a target maps all sequential zones of a zoned block device: for such mapping, other targets mapping other parts of the same zoned block device can only contain conventional zones and thus will not require any zone resource to correctly handle write operations. For a mapped device constructed with such targets, which includes mapped devices constructed with targets mapping entire zoned block devices, the zone resource limits can be reliably determined using the non-zero minimum of the zone resource limits of all targets. For mapped devices that include targets partially mapping the set of sequential write required zones of zoned block devices, instead of advertizing no zone resource limits, it is also better to set the mapped device limits to the non-zero minimum of the limits of all targets. In this case the limits for a target depend on the number of sequential zones being mapped: if this number of zone is larger than the limits, then the limits of the device apply and can be used. If on the other hand the target maps a number of zones smaller than the limits, then no limits is needed and we can assume that the target has no limits (limits set to 0). This commit improves zone resource limits handling as described above by modifying dm_set_zones_restrictions() to iterate the targets of a mapped device to evaluate the max open and max active zone limits. This relies on an internal "stacking" of the limits of the target devices combined with a direct counting of the number of sequential zones mapped by the targets. 1) For a target mapping an entire zoned block device, the limits for the target are set to the limits of the device. 2) For a target partially mapping a zoned block device, the number of mapped sequential zones is used to determine the limits: if the target maps more sequential write required zones than the device limits, then the limits of the device are used as-is. If the number of mapped sequential zones is lower than the limits, then we assume that the target has no limits (limits set to 0). As this evaluation is done for each target, the zone resource limits for the mapped device are evaluated as the non-zero minimum of the limits of all the targets. For configurations resulting in unreliable limits, i.e. a table containing a target partially mapping a zoned device, a warning message is issued. The counting of mapped sequential zones for the target is done using the new function dm_device_count_zones() which performs a report zones on the entire block device with the callback dm_device_count_zones_cb(). This count of mapped sequential zones is also used to determine if the mapped device contains only conventional zones. This allows simplifying dm_set_zones_restrictions() to not do a report zones just for this. For mapped devices mapping only conventional zones, as before, the mapped device is changed to a regular device by setting its zoned limit to false and clearing all its zone related limits. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Benjamin Marzinski Reviewed-by: Niklas Cassel Link: https://lore.kernel.org/r/20240611023639.89277-4-dlemoal@kernel.org Signed-off-by: Jens Axboe --- drivers/md/dm-zone.c | 180 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 150 insertions(+), 30 deletions(-) diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 75d0019a0649d4..ac9f1f82108bf4 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -145,21 +145,6 @@ bool dm_is_zone_write(struct mapped_device *md, struct bio *bio) } } -/* - * Count conventional zones of a mapped zoned device. If the device - * only has conventional zones, do not expose it as zoned. - */ -static int dm_check_zoned_cb(struct blk_zone *zone, unsigned int idx, - void *data) -{ - unsigned int *nr_conv_zones = data; - - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) - (*nr_conv_zones)++; - - return 0; -} - /* * Revalidate the zones of a mapped device to initialize resource necessary * for zone append emulation. Note that we cannot simply use the block layer @@ -228,13 +213,127 @@ static bool dm_table_supports_zone_append(struct dm_table *t) return true; } +struct dm_device_zone_count { + sector_t start; + sector_t len; + unsigned int total_nr_seq_zones; + unsigned int target_nr_seq_zones; +}; + +/* + * Count the total number of and the number of mapped sequential zones of a + * target zoned device. + */ +static int dm_device_count_zones_cb(struct blk_zone *zone, + unsigned int idx, void *data) +{ + struct dm_device_zone_count *zc = data; + + if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) { + zc->total_nr_seq_zones++; + if (zone->start >= zc->start && + zone->start < zc->start + zc->len) + zc->target_nr_seq_zones++; + } + + return 0; +} + +static int dm_device_count_zones(struct dm_dev *dev, + struct dm_device_zone_count *zc) +{ + int ret; + + ret = blkdev_report_zones(dev->bdev, 0, BLK_ALL_ZONES, + dm_device_count_zones_cb, zc); + if (ret < 0) + return ret; + if (!ret) + return -EIO; + return 0; +} + +struct dm_zone_resource_limits { + unsigned int mapped_nr_seq_zones; + struct queue_limits *lim; + bool reliable_limits; +}; + +static int device_get_zone_resource_limits(struct dm_target *ti, + struct dm_dev *dev, sector_t start, + sector_t len, void *data) +{ + struct dm_zone_resource_limits *zlim = data; + struct gendisk *disk = dev->bdev->bd_disk; + unsigned int max_open_zones, max_active_zones; + int ret; + struct dm_device_zone_count zc = { + .start = start, + .len = len, + }; + + /* + * If the target is not the whole device, the device zone resources may + * be shared between different targets. Check this by counting the + * number of mapped sequential zones: if this number is smaller than the + * total number of sequential zones of the target device, then resource + * sharing may happen and the zone limits will not be reliable. + */ + ret = dm_device_count_zones(dev, &zc); + if (ret) { + DMERR("Count %s zones failed %d", disk->disk_name, ret); + return ret; + } + + /* + * If the target does not map any sequential zones, then we do not need + * any zone resource limits. + */ + if (!zc.target_nr_seq_zones) + return 0; + + /* + * If the target does not map all sequential zones, the limits + * will not be reliable. + */ + if (zc.target_nr_seq_zones < zc.total_nr_seq_zones) + zlim->reliable_limits = false; + + /* + * If the target maps less sequential zones than the limit values, then + * we do not have limits for this target. + */ + max_active_zones = disk->queue->limits.max_active_zones; + if (max_active_zones >= zc.target_nr_seq_zones) + max_active_zones = 0; + zlim->lim->max_active_zones = + min_not_zero(max_active_zones, zlim->lim->max_active_zones); + + max_open_zones = disk->queue->limits.max_open_zones; + if (max_open_zones >= zc.target_nr_seq_zones) + max_open_zones = 0; + zlim->lim->max_open_zones = + min_not_zero(max_open_zones, zlim->lim->max_open_zones); + + /* + * Also count the total number of sequential zones for the mapped + * device so that when we are done inspecting all its targets, we are + * able to check if the mapped device actually has any sequential zones. + */ + zlim->mapped_nr_seq_zones += zc.target_nr_seq_zones; + + return 0; +} + int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *lim) { struct mapped_device *md = t->md; struct gendisk *disk = md->disk; - unsigned int nr_conv_zones = 0; - int ret; + struct dm_zone_resource_limits zlim = { + .reliable_limits = true, + .lim = lim, + }; /* * Check if zone append is natively supported, and if not, set the @@ -249,32 +348,53 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, } /* - * Count conventional zones to check that the mapped device will indeed - * have sequential write required zones. + * Determine the max open and max active zone limits for the mapped + * device by inspecting the zone resource limits and the zones mapped + * by each target. */ - md->zone_revalidate_map = t; - ret = dm_blk_report_zones(disk, 0, UINT_MAX, - dm_check_zoned_cb, &nr_conv_zones); - md->zone_revalidate_map = NULL; - if (ret < 0) { - DMERR("Check zoned failed %d", ret); - return ret; + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + if (!ti->type->iterate_devices || + ti->type->iterate_devices(ti, + device_get_zone_resource_limits, &zlim)) { + DMERR("Could not determine %s zone resource limits", + disk->disk_name); + return -ENODEV; + } } /* - * If we only have conventional zones, expose the mapped device as - * a regular device. + * If we only have conventional zones mapped, expose the mapped device + + as a regular device. */ - if (nr_conv_zones >= ret) { + if (!zlim.mapped_nr_seq_zones) { lim->max_open_zones = 0; lim->max_active_zones = 0; + lim->max_zone_append_sectors = 0; + lim->zone_write_granularity = 0; + lim->chunk_sectors = 0; lim->zoned = false; clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + md->nr_zones = 0; disk->nr_zones = 0; return 0; } - if (!static_key_enabled(&zoned_enabled.key)) + /* + * Warn once (when the capacity is not yet set) if the mapped device is + * partially using zone resources of the target devices as that leads to + * unreliable limits, i.e. if another mapped device uses the same + * underlying devices, we cannot enforce zone limits to guarantee that + * writing will not lead to errors. Note that we really should return + * an error for such case but there is no easy way to find out if + * another mapped device uses the same underlying zoned devices. + */ + if (!get_capacity(disk) && !zlim.reliable_limits) + DMWARN("%s zone resource limits may be unreliable", + disk->disk_name); + + if (lim->zoned && !static_key_enabled(&zoned_enabled.key)) static_branch_enable(&zoned_enabled); return 0; } From eaa3706fedc6a4142c251b2d4005d850caeabe50 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 11 Jun 2024 11:36:39 +0900 Subject: [PATCH 1026/1578] dm: Remove unused macro DM_ZONE_INVALID_WP_OFST With the switch to using the zone append emulation of the block layer zone write plugging, the macro DM_ZONE_INVALID_WP_OFST is no longer used in dm-zone.c. Remove its definition. Fixes: f211268ed1f9 ("dm: Use the block layer zone append emulation") Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Benjamin Marzinski Reviewed-by: Niklas Cassel Link: https://lore.kernel.org/r/20240611023639.89277-5-dlemoal@kernel.org Signed-off-by: Jens Axboe --- drivers/md/dm-zone.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index ac9f1f82108bf4..70719bf32a2e5e 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -13,8 +13,6 @@ #define DM_MSG_PREFIX "zone" -#define DM_ZONE_INVALID_WP_OFST UINT_MAX - /* * For internal zone reports bypassing the top BIO submission path. */ From a5d400b6439ac734a5c0dbb641e26a38736abc17 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Wed, 29 May 2024 00:48:54 -0300 Subject: [PATCH 1027/1578] arm64: dts: imx93-11x11-evk: Remove the 'no-sdio' property The usdhc2 port is connected to the microSD slot. The presence of the 'no-sdio' property prevents Wifi SDIO cards, such as CMP9010-X-EVB [1] to be detected. Remove the 'no-sdio' property so that SDIO cards could also work. [1] https://www.nxp.com/products/wireless-connectivity/wi-fi-plus-bluetooth-plus-802-15-4/cmp9010-x-evb-iw416-usd-interface-evaluation-board:CMP9010-X-EVB Fixes: e37907bd8294 ("arm64: dts: freescale: add i.MX93 11x11 EVK basic support") Signed-off-by: Fabio Estevam Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx93-11x11-evk.dts | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/imx93-11x11-evk.dts b/arch/arm64/boot/dts/freescale/imx93-11x11-evk.dts index d400d85f42a922..bd98eff4d685f1 100644 --- a/arch/arm64/boot/dts/freescale/imx93-11x11-evk.dts +++ b/arch/arm64/boot/dts/freescale/imx93-11x11-evk.dts @@ -296,7 +296,6 @@ vmmc-supply = <®_usdhc2_vmmc>; bus-width = <4>; status = "okay"; - no-sdio; no-mmc; }; From 8043832e2a123fd9372007a29192f2f3ba328cd6 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Fri, 14 Jun 2024 11:05:43 +0300 Subject: [PATCH 1028/1578] memblock: use numa_valid_node() helper to check for invalid node ID Introduce numa_valid_node(nid) that verifies that nid is a valid node ID and use that instead of comparing nid parameter with either NUMA_NO_NODE or MAX_NUMNODES. This makes the checks for valid node IDs consistent and more robust and allows to get rid of multiple WARNings. Suggested-by: Linus Torvalds Signed-off-by: Mike Rapoport (IBM) --- include/linux/numa.h | 5 +++++ mm/memblock.c | 28 +++++++--------------------- 2 files changed, 12 insertions(+), 21 deletions(-) diff --git a/include/linux/numa.h b/include/linux/numa.h index 1d43371fafd2fc..eb19503604fe36 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -15,6 +15,11 @@ #define NUMA_NO_NODE (-1) #define NUMA_NO_MEMBLK (-1) +static inline bool numa_valid_node(int nid) +{ + return nid >= 0 && nid < MAX_NUMNODES; +} + /* optionally keep NUMA memory info available post init */ #ifdef CONFIG_NUMA_KEEP_MEMINFO #define __initdata_or_meminfo diff --git a/mm/memblock.c b/mm/memblock.c index 08e9806b1cf91b..e81fb68f7f8887 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -754,7 +754,7 @@ bool __init_memblock memblock_validate_numa_coverage(unsigned long threshold_byt /* calculate lose page */ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { - if (nid == NUMA_NO_NODE) + if (!numa_valid_node(nid)) nr_pages += end_pfn - start_pfn; } @@ -1061,7 +1061,7 @@ static bool should_skip_region(struct memblock_type *type, return false; /* only memory regions are associated with nodes, check it */ - if (nid != NUMA_NO_NODE && nid != m_nid) + if (numa_valid_node(nid) && nid != m_nid) return true; /* skip hotpluggable memory regions if needed */ @@ -1118,10 +1118,6 @@ void __next_mem_range(u64 *idx, int nid, enum memblock_flags flags, int idx_a = *idx & 0xffffffff; int idx_b = *idx >> 32; - if (WARN_ONCE(nid == MAX_NUMNODES, - "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) - nid = NUMA_NO_NODE; - for (; idx_a < type_a->cnt; idx_a++) { struct memblock_region *m = &type_a->regions[idx_a]; @@ -1215,9 +1211,6 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, int idx_a = *idx & 0xffffffff; int idx_b = *idx >> 32; - if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) - nid = NUMA_NO_NODE; - if (*idx == (u64)ULLONG_MAX) { idx_a = type_a->cnt - 1; if (type_b != NULL) @@ -1303,7 +1296,7 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, if (PFN_UP(r->base) >= PFN_DOWN(r->base + r->size)) continue; - if (nid == MAX_NUMNODES || nid == r_nid) + if (!numa_valid_node(nid) || nid == r_nid) break; } if (*idx >= type->cnt) { @@ -1339,10 +1332,6 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, int start_rgn, end_rgn; int i, ret; - if (WARN_ONCE(nid == MAX_NUMNODES, - "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) - nid = NUMA_NO_NODE; - ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); if (ret) return ret; @@ -1452,9 +1441,6 @@ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, enum memblock_flags flags = choose_memblock_flags(); phys_addr_t found; - if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) - nid = NUMA_NO_NODE; - if (!align) { /* Can't use WARNs this early in boot on powerpc */ dump_stack(); @@ -1467,7 +1453,7 @@ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, if (found && !memblock_reserve(found, size)) goto done; - if (nid != NUMA_NO_NODE && !exact_nid) { + if (numa_valid_node(nid) && !exact_nid) { found = memblock_find_in_range_node(size, align, start, end, NUMA_NO_NODE, flags); @@ -1987,7 +1973,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type) end = base + size - 1; flags = rgn->flags; #ifdef CONFIG_NUMA - if (memblock_get_region_node(rgn) != MAX_NUMNODES) + if (numa_valid_node(memblock_get_region_node(rgn))) snprintf(nid_buf, sizeof(nid_buf), " on node %d", memblock_get_region_node(rgn)); #endif @@ -2181,7 +2167,7 @@ static void __init memmap_init_reserved_pages(void) start = region->base; end = start + region->size; - if (nid == NUMA_NO_NODE || nid >= MAX_NUMNODES) + if (!numa_valid_node(nid)) nid = early_pfn_to_nid(PFN_DOWN(start)); reserve_bootmem_region(start, end, nid); @@ -2272,7 +2258,7 @@ static int memblock_debug_show(struct seq_file *m, void *private) seq_printf(m, "%4d: ", i); seq_printf(m, "%pa..%pa ", ®->base, &end); - if (nid != MAX_NUMNODES) + if (numa_valid_node(nid)) seq_printf(m, "%4d ", nid); else seq_printf(m, "%4c ", 'x'); From b1fd0d1285b1eae8b99af36fb26ed2512b809af6 Mon Sep 17 00:00:00 2001 From: Ajrat Makhmutov Date: Sat, 15 Jun 2024 15:54:57 +0300 Subject: [PATCH 1029/1578] ALSA: hda/realtek: Enable headset mic on IdeaPad 330-17IKB 81DM Headset microphone do not work out of the box with this laptop. This quirk fixes it. Zihao Wang specified the wrong subsystem id in his patch. Link: https://lore.kernel.org/all/20220424084120.74125-1-wzhd@ustc.edu/ Fixes: 3b79954fd00d ("ALSA: hda/realtek: Add quirk for Yoga Duet 7 13ITL6 speakers") Signed-off-by: Ajrat Makhmutov Link: https://lore.kernel.org/r/20240615125457.167844-1-rauty@altlinux.org Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 79736c8782a31c..a7594d46055d70 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10506,7 +10506,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x3813, "Legion 7i 15IMHG05", ALC287_FIXUP_LEGION_15IMHG05_SPEAKERS), SND_PCI_QUIRK(0x17aa, 0x3818, "Lenovo C940 / Yoga Duet 7", ALC298_FIXUP_LENOVO_C940_DUET7), SND_PCI_QUIRK(0x17aa, 0x3819, "Lenovo 13s Gen2 ITL", ALC287_FIXUP_13S_GEN2_SPEAKERS), - SND_PCI_QUIRK(0x17aa, 0x3820, "Yoga Duet 7 13ITL6", ALC287_FIXUP_YOGA7_14ITL_SPEAKERS), + SND_PCI_QUIRK(0x17aa, 0x3820, "IdeaPad 330-17IKB 81DM", ALC269_FIXUP_ASPIRE_HEADSET_MIC), SND_PCI_QUIRK(0x17aa, 0x3824, "Legion Y9000X 2020", ALC285_FIXUP_LEGION_Y9000X_SPEAKERS), SND_PCI_QUIRK(0x17aa, 0x3827, "Ideapad S740", ALC285_FIXUP_IDEAPAD_S740_COEF), SND_PCI_QUIRK(0x17aa, 0x3834, "Lenovo IdeaPad Slim 9i 14ITL5", ALC287_FIXUP_YOGA7_14ITL_SPEAKERS), From ba437905b4fbf0ee1686c175069239a1cc292558 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 7 Jun 2024 16:33:00 -0500 Subject: [PATCH 1030/1578] RAS/AMD/ATL: Use system settings for MI300 DRAM to normalized address translation The currently used normalized address format is not applicable to all MI300 systems. This leads to incorrect results during address translation. Drop the fixed layout and construct the normalized address from system settings. Fixes: 87a612375307 ("RAS/AMD/ATL: Add MI300 DRAM to normalized address translation support") Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Cc: Link: https://lore.kernel.org/r/20240607-mi300-dram-xl-fix-v1-2-2f11547a178c@amd.com --- drivers/ras/amd/atl/internal.h | 2 +- drivers/ras/amd/atl/system.c | 2 +- drivers/ras/amd/atl/umc.c | 151 ++++++++++++++++++++++++--------- 3 files changed, 114 insertions(+), 41 deletions(-) diff --git a/drivers/ras/amd/atl/internal.h b/drivers/ras/amd/atl/internal.h index 5de69e0bb0f999..196c1c8b578ced 100644 --- a/drivers/ras/amd/atl/internal.h +++ b/drivers/ras/amd/atl/internal.h @@ -224,7 +224,7 @@ int df_indirect_read_broadcast(u16 node, u8 func, u16 reg, u32 *lo); int get_df_system_info(void); int determine_node_id(struct addr_ctx *ctx, u8 socket_num, u8 die_num); -int get_addr_hash_mi300(void); +int get_umc_info_mi300(void); int get_address_map(struct addr_ctx *ctx); diff --git a/drivers/ras/amd/atl/system.c b/drivers/ras/amd/atl/system.c index 701349e849428c..6979fa3d4fe25c 100644 --- a/drivers/ras/amd/atl/system.c +++ b/drivers/ras/amd/atl/system.c @@ -127,7 +127,7 @@ static int df4_determine_df_rev(u32 reg) if (reg == DF_FUNC0_ID_MI300) { df_cfg.flags.heterogeneous = 1; - if (get_addr_hash_mi300()) + if (get_umc_info_mi300()) return -EINVAL; } diff --git a/drivers/ras/amd/atl/umc.c b/drivers/ras/amd/atl/umc.c index 5cb92330dc6773..a1b4accf7b9659 100644 --- a/drivers/ras/amd/atl/umc.c +++ b/drivers/ras/amd/atl/umc.c @@ -68,6 +68,8 @@ struct xor_bits { }; #define NUM_BANK_BITS 4 +#define NUM_COL_BITS 5 +#define NUM_SID_BITS 2 static struct { /* UMC::CH::AddrHashBank */ @@ -80,7 +82,22 @@ static struct { u8 bank_xor; } addr_hash; +static struct { + u8 bank[NUM_BANK_BITS]; + u8 col[NUM_COL_BITS]; + u8 sid[NUM_SID_BITS]; + u8 num_row_lo; + u8 num_row_hi; + u8 row_lo; + u8 row_hi; + u8 pc; +} bit_shifts; + #define MI300_UMC_CH_BASE 0x90000 +#define MI300_ADDR_CFG (MI300_UMC_CH_BASE + 0x30) +#define MI300_ADDR_SEL (MI300_UMC_CH_BASE + 0x40) +#define MI300_COL_SEL_LO (MI300_UMC_CH_BASE + 0x50) +#define MI300_ADDR_SEL_2 (MI300_UMC_CH_BASE + 0xA4) #define MI300_ADDR_HASH_BANK0 (MI300_UMC_CH_BASE + 0xC8) #define MI300_ADDR_HASH_PC (MI300_UMC_CH_BASE + 0xE0) #define MI300_ADDR_HASH_PC2 (MI300_UMC_CH_BASE + 0xE4) @@ -90,17 +107,42 @@ static struct { #define ADDR_HASH_ROW_XOR GENMASK(31, 14) #define ADDR_HASH_BANK_XOR GENMASK(5, 0) +#define ADDR_CFG_NUM_ROW_LO GENMASK(11, 8) +#define ADDR_CFG_NUM_ROW_HI GENMASK(15, 12) + +#define ADDR_SEL_BANK0 GENMASK(3, 0) +#define ADDR_SEL_BANK1 GENMASK(7, 4) +#define ADDR_SEL_BANK2 GENMASK(11, 8) +#define ADDR_SEL_BANK3 GENMASK(15, 12) +#define ADDR_SEL_BANK4 GENMASK(20, 16) +#define ADDR_SEL_ROW_LO GENMASK(27, 24) +#define ADDR_SEL_ROW_HI GENMASK(31, 28) + +#define COL_SEL_LO_COL0 GENMASK(3, 0) +#define COL_SEL_LO_COL1 GENMASK(7, 4) +#define COL_SEL_LO_COL2 GENMASK(11, 8) +#define COL_SEL_LO_COL3 GENMASK(15, 12) +#define COL_SEL_LO_COL4 GENMASK(19, 16) + +#define ADDR_SEL_2_BANK5 GENMASK(4, 0) +#define ADDR_SEL_2_CHAN GENMASK(15, 12) + /* * Read UMC::CH::AddrHash{Bank,PC,PC2} registers to get XOR bits used - * for hashing. Do this during module init, since the values will not - * change during run time. + * for hashing. + * + * Also, read UMC::CH::Addr{Cfg,Sel,Sel2} and UMC::CH:ColSelLo registers to + * get the values needed to reconstruct the normalized address. Apply additional + * offsets to the raw register values, as needed. + * + * Do this during module init, since the values will not change during run time. * * These registers are instantiated for each UMC across each AMD Node. * However, they should be identically programmed due to the fixed hardware * design of MI300 systems. So read the values from Node 0 UMC 0 and keep a * single global structure for simplicity. */ -int get_addr_hash_mi300(void) +int get_umc_info_mi300(void) { u32 temp; int ret; @@ -130,6 +172,44 @@ int get_addr_hash_mi300(void) addr_hash.bank_xor = FIELD_GET(ADDR_HASH_BANK_XOR, temp); + ret = amd_smn_read(0, MI300_ADDR_CFG, &temp); + if (ret) + return ret; + + bit_shifts.num_row_hi = FIELD_GET(ADDR_CFG_NUM_ROW_HI, temp); + bit_shifts.num_row_lo = 10 + FIELD_GET(ADDR_CFG_NUM_ROW_LO, temp); + + ret = amd_smn_read(0, MI300_ADDR_SEL, &temp); + if (ret) + return ret; + + bit_shifts.bank[0] = 5 + FIELD_GET(ADDR_SEL_BANK0, temp); + bit_shifts.bank[1] = 5 + FIELD_GET(ADDR_SEL_BANK1, temp); + bit_shifts.bank[2] = 5 + FIELD_GET(ADDR_SEL_BANK2, temp); + bit_shifts.bank[3] = 5 + FIELD_GET(ADDR_SEL_BANK3, temp); + /* Use BankBit4 for the SID0 position. */ + bit_shifts.sid[0] = 5 + FIELD_GET(ADDR_SEL_BANK4, temp); + bit_shifts.row_lo = 12 + FIELD_GET(ADDR_SEL_ROW_LO, temp); + bit_shifts.row_hi = 24 + FIELD_GET(ADDR_SEL_ROW_HI, temp); + + ret = amd_smn_read(0, MI300_COL_SEL_LO, &temp); + if (ret) + return ret; + + bit_shifts.col[0] = 2 + FIELD_GET(COL_SEL_LO_COL0, temp); + bit_shifts.col[1] = 2 + FIELD_GET(COL_SEL_LO_COL1, temp); + bit_shifts.col[2] = 2 + FIELD_GET(COL_SEL_LO_COL2, temp); + bit_shifts.col[3] = 2 + FIELD_GET(COL_SEL_LO_COL3, temp); + bit_shifts.col[4] = 2 + FIELD_GET(COL_SEL_LO_COL4, temp); + + ret = amd_smn_read(0, MI300_ADDR_SEL_2, &temp); + if (ret) + return ret; + + /* Use BankBit5 for the SID1 position. */ + bit_shifts.sid[1] = 5 + FIELD_GET(ADDR_SEL_2_BANK5, temp); + bit_shifts.pc = 5 + FIELD_GET(ADDR_SEL_2_CHAN, temp); + return 0; } @@ -146,9 +226,6 @@ int get_addr_hash_mi300(void) * The MCA address format is as follows: * MCA_ADDR[27:0] = {S[1:0], P[0], R[14:0], B[3:0], C[4:0], Z[0]} * - * The normalized address format is fixed in hardware and is as follows: - * NA[30:0] = {S[1:0], R[13:0], C4, B[1:0], B[3:2], C[3:2], P, C[1:0], Z[4:0]} - * * Additionally, the PC and Bank bits may be hashed. This must be accounted for before * reconstructing the normalized address. */ @@ -158,18 +235,10 @@ int get_addr_hash_mi300(void) #define MI300_UMC_MCA_PC BIT(25) #define MI300_UMC_MCA_SID GENMASK(27, 26) -#define MI300_NA_COL_1_0 GENMASK(6, 5) -#define MI300_NA_PC BIT(7) -#define MI300_NA_COL_3_2 GENMASK(9, 8) -#define MI300_NA_BANK_3_2 GENMASK(11, 10) -#define MI300_NA_BANK_1_0 GENMASK(13, 12) -#define MI300_NA_COL_4 BIT(14) -#define MI300_NA_ROW GENMASK(28, 15) -#define MI300_NA_SID GENMASK(30, 29) - static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr) { - u16 i, col, row, bank, pc, sid, temp; + u16 i, col, row, bank, pc, sid; + u32 temp; col = FIELD_GET(MI300_UMC_MCA_COL, addr); bank = FIELD_GET(MI300_UMC_MCA_BANK, addr); @@ -199,34 +268,38 @@ static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr) /* Reconstruct the normalized address starting with NA[4:0] = 0 */ addr = 0; - /* NA[6:5] = Column[1:0] */ - temp = col & 0x3; - addr |= FIELD_PREP(MI300_NA_COL_1_0, temp); - - /* NA[7] = PC */ - addr |= FIELD_PREP(MI300_NA_PC, pc); - - /* NA[9:8] = Column[3:2] */ - temp = (col >> 2) & 0x3; - addr |= FIELD_PREP(MI300_NA_COL_3_2, temp); + /* Column bits */ + for (i = 0; i < NUM_COL_BITS; i++) { + temp = (col >> i) & 0x1; + addr |= temp << bit_shifts.col[i]; + } - /* NA[11:10] = Bank[3:2] */ - temp = (bank >> 2) & 0x3; - addr |= FIELD_PREP(MI300_NA_BANK_3_2, temp); + /* Bank bits */ + for (i = 0; i < NUM_BANK_BITS; i++) { + temp = (bank >> i) & 0x1; + addr |= temp << bit_shifts.bank[i]; + } - /* NA[13:12] = Bank[1:0] */ - temp = bank & 0x3; - addr |= FIELD_PREP(MI300_NA_BANK_1_0, temp); + /* Row lo bits */ + for (i = 0; i < bit_shifts.num_row_lo; i++) { + temp = (row >> i) & 0x1; + addr |= temp << (i + bit_shifts.row_lo); + } - /* NA[14] = Column[4] */ - temp = (col >> 4) & 0x1; - addr |= FIELD_PREP(MI300_NA_COL_4, temp); + /* Row hi bits */ + for (i = 0; i < bit_shifts.num_row_hi; i++) { + temp = (row >> (i + bit_shifts.num_row_lo)) & 0x1; + addr |= temp << (i + bit_shifts.row_hi); + } - /* NA[28:15] = Row[13:0] */ - addr |= FIELD_PREP(MI300_NA_ROW, row); + /* PC bit */ + addr |= pc << bit_shifts.pc; - /* NA[30:29] = SID[1:0] */ - addr |= FIELD_PREP(MI300_NA_SID, sid); + /* SID bits */ + for (i = 0; i < NUM_SID_BITS; i++) { + temp = (sid >> i) & 0x1; + addr |= temp << bit_shifts.sid[i]; + } pr_debug("Addr=0x%016lx", addr); pr_debug("Bank=%u Row=%u Column=%u PC=%u SID=%u", bank, row, col, pc, sid); From 1a49509885cd984b6f63c91da612e258b8a89fc8 Mon Sep 17 00:00:00 2001 From: Gergely Meszaros Date: Sun, 16 Jun 2024 10:52:33 +0200 Subject: [PATCH 1031/1578] ALSA: hda/realtek: Add quirk for Lenovo Yoga Pro 7 14ARP8 Similarly to other Lenovo laptops these also have a dual speaker setup with a shared amplifier. The model also seems to have a conflicting PCI SSID with the codec SSID for the Legion Y9000X 2022 IAH7. Only tested on the Yoga Pro 7, as I don't have access to the other laptop. Signed-off-by: Gergely Meszaros Link: https://lore.kernel.org/r/20240616085233.16922-1-meszaros.gergely97@gmail.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index a7594d46055d70..fd337fdd30f69a 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -7520,6 +7520,7 @@ enum { ALC285_FIXUP_ASUS_GU605_SPI_SPEAKER2_TO_DAC1, ALC287_FIXUP_LENOVO_THKPAD_WH_ALC1318, ALC256_FIXUP_CHROME_BOOK, + ALC287_FIXUP_LENOVO_14ARP8_LEGION_IAH7, }; /* A special fixup for Lenovo C940 and Yoga Duet 7; @@ -7559,6 +7560,21 @@ static void alc287_fixup_lenovo_14irp8_duetitl(struct hda_codec *codec, __snd_hda_apply_fixup(codec, id, action, 0); } +/* Similar to above the Lenovo Yoga Pro 7 14ARP8 PCI SSID matches the codec SSID of the + Legion Y9000X 2022 IAH7.*/ +static void alc287_fixup_lenovo_14arp8_legion_iah7(struct hda_codec *codec, + const struct hda_fixup *fix, + int action) +{ + int id; + + if (codec->core.subsystem_id == 0x17aa386e) + id = ALC287_FIXUP_CS35L41_I2C_2; /* Legion Y9000X 2022 IAH7 */ + else + id = ALC285_FIXUP_SPEAKER2_TO_DAC1; /* Yoga Pro 7 14ARP8 */ + __snd_hda_apply_fixup(codec, id, action, 0); +} + /* Another hilarious PCI SSID conflict with Lenovo Legion Pro 7 16ARX8H (with * TAS2781 codec) and Legion 7i 16IAX7 (with CS35L41 codec); * we apply a corresponding fixup depending on the codec SSID instead @@ -9658,6 +9674,10 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK, }, + [ALC287_FIXUP_LENOVO_14ARP8_LEGION_IAH7] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc287_fixup_lenovo_14arp8_legion_iah7, + }, [ALC287_FIXUP_YOGA9_14IMH9_BASS_SPK_PIN] = { .type = HDA_FIXUP_FUNC, .v.func = alc287_fixup_yoga9_14iap7_bass_spk_pin, @@ -10520,7 +10540,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x3865, "Lenovo 13X", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x17aa, 0x3866, "Lenovo 13X", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x17aa, 0x3869, "Lenovo Yoga7 14IAL7", ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK_PIN), - SND_PCI_QUIRK(0x17aa, 0x386e, "Legion Y9000X 2022 IAH7", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x17aa, 0x386e, "Legion Y9000X 2022 IAH7 / Yoga Pro 7 14ARP8", ALC287_FIXUP_LENOVO_14ARP8_LEGION_IAH7), SND_PCI_QUIRK(0x17aa, 0x386f, "Legion Pro 7/7i", ALC287_FIXUP_LENOVO_LEGION_7), SND_PCI_QUIRK(0x17aa, 0x3870, "Lenovo Yoga 7 14ARB7", ALC287_FIXUP_YOGA7_14ARB7_I2C), SND_PCI_QUIRK(0x17aa, 0x3877, "Lenovo Legion 7 Slim 16ARHA7", ALC287_FIXUP_CS35L41_I2C_2), From 67cc6125fb39902169707cb6277f010e56d4a40a Mon Sep 17 00:00:00 2001 From: Max Krummenacher Date: Mon, 3 Jun 2024 16:00:45 +0200 Subject: [PATCH 1032/1578] arm64: dts: freescale: imx8mm-verdin: enable hysteresis on slow input pin SODIMM 17 can be used as an edge triggered interrupt supplied from an off board source. Enable hysteresis on the pinmuxing to increase immunity against noise on the signal. Fixes: 60f01b5b5c7d ("arm64: dts: imx8mm-verdin: update iomux configuration") Signed-off-by: Max Krummenacher Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi index 0d9abca5882182..98544741ce1768 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi @@ -936,7 +936,7 @@ /* Verdin GPIO_9_DSI (pulled-up as active-low) */ pinctrl_gpio_9_dsi: gpio9dsigrp { fsl,pins = - ; /* SODIMM 17 */ + ; /* SODIMM 17 */ }; /* Verdin GPIO_10_DSI (pulled-up as active-low) */ From fcf2a9970ef587d8f358560c381ee6115a9108aa Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 7 Jun 2024 12:18:47 +0200 Subject: [PATCH 1033/1578] leds: class: Revert: "If no default trigger is given, make hw_control trigger the default trigger" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 66601a29bb23 ("leds: class: If no default trigger is given, make hw_control trigger the default trigger") causes ledtrig-netdev to get set as default trigger on various network LEDs. This causes users to hit a pre-existing AB-BA deadlock issue in ledtrig-netdev between the LED-trigger locks and the rtnl mutex, resulting in hung tasks in kernels >= 6.9. Solving the deadlock is non trivial, so for now revert the change to set the hw_control trigger as default trigger, so that ledtrig-netdev no longer gets activated automatically for various network LEDs. The netdev trigger is not needed because the network LEDs are usually under hw-control and the netdev trigger tries to leave things that way so setting it as the active trigger for the LED class device is a no-op. Fixes: 66601a29bb23 ("leds: class: If no default trigger is given, make hw_control trigger the default trigger") Reported-by: Genes Lists Closes: https://lore.kernel.org/all/9d189ec329cfe68ed68699f314e191a10d4b5eda.camel@sapience.com/ Reported-by: Johannes Wüller Closes: https://lore.kernel.org/lkml/e441605c-eaf2-4c2d-872b-d8e541f4cf60@gmail.com/ Cc: stable@vger.kernel.org Signed-off-by: Hans de Goede Reviewed-by: Andrew Lunn Acked-by: Lee Jones Signed-off-by: Linus Torvalds --- drivers/leds/led-class.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c index 24fcff682b24ad..ba1be15cfd8ea3 100644 --- a/drivers/leds/led-class.c +++ b/drivers/leds/led-class.c @@ -552,12 +552,6 @@ int led_classdev_register_ext(struct device *parent, led_init_core(led_cdev); #ifdef CONFIG_LEDS_TRIGGERS - /* - * If no default trigger was given and hw_control_trigger is set, - * make it the default trigger. - */ - if (!led_cdev->default_trigger && led_cdev->hw_control_trigger) - led_cdev->default_trigger = led_cdev->hw_control_trigger; led_trigger_set_default(led_cdev); #endif From 6ba59ff4227927d3a8530fc2973b80e94b54d58f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 16 Jun 2024 13:40:16 -0700 Subject: [PATCH 1034/1578] Linux 6.10-rc4 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 925a75b8ba7def..14427547dc1eb5 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 10 SUBLEVEL = 0 -EXTRAVERSION = -rc3 +EXTRAVERSION = -rc4 NAME = Baby Opossum Posse # *DOCUMENTATION* From 81cc927d9c5eefd4a1b08e16b0ab2263f36d03f7 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Thu, 23 May 2024 17:45:17 -0400 Subject: [PATCH 1035/1578] io_uring: Drop per-ctx dummy_ubuf Commit 19a63c402170 ("io_uring/rsrc: keep one global dummy_ubuf") replaced it with a global static object but this stayed behind. Fixes: 19a63c402170 ("io_uring/rsrc: keep one global dummy_ubuf") Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20240523214517.31803-1-krisman@suse.de Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index b48570eaa4497f..93c9044ec3fef3 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -373,7 +373,6 @@ struct io_ring_ctx { struct io_restriction restrictions; /* slow path rsrc auxilary data, used by update/register */ - struct io_mapped_ubuf *dummy_ubuf; struct io_rsrc_data *file_data; struct io_rsrc_data *buf_data; From f4eaf8eda89e1ae5d8274297094687245293deff Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Thu, 23 May 2024 17:45:35 -0400 Subject: [PATCH 1036/1578] io_uring/rsrc: Drop io_copy_iov in favor of iovec API Instead of open coding an io_uring function to copy iovs from userspace, rely on the existing iovec_from_user function. While there, avoid repeatedly zeroing the iov in the !arg case for io_sqe_buffer_register. tested with liburing testsuite. Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20240523214535.31890-1-krisman@suse.de Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 60 +++++++++++++++++-------------------------------- 1 file changed, 21 insertions(+), 39 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index edb9c5baf2e290..e89c5e2326a257 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -85,31 +85,6 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) return 0; } -static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, - void __user *arg, unsigned index) -{ - struct iovec __user *src; - -#ifdef CONFIG_COMPAT - if (ctx->compat) { - struct compat_iovec __user *ciovs; - struct compat_iovec ciov; - - ciovs = (struct compat_iovec __user *) arg; - if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) - return -EFAULT; - - dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); - dst->iov_len = ciov.iov_len; - return 0; - } -#endif - src = (struct iovec __user *) arg; - if (copy_from_user(dst, &src[index], sizeof(*dst))) - return -EFAULT; - return 0; -} - static int io_buffer_validate(struct iovec *iov) { unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); @@ -420,8 +395,9 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, struct io_uring_rsrc_update2 *up, unsigned int nr_args) { + struct iovec __user *uvec = u64_to_user_ptr(up->data); u64 __user *tags = u64_to_user_ptr(up->tags); - struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); + struct iovec fast_iov, *iov; struct page *last_hpage = NULL; __u32 done; int i, err; @@ -435,21 +411,23 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu; u64 tag = 0; - err = io_copy_iov(ctx, &iov, iovs, done); - if (err) + iov = iovec_from_user(&uvec[done], 1, 1, &fast_iov, ctx->compat); + if (IS_ERR(iov)) { + err = PTR_ERR(iov); break; + } if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { err = -EFAULT; break; } - err = io_buffer_validate(&iov); + err = io_buffer_validate(iov); if (err) break; - if (!iov.iov_base && tag) { + if (!iov->iov_base && tag) { err = -EINVAL; break; } - err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); + err = io_sqe_buffer_register(ctx, iov, &imu, &last_hpage); if (err) break; @@ -971,8 +949,9 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, { struct page *last_hpage = NULL; struct io_rsrc_data *data; + struct iovec fast_iov, *iov = &fast_iov; + const struct iovec __user *uvec = (struct iovec * __user) arg; int i, ret; - struct iovec iov; BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); @@ -989,24 +968,27 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, return ret; } + if (!arg) + memset(iov, 0, sizeof(*iov)); + for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { if (arg) { - ret = io_copy_iov(ctx, &iov, arg, i); - if (ret) + iov = iovec_from_user(&uvec[i], 1, 1, &fast_iov, ctx->compat); + if (IS_ERR(iov)) { + ret = PTR_ERR(iov); break; - ret = io_buffer_validate(&iov); + } + ret = io_buffer_validate(iov); if (ret) break; - } else { - memset(&iov, 0, sizeof(iov)); } - if (!iov.iov_base && *io_get_tag_slot(data, i)) { + if (!iov->iov_base && *io_get_tag_slot(data, i)) { ret = -EINVAL; break; } - ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], + ret = io_sqe_buffer_register(ctx, iov, &ctx->user_bufs[i], &last_hpage); if (ret) break; From 60b6c075e8eb8bd23c106e2ab13370a146a94a5b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 3 Jun 2024 11:19:10 -0600 Subject: [PATCH 1037/1578] io_uring/eventfd: move to more idiomatic RCU free usage In some ways, it just "happens to work" currently with using the ops field for both the free and signaling bit. But it depends on ordering of operations in terms of freeing and signaling. Clean it up and use the usual refs == 0 under RCU read side lock to determine if the ev_fd is still valid, and use the reference to gate the freeing as well. Fixes: 21a091b970cd ("io_uring: signal registered eventfd to process deferred task work") Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 49 ++++++++++++++++++++++++--------------------- io_uring/io_uring.h | 4 ++-- io_uring/register.c | 6 +++--- 3 files changed, 31 insertions(+), 28 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 154b25b8a613b1..0a24feec27f7b9 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -541,29 +541,33 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx) } } -void io_eventfd_ops(struct rcu_head *rcu) +void io_eventfd_free(struct rcu_head *rcu) { struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); - int ops = atomic_xchg(&ev_fd->ops, 0); - if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT)) - eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); + eventfd_ctx_put(ev_fd->cq_ev_fd); + kfree(ev_fd); +} - /* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback - * ordering in a race but if references are 0 we know we have to free - * it regardless. - */ - if (atomic_dec_and_test(&ev_fd->refs)) { - eventfd_ctx_put(ev_fd->cq_ev_fd); - kfree(ev_fd); - } +void io_eventfd_do_signal(struct rcu_head *rcu) +{ + struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); + + eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); + + if (atomic_dec_and_test(&ev_fd->refs)) + io_eventfd_free(rcu); } static void io_eventfd_signal(struct io_ring_ctx *ctx) { struct io_ev_fd *ev_fd = NULL; - rcu_read_lock(); + if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) + return; + + guard(rcu)(); + /* * rcu_dereference ctx->io_ev_fd once and use it for both for checking * and eventfd_signal @@ -576,24 +580,23 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx) * the function and rcu_read_lock. */ if (unlikely(!ev_fd)) - goto out; - if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) - goto out; + return; + if (!atomic_inc_not_zero(&ev_fd->refs)) + return; if (ev_fd->eventfd_async && !io_wq_current_is_worker()) goto out; if (likely(eventfd_signal_allowed())) { eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); } else { - atomic_inc(&ev_fd->refs); - if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) - call_rcu_hurry(&ev_fd->rcu, io_eventfd_ops); - else - atomic_dec(&ev_fd->refs); + if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) { + call_rcu_hurry(&ev_fd->rcu, io_eventfd_do_signal); + return; + } } - out: - rcu_read_unlock(); + if (atomic_dec_and_test(&ev_fd->refs)) + call_rcu(&ev_fd->rcu, io_eventfd_free); } static void io_eventfd_flush_signal(struct io_ring_ctx *ctx) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 726e6367af4d37..2b08b402b716d8 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -106,10 +106,10 @@ bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, enum { IO_EVENTFD_OP_SIGNAL_BIT, - IO_EVENTFD_OP_FREE_BIT, }; -void io_eventfd_ops(struct rcu_head *rcu); +void io_eventfd_do_signal(struct rcu_head *rcu); +void io_eventfd_free(struct rcu_head *rcu); void io_activate_pollwq(struct io_ring_ctx *ctx); static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) diff --git a/io_uring/register.c b/io_uring/register.c index c0010a66a6f2c2..212711e9bc8a67 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -63,9 +63,9 @@ static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, ev_fd->eventfd_async = eventfd_async; ctx->has_evfd = true; - rcu_assign_pointer(ctx->io_ev_fd, ev_fd); atomic_set(&ev_fd->refs, 1); atomic_set(&ev_fd->ops, 0); + rcu_assign_pointer(ctx->io_ev_fd, ev_fd); return 0; } @@ -78,8 +78,8 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx) if (ev_fd) { ctx->has_evfd = false; rcu_assign_pointer(ctx->io_ev_fd, NULL); - if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops)) - call_rcu(&ev_fd->rcu, io_eventfd_ops); + if (atomic_dec_and_test(&ev_fd->refs)) + call_rcu(&ev_fd->rcu, io_eventfd_free); return 0; } From 200f3abd14db55f9aadcb74f4e7a678f1c469ba1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 3 Jun 2024 11:51:19 -0600 Subject: [PATCH 1038/1578] io_uring/eventfd: move eventfd handling to separate file This is pretty nicely abstracted already, but let's move it to a separate file rather than have it in the main io_uring file. With that, we can also move the io_ev_fd struct and enum out of global scope. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 8 -- io_uring/Makefile | 6 +- io_uring/eventfd.c | 160 +++++++++++++++++++++++++++++++++ io_uring/eventfd.h | 8 ++ io_uring/io_uring.c | 82 +---------------- io_uring/io_uring.h | 6 -- io_uring/register.c | 56 +----------- 7 files changed, 173 insertions(+), 153 deletions(-) create mode 100644 io_uring/eventfd.c create mode 100644 io_uring/eventfd.h diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 93c9044ec3fef3..850e30be932256 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -211,14 +211,6 @@ struct io_submit_state { struct blk_plug plug; }; -struct io_ev_fd { - struct eventfd_ctx *cq_ev_fd; - unsigned int eventfd_async: 1; - struct rcu_head rcu; - atomic_t refs; - atomic_t ops; -}; - struct io_alloc_cache { void **entries; unsigned int nr_cached; diff --git a/io_uring/Makefile b/io_uring/Makefile index fc1b23c524e83b..61923e11c76772 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -4,9 +4,9 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ tctx.o filetable.o rw.o net.o poll.o \ - uring_cmd.o openclose.o sqpoll.o \ - xattr.o nop.o fs.o splice.o sync.o \ - msg_ring.o advise.o openclose.o \ + eventfd.o uring_cmd.o openclose.o \ + sqpoll.o xattr.o nop.o fs.o splice.o \ + sync.o msg_ring.o advise.o openclose.o \ epoll.o statx.o timeout.o fdinfo.o \ cancel.o waitid.o register.o \ truncate.o memmap.o diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c new file mode 100644 index 00000000000000..b9384503a2b752 --- /dev/null +++ b/io_uring/eventfd.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include + +#include "io-wq.h" +#include "eventfd.h" + +struct io_ev_fd { + struct eventfd_ctx *cq_ev_fd; + unsigned int eventfd_async: 1; + struct rcu_head rcu; + atomic_t refs; + atomic_t ops; +}; + +enum { + IO_EVENTFD_OP_SIGNAL_BIT, +}; + +static void io_eventfd_free(struct rcu_head *rcu) +{ + struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); + + eventfd_ctx_put(ev_fd->cq_ev_fd); + kfree(ev_fd); +} + +static void io_eventfd_do_signal(struct rcu_head *rcu) +{ + struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); + + eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); + + if (atomic_dec_and_test(&ev_fd->refs)) + io_eventfd_free(rcu); +} + +void io_eventfd_signal(struct io_ring_ctx *ctx) +{ + struct io_ev_fd *ev_fd = NULL; + + if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) + return; + + guard(rcu)(); + + /* + * rcu_dereference ctx->io_ev_fd once and use it for both for checking + * and eventfd_signal + */ + ev_fd = rcu_dereference(ctx->io_ev_fd); + + /* + * Check again if ev_fd exists incase an io_eventfd_unregister call + * completed between the NULL check of ctx->io_ev_fd at the start of + * the function and rcu_read_lock. + */ + if (unlikely(!ev_fd)) + return; + if (!atomic_inc_not_zero(&ev_fd->refs)) + return; + if (ev_fd->eventfd_async && !io_wq_current_is_worker()) + goto out; + + if (likely(eventfd_signal_allowed())) { + eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); + } else { + if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) { + call_rcu_hurry(&ev_fd->rcu, io_eventfd_do_signal); + return; + } + } +out: + if (atomic_dec_and_test(&ev_fd->refs)) + call_rcu(&ev_fd->rcu, io_eventfd_free); +} + +void io_eventfd_flush_signal(struct io_ring_ctx *ctx) +{ + bool skip; + + spin_lock(&ctx->completion_lock); + + /* + * Eventfd should only get triggered when at least one event has been + * posted. Some applications rely on the eventfd notification count + * only changing IFF a new CQE has been added to the CQ ring. There's + * no depedency on 1:1 relationship between how many times this + * function is called (and hence the eventfd count) and number of CQEs + * posted to the CQ ring. + */ + skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail; + ctx->evfd_last_cq_tail = ctx->cached_cq_tail; + spin_unlock(&ctx->completion_lock); + if (skip) + return; + + io_eventfd_signal(ctx); +} + +int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned int eventfd_async) +{ + struct io_ev_fd *ev_fd; + __s32 __user *fds = arg; + int fd; + + ev_fd = rcu_dereference_protected(ctx->io_ev_fd, + lockdep_is_held(&ctx->uring_lock)); + if (ev_fd) + return -EBUSY; + + if (copy_from_user(&fd, fds, sizeof(*fds))) + return -EFAULT; + + ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); + if (!ev_fd) + return -ENOMEM; + + ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); + if (IS_ERR(ev_fd->cq_ev_fd)) { + int ret = PTR_ERR(ev_fd->cq_ev_fd); + kfree(ev_fd); + return ret; + } + + spin_lock(&ctx->completion_lock); + ctx->evfd_last_cq_tail = ctx->cached_cq_tail; + spin_unlock(&ctx->completion_lock); + + ev_fd->eventfd_async = eventfd_async; + ctx->has_evfd = true; + atomic_set(&ev_fd->refs, 1); + atomic_set(&ev_fd->ops, 0); + rcu_assign_pointer(ctx->io_ev_fd, ev_fd); + return 0; +} + +int io_eventfd_unregister(struct io_ring_ctx *ctx) +{ + struct io_ev_fd *ev_fd; + + ev_fd = rcu_dereference_protected(ctx->io_ev_fd, + lockdep_is_held(&ctx->uring_lock)); + if (ev_fd) { + ctx->has_evfd = false; + rcu_assign_pointer(ctx->io_ev_fd, NULL); + if (atomic_dec_and_test(&ev_fd->refs)) + call_rcu(&ev_fd->rcu, io_eventfd_free); + return 0; + } + + return -ENXIO; +} diff --git a/io_uring/eventfd.h b/io_uring/eventfd.h new file mode 100644 index 00000000000000..d394f49c632105 --- /dev/null +++ b/io_uring/eventfd.h @@ -0,0 +1,8 @@ + +struct io_ring_ctx; +int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned int eventfd_async); +int io_eventfd_unregister(struct io_ring_ctx *ctx); + +void io_eventfd_flush_signal(struct io_ring_ctx *ctx); +void io_eventfd_signal(struct io_ring_ctx *ctx); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 0a24feec27f7b9..d10678b9d519cf 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -101,6 +101,7 @@ #include "poll.h" #include "rw.h" #include "alloc_cache.h" +#include "eventfd.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -541,87 +542,6 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx) } } -void io_eventfd_free(struct rcu_head *rcu) -{ - struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); - - eventfd_ctx_put(ev_fd->cq_ev_fd); - kfree(ev_fd); -} - -void io_eventfd_do_signal(struct rcu_head *rcu) -{ - struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); - - eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); - - if (atomic_dec_and_test(&ev_fd->refs)) - io_eventfd_free(rcu); -} - -static void io_eventfd_signal(struct io_ring_ctx *ctx) -{ - struct io_ev_fd *ev_fd = NULL; - - if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) - return; - - guard(rcu)(); - - /* - * rcu_dereference ctx->io_ev_fd once and use it for both for checking - * and eventfd_signal - */ - ev_fd = rcu_dereference(ctx->io_ev_fd); - - /* - * Check again if ev_fd exists incase an io_eventfd_unregister call - * completed between the NULL check of ctx->io_ev_fd at the start of - * the function and rcu_read_lock. - */ - if (unlikely(!ev_fd)) - return; - if (!atomic_inc_not_zero(&ev_fd->refs)) - return; - if (ev_fd->eventfd_async && !io_wq_current_is_worker()) - goto out; - - if (likely(eventfd_signal_allowed())) { - eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); - } else { - if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) { - call_rcu_hurry(&ev_fd->rcu, io_eventfd_do_signal); - return; - } - } -out: - if (atomic_dec_and_test(&ev_fd->refs)) - call_rcu(&ev_fd->rcu, io_eventfd_free); -} - -static void io_eventfd_flush_signal(struct io_ring_ctx *ctx) -{ - bool skip; - - spin_lock(&ctx->completion_lock); - - /* - * Eventfd should only get triggered when at least one event has been - * posted. Some applications rely on the eventfd notification count - * only changing IFF a new CQE has been added to the CQ ring. There's - * no depedency on 1:1 relationship between how many times this - * function is called (and hence the eventfd count) and number of CQEs - * posted to the CQ ring. - */ - skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail; - ctx->evfd_last_cq_tail = ctx->cached_cq_tail; - spin_unlock(&ctx->completion_lock); - if (skip) - return; - - io_eventfd_signal(ctx); -} - void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { if (ctx->poll_activated) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 2b08b402b716d8..cd43924eed04e1 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -104,12 +104,6 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx); bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, bool cancel_all); -enum { - IO_EVENTFD_OP_SIGNAL_BIT, -}; - -void io_eventfd_do_signal(struct rcu_head *rcu); -void io_eventfd_free(struct rcu_head *rcu); void io_activate_pollwq(struct io_ring_ctx *ctx); static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) diff --git a/io_uring/register.c b/io_uring/register.c index 212711e9bc8a67..f121e02f5e10e6 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -27,65 +27,11 @@ #include "cancel.h" #include "kbuf.h" #include "napi.h" +#include "eventfd.h" #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) -static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, - unsigned int eventfd_async) -{ - struct io_ev_fd *ev_fd; - __s32 __user *fds = arg; - int fd; - - ev_fd = rcu_dereference_protected(ctx->io_ev_fd, - lockdep_is_held(&ctx->uring_lock)); - if (ev_fd) - return -EBUSY; - - if (copy_from_user(&fd, fds, sizeof(*fds))) - return -EFAULT; - - ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); - if (!ev_fd) - return -ENOMEM; - - ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); - if (IS_ERR(ev_fd->cq_ev_fd)) { - int ret = PTR_ERR(ev_fd->cq_ev_fd); - kfree(ev_fd); - return ret; - } - - spin_lock(&ctx->completion_lock); - ctx->evfd_last_cq_tail = ctx->cached_cq_tail; - spin_unlock(&ctx->completion_lock); - - ev_fd->eventfd_async = eventfd_async; - ctx->has_evfd = true; - atomic_set(&ev_fd->refs, 1); - atomic_set(&ev_fd->ops, 0); - rcu_assign_pointer(ctx->io_ev_fd, ev_fd); - return 0; -} - -int io_eventfd_unregister(struct io_ring_ctx *ctx) -{ - struct io_ev_fd *ev_fd; - - ev_fd = rcu_dereference_protected(ctx->io_ev_fd, - lockdep_is_held(&ctx->uring_lock)); - if (ev_fd) { - ctx->has_evfd = false; - rcu_assign_pointer(ctx->io_ev_fd, NULL); - if (atomic_dec_and_test(&ev_fd->refs)) - call_rcu(&ev_fd->rcu, io_eventfd_free); - return 0; - } - - return -ENXIO; -} - static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { From f2a93294edce87c909d61e18b506404127394891 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 14 Jun 2024 10:57:03 -0600 Subject: [PATCH 1039/1578] io_uring: use 'state' consistently __io_submit_flush_completions() assigns ctx->submit_state to a local variable and uses it in all but one spot, switch that forgotten statement to using 'state' as well. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index d10678b9d519cf..57382e523b3367 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1390,7 +1390,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) } __io_cq_unlock_post(ctx); - if (!wq_list_empty(&ctx->submit_state.compl_reqs)) { + if (!wq_list_empty(&state->compl_reqs)) { io_free_batch_list(ctx, state->compl_reqs.first); INIT_WQ_LIST(&state->compl_reqs); } From 3474d1b93f897ab33ce160e759afd47d5f412de4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 13 Jun 2024 19:28:27 +0000 Subject: [PATCH 1040/1578] io_uring/io-wq: make io_wq_work flags atomic The work flags can be set/accessed from different tasks, both the originator of the request, and the io-wq workers. While modifications aren't concurrent, it still makes KMSAN unhappy. There's no real downside to just making the flag reading/manipulation use proper atomics here. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- io_uring/io-wq.c | 19 ++++++++++--------- io_uring/io-wq.h | 2 +- io_uring/io_uring.c | 12 ++++++------ 4 files changed, 18 insertions(+), 17 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 850e30be932256..1052a68fd68df4 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -50,7 +50,7 @@ struct io_wq_work_list { struct io_wq_work { struct io_wq_work_node list; - unsigned flags; + atomic_t flags; /* place it here instead of io_kiocb as it fills padding and saves 4B */ int cancel_seq; }; diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 7d3316fe9bfc46..913c92249522e8 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -159,7 +159,7 @@ static inline struct io_wq_acct *io_get_acct(struct io_wq *wq, bool bound) static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq, struct io_wq_work *work) { - return io_get_acct(wq, !(work->flags & IO_WQ_WORK_UNBOUND)); + return io_get_acct(wq, !(atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND)); } static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker) @@ -451,7 +451,7 @@ static void __io_worker_idle(struct io_wq *wq, struct io_worker *worker) static inline unsigned int io_get_work_hash(struct io_wq_work *work) { - return work->flags >> IO_WQ_HASH_SHIFT; + return atomic_read(&work->flags) >> IO_WQ_HASH_SHIFT; } static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash) @@ -592,8 +592,9 @@ static void io_worker_handle_work(struct io_wq_acct *acct, next_hashed = wq_next_work(work); - if (unlikely(do_kill) && (work->flags & IO_WQ_WORK_UNBOUND)) - work->flags |= IO_WQ_WORK_CANCEL; + if (do_kill && + (atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND)) + atomic_or(IO_WQ_WORK_CANCEL, &work->flags); wq->do_work(work); io_assign_current_work(worker, NULL); @@ -891,7 +892,7 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data) static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq) { do { - work->flags |= IO_WQ_WORK_CANCEL; + atomic_or(IO_WQ_WORK_CANCEL, &work->flags); wq->do_work(work); work = wq->free_work(work); } while (work); @@ -926,7 +927,7 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data) void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) { struct io_wq_acct *acct = io_work_get_acct(wq, work); - unsigned long work_flags = work->flags; + unsigned int work_flags = atomic_read(&work->flags); struct io_cb_cancel_data match = { .fn = io_wq_work_match_item, .data = work, @@ -939,7 +940,7 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) * been marked as one that should not get executed, cancel it here. */ if (test_bit(IO_WQ_BIT_EXIT, &wq->state) || - (work->flags & IO_WQ_WORK_CANCEL)) { + (work_flags & IO_WQ_WORK_CANCEL)) { io_run_cancel(work, wq); return; } @@ -982,7 +983,7 @@ void io_wq_hash_work(struct io_wq_work *work, void *val) unsigned int bit; bit = hash_ptr(val, IO_WQ_HASH_ORDER); - work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT)); + atomic_or(IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT), &work->flags); } static bool __io_wq_worker_cancel(struct io_worker *worker, @@ -990,7 +991,7 @@ static bool __io_wq_worker_cancel(struct io_worker *worker, struct io_wq_work *work) { if (work && match->fn(work, match->data)) { - work->flags |= IO_WQ_WORK_CANCEL; + atomic_or(IO_WQ_WORK_CANCEL, &work->flags); __set_notify_signal(worker->task); return true; } diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h index 2b2a6406dd8ee8..b3b004a7b62528 100644 --- a/io_uring/io-wq.h +++ b/io_uring/io-wq.h @@ -56,7 +56,7 @@ bool io_wq_worker_stopped(void); static inline bool io_wq_is_hashed(struct io_wq_work *work) { - return work->flags & IO_WQ_WORK_HASHED; + return atomic_read(&work->flags) & IO_WQ_WORK_HASHED; } typedef bool (work_cancel_fn)(struct io_wq_work *, void *); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 57382e523b3367..438c44ca3abd63 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -462,9 +462,9 @@ static void io_prep_async_work(struct io_kiocb *req) } req->work.list.next = NULL; - req->work.flags = 0; + atomic_set(&req->work.flags, 0); if (req->flags & REQ_F_FORCE_ASYNC) - req->work.flags |= IO_WQ_WORK_CONCURRENT; + atomic_or(IO_WQ_WORK_CONCURRENT, &req->work.flags); if (req->file && !(req->flags & REQ_F_FIXED_FILE)) req->flags |= io_file_get_flags(req->file); @@ -480,7 +480,7 @@ static void io_prep_async_work(struct io_kiocb *req) io_wq_hash_work(&req->work, file_inode(req->file)); } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { if (def->unbound_nonreg_file) - req->work.flags |= IO_WQ_WORK_UNBOUND; + atomic_or(IO_WQ_WORK_UNBOUND, &req->work.flags); } } @@ -520,7 +520,7 @@ static void io_queue_iowq(struct io_kiocb *req) * worker for it). */ if (WARN_ON_ONCE(!same_thread_group(req->task, current))) - req->work.flags |= IO_WQ_WORK_CANCEL; + atomic_or(IO_WQ_WORK_CANCEL, &req->work.flags); trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work)); io_wq_enqueue(tctx->io_wq, &req->work); @@ -1736,14 +1736,14 @@ void io_wq_submit_work(struct io_wq_work *work) io_arm_ltimeout(req); /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ - if (work->flags & IO_WQ_WORK_CANCEL) { + if (atomic_read(&work->flags) & IO_WQ_WORK_CANCEL) { fail: io_req_task_queue_fail(req, err); return; } if (!io_assign_file(req, def, issue_flags)) { err = -EBADF; - work->flags |= IO_WQ_WORK_CANCEL; + atomic_or(IO_WQ_WORK_CANCEL, &work->flags); goto fail; } From 11d194669271642a5d1bfff6c8011478309e7849 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 14 Jun 2024 18:34:50 -0600 Subject: [PATCH 1041/1578] io_uring/rsrc: remove redundant __set_current_state() post schedule() We're guaranteed to be in a TASK_RUNNING state post schedule, so we never need to set the state after that. While in there, remove the other __set_current_state() as well, and just call finish_wait() when we now we're going to break anyway. This is easier to grok than manual __set_current_state() calls. Reported-by: Linus Torvalds Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index e89c5e2326a257..60c00144471a61 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -224,7 +224,7 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, ret = io_run_task_work_sig(ctx); if (ret < 0) { - __set_current_state(TASK_RUNNING); + finish_wait(&ctx->rsrc_quiesce_wq, &we); mutex_lock(&ctx->uring_lock); if (list_empty(&ctx->rsrc_ref_list)) ret = 0; @@ -232,7 +232,6 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, } schedule(); - __set_current_state(TASK_RUNNING); mutex_lock(&ctx->uring_lock); ret = 0; } while (!list_empty(&ctx->rsrc_ref_list)); From 3b87184f7eff27fef7d7ee18b65f173152e1bb81 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 15 Jun 2024 20:47:14 -0600 Subject: [PATCH 1042/1578] io_uring/advise: support 64-bit lengths The existing fadvise/madvise support only supports 32-bit lengths. Add support for 64-bit lengths, enabled by the application setting sqe->off rather than sqe->len for the length. If sqe->len is set, then that is used as the 32-bit length. If sqe->len is zero, then sqe->off is read for full 64-bit support. Older kernels will return -EINVAL if 64-bit support isn't available. Fixes: 4840e418c2fc ("io_uring: add IORING_OP_FADVISE") Fixes: c1ca757bd6f4 ("io_uring: add IORING_OP_MADVISE") Reported-by: Stefan Signed-off-by: Jens Axboe --- io_uring/advise.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/io_uring/advise.c b/io_uring/advise.c index 7085804c513c09..cb7b881665e57f 100644 --- a/io_uring/advise.c +++ b/io_uring/advise.c @@ -17,14 +17,14 @@ struct io_fadvise { struct file *file; u64 offset; - u32 len; + u64 len; u32 advice; }; struct io_madvise { struct file *file; u64 addr; - u32 len; + u64 len; u32 advice; }; @@ -33,11 +33,13 @@ int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) struct io_madvise *ma = io_kiocb_to_cmd(req, struct io_madvise); - if (sqe->buf_index || sqe->off || sqe->splice_fd_in) + if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; ma->addr = READ_ONCE(sqe->addr); - ma->len = READ_ONCE(sqe->len); + ma->len = READ_ONCE(sqe->off); + if (!ma->len) + ma->len = READ_ONCE(sqe->len); ma->advice = READ_ONCE(sqe->fadvise_advice); req->flags |= REQ_F_FORCE_ASYNC; return 0; @@ -78,11 +80,13 @@ int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise); - if (sqe->buf_index || sqe->addr || sqe->splice_fd_in) + if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; fa->offset = READ_ONCE(sqe->off); - fa->len = READ_ONCE(sqe->len); + fa->len = READ_ONCE(sqe->addr); + if (!fa->len) + fa->len = READ_ONCE(sqe->len); fa->advice = READ_ONCE(sqe->fadvise_advice); if (io_fadvise_force_async(fa)) req->flags |= REQ_F_FORCE_ASYNC; From d9c2332199d073c5edd7163d64fbdee6224d8c08 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 14 Jun 2024 09:03:43 +0000 Subject: [PATCH 1043/1578] bdev: make blockdev_mnt static The blockdev_mnt are not used outside the file bdev.c, so the modification is defined as static. block/bdev.c:377:17: warning: symbol 'blockdev_mnt' was not declared. Should it be static? Reported-by: Abaci Robot jpg: Remove closes bugzilla link Reviewed-by: Christoph Hellwig Signed-off-by: Jiapeng Chong Signed-off-by: John Garry Reviewed-by: Bart Van Assche Fixes: 8f3a608827d1 ("bdev: open block device as files") Tested-by: John Garry Link: https://lore.kernel.org/r/20240614090345.655716-2-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- block/bdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bdev.c b/block/bdev.c index 353677ac49b3b9..ced4ac990ec88f 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -385,7 +385,7 @@ static struct file_system_type bd_type = { }; struct super_block *blockdev_superblock __ro_after_init; -struct vfsmount *blockdev_mnt __ro_after_init; +static struct vfsmount *blockdev_mnt __ro_after_init; EXPORT_SYMBOL_GPL(blockdev_superblock); void __init bdev_cache_init(void) From c3042a5403ef2be622023fcc3b11fc1aa08ba7fa Mon Sep 17 00:00:00 2001 From: John Garry Date: Fri, 14 Jun 2024 09:03:44 +0000 Subject: [PATCH 1044/1578] block: Drop locking annotation for limits_lock Currently compiling block/blk-settings.c with C=1 gives the following warning: block/blk-settings.c:262:9: warning: context imbalance in 'queue_limits_commit_update' - wrong count at exit request_queue.limits_lock is a mutex. Sparse locking annotation for mutexes are currently not supported - see [0] - so drop that locking annotation. [0] https://lore.kernel.org/lkml/cover.1579893447.git.jbi.octave@gmail.com/T/#mbb8bda6c0a7ca7ce19f46df976a8e3b489745488 Fixes: d690cb8ae14bd ("block: add an API to atomically update queue limits") Signed-off-by: John Garry Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240614090345.655716-3-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- block/blk-settings.c | 1 - include/linux/blkdev.h | 1 - 2 files changed, 2 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index f574181105f8a5..377a86fe0fcc41 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -303,7 +303,6 @@ int blk_set_default_limits(struct queue_limits *lim) */ int queue_limits_commit_update(struct request_queue *q, struct queue_limits *lim) - __releases(q->limits_lock) { int error; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0c247a71688561..83af6ac5aa4819 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -893,7 +893,6 @@ static inline unsigned int blk_chunk_sectors_left(sector_t offset, */ static inline struct queue_limits queue_limits_start_update(struct request_queue *q) - __acquires(q->limits_lock) { mutex_lock(&q->limits_lock); return q->limits; From 66088084fdabb6e5075cd19e8ffe15b8bc7e3708 Mon Sep 17 00:00:00 2001 From: John Garry Date: Fri, 14 Jun 2024 09:03:45 +0000 Subject: [PATCH 1045/1578] block: BFQ: Refactor bfq_exit_icq() to silence sparse warning Currently building for C=1 generates the following warning: block/bfq-iosched.c:5498:9: warning: context imbalance in 'bfq_exit_icq' - different lock contexts for basic block Refactor bfq_exit_icq() into a core part which loops for the actuators, and only lock calling this routine when necessary. Signed-off-by: John Garry Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20240614090345.655716-4-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 4b88a54a9b76cb..36a4998c4b378b 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5463,40 +5463,42 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync, } } +static void _bfq_exit_icq(struct bfq_io_cq *bic, unsigned int num_actuators) +{ + struct bfq_iocq_bfqq_data *bfqq_data = bic->bfqq_data; + unsigned int act_idx; + + for (act_idx = 0; act_idx < num_actuators; act_idx++) { + if (bfqq_data[act_idx].stable_merge_bfqq) + bfq_put_stable_ref(bfqq_data[act_idx].stable_merge_bfqq); + + bfq_exit_icq_bfqq(bic, true, act_idx); + bfq_exit_icq_bfqq(bic, false, act_idx); + } +} + static void bfq_exit_icq(struct io_cq *icq) { struct bfq_io_cq *bic = icq_to_bic(icq); struct bfq_data *bfqd = bic_to_bfqd(bic); unsigned long flags; - unsigned int act_idx; + /* * If bfqd and thus bfqd->num_actuators is not available any * longer, then cycle over all possible per-actuator bfqqs in * next loop. We rely on bic being zeroed on creation, and * therefore on its unused per-actuator fields being NULL. - */ - unsigned int num_actuators = BFQ_MAX_ACTUATORS; - struct bfq_iocq_bfqq_data *bfqq_data = bic->bfqq_data; - - /* + * * bfqd is NULL if scheduler already exited, and in that case * this is the last time these queues are accessed. */ if (bfqd) { spin_lock_irqsave(&bfqd->lock, flags); - num_actuators = bfqd->num_actuators; - } - - for (act_idx = 0; act_idx < num_actuators; act_idx++) { - if (bfqq_data[act_idx].stable_merge_bfqq) - bfq_put_stable_ref(bfqq_data[act_idx].stable_merge_bfqq); - - bfq_exit_icq_bfqq(bic, true, act_idx); - bfq_exit_icq_bfqq(bic, false, act_idx); - } - - if (bfqd) + _bfq_exit_icq(bic, bfqd->num_actuators); spin_unlock_irqrestore(&bfqd->lock, flags); + } else { + _bfq_exit_icq(bic, BFQ_MAX_ACTUATORS); + } } /* From dfd239a039b3581ca25f932e66b6e2c2bf77c798 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Fri, 14 Jun 2024 11:06:32 -0400 Subject: [PATCH 1046/1578] arm64: dts: imx8qm-mek: fix gpio number for reg_usdhc2_vmmc The gpio in "reg_usdhc2_vmmc" should be 7 instead of 19. Cc: stable@vger.kernel.org Fixes: 307fd14d4b14 ("arm64: dts: imx: add imx8qm mek support") Reviewed-by: Peng Fan Signed-off-by: Frank Li Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8qm-mek.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/imx8qm-mek.dts b/arch/arm64/boot/dts/freescale/imx8qm-mek.dts index 5c6b39c6933fc4..6e05361c1ffb23 100644 --- a/arch/arm64/boot/dts/freescale/imx8qm-mek.dts +++ b/arch/arm64/boot/dts/freescale/imx8qm-mek.dts @@ -36,7 +36,7 @@ regulator-name = "SD1_SPWR"; regulator-min-microvolt = <3000000>; regulator-max-microvolt = <3000000>; - gpio = <&lsio_gpio4 19 GPIO_ACTIVE_HIGH>; + gpio = <&lsio_gpio4 7 GPIO_ACTIVE_HIGH>; enable-active-high; }; From 348a1983cf4cf5099fc398438a968443af4c9f65 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 13 Jun 2024 08:51:48 +1000 Subject: [PATCH 1047/1578] xfs: fix unlink vs cluster buffer instantiation race Luis has been reporting an assert failure when freeing an inode cluster during inode inactivation for a while. The assert looks like: XFS: Assertion failed: bp->b_flags & XBF_DONE, file: fs/xfs/xfs_trans_buf.c, line: 241 ------------[ cut here ]------------ kernel BUG at fs/xfs/xfs_message.c:102! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN NOPTI CPU: 4 PID: 73 Comm: kworker/4:1 Not tainted 6.10.0-rc1 #4 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 Workqueue: xfs-inodegc/loop5 xfs_inodegc_worker [xfs] RIP: 0010:assfail (fs/xfs/xfs_message.c:102) xfs RSP: 0018:ffff88810188f7f0 EFLAGS: 00010202 RAX: 0000000000000000 RBX: ffff88816e748250 RCX: 1ffffffff844b0e7 RDX: 0000000000000004 RSI: ffff88810188f558 RDI: ffffffffc2431fa0 RBP: 1ffff11020311f01 R08: 0000000042431f9f R09: ffffed1020311e9b R10: ffff88810188f4df R11: ffffffffac725d70 R12: ffff88817a3f4000 R13: ffff88812182f000 R14: ffff88810188f998 R15: ffffffffc2423f80 FS: 0000000000000000(0000) GS:ffff8881c8400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055fe9d0f109c CR3: 000000014426c002 CR4: 0000000000770ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: xfs_trans_read_buf_map (fs/xfs/xfs_trans_buf.c:241 (discriminator 1)) xfs xfs_imap_to_bp (fs/xfs/xfs_trans.h:210 fs/xfs/libxfs/xfs_inode_buf.c:138) xfs xfs_inode_item_precommit (fs/xfs/xfs_inode_item.c:145) xfs xfs_trans_run_precommits (fs/xfs/xfs_trans.c:931) xfs __xfs_trans_commit (fs/xfs/xfs_trans.c:966) xfs xfs_inactive_ifree (fs/xfs/xfs_inode.c:1811) xfs xfs_inactive (fs/xfs/xfs_inode.c:2013) xfs xfs_inodegc_worker (fs/xfs/xfs_icache.c:1841 fs/xfs/xfs_icache.c:1886) xfs process_one_work (kernel/workqueue.c:3231) worker_thread (kernel/workqueue.c:3306 (discriminator 2) kernel/workqueue.c:3393 (discriminator 2)) kthread (kernel/kthread.c:389) ret_from_fork (arch/x86/kernel/process.c:147) ret_from_fork_asm (arch/x86/entry/entry_64.S:257) And occurs when the the inode precommit handlers is attempt to look up the inode cluster buffer to attach the inode for writeback. The trail of logic that I can reconstruct is as follows. 1. the inode is clean when inodegc runs, so it is not attached to a cluster buffer when precommit runs. 2. #1 implies the inode cluster buffer may be clean and not pinned by dirty inodes when inodegc runs. 3. #2 implies that the inode cluster buffer can be reclaimed by memory pressure at any time. 4. The assert failure implies that the cluster buffer was attached to the transaction, but not marked done. It had been accessed earlier in the transaction, but not marked done. 5. #4 implies the cluster buffer has been invalidated (i.e. marked stale). 6. #5 implies that the inode cluster buffer was instantiated uninitialised in the transaction in xfs_ifree_cluster(), which only instantiates the buffers to invalidate them and never marks them as done. Given factors 1-3, this issue is highly dependent on timing and environmental factors. Hence the issue can be very difficult to reproduce in some situations, but highly reliable in others. Luis has an environment where it can be reproduced easily by g/531 but, OTOH, I've reproduced it only once in ~2000 cycles of g/531. I think the fix is to have xfs_ifree_cluster() set the XBF_DONE flag on the cluster buffers, even though they may not be initialised. The reasons why I think this is safe are: 1. A buffer cache lookup hit on a XBF_STALE buffer will clear the XBF_DONE flag. Hence all future users of the buffer know they have to re-initialise the contents before use and mark it done themselves. 2. xfs_trans_binval() sets the XFS_BLI_STALE flag, which means the buffer remains locked until the journal commit completes and the buffer is unpinned. Hence once marked XBF_STALE/XFS_BLI_STALE by xfs_ifree_cluster(), the only context that can access the freed buffer is the currently running transaction. 3. #2 implies that future buffer lookups in the currently running transaction will hit the transaction match code and not the buffer cache. Hence XBF_STALE and XFS_BLI_STALE will not be cleared unless the transaction initialises and logs the buffer with valid contents again. At which point, the buffer will be marked marked XBF_DONE again, so having XBF_DONE already set on the stale buffer is a moot point. 4. #2 also implies that any concurrent access to that cluster buffer will block waiting on the buffer lock until the inode cluster has been fully freed and is no longer an active inode cluster buffer. 5. #4 + #1 means that any future user of the disk range of that buffer will always see the range of disk blocks covered by the cluster buffer as not done, and hence must initialise the contents themselves. 6. Setting XBF_DONE in xfs_ifree_cluster() then means the unlinked inode precommit code will see a XBF_DONE buffer from the transaction match as it expects. It can then attach the stale but newly dirtied inode to the stale but newly dirtied cluster buffer without unexpected failures. The stale buffer will then sail through the journal and do the right thing with the attached stale inode during unpin. Hence the fix is just one line of extra code. The explanation of why we have to set XBF_DONE in xfs_ifree_cluster, OTOH, is long and complex.... Fixes: 82842fee6e59 ("xfs: fix AGF vs inode cluster buffer deadlock") Signed-off-by: Dave Chinner Tested-by: Luis Chamberlain Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/xfs_inode.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 58fb7a5062e1e6..f36091e1e7f50b 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2548,11 +2548,26 @@ xfs_ifree_cluster( * This buffer may not have been correctly initialised as we * didn't read it from disk. That's not important because we are * only using to mark the buffer as stale in the log, and to - * attach stale cached inodes on it. That means it will never be - * dispatched for IO. If it is, we want to know about it, and we - * want it to fail. We can acheive this by adding a write - * verifier to the buffer. + * attach stale cached inodes on it. + * + * For the inode that triggered the cluster freeing, this + * attachment may occur in xfs_inode_item_precommit() after we + * have marked this buffer stale. If this buffer was not in + * memory before xfs_ifree_cluster() started, it will not be + * marked XBF_DONE and this will cause problems later in + * xfs_inode_item_precommit() when we trip over a (stale, !done) + * buffer to attached to the transaction. + * + * Hence we have to mark the buffer as XFS_DONE here. This is + * safe because we are also marking the buffer as XBF_STALE and + * XFS_BLI_STALE. That means it will never be dispatched for + * IO and it won't be unlocked until the cluster freeing has + * been committed to the journal and the buffer unpinned. If it + * is written, we want to know about it, and we want it to + * fail. We can acheive this by adding a write verifier to the + * buffer. */ + bp->b_flags |= XBF_DONE; bp->b_ops = &xfs_inode_buf_ops; /* From 8da86499d4cd125a9561f9cd1de7fba99b0aecbf Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 29 May 2024 18:29:52 +0200 Subject: [PATCH 1048/1578] pinctrl: qcom: spmi-gpio: drop broken pm8008 support The SPMI GPIO driver assumes that the parent device is an SPMI device and accesses random data when backcasting the parent struct device pointer for non-SPMI devices. Fortunately this does not seem to cause any issues currently when the parent device is an I2C client like the PM8008, but this could change if the structures are reorganised (e.g. using structure randomisation). Notably the interrupt implementation is also broken for non-SPMI devices. Also note that the two GPIO pins on PM8008 are used for interrupts and reset so their practical use should be limited. Drop the broken GPIO support for PM8008 for now. Fixes: ea119e5a482a ("pinctrl: qcom-pmic-gpio: Add support for pm8008") Cc: stable@vger.kernel.org # 5.13 Reviewed-by: Bryan O'Donoghue Reviewed-by: Stephen Boyd Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20240529162958.18081-9-johan+linaro@kernel.org Signed-off-by: Linus Walleij --- drivers/pinctrl/qcom/pinctrl-spmi-gpio.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c index 4e80c7204e5fb4..4abd6f18bbefe5 100644 --- a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c +++ b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c @@ -1207,7 +1207,6 @@ static const struct of_device_id pmic_gpio_of_match[] = { { .compatible = "qcom,pm7325-gpio", .data = (void *) 10 }, { .compatible = "qcom,pm7550ba-gpio", .data = (void *) 8}, { .compatible = "qcom,pm8005-gpio", .data = (void *) 4 }, - { .compatible = "qcom,pm8008-gpio", .data = (void *) 2 }, { .compatible = "qcom,pm8019-gpio", .data = (void *) 6 }, /* pm8150 has 10 GPIOs with holes on 2, 5, 7 and 8 */ { .compatible = "qcom,pm8150-gpio", .data = (void *) 10 }, From 550dec8593cd0b32ffc90bb88edb190c04efbb48 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 29 May 2024 18:29:53 +0200 Subject: [PATCH 1049/1578] dt-bindings: pinctrl: qcom,pmic-gpio: drop pm8008 The binding for PM8008 is being reworked so that internal details like interrupts and register offsets are no longer described. This specifically also involves dropping the gpio child node and its compatible string which is no longer needed. Note that there are currently no users of the upstream binding and driver. Reviewed-by: Stephen Boyd Signed-off-by: Johan Hovold Reviewed-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20240529162958.18081-10-johan+linaro@kernel.org Signed-off-by: Linus Walleij --- Documentation/devicetree/bindings/pinctrl/qcom,pmic-gpio.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,pmic-gpio.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,pmic-gpio.yaml index 50846a2d09c887..0bf2d9f093b5c0 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,pmic-gpio.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,pmic-gpio.yaml @@ -29,7 +29,6 @@ properties: - qcom,pm7325-gpio - qcom,pm7550ba-gpio - qcom,pm8005-gpio - - qcom,pm8008-gpio - qcom,pm8018-gpio - qcom,pm8019-gpio - qcom,pm8038-gpio @@ -126,7 +125,6 @@ allOf: compatible: contains: enum: - - qcom,pm8008-gpio - qcom,pmi8950-gpio - qcom,pmr735d-gpio then: @@ -448,7 +446,6 @@ $defs: - gpio1-gpio10 for pm7325 - gpio1-gpio8 for pm7550ba - gpio1-gpio4 for pm8005 - - gpio1-gpio2 for pm8008 - gpio1-gpio6 for pm8018 - gpio1-gpio12 for pm8038 - gpio1-gpio40 for pm8058 From 03ecbe4bb61886cc7fff5b0efc3784e86ac16214 Mon Sep 17 00:00:00 2001 From: Thomas Richard Date: Mon, 3 Jun 2024 10:21:10 +0200 Subject: [PATCH 1050/1578] pinctrl: tps6594: add missing support for LP8764 PMIC Add missing support for LP8764 PMIC in the probe(). Issue detected with v6.10-rc1 (and reproduced with 6.10-rc2) using a TI J7200 EVM board. tps6594-pinctrl tps6594-pinctrl.8.auto: error -EINVAL: Couldn't register gpio_regmap driver tps6594-pinctrl tps6594-pinctrl.8.auto: probe with driver tps6594-pinctrl failed with error -22 Fixes: 208829715917 (pinctrl: pinctrl-tps6594: Add TPS65224 PMIC pinctrl and GPIO) Signed-off-by: Thomas Richard Link: https://lore.kernel.org/r/20240603082110.2104977-1-thomas.richard@bootlin.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-tps6594.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pinctrl/pinctrl-tps6594.c b/drivers/pinctrl/pinctrl-tps6594.c index 08504732085324..5e7c7cf93445f4 100644 --- a/drivers/pinctrl/pinctrl-tps6594.c +++ b/drivers/pinctrl/pinctrl-tps6594.c @@ -486,6 +486,7 @@ static int tps6594_pinctrl_probe(struct platform_device *pdev) break; case TPS6593: case TPS6594: + case LP8764: pctrl_desc->pins = tps6594_pins; pctrl_desc->npins = ARRAY_SIZE(tps6594_pins); From 61fef29ad120d3077aeb0ef598d76822acb8c4cb Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Mon, 3 Jun 2024 20:19:37 +0200 Subject: [PATCH 1051/1578] pinctrl: bcm2835: Fix permissions of persist_gpio_outputs The commit 8ff05989b44e ("pinctrl: bcm2835: Make pin freeing behavior configurable") unintentionally made the module parameter persist_gpio_outputs changeable at runtime. So drop the write permission in order to make the freeing behavior predictable for user applications. Fixes: 8ff05989b44e ("pinctrl: bcm2835: Make pin freeing behavior configurable") Reported-by: Andy Shevchenko Closes: https://lore.kernel.org/linux-gpio/Zjk-C0nLmlynqLAE@surfacebook.localdomain/ Signed-off-by: Stefan Wahren Acked-by: Florian Fainelli Link: https://lore.kernel.org/r/20240603181938.76047-2-wahrenst@gmx.net Signed-off-by: Linus Walleij --- drivers/pinctrl/bcm/pinctrl-bcm2835.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/bcm/pinctrl-bcm2835.c b/drivers/pinctrl/bcm/pinctrl-bcm2835.c index 7178a38475cccb..27fd547957915c 100644 --- a/drivers/pinctrl/bcm/pinctrl-bcm2835.c +++ b/drivers/pinctrl/bcm/pinctrl-bcm2835.c @@ -245,7 +245,7 @@ static const char * const irq_type_names[] = { }; static bool persist_gpio_outputs; -module_param(persist_gpio_outputs, bool, 0644); +module_param(persist_gpio_outputs, bool, 0444); MODULE_PARM_DESC(persist_gpio_outputs, "Enable GPIO_OUT persistence when pin is freed"); static inline u32 bcm2835_gpio_rd(struct bcm2835_pinctrl *pc, unsigned reg) From adec57ff8e66aee632f3dd1f93787c13d112b7a1 Mon Sep 17 00:00:00 2001 From: Hagar Hemdan Date: Tue, 4 Jun 2024 08:58:38 +0000 Subject: [PATCH 1052/1578] pinctrl: fix deadlock in create_pinctrl() when handling -EPROBE_DEFER In create_pinctrl(), pinctrl_maps_mutex is acquired before calling add_setting(). If add_setting() returns -EPROBE_DEFER, create_pinctrl() calls pinctrl_free(). However, pinctrl_free() attempts to acquire pinctrl_maps_mutex, which is already held by create_pinctrl(), leading to a potential deadlock. This patch resolves the issue by releasing pinctrl_maps_mutex before calling pinctrl_free(), preventing the deadlock. This bug was discovered and resolved using Coverity Static Analysis Security Testing (SAST) by Synopsys, Inc. Fixes: 42fed7ba44e4 ("pinctrl: move subsystem mutex to pinctrl_dev struct") Suggested-by: Maximilian Heyne Signed-off-by: Hagar Hemdan Link: https://lore.kernel.org/r/20240604085838.3344-1-hagarhem@amazon.com Signed-off-by: Linus Walleij --- drivers/pinctrl/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c index cffeb869130dda..f424a57f001362 100644 --- a/drivers/pinctrl/core.c +++ b/drivers/pinctrl/core.c @@ -1106,8 +1106,8 @@ static struct pinctrl *create_pinctrl(struct device *dev, * an -EPROBE_DEFER later, as that is the worst case. */ if (ret == -EPROBE_DEFER) { - pinctrl_free(p, false); mutex_unlock(&pinctrl_maps_mutex); + pinctrl_free(p, false); return ERR_PTR(ret); } } From e8448a6c817c2aa6c6af785b1d45678bd5977e8d Mon Sep 17 00:00:00 2001 From: Huang-Huang Bao Date: Thu, 6 Jun 2024 20:57:52 +0800 Subject: [PATCH 1053/1578] pinctrl: rockchip: fix pinmux bits for RK3328 GPIO2-B pins The pinmux bits for GPIO2-B0 to GPIO2-B6 actually have 2 bits width, correct the bank flag for GPIO2-B. The pinmux bits for GPIO2-B7 is recalculated so it remain unchanged. The pinmux bits for those pins are not explicitly specified in RK3328 TRM, however we can get hint from pad name and its correspinding IOMUX setting for pins in interface descriptions. The correspinding IOMIX settings for GPIO2-B0 to GPIO2-B6 can be found in the same row next to occurrences of following pad names in RK3328 TRM. GPIO2-B0: IO_SPIclkm0_GPIO2B0vccio5 GPIO2-B1: IO_SPItxdm0_GPIO2B1vccio5 GPIO2-B2: IO_SPIrxdm0_GPIO2B2vccio5 GPIO2-B3: IO_SPIcsn0m0_GPIO2B3vccio5 GPIO2-B4: IO_SPIcsn1m0_FLASHvol_sel_GPIO2B4vccio5 GPIO2-B5: IO_ I2C2sda_TSADCshut_GPIO2B5vccio5 GPIO2-B6: IO_ I2C2scl_GPIO2B6vccio5 This fix has been tested on NanoPi R2S for fixing confliting pinmux bits between GPIO2-B7 with GPIO2-B5. Signed-off-by: Huang-Huang Bao Reviewed-by: Heiko Stuebner Fixes: 3818e4a7678e ("pinctrl: rockchip: Add rk3328 pinctrl support") Link: https://lore.kernel.org/r/20240606125755.53778-2-i@eh5.me Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-rockchip.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/pinctrl/pinctrl-rockchip.c b/drivers/pinctrl/pinctrl-rockchip.c index 3bedf36a0019df..78dcf4daccde8d 100644 --- a/drivers/pinctrl/pinctrl-rockchip.c +++ b/drivers/pinctrl/pinctrl-rockchip.c @@ -634,12 +634,6 @@ static struct rockchip_mux_recalced_data rk3308_mux_recalced_data[] = { static struct rockchip_mux_recalced_data rk3328_mux_recalced_data[] = { { - .num = 2, - .pin = 12, - .reg = 0x24, - .bit = 8, - .mask = 0x3 - }, { .num = 2, .pin = 15, .reg = 0x28, @@ -3763,7 +3757,7 @@ static struct rockchip_pin_bank rk3328_pin_banks[] = { PIN_BANK_IOMUX_FLAGS(0, 32, "gpio0", 0, 0, 0, 0), PIN_BANK_IOMUX_FLAGS(1, 32, "gpio1", 0, 0, 0, 0), PIN_BANK_IOMUX_FLAGS(2, 32, "gpio2", 0, - IOMUX_WIDTH_3BIT, + 0, IOMUX_WIDTH_3BIT, 0), PIN_BANK_IOMUX_FLAGS(3, 32, "gpio3", From 5ef6914e0bf578357b4c906ffe6b26e7eedb8ccf Mon Sep 17 00:00:00 2001 From: Huang-Huang Bao Date: Thu, 6 Jun 2024 20:57:53 +0800 Subject: [PATCH 1054/1578] pinctrl: rockchip: fix pinmux bits for RK3328 GPIO3-B pins The pinmux bits for GPIO3-B1 to GPIO3-B6 pins are not explicitly specified in RK3328 TRM, however we can get hint from pad name and its correspinding IOMUX setting for pins in interface descriptions. The correspinding IOMIX settings for these pins can be found in the same row next to occurrences of following pad names in RK3328 TRM. GPIO3-B1: IO_TSPd5m0_CIFdata5m0_GPIO3B1vccio6 GPIO3-B2: IO_TSPd6m0_CIFdata6m0_GPIO3B2vccio6 GPIO3-B3: IO_TSPd7m0_CIFdata7m0_GPIO3B3vccio6 GPIO3-B4: IO_CARDclkm0_GPIO3B4vccio6 GPIO3-B5: IO_CARDrstm0_GPIO3B5vccio6 GPIO3-B6: IO_CARDdetm0_GPIO3B6vccio6 Add pinmux data to rk3328_mux_recalced_data as mux register offset for these pins does not follow rockchip convention. Signed-off-by: Huang-Huang Bao Reviewed-by: Heiko Stuebner Fixes: 3818e4a7678e ("pinctrl: rockchip: Add rk3328 pinctrl support") Link: https://lore.kernel.org/r/20240606125755.53778-3-i@eh5.me Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-rockchip.c | 51 ++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/drivers/pinctrl/pinctrl-rockchip.c b/drivers/pinctrl/pinctrl-rockchip.c index 78dcf4daccde8d..23531ea0d088f8 100644 --- a/drivers/pinctrl/pinctrl-rockchip.c +++ b/drivers/pinctrl/pinctrl-rockchip.c @@ -634,17 +634,68 @@ static struct rockchip_mux_recalced_data rk3308_mux_recalced_data[] = { static struct rockchip_mux_recalced_data rk3328_mux_recalced_data[] = { { + /* gpio2_b7_sel */ .num = 2, .pin = 15, .reg = 0x28, .bit = 0, .mask = 0x7 }, { + /* gpio2_c7_sel */ .num = 2, .pin = 23, .reg = 0x30, .bit = 14, .mask = 0x3 + }, { + /* gpio3_b1_sel */ + .num = 3, + .pin = 9, + .reg = 0x44, + .bit = 2, + .mask = 0x3 + }, { + /* gpio3_b2_sel */ + .num = 3, + .pin = 10, + .reg = 0x44, + .bit = 4, + .mask = 0x3 + }, { + /* gpio3_b3_sel */ + .num = 3, + .pin = 11, + .reg = 0x44, + .bit = 6, + .mask = 0x3 + }, { + /* gpio3_b4_sel */ + .num = 3, + .pin = 12, + .reg = 0x44, + .bit = 8, + .mask = 0x3 + }, { + /* gpio3_b5_sel */ + .num = 3, + .pin = 13, + .reg = 0x44, + .bit = 10, + .mask = 0x3 + }, { + /* gpio3_b6_sel */ + .num = 3, + .pin = 14, + .reg = 0x44, + .bit = 12, + .mask = 0x3 + }, { + /* gpio3_b7_sel */ + .num = 3, + .pin = 15, + .reg = 0x44, + .bit = 14, + .mask = 0x3 }, }; From 01b4b1d1cec48ef4c26616c2fc4600b2c9fec05a Mon Sep 17 00:00:00 2001 From: Huang-Huang Bao Date: Thu, 6 Jun 2024 20:57:54 +0800 Subject: [PATCH 1055/1578] pinctrl: rockchip: use dedicated pinctrl type for RK3328 rk3328_pin_ctrl uses type of RK3288 which has a hack in rockchip_pinctrl_suspend and rockchip_pinctrl_resume to restore GPIO6-C6 at assume, the hack is not applicable to RK3328 as GPIO6 is not even exist in it. So use a dedicated pinctrl type to skip this hack. Fixes: 3818e4a7678e ("pinctrl: rockchip: Add rk3328 pinctrl support") Reviewed-by: Heiko Stuebner Signed-off-by: Huang-Huang Bao Link: https://lore.kernel.org/r/20240606125755.53778-4-i@eh5.me Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-rockchip.c | 5 ++++- drivers/pinctrl/pinctrl-rockchip.h | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/pinctrl/pinctrl-rockchip.c b/drivers/pinctrl/pinctrl-rockchip.c index 23531ea0d088f8..24ee88863ce399 100644 --- a/drivers/pinctrl/pinctrl-rockchip.c +++ b/drivers/pinctrl/pinctrl-rockchip.c @@ -2478,6 +2478,7 @@ static int rockchip_get_pull(struct rockchip_pin_bank *bank, int pin_num) case RK3188: case RK3288: case RK3308: + case RK3328: case RK3368: case RK3399: case RK3568: @@ -2536,6 +2537,7 @@ static int rockchip_set_pull(struct rockchip_pin_bank *bank, case RK3188: case RK3288: case RK3308: + case RK3328: case RK3368: case RK3399: case RK3568: @@ -2798,6 +2800,7 @@ static bool rockchip_pinconf_pull_valid(struct rockchip_pin_ctrl *ctrl, case RK3188: case RK3288: case RK3308: + case RK3328: case RK3368: case RK3399: case RK3568: @@ -3822,7 +3825,7 @@ static struct rockchip_pin_ctrl rk3328_pin_ctrl = { .pin_banks = rk3328_pin_banks, .nr_banks = ARRAY_SIZE(rk3328_pin_banks), .label = "RK3328-GPIO", - .type = RK3288, + .type = RK3328, .grf_mux_offset = 0x0, .iomux_recalced = rk3328_mux_recalced_data, .niomux_recalced = ARRAY_SIZE(rk3328_mux_recalced_data), diff --git a/drivers/pinctrl/pinctrl-rockchip.h b/drivers/pinctrl/pinctrl-rockchip.h index 4759f336941ef3..849266f8b19131 100644 --- a/drivers/pinctrl/pinctrl-rockchip.h +++ b/drivers/pinctrl/pinctrl-rockchip.h @@ -193,6 +193,7 @@ enum rockchip_pinctrl_type { RK3188, RK3288, RK3308, + RK3328, RK3368, RK3399, RK3568, From 4ea4d4808e342ddf89ba24b93ffa2057005aaced Mon Sep 17 00:00:00 2001 From: Huang-Huang Bao Date: Thu, 6 Jun 2024 20:57:55 +0800 Subject: [PATCH 1056/1578] pinctrl: rockchip: fix pinmux reset in rockchip_pmx_set rockchip_pmx_set reset all pinmuxs in group to 0 in the case of error, add missing bank data retrieval in that code to avoid setting mux on unexpected pins. Fixes: 14797189b35e ("pinctrl: rockchip: add return value to rockchip_set_mux") Reviewed-by: Heiko Stuebner Signed-off-by: Huang-Huang Bao Link: https://lore.kernel.org/r/20240606125755.53778-5-i@eh5.me Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-rockchip.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/pinctrl/pinctrl-rockchip.c b/drivers/pinctrl/pinctrl-rockchip.c index 24ee88863ce399..3f56991f5b8920 100644 --- a/drivers/pinctrl/pinctrl-rockchip.c +++ b/drivers/pinctrl/pinctrl-rockchip.c @@ -2751,8 +2751,10 @@ static int rockchip_pmx_set(struct pinctrl_dev *pctldev, unsigned selector, if (ret) { /* revert the already done pin settings */ - for (cnt--; cnt >= 0; cnt--) + for (cnt--; cnt >= 0; cnt--) { + bank = pin_to_bank(info, pins[cnt]); rockchip_set_mux(bank, pins[cnt] - bank->pin_base, 0); + } return ret; } From 49cc17967be95d64606d5684416ee51eec35e84a Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Fri, 14 Jun 2024 17:23:11 +0300 Subject: [PATCH 1057/1578] drm/i915/mso: using joiner is not possible with eDP MSO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's not possible to use the joiner at the same time with eDP MSO. When a panel needs MSO, it's not optional, so MSO trumps joiner. v3: Only change intel_dp_has_joiner(), leave debugfs alone (Ville) Fixes: bc71194e8897 ("drm/i915/edp: enable eDP MSO during link training") Cc: # v5.13+ Cc: Ville Syrjala Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/1668 Reviewed-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20240614142311.589089-1-jani.nikula@intel.com Signed-off-by: Jani Nikula (cherry picked from commit 8b5a92ca24eb96bb71e2a55e352687487d87687f) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/intel_dp.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index e05e25cd4a9400..5b3b6ae1e3d71d 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -442,6 +442,10 @@ bool intel_dp_has_bigjoiner(struct intel_dp *intel_dp) struct intel_encoder *encoder = &intel_dig_port->base; struct drm_i915_private *dev_priv = to_i915(encoder->base.dev); + /* eDP MSO is not compatible with joiner */ + if (intel_dp->mso_link_count) + return false; + return DISPLAY_VER(dev_priv) >= 12 || (DISPLAY_VER(dev_priv) == 11 && encoder->port != PORT_A); From 8c4d6945fe5bd04ff847c3c788abd34ca354ecee Mon Sep 17 00:00:00 2001 From: Alexey Makhalov Date: Sat, 15 Jun 2024 18:25:10 -0700 Subject: [PATCH 1058/1578] drm/vmwgfx: Fix missing HYPERVISOR_GUEST dependency VMWARE_HYPERCALL alternative will not work as intended without VMware guest code initialization. [ bp: note that this doesn't reproduce with newer gccs so it must be something gcc-9-specific. ] Closes: https://lore.kernel.org/oe-kbuild-all/202406152104.FxakP1MB-lkp@intel.com/ Reported-by: kernel test robot Signed-off-by: Alexey Makhalov Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240616012511.198243-1-alexey.makhalov@broadcom.com --- drivers/gpu/drm/vmwgfx/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/vmwgfx/Kconfig b/drivers/gpu/drm/vmwgfx/Kconfig index faddae3d6ac2e0..6f1ac940cbae70 100644 --- a/drivers/gpu/drm/vmwgfx/Kconfig +++ b/drivers/gpu/drm/vmwgfx/Kconfig @@ -2,7 +2,7 @@ config DRM_VMWGFX tristate "DRM driver for VMware Virtual GPU" depends on DRM && PCI && MMU - depends on X86 || ARM64 + depends on (X86 && HYPERVISOR_GUEST) || ARM64 select DRM_TTM select DRM_TTM_HELPER select MAPPING_DIRTY_HELPERS From 0b9130247f3b6a1122478471ff0e014ea96bb735 Mon Sep 17 00:00:00 2001 From: Gavrilov Ilia Date: Thu, 13 Jun 2024 08:23:00 +0000 Subject: [PATCH 1059/1578] netrom: Fix a memory leak in nr_heartbeat_expiry() syzbot reported a memory leak in nr_create() [0]. Commit 409db27e3a2e ("netrom: Fix use-after-free of a listening socket.") added sock_hold() to the nr_heartbeat_expiry() function, where a) a socket has a SOCK_DESTROY flag or b) a listening socket has a SOCK_DEAD flag. But in the case "a," when the SOCK_DESTROY flag is set, the file descriptor has already been closed and the nr_release() function has been called. So it makes no sense to hold the reference count because no one will call another nr_destroy_socket() and put it as in the case "b." nr_connect nr_establish_data_link nr_start_heartbeat nr_release switch (nr->state) case NR_STATE_3 nr->state = NR_STATE_2 sock_set_flag(sk, SOCK_DESTROY); nr_rx_frame nr_process_rx_frame switch (nr->state) case NR_STATE_2 nr_state2_machine() nr_disconnect() nr_sk(sk)->state = NR_STATE_0 sock_set_flag(sk, SOCK_DEAD) nr_heartbeat_expiry switch (nr->state) case NR_STATE_0 if (sock_flag(sk, SOCK_DESTROY) || (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_DEAD))) sock_hold() // ( !!! ) nr_destroy_socket() To fix the memory leak, let's call sock_hold() only for a listening socket. Found by InfoTeCS on behalf of Linux Verification Center (linuxtesting.org) with Syzkaller. [0]: https://syzkaller.appspot.com/bug?extid=d327a1f3b12e1e206c16 Reported-by: syzbot+d327a1f3b12e1e206c16@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=d327a1f3b12e1e206c16 Fixes: 409db27e3a2e ("netrom: Fix use-after-free of a listening socket.") Signed-off-by: Gavrilov Ilia Signed-off-by: David S. Miller --- net/netrom/nr_timer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c index 4e7c968cde2dcf..5e3ca068f04e04 100644 --- a/net/netrom/nr_timer.c +++ b/net/netrom/nr_timer.c @@ -121,7 +121,8 @@ static void nr_heartbeat_expiry(struct timer_list *t) is accepted() it isn't 'dead' so doesn't get removed. */ if (sock_flag(sk, SOCK_DESTROY) || (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_DEAD))) { - sock_hold(sk); + if (sk->sk_state == TCP_LISTEN) + sock_hold(sk); bh_unlock_sock(sk); nr_destroy_socket(sk); goto out; From 932d8476399f622aa0767a4a0a9e78e5341dc0e1 Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Wed, 15 May 2024 21:45:54 +0800 Subject: [PATCH 1060/1578] cpu/hotplug: Fix dynstate assignment in __cpuhp_setup_state_cpuslocked() Commit 4205e4786d0b ("cpu/hotplug: Provide dynamic range for prepare stage") added a dynamic range for the prepare states, but did not handle the assignment of the dynstate variable in __cpuhp_setup_state_cpuslocked(). This causes the corresponding startup callback not to be invoked when calling __cpuhp_setup_state_cpuslocked() with the CPUHP_BP_PREPARE_DYN parameter, even though it should be. Currently, the users of __cpuhp_setup_state_cpuslocked(), for one reason or another, have not triggered this bug. Fixes: 4205e4786d0b ("cpu/hotplug: Provide dynamic range for prepare stage") Signed-off-by: Yuntao Wang Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240515134554.427071-1-ytcoode@gmail.com --- kernel/cpu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index 563877d6c28b65..74cfdb66a9bdd2 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2446,7 +2446,7 @@ EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); * The caller needs to hold cpus read locked while calling this function. * Return: * On success: - * Positive state number if @state is CPUHP_AP_ONLINE_DYN; + * Positive state number if @state is CPUHP_AP_ONLINE_DYN or CPUHP_BP_PREPARE_DYN; * 0 for all other states * On failure: proper (negative) error code */ @@ -2469,7 +2469,7 @@ int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state, ret = cpuhp_store_callbacks(state, name, startup, teardown, multi_instance); - dynstate = state == CPUHP_AP_ONLINE_DYN; + dynstate = state == CPUHP_AP_ONLINE_DYN || state == CPUHP_BP_PREPARE_DYN; if (ret > 0 && dynstate) { state = ret; ret = 0; @@ -2500,8 +2500,8 @@ int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state, out: mutex_unlock(&cpuhp_state_mutex); /* - * If the requested state is CPUHP_AP_ONLINE_DYN, return the - * dynamically allocated state in case of success. + * If the requested state is CPUHP_AP_ONLINE_DYN or CPUHP_BP_PREPARE_DYN, + * return the dynamically allocated state in case of success. */ if (!ret && dynstate) return state; From 8e948c365d9c10b685d1deb946bd833d6a9b43e0 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 17 Jun 2024 07:54:08 -0400 Subject: [PATCH 1061/1578] nfsd: fix oops when reading pool_stats before server is started Sourbh reported an oops that is triggerable by trying to read the pool_stats procfile before nfsd had been started. Move the check for a NULL serv in svc_pool_stats_start above the mutex acquisition, and fix the stop routine not to unlock the mutex if there is no serv yet. Fixes: 7b207ccd9833 ("svc: don't hold reference for poolstats, only mutex.") Reported-by: Sourabh Jain Signed-off-by: Jeff Layton Tested-by: Sourabh Jain Signed-off-by: Chuck Lever --- net/sunrpc/svc_xprt.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index dd86d7f1e97e92..49a3bea33f9d5f 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -1421,12 +1421,13 @@ static void *svc_pool_stats_start(struct seq_file *m, loff_t *pos) dprintk("svc_pool_stats_start, *pidx=%u\n", pidx); + if (!si->serv) + return NULL; + mutex_lock(si->mutex); if (!pidx) return SEQ_START_TOKEN; - if (!si->serv) - return NULL; return pidx > si->serv->sv_nrpools ? NULL : &si->serv->sv_pools[pidx - 1]; } @@ -1458,7 +1459,8 @@ static void svc_pool_stats_stop(struct seq_file *m, void *p) { struct svc_info *si = m->private; - mutex_unlock(si->mutex); + if (si->serv) + mutex_unlock(si->mutex); } static int svc_pool_stats_show(struct seq_file *m, void *p) From b83bd486b43d2b7f10595a9d7a52d41023eaa9c1 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Mon, 17 Jun 2024 10:19:18 +0530 Subject: [PATCH 1062/1578] block: cleanup flag_{show,store} Remove a superfluous argument that flag_show and flag_store currently take. Signed-off-by: Kanchan Joshi Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240617044918.374608-1-joshi.k@samsung.com Signed-off-by: Jens Axboe --- block/blk-integrity.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 05a48689a424b2..010decc892eaa0 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -186,8 +186,8 @@ const char *blk_integrity_profile_name(struct blk_integrity *bi) } EXPORT_SYMBOL_GPL(blk_integrity_profile_name); -static ssize_t flag_store(struct device *dev, struct device_attribute *attr, - const char *page, size_t count, unsigned char flag) +static ssize_t flag_store(struct device *dev, const char *page, size_t count, + unsigned char flag) { struct request_queue *q = dev_to_disk(dev)->queue; struct queue_limits lim; @@ -213,8 +213,7 @@ static ssize_t flag_store(struct device *dev, struct device_attribute *attr, return count; } -static ssize_t flag_show(struct device *dev, struct device_attribute *attr, - char *page, unsigned char flag) +static ssize_t flag_show(struct device *dev, char *page, unsigned char flag) { struct blk_integrity *bi = dev_to_bi(dev); @@ -253,26 +252,26 @@ static ssize_t read_verify_store(struct device *dev, struct device_attribute *attr, const char *page, size_t count) { - return flag_store(dev, attr, page, count, BLK_INTEGRITY_NOVERIFY); + return flag_store(dev, page, count, BLK_INTEGRITY_NOVERIFY); } static ssize_t read_verify_show(struct device *dev, struct device_attribute *attr, char *page) { - return flag_show(dev, attr, page, BLK_INTEGRITY_NOVERIFY); + return flag_show(dev, page, BLK_INTEGRITY_NOVERIFY); } static ssize_t write_generate_store(struct device *dev, struct device_attribute *attr, const char *page, size_t count) { - return flag_store(dev, attr, page, count, BLK_INTEGRITY_NOGENERATE); + return flag_store(dev, page, count, BLK_INTEGRITY_NOGENERATE); } static ssize_t write_generate_show(struct device *dev, struct device_attribute *attr, char *page) { - return flag_show(dev, attr, page, BLK_INTEGRITY_NOGENERATE); + return flag_show(dev, page, BLK_INTEGRITY_NOGENERATE); } static ssize_t device_is_integrity_capable_show(struct device *dev, From da2c8fef130ec7197e2f91c7ed70a8c5bede4bea Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 17 Jun 2024 16:26:26 +0200 Subject: [PATCH 1063/1578] NFSD: grab nfsd_mutex in nfsd_nl_rpc_status_get_dumpit() Grab nfsd_mutex lock in nfsd_nl_rpc_status_get_dumpit routine and remove nfsd_nl_rpc_status_get_start() and nfsd_nl_rpc_status_get_done(). This patch fix the syzbot log reported below: INFO: task syz-executor.1:17770 blocked for more than 143 seconds. Not tainted 6.10.0-rc3-syzkaller-00022-gcea2a26553ac #0 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:syz-executor.1 state:D stack:23800 pid:17770 tgid:17767 ppid:11381 flags:0x00000006 Call Trace: context_switch kernel/sched/core.c:5408 [inline] __schedule+0x17e8/0x4a20 kernel/sched/core.c:6745 __schedule_loop kernel/sched/core.c:6822 [inline] schedule+0x14b/0x320 kernel/sched/core.c:6837 schedule_preempt_disabled+0x13/0x30 kernel/sched/core.c:6894 __mutex_lock_common kernel/locking/mutex.c:684 [inline] __mutex_lock+0x6a4/0xd70 kernel/locking/mutex.c:752 nfsd_nl_listener_get_doit+0x115/0x5d0 fs/nfsd/nfsctl.c:2124 genl_family_rcv_msg_doit net/netlink/genetlink.c:1115 [inline] genl_family_rcv_msg net/netlink/genetlink.c:1195 [inline] genl_rcv_msg+0xb16/0xec0 net/netlink/genetlink.c:1210 netlink_rcv_skb+0x1e5/0x430 net/netlink/af_netlink.c:2564 genl_rcv+0x28/0x40 net/netlink/genetlink.c:1219 netlink_unicast_kernel net/netlink/af_netlink.c:1335 [inline] netlink_unicast+0x7ec/0x980 net/netlink/af_netlink.c:1361 netlink_sendmsg+0x8db/0xcb0 net/netlink/af_netlink.c:1905 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x223/0x270 net/socket.c:745 ____sys_sendmsg+0x525/0x7d0 net/socket.c:2585 ___sys_sendmsg net/socket.c:2639 [inline] __sys_sendmsg+0x2b0/0x3a0 net/socket.c:2668 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f24ed27cea9 RSP: 002b:00007f24ee0080c8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f24ed3b3f80 RCX: 00007f24ed27cea9 RDX: 0000000000000000 RSI: 0000000020000100 RDI: 0000000000000005 RBP: 00007f24ed2ebff4 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 Fixes: 1bd773b4f0c9 ("nfsd: hold nfsd_mutex across entire netlink operation") Fixes: bd9d6a3efa97 ("NFSD: add rpc_status netlink support") Signed-off-by: Lorenzo Bianconi Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- Documentation/netlink/specs/nfsd.yaml | 2 -- fs/nfsd/netlink.c | 2 -- fs/nfsd/netlink.h | 3 -- fs/nfsd/nfsctl.c | 48 ++++++--------------------- 4 files changed, 11 insertions(+), 44 deletions(-) diff --git a/Documentation/netlink/specs/nfsd.yaml b/Documentation/netlink/specs/nfsd.yaml index d2123409716738..6bda7a46730184 100644 --- a/Documentation/netlink/specs/nfsd.yaml +++ b/Documentation/netlink/specs/nfsd.yaml @@ -123,8 +123,6 @@ operations: doc: dump pending nfsd rpc attribute-set: rpc-status dump: - pre: nfsd-nl-rpc-status-get-start - post: nfsd-nl-rpc-status-get-done reply: attributes: - xid diff --git a/fs/nfsd/netlink.c b/fs/nfsd/netlink.c index 62d2586d990259..529a75ecf22e8d 100644 --- a/fs/nfsd/netlink.c +++ b/fs/nfsd/netlink.c @@ -44,9 +44,7 @@ static const struct nla_policy nfsd_listener_set_nl_policy[NFSD_A_SERVER_SOCK_AD static const struct genl_split_ops nfsd_nl_ops[] = { { .cmd = NFSD_CMD_RPC_STATUS_GET, - .start = nfsd_nl_rpc_status_get_start, .dumpit = nfsd_nl_rpc_status_get_dumpit, - .done = nfsd_nl_rpc_status_get_done, .flags = GENL_CMD_CAP_DUMP, }, { diff --git a/fs/nfsd/netlink.h b/fs/nfsd/netlink.h index e3724637d64d5c..2e132ef328f8d4 100644 --- a/fs/nfsd/netlink.h +++ b/fs/nfsd/netlink.h @@ -15,9 +15,6 @@ extern const struct nla_policy nfsd_sock_nl_policy[NFSD_A_SOCK_TRANSPORT_NAME + 1]; extern const struct nla_policy nfsd_version_nl_policy[NFSD_A_VERSION_ENABLED + 1]; -int nfsd_nl_rpc_status_get_start(struct netlink_callback *cb); -int nfsd_nl_rpc_status_get_done(struct netlink_callback *cb); - int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 202140df8f82e4..533b65057e18ef 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1460,28 +1460,6 @@ static int create_proc_exports_entry(void) unsigned int nfsd_net_id; -/** - * nfsd_nl_rpc_status_get_start - Prepare rpc_status_get dumpit - * @cb: netlink metadata and command arguments - * - * Return values: - * %0: The rpc_status_get command may proceed - * %-ENODEV: There is no NFSD running in this namespace - */ -int nfsd_nl_rpc_status_get_start(struct netlink_callback *cb) -{ - struct nfsd_net *nn = net_generic(sock_net(cb->skb->sk), nfsd_net_id); - int ret = -ENODEV; - - mutex_lock(&nfsd_mutex); - if (nn->nfsd_serv) - ret = 0; - else - mutex_unlock(&nfsd_mutex); - - return ret; -} - static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb, struct netlink_callback *cb, struct nfsd_genl_rqstp *rqstp) @@ -1558,8 +1536,16 @@ static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb, int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { - struct nfsd_net *nn = net_generic(sock_net(skb->sk), nfsd_net_id); int i, ret, rqstp_index = 0; + struct nfsd_net *nn; + + mutex_lock(&nfsd_mutex); + + nn = net_generic(sock_net(skb->sk), nfsd_net_id); + if (!nn->nfsd_serv) { + ret = -ENODEV; + goto out_unlock; + } rcu_read_lock(); @@ -1636,22 +1622,10 @@ int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb, ret = skb->len; out: rcu_read_unlock(); - - return ret; -} - -/** - * nfsd_nl_rpc_status_get_done - rpc_status_get dumpit post-processing - * @cb: netlink metadata and command arguments - * - * Return values: - * %0: Success - */ -int nfsd_nl_rpc_status_get_done(struct netlink_callback *cb) -{ +out_unlock: mutex_unlock(&nfsd_mutex); - return 0; + return ret; } /** From 380d5f89a4815ff88461a45de2fb6f28533df708 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 15 Jun 2024 10:46:26 -0700 Subject: [PATCH 1064/1578] bpf: Add missed var_off setting in set_sext32_default_val() Zac reported a verification failure and Alexei reproduced the issue with a simple reproducer ([1]). The verification failure is due to missed setting for var_off. The following is the reproducer in [1]: 0: R1=ctx() R10=fp0 0: (71) r3 = *(u8 *)(r10 -387) ; R3_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) R10=fp0 1: (bc) w7 = (s8)w3 ; R3_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) R7_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=127,var_off=(0x0; 0x7f)) 2: (36) if w7 >= 0x2533823b goto pc-3 mark_precise: frame0: last_idx 2 first_idx 0 subseq_idx -1 mark_precise: frame0: regs=r7 stack= before 1: (bc) w7 = (s8)w3 mark_precise: frame0: regs=r3 stack= before 0: (71) r3 = *(u8 *)(r10 -387) 2: R7_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=127,var_off=(0x0; 0x7f)) 3: (b4) w0 = 0 ; R0_w=0 4: (95) exit Note that after insn 1, the var_off for R7 is (0x0; 0x7f). This is not correct since upper 24 bits of w7 could be 0 or 1. So correct var_off should be (0x0; 0xffffffff). Missing var_off setting in set_sext32_default_val() caused later incorrect analysis in zext_32_to_64(dst_reg) and reg_bounds_sync(dst_reg). To fix the issue, set var_off correctly in set_sext32_default_val(). The correct reg state after insn 1 becomes: 1: (bc) w7 = (s8)w3 ; R3_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) R7_w=scalar(smin=0,smax=umax=0xffffffff,smin32=-128,smax32=127,var_off=(0x0; 0xffffffff)) and at insn 2, the verifier correctly determines either branch is possible. [1] https://lore.kernel.org/bpf/CAADnVQLPU0Shz7dWV4bn2BgtGdxN3uFHPeobGBA72tpg5Xoykw@mail.gmail.com/ Fixes: 8100928c8814 ("bpf: Support new sign-extension mov insns") Reported-by: Zac Ecob Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240615174626.3994813-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 010cfee7ffe938..904ef5a03cf5d9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6236,6 +6236,7 @@ static void set_sext32_default_val(struct bpf_reg_state *reg, int size) } reg->u32_min_value = 0; reg->u32_max_value = U32_MAX; + reg->var_off = tnum_subreg(tnum_unknown); } static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size) From 44b7f7151dfc2e0947f39ed4b9bc4b0c2ccd46fc Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 15 Jun 2024 10:46:32 -0700 Subject: [PATCH 1065/1578] bpf: Add missed var_off setting in coerce_subreg_to_size_sx() In coerce_subreg_to_size_sx(), for the case where upper sign extension bits are the same for smax32 and smin32 values, we missed to setup properly. This is especially problematic if both smax32 and smin32's sign extension bits are 1. The following is a simple example illustrating the inconsistent verifier states due to missed var_off: 0: (85) call bpf_get_prandom_u32#7 ; R0_w=scalar() 1: (bf) r3 = r0 ; R0_w=scalar(id=1) R3_w=scalar(id=1) 2: (57) r3 &= 15 ; R3_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=15,var_off=(0x0; 0xf)) 3: (47) r3 |= 128 ; R3_w=scalar(smin=umin=smin32=umin32=128,smax=umax=smax32=umax32=143,var_off=(0x80; 0xf)) 4: (bc) w7 = (s8)w3 REG INVARIANTS VIOLATION (alu): range bounds violation u64=[0xffffff80, 0x8f] s64=[0xffffff80, 0x8f] u32=[0xffffff80, 0x8f] s32=[0x80, 0xffffff8f] var_off=(0x80, 0xf) The var_off=(0x80, 0xf) is not correct, and the correct one should be var_off=(0xffffff80; 0xf) since from insn 3, we know that at insn 4, the sign extension bits will be 1. This patch fixed this issue by setting var_off properly. Fixes: 8100928c8814 ("bpf: Support new sign-extension mov insns") Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240615174632.3995278-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 904ef5a03cf5d9..e0a398a97d32e4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6281,6 +6281,7 @@ static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size) reg->s32_max_value = s32_max; reg->u32_min_value = (u32)s32_min; reg->u32_max_value = (u32)s32_max; + reg->var_off = tnum_subreg(tnum_range(s32_min, s32_max)); return; } From a62293c33b058415237c55058a6d20de313a2e61 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 15 Jun 2024 10:46:37 -0700 Subject: [PATCH 1066/1578] selftests/bpf: Add a few tests to cover Add three unit tests in verifier_movsx.c to cover cases where missed var_off setting can cause unexpected verification success or failure. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240615174637.3995589-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/verifier_movsx.c | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_movsx.c b/tools/testing/selftests/bpf/progs/verifier_movsx.c index cbb9d6714f537c..028ec855587bea 100644 --- a/tools/testing/selftests/bpf/progs/verifier_movsx.c +++ b/tools/testing/selftests/bpf/progs/verifier_movsx.c @@ -224,6 +224,69 @@ l0_%=: \ : __clobber_all); } +SEC("socket") +__description("MOV32SX, S8, var_off u32_max") +__failure __msg("infinite loop detected") +__failure_unpriv __msg_unpriv("back-edge from insn 2 to 0") +__naked void mov64sx_s32_varoff_1(void) +{ + asm volatile (" \ +l0_%=: \ + r3 = *(u8 *)(r10 -387); \ + w7 = (s8)w3; \ + if w7 >= 0x2533823b goto l0_%=; \ + w0 = 0; \ + exit; \ +" : + : + : __clobber_all); +} + +SEC("socket") +__description("MOV32SX, S8, var_off not u32_max, positive after s8 extension") +__success __retval(0) +__failure_unpriv __msg_unpriv("frame pointer is read only") +__naked void mov64sx_s32_varoff_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r3 = r0; \ + r3 &= 0xf; \ + w7 = (s8)w3; \ + if w7 s>= 16 goto l0_%=; \ + w0 = 0; \ + exit; \ +l0_%=: \ + r10 = 1; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("MOV32SX, S8, var_off not u32_max, negative after s8 extension") +__success __retval(0) +__failure_unpriv __msg_unpriv("frame pointer is read only") +__naked void mov64sx_s32_varoff_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r3 = r0; \ + r3 &= 0xf; \ + r3 |= 0x80; \ + w7 = (s8)w3; \ + if w7 s>= -5 goto l0_%=; \ + w0 = 0; \ + exit; \ +l0_%=: \ + r10 = 1; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + #else SEC("socket") From 1ab1a422c0daedbd76f9f25c297eca48986ddea0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 17 Jun 2024 11:13:01 -0700 Subject: [PATCH 1067/1578] MAINTAINERS: Update entries for Kees Cook Update current email address for Kees Cook in the MAINTAINER file to match the change from commit 4e173c825b19 ("mailmap: update entry for Kees Cook"). Link: https://lore.kernel.org/r/20240617181257.work.206-kees@kernel.org Signed-off-by: Kees Cook --- MAINTAINERS | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 8754ac2c259dc9..f601a2fd1ebf1e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5296,7 +5296,7 @@ F: drivers/infiniband/hw/usnic/ CLANG CONTROL FLOW INTEGRITY SUPPORT M: Sami Tolvanen -M: Kees Cook +M: Kees Cook R: Nathan Chancellor L: llvm@lists.linux.dev S: Supported @@ -8212,7 +8212,7 @@ F: rust/kernel/net/phy.rs EXEC & BINFMT API, ELF R: Eric Biederman -R: Kees Cook +R: Kees Cook L: linux-mm@kvack.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/execve @@ -8613,7 +8613,7 @@ S: Maintained F: drivers/net/ethernet/nvidia/* FORTIFY_SOURCE -M: Kees Cook +M: Kees Cook L: linux-hardening@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/hardening @@ -9103,7 +9103,7 @@ F: include/linux/mfd/gsc.h F: include/linux/platform_data/gsc_hwmon.h GCC PLUGINS -M: Kees Cook +M: Kees Cook L: linux-hardening@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/hardening @@ -9237,7 +9237,7 @@ S: Maintained F: drivers/input/touchscreen/resistive-adc-touch.c GENERIC STRING LIBRARY -M: Kees Cook +M: Kees Cook R: Andy Shevchenko L: linux-hardening@vger.kernel.org S: Supported @@ -11951,7 +11951,7 @@ F: scripts/package/ F: usr/ KERNEL HARDENING (not covered by other areas) -M: Kees Cook +M: Kees Cook R: Gustavo A. R. Silva L: linux-hardening@vger.kernel.org S: Supported @@ -12479,7 +12479,7 @@ F: drivers/scsi/53c700* LEAKING_ADDRESSES M: Tycho Andersen -R: Kees Cook +R: Kees Cook L: linux-hardening@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/hardening @@ -12775,7 +12775,7 @@ F: arch/powerpc/platforms/8xx/ F: arch/powerpc/platforms/83xx/ LINUX KERNEL DUMP TEST MODULE (LKDTM) -M: Kees Cook +M: Kees Cook S: Maintained F: drivers/misc/lkdtm/* F: tools/testing/selftests/lkdtm/* @@ -12905,7 +12905,7 @@ Q: http://patchwork.linuxtv.org/project/linux-media/list/ F: drivers/media/usb/dvb-usb-v2/lmedm04* LOADPIN SECURITY MODULE -M: Kees Cook +M: Kees Cook S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/hardening F: Documentation/admin-guide/LSM/LoadPin.rst @@ -17998,7 +17998,7 @@ F: tools/testing/selftests/proc/ PROC SYSCTL M: Luis Chamberlain -M: Kees Cook +M: Kees Cook M: Joel Granados L: linux-kernel@vger.kernel.org L: linux-fsdevel@vger.kernel.org @@ -18054,7 +18054,7 @@ F: Documentation/devicetree/bindings/net/pse-pd/ F: drivers/net/pse-pd/ PSTORE FILESYSTEM -M: Kees Cook +M: Kees Cook R: Tony Luck R: Guilherme G. Piccoli L: linux-hardening@vger.kernel.org @@ -20060,7 +20060,7 @@ F: drivers/media/cec/platform/seco/seco-cec.c F: drivers/media/cec/platform/seco/seco-cec.h SECURE COMPUTING -M: Kees Cook +M: Kees Cook R: Andy Lutomirski R: Will Drewry S: Supported @@ -22974,7 +22974,7 @@ F: drivers/block/ublk_drv.c F: include/uapi/linux/ublk_cmd.h UBSAN -M: Kees Cook +M: Kees Cook R: Marco Elver R: Andrey Konovalov R: Andrey Ryabinin @@ -24812,7 +24812,7 @@ F: drivers/net/hamradio/yam* F: include/linux/yam.h YAMA SECURITY MODULE -M: Kees Cook +M: Kees Cook S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/hardening F: Documentation/admin-guide/LSM/Yama.rst From 9570a48847e3acfa1a741cef431c923325ddc637 Mon Sep 17 00:00:00 2001 From: Boyang Yu Date: Mon, 17 Jun 2024 21:11:44 +0800 Subject: [PATCH 1068/1578] nvme: fix NVME_NS_DEAC may incorrectly identifying the disk as EXT_LBA. The value of NVME_NS_DEAC is 3, which means NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS. Provide a unique value for this feature flag. Fixes 1b96f862eccc ("nvme: implement the DEAC bit for the Write Zeroes command") Signed-off-by: Boyang Yu Reviewed-by: Kanchan Joshi Signed-off-by: Keith Busch --- drivers/nvme/host/nvme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index f3a41133ac3f97..68b400f9c42d51 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -502,7 +502,7 @@ static inline bool nvme_ns_head_multipath(struct nvme_ns_head *head) enum nvme_ns_features { NVME_NS_EXT_LBAS = 1 << 0, /* support extended LBA format */ NVME_NS_METADATA_SUPPORTED = 1 << 1, /* support getting generated md */ - NVME_NS_DEAC, /* DEAC bit in Write Zeores supported */ + NVME_NS_DEAC = 1 << 2, /* DEAC bit in Write Zeores supported */ }; struct nvme_ns { From f31e85a4d7c6ac4a3e014129c9cdc31592ea29f3 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 17 Jun 2024 09:27:26 +0200 Subject: [PATCH 1069/1578] nvmet: do not return 'reserved' for empty TSAS values The 'TSAS' value is only defined for TCP and RDMA, but returning 'reserved' for undefined values tricked nvmetcli to try to write 'reserved' when restoring from a config file. This caused an error and the configuration would not be applied. Fixes: 3f123494db72 ("nvmet: make TCP sectype settable via configfs") Signed-off-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/target/configfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index bd87dfd173a4cb..e60224356048a4 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -410,7 +410,7 @@ static ssize_t nvmet_addr_tsas_show(struct config_item *item, return sprintf(page, "%s\n", nvmet_addr_tsas_rdma[i].name); } } - return sprintf(page, "reserved\n"); + return sprintf(page, "\n"); } static ssize_t nvmet_addr_tsas_store(struct config_item *item, From a83e1385b780d41307433ddbc86e3c528db031f0 Mon Sep 17 00:00:00 2001 From: Raju Rangoju Date: Fri, 14 Jun 2024 19:31:49 +0530 Subject: [PATCH 1070/1578] ACPICA: Revert "ACPICA: avoid Info: mapping multiple BARs. Your kernel is fine." Undo the modifications made in commit d410ee5109a1 ("ACPICA: avoid "Info: mapping multiple BARs. Your kernel is fine.""). The initial purpose of this commit was to stop memory mappings for operation regions from overlapping page boundaries, as it can trigger warnings if different page attributes are present. However, it was found that when this situation arises, mapping continues until the boundary's end, but there is still an attempt to read/write the entire length of the map, leading to a NULL pointer deference. For example, if a four-byte mapping request is made but only one byte is mapped because it hits the current page boundary's end, a four-byte read/write attempt is still made, resulting in a NULL pointer deference. Instead, map the entire length, as the ACPI specification does not mandate that it must be within the same page boundary. It is permissible for it to be mapped across different regions. Link: https://github.com/acpica/acpica/pull/954 Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218849 Fixes: d410ee5109a1 ("ACPICA: avoid "Info: mapping multiple BARs. Your kernel is fine."") Co-developed-by: Sanath S Signed-off-by: Sanath S Signed-off-by: Raju Rangoju Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/exregion.c | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/drivers/acpi/acpica/exregion.c b/drivers/acpi/acpica/exregion.c index 8907b8bf42672a..c49b9f8de723d8 100644 --- a/drivers/acpi/acpica/exregion.c +++ b/drivers/acpi/acpica/exregion.c @@ -44,7 +44,6 @@ acpi_ex_system_memory_space_handler(u32 function, struct acpi_mem_mapping *mm = mem_info->cur_mm; u32 length; acpi_size map_length; - acpi_size page_boundary_map_length; #ifdef ACPI_MISALIGNMENT_NOT_SUPPORTED u32 remainder; #endif @@ -138,26 +137,8 @@ acpi_ex_system_memory_space_handler(u32 function, map_length = (acpi_size) ((mem_info->address + mem_info->length) - address); - /* - * If mapping the entire remaining portion of the region will cross - * a page boundary, just map up to the page boundary, do not cross. - * On some systems, crossing a page boundary while mapping regions - * can cause warnings if the pages have different attributes - * due to resource management. - * - * This has the added benefit of constraining a single mapping to - * one page, which is similar to the original code that used a 4k - * maximum window. - */ - page_boundary_map_length = (acpi_size) - (ACPI_ROUND_UP(address, ACPI_DEFAULT_PAGE_SIZE) - address); - if (page_boundary_map_length == 0) { - page_boundary_map_length = ACPI_DEFAULT_PAGE_SIZE; - } - - if (map_length > page_boundary_map_length) { - map_length = page_boundary_map_length; - } + if (map_length > ACPI_DEFAULT_PAGE_SIZE) + map_length = ACPI_DEFAULT_PAGE_SIZE; /* Create a new mapping starting at the address given */ From c7be64355fccfe7d4727681e32fce07113e40af1 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 12 Jun 2024 12:42:20 +0200 Subject: [PATCH 1071/1578] ACPI: scan: Ignore camera graph port nodes on all Dell Tiger, Alder and Raptor Lake models Dell laptops with IPU6 camera (the Tiger Lake, Alder Lake and Raptor Lake generations) have broken ACPI MIPI DISCO information (this results from an OEM attempt to make Linux work by supplying it with custom data in the ACPI tables which has never been supported in the mainline). Instead of adding a lot of DMI quirks for this, check for Dell platforms based on the processor generations in question and drop the ACPI graph port nodes, likely to be created with the help of invalid data, on all of them. Fixes: bd721b934323 ("ACPI: scan: Extract CSI-2 connection graph from _CRS") Signed-off-by: Hans de Goede [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/internal.h | 4 ++++ drivers/acpi/mipi-disco-img.c | 28 +++++++++++++++++++--------- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h index 2a0e9fc7b74c1a..601b670356e50e 100644 --- a/drivers/acpi/internal.h +++ b/drivers/acpi/internal.h @@ -302,6 +302,10 @@ void acpi_mipi_check_crs_csi2(acpi_handle handle); void acpi_mipi_scan_crs_csi2(void); void acpi_mipi_init_crs_csi2_swnodes(void); void acpi_mipi_crs_csi2_cleanup(void); +#ifdef CONFIG_X86 bool acpi_graph_ignore_port(acpi_handle handle); +#else +static inline bool acpi_graph_ignore_port(acpi_handle handle) { return false; } +#endif #endif /* _ACPI_INTERNAL_H_ */ diff --git a/drivers/acpi/mipi-disco-img.c b/drivers/acpi/mipi-disco-img.c index d05413a0672a99..0ab13751f0dbc1 100644 --- a/drivers/acpi/mipi-disco-img.c +++ b/drivers/acpi/mipi-disco-img.c @@ -725,14 +725,20 @@ void acpi_mipi_crs_csi2_cleanup(void) acpi_mipi_del_crs_csi2(csi2); } -static const struct dmi_system_id dmi_ignore_port_nodes[] = { - { - .matches = { - DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "XPS 9315"), - }, - }, - { } +#ifdef CONFIG_X86 +#include +#include + +/* CPU matches for Dell generations with broken ACPI MIPI DISCO info */ +static const struct x86_cpu_id dell_broken_mipi_disco_cpu_gens[] = { + X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, NULL), + X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, NULL), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, NULL), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, NULL), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, NULL), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, NULL), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, NULL), + {} }; static const char *strnext(const char *s1, const char *s2) @@ -761,7 +767,10 @@ bool acpi_graph_ignore_port(acpi_handle handle) static bool dmi_tested, ignore_port; if (!dmi_tested) { - ignore_port = dmi_first_match(dmi_ignore_port_nodes); + if (dmi_name_in_vendors("Dell Inc.") && + x86_match_cpu(dell_broken_mipi_disco_cpu_gens)) + ignore_port = true; + dmi_tested = true; } @@ -794,3 +803,4 @@ bool acpi_graph_ignore_port(acpi_handle handle) kfree(orig_path); return false; } +#endif From 0606c5c4ad05076dba39af055644d83e3f6ba3a4 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 12 Jun 2024 12:48:28 +0200 Subject: [PATCH 1072/1578] ACPI: mipi-disco-img: Switch to new Intel CPU model defines Switch over to using the new Intel CPU model defines, as the old ones are going away. Signed-off-by: Hans de Goede Signed-off-by: Rafael J. Wysocki --- drivers/acpi/mipi-disco-img.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/acpi/mipi-disco-img.c b/drivers/acpi/mipi-disco-img.c index 0ab13751f0dbc1..92b658f92dc0f1 100644 --- a/drivers/acpi/mipi-disco-img.c +++ b/drivers/acpi/mipi-disco-img.c @@ -731,13 +731,13 @@ void acpi_mipi_crs_csi2_cleanup(void) /* CPU matches for Dell generations with broken ACPI MIPI DISCO info */ static const struct x86_cpu_id dell_broken_mipi_disco_cpu_gens[] = { - X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, NULL), - X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, NULL), - X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, NULL), - X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, NULL), - X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, NULL), - X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, NULL), - X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, NULL), + X86_MATCH_VFM(INTEL_TIGERLAKE, NULL), + X86_MATCH_VFM(INTEL_TIGERLAKE_L, NULL), + X86_MATCH_VFM(INTEL_ALDERLAKE, NULL), + X86_MATCH_VFM(INTEL_ALDERLAKE_L, NULL), + X86_MATCH_VFM(INTEL_RAPTORLAKE, NULL), + X86_MATCH_VFM(INTEL_RAPTORLAKE_P, NULL), + X86_MATCH_VFM(INTEL_RAPTORLAKE_S, NULL), {} }; From 4181b51c38875de9f6f11248fa0bcf3246c19c82 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Tue, 11 Jun 2024 14:06:31 +0200 Subject: [PATCH 1073/1578] s390/pci: Add missing virt_to_phys() for directed DIBV In commit 4e4dc65ab578 ("s390/pci: use phys_to_virt() for AIBVs/DIBVs") the setting of dibv_addr was missed when adding virt_to_phys(). This only affects systems with directed interrupt delivery enabled which are not generally available. Fixes: 4e4dc65ab578 ("s390/pci: use phys_to_virt() for AIBVs/DIBVs") Reviewed-by: Heiko Carstens Signed-off-by: Niklas Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/pci/pci_irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c index ff8f24854c6462..0ef83b6ac0db74 100644 --- a/arch/s390/pci/pci_irq.c +++ b/arch/s390/pci/pci_irq.c @@ -410,7 +410,7 @@ static void __init cpu_enable_directed_irq(void *unused) union zpci_sic_iib iib = {{0}}; union zpci_sic_iib ziib = {{0}}; - iib.cdiib.dibv_addr = (u64) zpci_ibv[smp_processor_id()]->vector; + iib.cdiib.dibv_addr = virt_to_phys(zpci_ibv[smp_processor_id()]->vector); zpci_set_irq_ctrl(SIC_IRQ_MODE_SET_CPU, 0, &iib); zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC, &ziib); From d8354a1de2c4cc693812f6130fc922537a59217d Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Tue, 11 Jun 2024 23:47:16 +0200 Subject: [PATCH 1074/1578] s390/virtio_ccw: Fix config change notifications Commit e3e9bda38e6d ("s390/virtio_ccw: use DMA handle from DMA API") broke configuration change notifications for virtio-ccw by putting the DMA address of *indicatorp directly into ccw->cda disregarding the fact that if !!(vcdev->is_thinint) then the function virtio_ccw_register_adapter_ind() will overwrite that ccw->cda value with the address of the virtio_thinint_area so it can actually set up the adapter interrupts via CCW_CMD_SET_IND_ADAPTER. Thus we end up pointing to the wrong object for both CCW_CMD_SET_IND if setting up the adapter interrupts fails, and for CCW_CMD_SET_CONF_IND regardless whether it succeeds or fails. To fix this, let us save away the dma address of *indicatorp in a local variable, and copy it to ccw->cda after the "vcdev->is_thinint" branch. Fixes: e3e9bda38e6d ("s390/virtio_ccw: use DMA handle from DMA API") Reported-by: Boqiao Fu Reported-by: Sebastian Mitterle Closes: https://issues.redhat.com/browse/RHEL-39983 Tested-by: Thomas Huth Reviewed-by: Eric Farman Signed-off-by: Halil Pasic Link: https://lore.kernel.org/r/20240611214716.1002781-1-pasic@linux.ibm.com Signed-off-by: Vasily Gorbik --- drivers/s390/virtio/virtio_ccw.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c index d7569f3955591a..d6491fc84e8c54 100644 --- a/drivers/s390/virtio/virtio_ccw.c +++ b/drivers/s390/virtio/virtio_ccw.c @@ -698,6 +698,7 @@ static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs, dma64_t *indicatorp = NULL; int ret, i, queue_idx = 0; struct ccw1 *ccw; + dma32_t indicatorp_dma = 0; ccw = ccw_device_dma_zalloc(vcdev->cdev, sizeof(*ccw), NULL); if (!ccw) @@ -725,7 +726,7 @@ static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs, */ indicatorp = ccw_device_dma_zalloc(vcdev->cdev, sizeof(*indicatorp), - &ccw->cda); + &indicatorp_dma); if (!indicatorp) goto out; *indicatorp = indicators_dma(vcdev); @@ -735,6 +736,7 @@ static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs, /* no error, just fall back to legacy interrupts */ vcdev->is_thinint = false; } + ccw->cda = indicatorp_dma; if (!vcdev->is_thinint) { /* Register queue indicators with host. */ *indicators(vcdev) = 0; From 14d7c92f8df9c0964ae6f8b813c1b3ac38120825 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 17 Jun 2024 12:57:03 -0700 Subject: [PATCH 1075/1578] Revert "mm: mmap: allow for the maximum number of bits for randomizing mmap_base by default" This reverts commit 3afb76a66b5559a7b595155803ce23801558a7a9. This was a wrongheaded workaround for an issue that had already been fixed much better by commit 4ef9ad19e176 ("mm: huge_memory: don't force huge page alignment on 32 bit"). Asking users questions at kernel compile time that they can't make sense of is not a viable strategy. And the fact that even the kernel VM maintainers apparently didn't catch that this "fix" is not a fix any more pretty much proves the point that people can't be expected to understand the implications of the question. It may well be the case that we could improve things further, and that __thp_get_unmapped_area() should take the mapping randomization into account even for 64-bit kernels. Maybe we should not be so eager to use THP mappings. But in no case should this be a kernel config option. Cc: Rafael Aquini Cc: Andrew Morton Cc: Jiri Slaby Cc: Suren Baghdasaryan Cc: Matthew Wilcox (Oracle) Signed-off-by: Linus Torvalds --- arch/Kconfig | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 3e2a63772b3db1..975dd22a2dbd22 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1046,21 +1046,10 @@ config ARCH_MMAP_RND_BITS_MAX config ARCH_MMAP_RND_BITS_DEFAULT int -config FORCE_MAX_MMAP_RND_BITS - bool "Force maximum number of bits to use for ASLR of mmap base address" - default y if !64BIT - help - ARCH_MMAP_RND_BITS and ARCH_MMAP_RND_COMPAT_BITS represent the number - of bits to use for ASLR and if no custom value is assigned (EXPERT) - then the architecture's lower bound (minimum) value is assumed. - This toggle changes that default assumption to assume the arch upper - bound (maximum) value instead. - config ARCH_MMAP_RND_BITS int "Number of bits to use for ASLR of mmap base address" if EXPERT range ARCH_MMAP_RND_BITS_MIN ARCH_MMAP_RND_BITS_MAX default ARCH_MMAP_RND_BITS_DEFAULT if ARCH_MMAP_RND_BITS_DEFAULT - default ARCH_MMAP_RND_BITS_MAX if FORCE_MAX_MMAP_RND_BITS default ARCH_MMAP_RND_BITS_MIN depends on HAVE_ARCH_MMAP_RND_BITS help @@ -1095,7 +1084,6 @@ config ARCH_MMAP_RND_COMPAT_BITS int "Number of bits to use for ASLR of mmap base address for compatible applications" if EXPERT range ARCH_MMAP_RND_COMPAT_BITS_MIN ARCH_MMAP_RND_COMPAT_BITS_MAX default ARCH_MMAP_RND_COMPAT_BITS_DEFAULT if ARCH_MMAP_RND_COMPAT_BITS_DEFAULT - default ARCH_MMAP_RND_COMPAT_BITS_MAX if FORCE_MAX_MMAP_RND_BITS default ARCH_MMAP_RND_COMPAT_BITS_MIN depends on HAVE_ARCH_MMAP_RND_COMPAT_BITS help From 9e046bb111f13461d3f9331e24e974324245140e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 14 Jun 2024 13:06:15 +0000 Subject: [PATCH 1076/1578] tcp: clear tp->retrans_stamp in tcp_rcv_fastopen_synack() Some applications were reporting ETIMEDOUT errors on apparently good looking flows, according to packet dumps. We were able to root cause the issue to an accidental setting of tp->retrans_stamp in the following scenario: - client sends TFO SYN with data. - server has TFO disabled, ACKs only SYN but not payload. - client receives SYNACK covering only SYN. - tcp_ack() eats SYN and sets tp->retrans_stamp to 0. - tcp_rcv_fastopen_synack() calls tcp_xmit_retransmit_queue() to retransmit TFO payload w/o SYN, sets tp->retrans_stamp to "now", but we are not in any loss recovery state. - TFO payload is ACKed. - we are not in any loss recovery state, and don't see any dupacks, so we don't get to any code path that clears tp->retrans_stamp. - tp->retrans_stamp stays non-zero for the lifetime of the connection. - after first RTO, tcp_clamp_rto_to_user_timeout() clamps second RTO to 1 jiffy due to bogus tp->retrans_stamp. - on clamped RTO with non-zero icsk_retransmits, retransmits_timed_out() sets start_ts from tp->retrans_stamp from TFO payload retransmit hours/days ago, and computes bogus long elapsed time for loss recovery, and suffers ETIMEDOUT early. Fixes: a7abf3cd76e1 ("tcp: consider using standard rtx logic in tcp_rcv_fastopen_synack()") CC: stable@vger.kernel.org Co-developed-by: Neal Cardwell Signed-off-by: Neal Cardwell Co-developed-by: Yuchung Cheng Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20240614130615.396837-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9c04a9c8be9dfa..01d208e0eef31f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6296,6 +6296,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, skb_rbtree_walk_from(data) tcp_mark_skb_lost(sk, data); tcp_xmit_retransmit_queue(sk); + tp->retrans_stamp = 0; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); return true; From e874557fce1b6023efafd523aee0c347bf7f1694 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 14 Jun 2024 19:15:29 +0200 Subject: [PATCH 1077/1578] selftests: mptcp: userspace_pm: fixed subtest names It is important to have fixed (sub)test names in TAP, because these names are used to identify them. If they are not fixed, tracking cannot be done. Some subtests from the userspace_pm selftest were using random numbers in their names: the client and server address IDs from $RANDOM, and the client port number randomly picked by the kernel when creating the connection. These values have been replaced by 'client' and 'server' words: that's even more helpful than showing random numbers. Note that the addresses IDs are incremented and decremented in the test: +1 or -1 are then displayed in these cases. Not to loose info that can be useful for debugging in case of issues, these random numbers are now displayed at the beginning of the test. Fixes: f589234e1af0 ("selftests: mptcp: userspace_pm: format subtests results in TAP") Cc: stable@vger.kernel.org Signed-off-by: Matthieu Baerts (NGI0) Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240614-upstream-net-20240614-selftests-mptcp-uspace-pm-fixed-test-names-v1-1-460ad3edb429@kernel.org Signed-off-by: Jakub Kicinski --- .../selftests/net/mptcp/userspace_pm.sh | 46 +++++++++++-------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/userspace_pm.sh b/tools/testing/selftests/net/mptcp/userspace_pm.sh index 9e2981f2d7f5ce..9cb05978269d15 100755 --- a/tools/testing/selftests/net/mptcp/userspace_pm.sh +++ b/tools/testing/selftests/net/mptcp/userspace_pm.sh @@ -160,10 +160,12 @@ make_connection() local is_v6=$1 local app_port=$app4_port local connect_addr="10.0.1.1" + local client_addr="10.0.1.2" local listen_addr="0.0.0.0" if [ "$is_v6" = "v6" ] then connect_addr="dead:beef:1::1" + client_addr="dead:beef:1::2" listen_addr="::" app_port=$app6_port else @@ -206,6 +208,7 @@ make_connection() [ "$server_serverside" = 1 ] then test_pass + print_title "Connection info: ${client_addr}:${client_port} -> ${connect_addr}:${app_port}" else test_fail "Expected tokens (c:${client_token} - s:${server_token}) and server (c:${client_serverside} - s:${server_serverside})" mptcp_lib_result_print_all_tap @@ -297,7 +300,7 @@ test_announce() ip netns exec "$ns2"\ ./pm_nl_ctl ann 10.0.2.2 token "$client4_token" id $client_addr_id dev\ ns2eth1 - print_test "ADD_ADDR id:${client_addr_id} 10.0.2.2 (ns2) => ns1, reuse port" + print_test "ADD_ADDR id:client 10.0.2.2 (ns2) => ns1, reuse port" sleep 0.5 verify_announce_event $server_evts $ANNOUNCED $server4_token "10.0.2.2" $client_addr_id \ "$client4_port" @@ -306,7 +309,7 @@ test_announce() :>"$server_evts" ip netns exec "$ns2" ./pm_nl_ctl ann\ dead:beef:2::2 token "$client6_token" id $client_addr_id dev ns2eth1 - print_test "ADD_ADDR6 id:${client_addr_id} dead:beef:2::2 (ns2) => ns1, reuse port" + print_test "ADD_ADDR6 id:client dead:beef:2::2 (ns2) => ns1, reuse port" sleep 0.5 verify_announce_event "$server_evts" "$ANNOUNCED" "$server6_token" "dead:beef:2::2"\ "$client_addr_id" "$client6_port" "v6" @@ -316,7 +319,7 @@ test_announce() client_addr_id=$((client_addr_id+1)) ip netns exec "$ns2" ./pm_nl_ctl ann 10.0.2.2 token "$client4_token" id\ $client_addr_id dev ns2eth1 port $new4_port - print_test "ADD_ADDR id:${client_addr_id} 10.0.2.2 (ns2) => ns1, new port" + print_test "ADD_ADDR id:client+1 10.0.2.2 (ns2) => ns1, new port" sleep 0.5 verify_announce_event "$server_evts" "$ANNOUNCED" "$server4_token" "10.0.2.2"\ "$client_addr_id" "$new4_port" @@ -327,7 +330,7 @@ test_announce() # ADD_ADDR from the server to client machine reusing the subflow port ip netns exec "$ns1" ./pm_nl_ctl ann 10.0.2.1 token "$server4_token" id\ $server_addr_id dev ns1eth2 - print_test "ADD_ADDR id:${server_addr_id} 10.0.2.1 (ns1) => ns2, reuse port" + print_test "ADD_ADDR id:server 10.0.2.1 (ns1) => ns2, reuse port" sleep 0.5 verify_announce_event "$client_evts" "$ANNOUNCED" "$client4_token" "10.0.2.1"\ "$server_addr_id" "$app4_port" @@ -336,7 +339,7 @@ test_announce() :>"$client_evts" ip netns exec "$ns1" ./pm_nl_ctl ann dead:beef:2::1 token "$server6_token" id\ $server_addr_id dev ns1eth2 - print_test "ADD_ADDR6 id:${server_addr_id} dead:beef:2::1 (ns1) => ns2, reuse port" + print_test "ADD_ADDR6 id:server dead:beef:2::1 (ns1) => ns2, reuse port" sleep 0.5 verify_announce_event "$client_evts" "$ANNOUNCED" "$client6_token" "dead:beef:2::1"\ "$server_addr_id" "$app6_port" "v6" @@ -346,7 +349,7 @@ test_announce() server_addr_id=$((server_addr_id+1)) ip netns exec "$ns1" ./pm_nl_ctl ann 10.0.2.1 token "$server4_token" id\ $server_addr_id dev ns1eth2 port $new4_port - print_test "ADD_ADDR id:${server_addr_id} 10.0.2.1 (ns1) => ns2, new port" + print_test "ADD_ADDR id:server+1 10.0.2.1 (ns1) => ns2, new port" sleep 0.5 verify_announce_event "$client_evts" "$ANNOUNCED" "$client4_token" "10.0.2.1"\ "$server_addr_id" "$new4_port" @@ -380,7 +383,7 @@ test_remove() local invalid_token=$(( client4_token - 1 )) ip netns exec "$ns2" ./pm_nl_ctl rem token $invalid_token id\ $client_addr_id > /dev/null 2>&1 - print_test "RM_ADDR id:${client_addr_id} ns2 => ns1, invalid token" + print_test "RM_ADDR id:client ns2 => ns1, invalid token" local type type=$(mptcp_lib_evts_get_info type "$server_evts") if [ "$type" = "" ] @@ -394,7 +397,7 @@ test_remove() local invalid_id=$(( client_addr_id + 1 )) ip netns exec "$ns2" ./pm_nl_ctl rem token "$client4_token" id\ $invalid_id > /dev/null 2>&1 - print_test "RM_ADDR id:${invalid_id} ns2 => ns1, invalid id" + print_test "RM_ADDR id:client+1 ns2 => ns1, invalid id" type=$(mptcp_lib_evts_get_info type "$server_evts") if [ "$type" = "" ] then @@ -407,7 +410,7 @@ test_remove() :>"$server_evts" ip netns exec "$ns2" ./pm_nl_ctl rem token "$client4_token" id\ $client_addr_id - print_test "RM_ADDR id:${client_addr_id} ns2 => ns1" + print_test "RM_ADDR id:client ns2 => ns1" sleep 0.5 verify_remove_event "$server_evts" "$REMOVED" "$server4_token" "$client_addr_id" @@ -416,7 +419,7 @@ test_remove() client_addr_id=$(( client_addr_id - 1 )) ip netns exec "$ns2" ./pm_nl_ctl rem token "$client4_token" id\ $client_addr_id - print_test "RM_ADDR id:${client_addr_id} ns2 => ns1" + print_test "RM_ADDR id:client-1 ns2 => ns1" sleep 0.5 verify_remove_event "$server_evts" "$REMOVED" "$server4_token" "$client_addr_id" @@ -424,7 +427,7 @@ test_remove() :>"$server_evts" ip netns exec "$ns2" ./pm_nl_ctl rem token "$client6_token" id\ $client_addr_id - print_test "RM_ADDR6 id:${client_addr_id} ns2 => ns1" + print_test "RM_ADDR6 id:client-1 ns2 => ns1" sleep 0.5 verify_remove_event "$server_evts" "$REMOVED" "$server6_token" "$client_addr_id" @@ -434,7 +437,7 @@ test_remove() # RM_ADDR from the server to client machine ip netns exec "$ns1" ./pm_nl_ctl rem token "$server4_token" id\ $server_addr_id - print_test "RM_ADDR id:${server_addr_id} ns1 => ns2" + print_test "RM_ADDR id:server ns1 => ns2" sleep 0.5 verify_remove_event "$client_evts" "$REMOVED" "$client4_token" "$server_addr_id" @@ -443,7 +446,7 @@ test_remove() server_addr_id=$(( server_addr_id - 1 )) ip netns exec "$ns1" ./pm_nl_ctl rem token "$server4_token" id\ $server_addr_id - print_test "RM_ADDR id:${server_addr_id} ns1 => ns2" + print_test "RM_ADDR id:server-1 ns1 => ns2" sleep 0.5 verify_remove_event "$client_evts" "$REMOVED" "$client4_token" "$server_addr_id" @@ -451,7 +454,7 @@ test_remove() :>"$client_evts" ip netns exec "$ns1" ./pm_nl_ctl rem token "$server6_token" id\ $server_addr_id - print_test "RM_ADDR6 id:${server_addr_id} ns1 => ns2" + print_test "RM_ADDR6 id:server-1 ns1 => ns2" sleep 0.5 verify_remove_event "$client_evts" "$REMOVED" "$client6_token" "$server_addr_id" } @@ -479,8 +482,14 @@ verify_subflow_events() local locid local remid local info + local e_dport_txt - info="${e_saddr} (${e_from}) => ${e_daddr}:${e_dport} (${e_to})" + # only display the fixed ports + if [ "${e_dport}" -ge "${app4_port}" ] && [ "${e_dport}" -le "${app6_port}" ]; then + e_dport_txt=":${e_dport}" + fi + + info="${e_saddr} (${e_from}) => ${e_daddr}${e_dport_txt} (${e_to})" if [ "$e_type" = "$SUB_ESTABLISHED" ] then @@ -766,7 +775,7 @@ test_subflows_v4_v6_mix() :>"$client_evts" ip netns exec "$ns1" ./pm_nl_ctl ann 10.0.2.1 token "$server6_token" id\ $server_addr_id dev ns1eth2 - print_test "ADD_ADDR4 id:${server_addr_id} 10.0.2.1 (ns1) => ns2, reuse port" + print_test "ADD_ADDR4 id:server 10.0.2.1 (ns1) => ns2, reuse port" sleep 0.5 verify_announce_event "$client_evts" "$ANNOUNCED" "$client6_token" "10.0.2.1"\ "$server_addr_id" "$app6_port" @@ -861,7 +870,7 @@ test_listener() local listener_pid=$! sleep 0.5 - print_test "CREATE_LISTENER 10.0.2.2:$client4_port" + print_test "CREATE_LISTENER 10.0.2.2 (client port)" verify_listener_events $client_evts $LISTENER_CREATED $AF_INET 10.0.2.2 $client4_port # ADD_ADDR from client to server machine reusing the subflow port @@ -878,13 +887,14 @@ test_listener() mptcp_lib_kill_wait $listener_pid sleep 0.5 - print_test "CLOSE_LISTENER 10.0.2.2:$client4_port" + print_test "CLOSE_LISTENER 10.0.2.2 (client port)" verify_listener_events $client_evts $LISTENER_CLOSED $AF_INET 10.0.2.2 $client4_port } print_title "Make connections" make_connection make_connection "v6" +print_title "Will be using address IDs ${client_addr_id} (client) and ${server_addr_id} (server)" test_announce test_remove From 2eab4543a2204092c3a7af81d7d6c506e59a03a6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 14 Jun 2024 08:20:02 +0000 Subject: [PATCH 1078/1578] ipv6: prevent possible NULL deref in fib6_nh_init() syzbot reminds us that in6_dev_get() can return NULL. fib6_nh_init() ip6_validate_gw( &idev ) ip6_route_check_nh( idev ) *idev = in6_dev_get(dev); // can be NULL Oops: general protection fault, probably for non-canonical address 0xdffffc00000000bc: 0000 [#1] PREEMPT SMP KASAN PTI KASAN: null-ptr-deref in range [0x00000000000005e0-0x00000000000005e7] CPU: 0 PID: 11237 Comm: syz-executor.3 Not tainted 6.10.0-rc2-syzkaller-00249-gbe27b8965297 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 06/07/2024 RIP: 0010:fib6_nh_init+0x640/0x2160 net/ipv6/route.c:3606 Code: 00 00 fc ff df 4c 8b 64 24 58 48 8b 44 24 28 4c 8b 74 24 30 48 89 c1 48 89 44 24 28 48 8d 98 e0 05 00 00 48 89 d8 48 c1 e8 03 <42> 0f b6 04 38 84 c0 0f 85 b3 17 00 00 8b 1b 31 ff 89 de e8 b8 8b RSP: 0018:ffffc900032775a0 EFLAGS: 00010202 RAX: 00000000000000bc RBX: 00000000000005e0 RCX: 0000000000000000 RDX: 0000000000000010 RSI: ffffc90003277a54 RDI: ffff88802b3a08d8 RBP: ffffc900032778b0 R08: 00000000000002fc R09: 0000000000000000 R10: 00000000000002fc R11: 0000000000000000 R12: ffff88802b3a08b8 R13: 1ffff9200064eec8 R14: ffffc90003277a00 R15: dffffc0000000000 FS: 00007f940feb06c0(0000) GS:ffff8880b9400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000000245e8000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ip6_route_info_create+0x99e/0x12b0 net/ipv6/route.c:3809 ip6_route_add+0x28/0x160 net/ipv6/route.c:3853 ipv6_route_ioctl+0x588/0x870 net/ipv6/route.c:4483 inet6_ioctl+0x21a/0x280 net/ipv6/af_inet6.c:579 sock_do_ioctl+0x158/0x460 net/socket.c:1222 sock_ioctl+0x629/0x8e0 net/socket.c:1341 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:907 [inline] __se_sys_ioctl+0xfc/0x170 fs/ioctl.c:893 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f940f07cea9 Fixes: 428604fb118f ("ipv6: do not set routes if disable_ipv6 has been enabled") Reported-by: syzbot Signed-off-by: Eric Dumazet Acked-by: Lorenzo Bianconi Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240614082002.26407-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 952c2bf1170942..28788ffde5854f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3603,7 +3603,7 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, if (!dev) goto out; - if (idev->cnf.disable_ipv6) { + if (!idev || idev->cnf.disable_ipv6) { NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); err = -EACCES; goto out; From b86762dbe19a62e785c189f313cda5b989931f37 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 15 Jun 2024 15:14:54 +0000 Subject: [PATCH 1079/1578] ipv6: prevent possible NULL dereference in rt6_probe() syzbot caught a NULL dereference in rt6_probe() [1] Bail out if __in6_dev_get() returns NULL. [1] Oops: general protection fault, probably for non-canonical address 0xdffffc00000000cb: 0000 [#1] PREEMPT SMP KASAN PTI KASAN: null-ptr-deref in range [0x0000000000000658-0x000000000000065f] CPU: 1 PID: 22444 Comm: syz-executor.0 Not tainted 6.10.0-rc2-syzkaller-00383-gb8481381d4e2 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/02/2024 RIP: 0010:rt6_probe net/ipv6/route.c:656 [inline] RIP: 0010:find_match+0x8c4/0xf50 net/ipv6/route.c:758 Code: 14 fd f7 48 8b 85 38 ff ff ff 48 c7 45 b0 00 00 00 00 48 8d b8 5c 06 00 00 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <0f> b6 14 02 48 89 f8 83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 19 RSP: 0018:ffffc900034af070 EFLAGS: 00010203 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffc90004521000 RDX: 00000000000000cb RSI: ffffffff8990d0cd RDI: 000000000000065c RBP: ffffc900034af150 R08: 0000000000000005 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000002 R12: 000000000000000a R13: 1ffff92000695e18 R14: ffff8880244a1d20 R15: 0000000000000000 FS: 00007f4844a5a6c0(0000) GS:ffff8880b9300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b31b27000 CR3: 000000002d42c000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: rt6_nh_find_match+0xfa/0x1a0 net/ipv6/route.c:784 nexthop_for_each_fib6_nh+0x26d/0x4a0 net/ipv4/nexthop.c:1496 __find_rr_leaf+0x6e7/0xe00 net/ipv6/route.c:825 find_rr_leaf net/ipv6/route.c:853 [inline] rt6_select net/ipv6/route.c:897 [inline] fib6_table_lookup+0x57e/0xa30 net/ipv6/route.c:2195 ip6_pol_route+0x1cd/0x1150 net/ipv6/route.c:2231 pol_lookup_func include/net/ip6_fib.h:616 [inline] fib6_rule_lookup+0x386/0x720 net/ipv6/fib6_rules.c:121 ip6_route_output_flags_noref net/ipv6/route.c:2639 [inline] ip6_route_output_flags+0x1d0/0x640 net/ipv6/route.c:2651 ip6_dst_lookup_tail.constprop.0+0x961/0x1760 net/ipv6/ip6_output.c:1147 ip6_dst_lookup_flow+0x99/0x1d0 net/ipv6/ip6_output.c:1250 rawv6_sendmsg+0xdab/0x4340 net/ipv6/raw.c:898 inet_sendmsg+0x119/0x140 net/ipv4/af_inet.c:853 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg net/socket.c:745 [inline] sock_write_iter+0x4b8/0x5c0 net/socket.c:1160 new_sync_write fs/read_write.c:497 [inline] vfs_write+0x6b6/0x1140 fs/read_write.c:590 ksys_write+0x1f8/0x260 fs/read_write.c:643 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x250 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: 52e1635631b3 ("[IPV6]: ROUTE: Add router_probe_interval sysctl.") Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240615151454.166404-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/route.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 28788ffde5854f..8d72ca0b086d7d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -638,6 +638,8 @@ static void rt6_probe(struct fib6_nh *fib6_nh) rcu_read_lock(); last_probe = READ_ONCE(fib6_nh->last_probe); idev = __in6_dev_get(dev); + if (!idev) + goto out; neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); if (neigh) { if (READ_ONCE(neigh->nud_state) & NUD_VALID) From d46401052c2d5614da8efea5788532f0401cb164 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 15 Jun 2024 15:42:31 +0000 Subject: [PATCH 1080/1578] xfrm6: check ip6_dst_idev() return value in xfrm6_get_saddr() ip6_dst_idev() can return NULL, xfrm6_get_saddr() must act accordingly. syzbot reported: Oops: general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] PREEMPT SMP KASAN PTI KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] CPU: 1 PID: 12 Comm: kworker/u8:1 Not tainted 6.10.0-rc2-syzkaller-00383-gb8481381d4e2 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/02/2024 Workqueue: wg-kex-wg1 wg_packet_handshake_send_worker RIP: 0010:xfrm6_get_saddr+0x93/0x130 net/ipv6/xfrm6_policy.c:64 Code: df 48 89 fa 48 c1 ea 03 80 3c 02 00 0f 85 97 00 00 00 4c 8b ab d8 00 00 00 48 b8 00 00 00 00 00 fc ff df 4c 89 ea 48 c1 ea 03 <80> 3c 02 00 0f 85 86 00 00 00 4d 8b 6d 00 e8 ca 13 47 01 48 b8 00 RSP: 0018:ffffc90000117378 EFLAGS: 00010246 RAX: dffffc0000000000 RBX: ffff88807b079dc0 RCX: ffffffff89a0d6d7 RDX: 0000000000000000 RSI: ffffffff89a0d6e9 RDI: ffff88807b079e98 RBP: ffff88807ad73248 R08: 0000000000000007 R09: fffffffffffff000 R10: ffff88807b079dc0 R11: 0000000000000007 R12: ffffc90000117480 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff8880b9300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f4586d00440 CR3: 0000000079042000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: xfrm_get_saddr net/xfrm/xfrm_policy.c:2452 [inline] xfrm_tmpl_resolve_one net/xfrm/xfrm_policy.c:2481 [inline] xfrm_tmpl_resolve+0xa26/0xf10 net/xfrm/xfrm_policy.c:2541 xfrm_resolve_and_create_bundle+0x140/0x2570 net/xfrm/xfrm_policy.c:2835 xfrm_bundle_lookup net/xfrm/xfrm_policy.c:3070 [inline] xfrm_lookup_with_ifid+0x4d1/0x1e60 net/xfrm/xfrm_policy.c:3201 xfrm_lookup net/xfrm/xfrm_policy.c:3298 [inline] xfrm_lookup_route+0x3b/0x200 net/xfrm/xfrm_policy.c:3309 ip6_dst_lookup_flow+0x15c/0x1d0 net/ipv6/ip6_output.c:1256 send6+0x611/0xd20 drivers/net/wireguard/socket.c:139 wg_socket_send_skb_to_peer+0xf9/0x220 drivers/net/wireguard/socket.c:178 wg_socket_send_buffer_to_peer+0x12b/0x190 drivers/net/wireguard/socket.c:200 wg_packet_send_handshake_initiation+0x227/0x360 drivers/net/wireguard/send.c:40 wg_packet_handshake_send_worker+0x1c/0x30 drivers/net/wireguard/send.c:51 process_one_work+0x9fb/0x1b60 kernel/workqueue.c:3231 process_scheduled_works kernel/workqueue.c:3312 [inline] worker_thread+0x6c8/0xf70 kernel/workqueue.c:3393 kthread+0x2c1/0x3a0 kernel/kthread.c:389 ret_from_fork+0x45/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240615154231.234442-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/xfrm6_policy.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index cc885d3aa9e594..2f1ea5f999a259 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -56,12 +56,18 @@ static int xfrm6_get_saddr(struct net *net, int oif, { struct dst_entry *dst; struct net_device *dev; + struct inet6_dev *idev; dst = xfrm6_dst_lookup(net, 0, oif, NULL, daddr, mark); if (IS_ERR(dst)) return -EHOSTUNREACH; - dev = ip6_dst_idev(dst)->dev; + idev = ip6_dst_idev(dst); + if (!idev) { + dst_release(dst); + return -EHOSTUNREACH; + } + dev = idev->dev; ipv6_dev_get_saddr(dev_net(dev), dev, &daddr->in6, 0, &saddr->in6); dst_release(dst); return 0; From ff960f9d3edbe08a736b5a224d91a305ccc946b0 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Fri, 14 Jun 2024 21:13:02 +0800 Subject: [PATCH 1081/1578] netns: Make get_net_ns() handle zero refcount net Syzkaller hit a warning: refcount_t: addition on 0; use-after-free. WARNING: CPU: 3 PID: 7890 at lib/refcount.c:25 refcount_warn_saturate+0xdf/0x1d0 Modules linked in: CPU: 3 PID: 7890 Comm: tun Not tainted 6.10.0-rc3-00100-gcaa4f9578aba-dirty #310 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 RIP: 0010:refcount_warn_saturate+0xdf/0x1d0 Code: 41 49 04 31 ff 89 de e8 9f 1e cd fe 84 db 75 9c e8 76 26 cd fe c6 05 b6 41 49 04 01 90 48 c7 c7 b8 8e 25 86 e8 d2 05 b5 fe 90 <0f> 0b 90 90 e9 79 ff ff ff e8 53 26 cd fe 0f b6 1 RSP: 0018:ffff8881067b7da0 EFLAGS: 00010286 RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffffff811c72ac RDX: ffff8881026a2140 RSI: ffffffff811c72b5 RDI: 0000000000000001 RBP: ffff8881067b7db0 R08: 0000000000000000 R09: 205b5d3730353139 R10: 0000000000000000 R11: 205d303938375420 R12: ffff8881086500c4 R13: ffff8881086500c4 R14: ffff8881086500b0 R15: ffff888108650040 FS: 00007f5b2961a4c0(0000) GS:ffff88823bd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055d7ed36fd18 CR3: 00000001482f6000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? show_regs+0xa3/0xc0 ? __warn+0xa5/0x1c0 ? refcount_warn_saturate+0xdf/0x1d0 ? report_bug+0x1fc/0x2d0 ? refcount_warn_saturate+0xdf/0x1d0 ? handle_bug+0xa1/0x110 ? exc_invalid_op+0x3c/0xb0 ? asm_exc_invalid_op+0x1f/0x30 ? __warn_printk+0xcc/0x140 ? __warn_printk+0xd5/0x140 ? refcount_warn_saturate+0xdf/0x1d0 get_net_ns+0xa4/0xc0 ? __pfx_get_net_ns+0x10/0x10 open_related_ns+0x5a/0x130 __tun_chr_ioctl+0x1616/0x2370 ? __sanitizer_cov_trace_switch+0x58/0xa0 ? __sanitizer_cov_trace_const_cmp2+0x1c/0x30 ? __pfx_tun_chr_ioctl+0x10/0x10 tun_chr_ioctl+0x2f/0x40 __x64_sys_ioctl+0x11b/0x160 x64_sys_call+0x1211/0x20d0 do_syscall_64+0x9e/0x1d0 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f5b28f165d7 Code: b3 66 90 48 8b 05 b1 48 2d 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 81 48 2d 00 8 RSP: 002b:00007ffc2b59c5e8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f5b28f165d7 RDX: 0000000000000000 RSI: 00000000000054e3 RDI: 0000000000000003 RBP: 00007ffc2b59c650 R08: 00007f5b291ed8c0 R09: 00007f5b2961a4c0 R10: 0000000029690010 R11: 0000000000000246 R12: 0000000000400730 R13: 00007ffc2b59cf40 R14: 0000000000000000 R15: 0000000000000000 Kernel panic - not syncing: kernel: panic_on_warn set ... This is trigger as below: ns0 ns1 tun_set_iff() //dev is tun0 tun->dev = dev //ip link set tun0 netns ns1 put_net() //ref is 0 __tun_chr_ioctl() //TUNGETDEVNETNS net = dev_net(tun->dev); open_related_ns(&net->ns, get_net_ns); //ns1 get_net_ns() get_net() //addition on 0 Use maybe_get_net() in get_net_ns in case net's ref is zero to fix this Fixes: 0c3e0e3bb623 ("tun: Add ioctl() TUNGETDEVNETNS cmd to allow obtaining real net ns of tun device") Signed-off-by: Yue Haibing Link: https://lore.kernel.org/r/20240614131302.2698509-1-yuehaibing@huawei.com Signed-off-by: Paolo Abeni --- net/core/net_namespace.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 4f7a61688d189d..6a823ba906c658 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -693,11 +693,16 @@ EXPORT_SYMBOL_GPL(__put_net); * get_net_ns - increment the refcount of the network namespace * @ns: common namespace (net) * - * Returns the net's common namespace. + * Returns the net's common namespace or ERR_PTR() if ref is zero. */ struct ns_common *get_net_ns(struct ns_common *ns) { - return &get_net(container_of(ns, struct net, ns))->ns; + struct net *net; + + net = maybe_get_net(container_of(ns, struct net, ns)); + if (net) + return &net->ns; + return ERR_PTR(-EINVAL); } EXPORT_SYMBOL_GPL(get_net_ns); From 2d7198278ece01818cd95a3beffbdf8b2a353fa0 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Fri, 14 Jun 2024 16:50:30 +0200 Subject: [PATCH 1082/1578] qca_spi: Make interrupt remembering atomic The whole mechanism to remember occurred SPI interrupts is not atomic, which could lead to unexpected behavior. So fix this by using atomic bit operations instead. Fixes: 291ab06ecf67 ("net: qualcomm: new Ethernet over SPI driver for QCA7000") Signed-off-by: Stefan Wahren Link: https://lore.kernel.org/r/20240614145030.7781-1-wahrenst@gmx.net Signed-off-by: Paolo Abeni --- drivers/net/ethernet/qualcomm/qca_debug.c | 6 ++---- drivers/net/ethernet/qualcomm/qca_spi.c | 16 ++++++++-------- drivers/net/ethernet/qualcomm/qca_spi.h | 3 +-- 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/qualcomm/qca_debug.c b/drivers/net/ethernet/qualcomm/qca_debug.c index ff3b89e9028e9b..ad06da0fdaa04a 100644 --- a/drivers/net/ethernet/qualcomm/qca_debug.c +++ b/drivers/net/ethernet/qualcomm/qca_debug.c @@ -98,10 +98,8 @@ qcaspi_info_show(struct seq_file *s, void *what) seq_printf(s, "IRQ : %d\n", qca->spi_dev->irq); - seq_printf(s, "INTR REQ : %u\n", - qca->intr_req); - seq_printf(s, "INTR SVC : %u\n", - qca->intr_svc); + seq_printf(s, "INTR : %lx\n", + qca->intr); seq_printf(s, "SPI max speed : %lu\n", (unsigned long)qca->spi_dev->max_speed_hz); diff --git a/drivers/net/ethernet/qualcomm/qca_spi.c b/drivers/net/ethernet/qualcomm/qca_spi.c index 5799ecc88a875b..8f7ce6b51a1c9b 100644 --- a/drivers/net/ethernet/qualcomm/qca_spi.c +++ b/drivers/net/ethernet/qualcomm/qca_spi.c @@ -35,6 +35,8 @@ #define MAX_DMA_BURST_LEN 5000 +#define SPI_INTR 0 + /* Modules parameters */ #define QCASPI_CLK_SPEED_MIN 1000000 #define QCASPI_CLK_SPEED_MAX 16000000 @@ -579,14 +581,14 @@ qcaspi_spi_thread(void *data) continue; } - if ((qca->intr_req == qca->intr_svc) && + if (!test_bit(SPI_INTR, &qca->intr) && !qca->txr.skb[qca->txr.head]) schedule(); set_current_state(TASK_RUNNING); - netdev_dbg(qca->net_dev, "have work to do. int: %d, tx_skb: %p\n", - qca->intr_req - qca->intr_svc, + netdev_dbg(qca->net_dev, "have work to do. int: %lu, tx_skb: %p\n", + qca->intr, qca->txr.skb[qca->txr.head]); qcaspi_qca7k_sync(qca, QCASPI_EVENT_UPDATE); @@ -600,8 +602,7 @@ qcaspi_spi_thread(void *data) msleep(QCASPI_QCA7K_REBOOT_TIME_MS); } - if (qca->intr_svc != qca->intr_req) { - qca->intr_svc = qca->intr_req; + if (test_and_clear_bit(SPI_INTR, &qca->intr)) { start_spi_intr_handling(qca, &intr_cause); if (intr_cause & SPI_INT_CPU_ON) { @@ -663,7 +664,7 @@ qcaspi_intr_handler(int irq, void *data) { struct qcaspi *qca = data; - qca->intr_req++; + set_bit(SPI_INTR, &qca->intr); if (qca->spi_thread) wake_up_process(qca->spi_thread); @@ -679,8 +680,7 @@ qcaspi_netdev_open(struct net_device *dev) if (!qca) return -EINVAL; - qca->intr_req = 1; - qca->intr_svc = 0; + set_bit(SPI_INTR, &qca->intr); qca->sync = QCASPI_SYNC_UNKNOWN; qcafrm_fsm_init_spi(&qca->frm_handle); diff --git a/drivers/net/ethernet/qualcomm/qca_spi.h b/drivers/net/ethernet/qualcomm/qca_spi.h index d59cb2352ceecc..8f4808695e8206 100644 --- a/drivers/net/ethernet/qualcomm/qca_spi.h +++ b/drivers/net/ethernet/qualcomm/qca_spi.h @@ -81,8 +81,7 @@ struct qcaspi { struct qcafrm_handle frm_handle; struct sk_buff *rx_skb; - unsigned int intr_req; - unsigned int intr_svc; + unsigned long intr; u16 reset_count; #ifdef CONFIG_DEBUG_FS From 8039156e23bc07a0039168266dbe68b30cddf7b2 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Mon, 17 Jun 2024 18:37:09 -0700 Subject: [PATCH 1083/1578] sound/oss/dmasound: add missing MODULE_DESCRIPTION() macro With ARCH=m68k, make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in sound/oss/dmasound/dmasound_core.o Add the missing invocation of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Reviewed-by: Geert Uytterhoeven Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/20240617-md-m68k-sound-oss-dmasound-v1-1-5c19306be930@quicinc.com --- sound/oss/dmasound/dmasound_core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/oss/dmasound/dmasound_core.c b/sound/oss/dmasound/dmasound_core.c index 164335d3c20092..4b1baf4dd50eba 100644 --- a/sound/oss/dmasound/dmasound_core.c +++ b/sound/oss/dmasound/dmasound_core.c @@ -204,6 +204,7 @@ module_param(numWriteBufs, int, 0); static unsigned int writeBufSize = DEFAULT_BUFF_SIZE ; /* in bytes */ module_param(writeBufSize, int, 0); +MODULE_DESCRIPTION("Atari/Amiga/Q40 core DMA sound driver"); MODULE_LICENSE("GPL"); static int sq_unit = -1; From 70794b9563fe011988bcf6a081af9777e63e8d37 Mon Sep 17 00:00:00 2001 From: Kailang Yang Date: Tue, 18 Jun 2024 14:16:04 +0800 Subject: [PATCH 1084/1578] ALSA: hda/realtek: Add more codec ID to no shutup pins list If it enter to runtime D3 state, it didn't shutup Headset MIC pin. Signed-off-by: Kailang Yang Link: https://lore.kernel.org/r/8d86f61e7d6f4a03b311e4eb4e5caaef@realtek.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index fd337fdd30f69a..e2dbcf8f5bcfb3 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -583,10 +583,14 @@ static void alc_shutup_pins(struct hda_codec *codec) switch (codec->core.vendor_id) { case 0x10ec0236: case 0x10ec0256: + case 0x10ec0257: case 0x19e58326: case 0x10ec0283: + case 0x10ec0285: case 0x10ec0286: + case 0x10ec0287: case 0x10ec0288: + case 0x10ec0295: case 0x10ec0298: alc_headset_mic_no_shutup(codec); break; From 7725363936a88351b71495774c1e0e852ae4cdca Mon Sep 17 00:00:00 2001 From: Raju Lakkaraju Date: Fri, 14 Jun 2024 22:41:55 +0530 Subject: [PATCH 1085/1578] net: lan743x: disable WOL upon resume to restore full data path operation When Wake-on-LAN (WoL) is active and the system is in suspend mode, triggering a system event can wake the system from sleep, which may block the data path. To restore normal data path functionality after waking, disable all wake-up events. Furthermore, clear all Write 1 to Clear (W1C) status bits by writing 1's to them. Fixes: 4d94282afd95 ("lan743x: Add power management support") Reviewed-by: Wojciech Drewek Signed-off-by: Raju Lakkaraju Signed-off-by: Paolo Abeni --- drivers/net/ethernet/microchip/lan743x_main.c | 30 ++++++++++++++++--- drivers/net/ethernet/microchip/lan743x_main.h | 24 +++++++++++++++ 2 files changed, 50 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c index 6be8a43c908a8d..48835bdc2e63ac 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.c +++ b/drivers/net/ethernet/microchip/lan743x_main.c @@ -3575,7 +3575,7 @@ static void lan743x_pm_set_wol(struct lan743x_adapter *adapter) /* clear wake settings */ pmtctl = lan743x_csr_read(adapter, PMT_CTL); - pmtctl |= PMT_CTL_WUPS_MASK_; + pmtctl |= PMT_CTL_WUPS_MASK_ | PMT_CTL_RES_CLR_WKP_MASK_; pmtctl &= ~(PMT_CTL_GPIO_WAKEUP_EN_ | PMT_CTL_EEE_WAKEUP_EN_ | PMT_CTL_WOL_EN_ | PMT_CTL_MAC_D3_RX_CLK_OVR_ | PMT_CTL_RX_FCT_RFE_D3_CLK_OVR_ | PMT_CTL_ETH_PHY_WAKE_EN_); @@ -3710,6 +3710,7 @@ static int lan743x_pm_resume(struct device *dev) struct pci_dev *pdev = to_pci_dev(dev); struct net_device *netdev = pci_get_drvdata(pdev); struct lan743x_adapter *adapter = netdev_priv(netdev); + u32 data; int ret; pci_set_power_state(pdev, PCI_D0); @@ -3728,6 +3729,30 @@ static int lan743x_pm_resume(struct device *dev) return ret; } + ret = lan743x_csr_read(adapter, MAC_WK_SRC); + netif_dbg(adapter, drv, adapter->netdev, + "Wakeup source : 0x%08X\n", ret); + + /* Clear the wol configuration and status bits. Note that + * the status bits are "Write One to Clear (W1C)" + */ + data = MAC_WUCSR_EEE_TX_WAKE_ | MAC_WUCSR_EEE_RX_WAKE_ | + MAC_WUCSR_RFE_WAKE_FR_ | MAC_WUCSR_PFDA_FR_ | MAC_WUCSR_WUFR_ | + MAC_WUCSR_MPR_ | MAC_WUCSR_BCAST_FR_; + lan743x_csr_write(adapter, MAC_WUCSR, data); + + data = MAC_WUCSR2_NS_RCD_ | MAC_WUCSR2_ARP_RCD_ | + MAC_WUCSR2_IPV6_TCPSYN_RCD_ | MAC_WUCSR2_IPV4_TCPSYN_RCD_; + lan743x_csr_write(adapter, MAC_WUCSR2, data); + + data = MAC_WK_SRC_ETH_PHY_WK_ | MAC_WK_SRC_IPV6_TCPSYN_RCD_WK_ | + MAC_WK_SRC_IPV4_TCPSYN_RCD_WK_ | MAC_WK_SRC_EEE_TX_WK_ | + MAC_WK_SRC_EEE_RX_WK_ | MAC_WK_SRC_RFE_FR_WK_ | + MAC_WK_SRC_PFDA_FR_WK_ | MAC_WK_SRC_MP_FR_WK_ | + MAC_WK_SRC_BCAST_FR_WK_ | MAC_WK_SRC_WU_FR_WK_ | + MAC_WK_SRC_WK_FR_SAVED_; + lan743x_csr_write(adapter, MAC_WK_SRC, data); + /* open netdev when netdev is at running state while resume. * For instance, it is true when system wakesup after pm-suspend * However, it is false when system wakes up after suspend GUI menu @@ -3736,9 +3761,6 @@ static int lan743x_pm_resume(struct device *dev) lan743x_netdev_open(netdev); netif_device_attach(netdev); - ret = lan743x_csr_read(adapter, MAC_WK_SRC); - netif_info(adapter, drv, adapter->netdev, - "Wakeup source : 0x%08X\n", ret); return 0; } diff --git a/drivers/net/ethernet/microchip/lan743x_main.h b/drivers/net/ethernet/microchip/lan743x_main.h index 645bc048e52ef5..fac0f33d10b2e3 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.h +++ b/drivers/net/ethernet/microchip/lan743x_main.h @@ -61,6 +61,7 @@ #define PMT_CTL_RX_FCT_RFE_D3_CLK_OVR_ BIT(18) #define PMT_CTL_GPIO_WAKEUP_EN_ BIT(15) #define PMT_CTL_EEE_WAKEUP_EN_ BIT(13) +#define PMT_CTL_RES_CLR_WKP_MASK_ GENMASK(9, 8) #define PMT_CTL_READY_ BIT(7) #define PMT_CTL_ETH_PHY_RST_ BIT(4) #define PMT_CTL_WOL_EN_ BIT(3) @@ -227,12 +228,31 @@ #define MAC_WUCSR (0x140) #define MAC_MP_SO_EN_ BIT(21) #define MAC_WUCSR_RFE_WAKE_EN_ BIT(14) +#define MAC_WUCSR_EEE_TX_WAKE_ BIT(13) +#define MAC_WUCSR_EEE_RX_WAKE_ BIT(11) +#define MAC_WUCSR_RFE_WAKE_FR_ BIT(9) +#define MAC_WUCSR_PFDA_FR_ BIT(7) +#define MAC_WUCSR_WUFR_ BIT(6) +#define MAC_WUCSR_MPR_ BIT(5) +#define MAC_WUCSR_BCAST_FR_ BIT(4) #define MAC_WUCSR_PFDA_EN_ BIT(3) #define MAC_WUCSR_WAKE_EN_ BIT(2) #define MAC_WUCSR_MPEN_ BIT(1) #define MAC_WUCSR_BCST_EN_ BIT(0) #define MAC_WK_SRC (0x144) +#define MAC_WK_SRC_ETH_PHY_WK_ BIT(17) +#define MAC_WK_SRC_IPV6_TCPSYN_RCD_WK_ BIT(16) +#define MAC_WK_SRC_IPV4_TCPSYN_RCD_WK_ BIT(15) +#define MAC_WK_SRC_EEE_TX_WK_ BIT(14) +#define MAC_WK_SRC_EEE_RX_WK_ BIT(13) +#define MAC_WK_SRC_RFE_FR_WK_ BIT(12) +#define MAC_WK_SRC_PFDA_FR_WK_ BIT(11) +#define MAC_WK_SRC_MP_FR_WK_ BIT(10) +#define MAC_WK_SRC_BCAST_FR_WK_ BIT(9) +#define MAC_WK_SRC_WU_FR_WK_ BIT(8) +#define MAC_WK_SRC_WK_FR_SAVED_ BIT(7) + #define MAC_MP_SO_HI (0x148) #define MAC_MP_SO_LO (0x14C) @@ -295,6 +315,10 @@ #define RFE_INDX(index) (0x580 + (index << 2)) #define MAC_WUCSR2 (0x600) +#define MAC_WUCSR2_NS_RCD_ BIT(7) +#define MAC_WUCSR2_ARP_RCD_ BIT(6) +#define MAC_WUCSR2_IPV6_TCPSYN_RCD_ BIT(5) +#define MAC_WUCSR2_IPV4_TCPSYN_RCD_ BIT(4) #define SGMII_ACC (0x720) #define SGMII_ACC_SGMII_BZY_ BIT(31) From 8c248cd836014339498486f14f435c0e344183a7 Mon Sep 17 00:00:00 2001 From: Raju Lakkaraju Date: Fri, 14 Jun 2024 22:41:56 +0530 Subject: [PATCH 1086/1578] net: lan743x: Support WOL at both the PHY and MAC appropriately Prevent options not supported by the PHY from being requested to it by the MAC Whenever a WOL option is supported by both, the PHY is given priority since that usually leads to better power savings. Fixes: e9e13b6adc33 ("lan743x: fix for potential NULL pointer dereference with bare card") Reviewed-by: Wojciech Drewek Signed-off-by: Raju Lakkaraju Signed-off-by: Paolo Abeni --- .../net/ethernet/microchip/lan743x_ethtool.c | 44 +++++++++++++++++-- drivers/net/ethernet/microchip/lan743x_main.c | 18 ++++++-- drivers/net/ethernet/microchip/lan743x_main.h | 4 ++ 3 files changed, 58 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan743x_ethtool.c b/drivers/net/ethernet/microchip/lan743x_ethtool.c index d0f4ff4ee07594..0d1740d6467693 100644 --- a/drivers/net/ethernet/microchip/lan743x_ethtool.c +++ b/drivers/net/ethernet/microchip/lan743x_ethtool.c @@ -1127,8 +1127,12 @@ static void lan743x_ethtool_get_wol(struct net_device *netdev, if (netdev->phydev) phy_ethtool_get_wol(netdev->phydev, wol); - wol->supported |= WAKE_BCAST | WAKE_UCAST | WAKE_MCAST | - WAKE_MAGIC | WAKE_PHY | WAKE_ARP; + if (wol->supported != adapter->phy_wol_supported) + netif_warn(adapter, drv, adapter->netdev, + "PHY changed its supported WOL! old=%x, new=%x\n", + adapter->phy_wol_supported, wol->supported); + + wol->supported |= MAC_SUPPORTED_WAKES; if (adapter->is_pci11x1x) wol->supported |= WAKE_MAGICSECURE; @@ -1143,7 +1147,39 @@ static int lan743x_ethtool_set_wol(struct net_device *netdev, { struct lan743x_adapter *adapter = netdev_priv(netdev); + /* WAKE_MAGICSEGURE is a modifier of and only valid together with + * WAKE_MAGIC + */ + if ((wol->wolopts & WAKE_MAGICSECURE) && !(wol->wolopts & WAKE_MAGIC)) + return -EINVAL; + + if (netdev->phydev) { + struct ethtool_wolinfo phy_wol; + int ret; + + phy_wol.wolopts = wol->wolopts & adapter->phy_wol_supported; + + /* If WAKE_MAGICSECURE was requested, filter out WAKE_MAGIC + * for PHYs that do not support WAKE_MAGICSECURE + */ + if (wol->wolopts & WAKE_MAGICSECURE && + !(adapter->phy_wol_supported & WAKE_MAGICSECURE)) + phy_wol.wolopts &= ~WAKE_MAGIC; + + ret = phy_ethtool_set_wol(netdev->phydev, &phy_wol); + if (ret && (ret != -EOPNOTSUPP)) + return ret; + + if (ret == -EOPNOTSUPP) + adapter->phy_wolopts = 0; + else + adapter->phy_wolopts = phy_wol.wolopts; + } else { + adapter->phy_wolopts = 0; + } + adapter->wolopts = 0; + wol->wolopts &= ~adapter->phy_wolopts; if (wol->wolopts & WAKE_UCAST) adapter->wolopts |= WAKE_UCAST; if (wol->wolopts & WAKE_MCAST) @@ -1164,10 +1200,10 @@ static int lan743x_ethtool_set_wol(struct net_device *netdev, memset(adapter->sopass, 0, sizeof(u8) * SOPASS_MAX); } + wol->wolopts = adapter->wolopts | adapter->phy_wolopts; device_set_wakeup_enable(&adapter->pdev->dev, (bool)wol->wolopts); - return netdev->phydev ? phy_ethtool_set_wol(netdev->phydev, wol) - : -ENETDOWN; + return 0; } #endif /* CONFIG_PM */ diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c index 48835bdc2e63ac..e418539565b18d 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.c +++ b/drivers/net/ethernet/microchip/lan743x_main.c @@ -3118,6 +3118,17 @@ static int lan743x_netdev_open(struct net_device *netdev) if (ret) goto close_tx; } + +#ifdef CONFIG_PM + if (adapter->netdev->phydev) { + struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL }; + + phy_ethtool_get_wol(netdev->phydev, &wol); + adapter->phy_wol_supported = wol.supported; + adapter->phy_wolopts = wol.wolopts; + } +#endif + return 0; close_tx: @@ -3587,10 +3598,9 @@ static void lan743x_pm_set_wol(struct lan743x_adapter *adapter) pmtctl |= PMT_CTL_ETH_PHY_D3_COLD_OVR_ | PMT_CTL_ETH_PHY_D3_OVR_; - if (adapter->wolopts & WAKE_PHY) { - pmtctl |= PMT_CTL_ETH_PHY_EDPD_PLL_CTL_; + if (adapter->phy_wolopts) pmtctl |= PMT_CTL_ETH_PHY_WAKE_EN_; - } + if (adapter->wolopts & WAKE_MAGIC) { wucsr |= MAC_WUCSR_MPEN_; macrx |= MAC_RX_RXEN_; @@ -3686,7 +3696,7 @@ static int lan743x_pm_suspend(struct device *dev) lan743x_csr_write(adapter, MAC_WUCSR2, 0); lan743x_csr_write(adapter, MAC_WK_SRC, 0xFFFFFFFF); - if (adapter->wolopts) + if (adapter->wolopts || adapter->phy_wolopts) lan743x_pm_set_wol(adapter); if (adapter->is_pci11x1x) { diff --git a/drivers/net/ethernet/microchip/lan743x_main.h b/drivers/net/ethernet/microchip/lan743x_main.h index fac0f33d10b2e3..3b2585a384e2c5 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.h +++ b/drivers/net/ethernet/microchip/lan743x_main.h @@ -1042,6 +1042,8 @@ enum lan743x_sgmii_lsd { LINK_2500_SLAVE }; +#define MAC_SUPPORTED_WAKES (WAKE_BCAST | WAKE_UCAST | WAKE_MCAST | \ + WAKE_MAGIC | WAKE_ARP) struct lan743x_adapter { struct net_device *netdev; struct mii_bus *mdiobus; @@ -1049,6 +1051,8 @@ struct lan743x_adapter { #ifdef CONFIG_PM u32 wolopts; u8 sopass[SOPASS_MAX]; + u32 phy_wolopts; + u32 phy_wol_supported; #endif struct pci_dev *pdev; struct lan743x_csr csr; From c44d3ffd85db03ebcc3090e55589e10d5af9f3a9 Mon Sep 17 00:00:00 2001 From: Raju Lakkaraju Date: Fri, 14 Jun 2024 22:41:57 +0530 Subject: [PATCH 1087/1578] net: phy: mxl-gpy: Remove interrupt mask clearing from config_init When the system resumes from sleep, the phy_init_hw() function invokes config_init(), which clears all interrupt masks and causes wake events to be lost in subsequent wake sequences. Remove interrupt mask clearing from config_init() and preserve relevant masks in config_intr(). Fixes: 7d901a1e878a ("net: phy: add Maxlinear GPY115/21x/24x driver") Reviewed-by: Wojciech Drewek Signed-off-by: Raju Lakkaraju Signed-off-by: Paolo Abeni --- drivers/net/phy/mxl-gpy.c | 58 +++++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 20 deletions(-) diff --git a/drivers/net/phy/mxl-gpy.c b/drivers/net/phy/mxl-gpy.c index b2d36a3a96f1e9..e5f8ac4b4604bd 100644 --- a/drivers/net/phy/mxl-gpy.c +++ b/drivers/net/phy/mxl-gpy.c @@ -107,6 +107,7 @@ struct gpy_priv { u8 fw_major; u8 fw_minor; + u32 wolopts; /* It takes 3 seconds to fully switch out of loopback mode before * it can safely re-enter loopback mode. Record the time when @@ -221,6 +222,15 @@ static int gpy_hwmon_register(struct phy_device *phydev) } #endif +static int gpy_ack_interrupt(struct phy_device *phydev) +{ + int ret; + + /* Clear all pending interrupts */ + ret = phy_read(phydev, PHY_ISTAT); + return ret < 0 ? ret : 0; +} + static int gpy_mbox_read(struct phy_device *phydev, u32 addr) { struct gpy_priv *priv = phydev->priv; @@ -262,16 +272,8 @@ static int gpy_mbox_read(struct phy_device *phydev, u32 addr) static int gpy_config_init(struct phy_device *phydev) { - int ret; - - /* Mask all interrupts */ - ret = phy_write(phydev, PHY_IMASK, 0); - if (ret) - return ret; - - /* Clear all pending interrupts */ - ret = phy_read(phydev, PHY_ISTAT); - return ret < 0 ? ret : 0; + /* Nothing to configure. Configuration Requirement Placeholder */ + return 0; } static int gpy21x_config_init(struct phy_device *phydev) @@ -627,11 +629,23 @@ static int gpy_read_status(struct phy_device *phydev) static int gpy_config_intr(struct phy_device *phydev) { + struct gpy_priv *priv = phydev->priv; u16 mask = 0; + int ret; + + ret = gpy_ack_interrupt(phydev); + if (ret) + return ret; if (phydev->interrupts == PHY_INTERRUPT_ENABLED) mask = PHY_IMASK_MASK; + if (priv->wolopts & WAKE_MAGIC) + mask |= PHY_IMASK_WOL; + + if (priv->wolopts & WAKE_PHY) + mask |= PHY_IMASK_LSTC; + return phy_write(phydev, PHY_IMASK, mask); } @@ -678,6 +692,7 @@ static int gpy_set_wol(struct phy_device *phydev, struct ethtool_wolinfo *wol) { struct net_device *attach_dev = phydev->attached_dev; + struct gpy_priv *priv = phydev->priv; int ret; if (wol->wolopts & WAKE_MAGIC) { @@ -725,6 +740,8 @@ static int gpy_set_wol(struct phy_device *phydev, ret = phy_read(phydev, PHY_ISTAT); if (ret < 0) return ret; + + priv->wolopts |= WAKE_MAGIC; } else { /* Disable magic packet matching */ ret = phy_clear_bits_mmd(phydev, MDIO_MMD_VEND2, @@ -732,6 +749,13 @@ static int gpy_set_wol(struct phy_device *phydev, WOL_EN); if (ret < 0) return ret; + + /* Disable the WOL interrupt */ + ret = phy_clear_bits(phydev, PHY_IMASK, PHY_IMASK_WOL); + if (ret < 0) + return ret; + + priv->wolopts &= ~WAKE_MAGIC; } if (wol->wolopts & WAKE_PHY) { @@ -748,9 +772,11 @@ static int gpy_set_wol(struct phy_device *phydev, if (ret & (PHY_IMASK_MASK & ~PHY_IMASK_LSTC)) phy_trigger_machine(phydev); + priv->wolopts |= WAKE_PHY; return 0; } + priv->wolopts &= ~WAKE_PHY; /* Disable the link state change interrupt */ return phy_clear_bits(phydev, PHY_IMASK, PHY_IMASK_LSTC); } @@ -758,18 +784,10 @@ static int gpy_set_wol(struct phy_device *phydev, static void gpy_get_wol(struct phy_device *phydev, struct ethtool_wolinfo *wol) { - int ret; + struct gpy_priv *priv = phydev->priv; wol->supported = WAKE_MAGIC | WAKE_PHY; - wol->wolopts = 0; - - ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, VPSPEC2_WOL_CTL); - if (ret & WOL_EN) - wol->wolopts |= WAKE_MAGIC; - - ret = phy_read(phydev, PHY_IMASK); - if (ret & PHY_IMASK_LSTC) - wol->wolopts |= WAKE_PHY; + wol->wolopts = priv->wolopts; } static int gpy_loopback(struct phy_device *phydev, bool enable) From d864319871b05fadd153e0aede4811ca7008f5d6 Mon Sep 17 00:00:00 2001 From: David Ruth Date: Fri, 14 Jun 2024 19:03:26 +0000 Subject: [PATCH 1088/1578] net/sched: act_api: fix possible infinite loop in tcf_idr_check_alloc() syzbot found hanging tasks waiting on rtnl_lock [1] A reproducer is available in the syzbot bug. When a request to add multiple actions with the same index is sent, the second request will block forever on the first request. This holds rtnl_lock, and causes tasks to hang. Return -EAGAIN to prevent infinite looping, while keeping documented behavior. [1] INFO: task kworker/1:0:5088 blocked for more than 143 seconds. Not tainted 6.9.0-rc4-syzkaller-00173-g3cdb45594619 #0 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:kworker/1:0 state:D stack:23744 pid:5088 tgid:5088 ppid:2 flags:0x00004000 Workqueue: events_power_efficient reg_check_chans_work Call Trace: context_switch kernel/sched/core.c:5409 [inline] __schedule+0xf15/0x5d00 kernel/sched/core.c:6746 __schedule_loop kernel/sched/core.c:6823 [inline] schedule+0xe7/0x350 kernel/sched/core.c:6838 schedule_preempt_disabled+0x13/0x30 kernel/sched/core.c:6895 __mutex_lock_common kernel/locking/mutex.c:684 [inline] __mutex_lock+0x5b8/0x9c0 kernel/locking/mutex.c:752 wiphy_lock include/net/cfg80211.h:5953 [inline] reg_leave_invalid_chans net/wireless/reg.c:2466 [inline] reg_check_chans_work+0x10a/0x10e0 net/wireless/reg.c:2481 Fixes: 0190c1d452a9 ("net: sched: atomically check-allocate action") Reported-by: syzbot+b87c222546179f4513a7@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=b87c222546179f4513a7 Signed-off-by: David Ruth Reviewed-by: Jamal Hadi Salim Link: https://lore.kernel.org/r/20240614190326.1349786-1-druth@chromium.org Signed-off-by: Paolo Abeni --- net/sched/act_api.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 9ee622fb1160fe..2520708b06a12e 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -830,7 +830,6 @@ int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, u32 max; if (*index) { -again: rcu_read_lock(); p = idr_find(&idrinfo->action_idr, *index); @@ -839,7 +838,7 @@ int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, * index but did not assign the pointer yet. */ rcu_read_unlock(); - goto again; + return -EAGAIN; } if (!p) { From 2ebe8f840c7450ecbfca9d18ac92e9ce9155e269 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 15 Jun 2024 14:27:20 -0400 Subject: [PATCH 1089/1578] tipc: force a dst refcount before doing decryption As it says in commit 3bc07321ccc2 ("xfrm: Force a dst refcount before entering the xfrm type handlers"): "Crypto requests might return asynchronous. In this case we leave the rcu protected region, so force a refcount on the skb's destination entry before we enter the xfrm type input/output handlers." On TIPC decryption path it has the same problem, and skb_dst_force() should be called before doing decryption to avoid a possible crash. Shuang reported this issue when this warning is triggered: [] WARNING: include/net/dst.h:337 tipc_sk_rcv+0x1055/0x1ea0 [tipc] [] Kdump: loaded Tainted: G W --------- - - 4.18.0-496.el8.x86_64+debug [] Workqueue: crypto cryptd_queue_worker [] RIP: 0010:tipc_sk_rcv+0x1055/0x1ea0 [tipc] [] Call Trace: [] tipc_sk_mcast_rcv+0x548/0xea0 [tipc] [] tipc_rcv+0xcf5/0x1060 [tipc] [] tipc_aead_decrypt_done+0x215/0x2e0 [tipc] [] cryptd_aead_crypt+0xdb/0x190 [] cryptd_queue_worker+0xed/0x190 [] process_one_work+0x93d/0x17e0 Fixes: fc1b6d6de220 ("tipc: introduce TIPC encryption & authentication") Reported-by: Shuang Li Signed-off-by: Xin Long Link: https://lore.kernel.org/r/fbe3195fad6997a4eec62d9bf076b2ad03ac336b.1718476040.git.lucien.xin@gmail.com Signed-off-by: Paolo Abeni --- net/tipc/node.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/tipc/node.c b/net/tipc/node.c index c1e890a8243478..500320e5ca4791 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -2105,6 +2105,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) } else { n = tipc_node_find_by_id(net, ehdr->id); } + skb_dst_force(skb); tipc_crypto_rcv(net, (n) ? n->crypto_rx : NULL, &skb, b); if (!skb) return; From 88c67aeb14070bab61d3dd8be96c8b42ebcaf53a Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 15 Jun 2024 17:47:30 -0400 Subject: [PATCH 1090/1578] sched: act_ct: add netns into the key of tcf_ct_flow_table zones_ht is a global hashtable for flow_table with zone as key. However, it does not consider netns when getting a flow_table from zones_ht in tcf_ct_init(), and it means an act_ct action in netns A may get a flow_table that belongs to netns B if it has the same zone value. In Shuang's test with the TOPO: tcf2_c <---> tcf2_sw1 <---> tcf2_sw2 <---> tcf2_s tcf2_sw1 and tcf2_sw2 saw the same flow and used the same flow table, which caused their ct entries entering unexpected states and the TCP connection not able to end normally. This patch fixes the issue simply by adding netns into the key of tcf_ct_flow_table so that an act_ct action gets a flow_table that belongs to its own netns in tcf_ct_init(). Note that for easy coding we don't use tcf_ct_flow_table.nf_ft.net, as the ct_ft is initialized after inserting it to the hashtable in tcf_ct_flow_table_get() and also it requires to implement several functions in rhashtable_params including hashfn, obj_hashfn and obj_cmpfn. Fixes: 64ff70b80fd4 ("net/sched: act_ct: Offload established connections to flow table") Reported-by: Shuang Li Signed-off-by: Xin Long Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/1db5b6cc6902c5fc6f8c6cbd85494a2008087be5.1718488050.git.lucien.xin@gmail.com Signed-off-by: Paolo Abeni --- net/sched/act_ct.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index baac083fd8f109..2a96d9c1db65b9 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -41,21 +41,26 @@ static struct workqueue_struct *act_ct_wq; static struct rhashtable zones_ht; static DEFINE_MUTEX(zones_mutex); +struct zones_ht_key { + struct net *net; + u16 zone; +}; + struct tcf_ct_flow_table { struct rhash_head node; /* In zones tables */ struct rcu_work rwork; struct nf_flowtable nf_ft; refcount_t ref; - u16 zone; + struct zones_ht_key key; bool dying; }; static const struct rhashtable_params zones_params = { .head_offset = offsetof(struct tcf_ct_flow_table, node), - .key_offset = offsetof(struct tcf_ct_flow_table, zone), - .key_len = sizeof_field(struct tcf_ct_flow_table, zone), + .key_offset = offsetof(struct tcf_ct_flow_table, key), + .key_len = sizeof_field(struct tcf_ct_flow_table, key), .automatic_shrinking = true, }; @@ -316,11 +321,12 @@ static struct nf_flowtable_type flowtable_ct = { static int tcf_ct_flow_table_get(struct net *net, struct tcf_ct_params *params) { + struct zones_ht_key key = { .net = net, .zone = params->zone }; struct tcf_ct_flow_table *ct_ft; int err = -ENOMEM; mutex_lock(&zones_mutex); - ct_ft = rhashtable_lookup_fast(&zones_ht, ¶ms->zone, zones_params); + ct_ft = rhashtable_lookup_fast(&zones_ht, &key, zones_params); if (ct_ft && refcount_inc_not_zero(&ct_ft->ref)) goto out_unlock; @@ -329,7 +335,7 @@ static int tcf_ct_flow_table_get(struct net *net, struct tcf_ct_params *params) goto err_alloc; refcount_set(&ct_ft->ref, 1); - ct_ft->zone = params->zone; + ct_ft->key = key; err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params); if (err) goto err_insert; From c2bd0791c5f02e964402624dfff45ca8995f5397 Mon Sep 17 00:00:00 2001 From: Patrice Chotard Date: Tue, 18 Jun 2024 15:29:49 +0200 Subject: [PATCH 1091/1578] spi: stm32: qspi: Fix dual flash mode sanity test in stm32_qspi_setup() Misplaced parenthesis make test of mode wrong in case mode is equal to SPI_TX_OCTAL or SPI_RX_OCTAL. Simplify this sanity test, if one of this bit is set, property cs-gpio must be present in DT. Fixes: a557fca630cc ("spi: stm32_qspi: Add transfer_one_message() spi callback") Cc: stable@vger.kernel.org Signed-off-by: Patrice Chotard Link: https://msgid.link/r/20240618132951.2743935-2-patrice.chotard@foss.st.com Signed-off-by: Mark Brown --- drivers/spi/spi-stm32-qspi.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/spi/spi-stm32-qspi.c b/drivers/spi/spi-stm32-qspi.c index f1e922fd362ab9..6944e85d836796 100644 --- a/drivers/spi/spi-stm32-qspi.c +++ b/drivers/spi/spi-stm32-qspi.c @@ -653,9 +653,7 @@ static int stm32_qspi_setup(struct spi_device *spi) return -EINVAL; mode = spi->mode & (SPI_TX_OCTAL | SPI_RX_OCTAL); - if ((mode == SPI_TX_OCTAL || mode == SPI_RX_OCTAL) || - ((mode == (SPI_TX_OCTAL | SPI_RX_OCTAL)) && - gpiod_count(qspi->dev, "cs") == -ENOENT)) { + if (mode && gpiod_count(qspi->dev, "cs") == -ENOENT) { dev_err(qspi->dev, "spi-rx-bus-width\\/spi-tx-bus-width\\/cs-gpios\n"); dev_err(qspi->dev, "configuration not supported\n"); @@ -676,10 +674,10 @@ static int stm32_qspi_setup(struct spi_device *spi) qspi->cr_reg = CR_APMS | 3 << CR_FTHRES_SHIFT | CR_SSHIFT | CR_EN; /* - * Dual flash mode is only enable in case SPI_TX_OCTAL and SPI_TX_OCTAL - * are both set in spi->mode and "cs-gpios" properties is found in DT + * Dual flash mode is only enable in case SPI_TX_OCTAL or SPI_RX_OCTAL + * is set in spi->mode and "cs-gpios" properties is found in DT */ - if (mode == (SPI_TX_OCTAL | SPI_RX_OCTAL)) { + if (mode) { qspi->cr_reg |= CR_DFM; dev_dbg(qspi->dev, "Dual flash mode enable"); } From 63deee52811b2f84ed2da55ad47252f0e8145d62 Mon Sep 17 00:00:00 2001 From: Patrice Chotard Date: Tue, 18 Jun 2024 15:29:50 +0200 Subject: [PATCH 1092/1578] spi: stm32: qspi: Clamp stm32_qspi_get_mode() output to CCR_BUSWIDTH_4 In case usage of OCTAL mode, buswidth parameter can take the value 8. As return value of stm32_qspi_get_mode() is used to configure fields of CCR registers that are 2 bits only (fields IMODE, ADMODE, ADSIZE, DMODE), clamp return value of stm32_qspi_get_mode() to 4. Fixes: a557fca630cc ("spi: stm32_qspi: Add transfer_one_message() spi callback") Cc: stable@vger.kernel.org Signed-off-by: Patrice Chotard Link: https://msgid.link/r/20240618132951.2743935-3-patrice.chotard@foss.st.com Signed-off-by: Mark Brown --- drivers/spi/spi-stm32-qspi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-stm32-qspi.c b/drivers/spi/spi-stm32-qspi.c index 6944e85d836796..955c920c4b6390 100644 --- a/drivers/spi/spi-stm32-qspi.c +++ b/drivers/spi/spi-stm32-qspi.c @@ -349,7 +349,7 @@ static int stm32_qspi_wait_poll_status(struct stm32_qspi *qspi) static int stm32_qspi_get_mode(u8 buswidth) { - if (buswidth == 4) + if (buswidth >= 4) return CCR_BUSWIDTH_4; return buswidth; From d6a711a898672dd873aab3844f754a3ca40723a5 Mon Sep 17 00:00:00 2001 From: Patrice Chotard Date: Tue, 18 Jun 2024 15:29:51 +0200 Subject: [PATCH 1093/1578] spi: Fix OCTAL mode support Add OCTAL mode support. Issue detected using "--octal" spidev_test's option. Signed-off-by: Patrice Chotard Link: https://msgid.link/r/20240618132951.2743935-4-patrice.chotard@foss.st.com Signed-off-by: Mark Brown --- drivers/spi/spi.c | 6 ++++-- include/linux/spi/spi.h | 5 +++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 9bc9fd10d538d2..9da736d51a2ba4 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -4156,7 +4156,8 @@ static int __spi_validate(struct spi_device *spi, struct spi_message *message) return -EINVAL; if (xfer->tx_nbits != SPI_NBITS_SINGLE && xfer->tx_nbits != SPI_NBITS_DUAL && - xfer->tx_nbits != SPI_NBITS_QUAD) + xfer->tx_nbits != SPI_NBITS_QUAD && + xfer->tx_nbits != SPI_NBITS_OCTAL) return -EINVAL; if ((xfer->tx_nbits == SPI_NBITS_DUAL) && !(spi->mode & (SPI_TX_DUAL | SPI_TX_QUAD))) @@ -4171,7 +4172,8 @@ static int __spi_validate(struct spi_device *spi, struct spi_message *message) return -EINVAL; if (xfer->rx_nbits != SPI_NBITS_SINGLE && xfer->rx_nbits != SPI_NBITS_DUAL && - xfer->rx_nbits != SPI_NBITS_QUAD) + xfer->rx_nbits != SPI_NBITS_QUAD && + xfer->rx_nbits != SPI_NBITS_OCTAL) return -EINVAL; if ((xfer->rx_nbits == SPI_NBITS_DUAL) && !(spi->mode & (SPI_RX_DUAL | SPI_RX_QUAD))) diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index e8e1e798924f4c..98fdef6e28f2a7 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -1085,12 +1085,13 @@ struct spi_transfer { unsigned dummy_data:1; unsigned cs_off:1; unsigned cs_change:1; - unsigned tx_nbits:3; - unsigned rx_nbits:3; + unsigned tx_nbits:4; + unsigned rx_nbits:4; unsigned timestamped:1; #define SPI_NBITS_SINGLE 0x01 /* 1-bit transfer */ #define SPI_NBITS_DUAL 0x02 /* 2-bit transfer */ #define SPI_NBITS_QUAD 0x04 /* 4-bit transfer */ +#define SPI_NBITS_OCTAL 0x08 /* 8-bit transfer */ u8 bits_per_word; struct spi_delay delay; struct spi_delay cs_change_delay; From b90d77e5fd784ada62ddd714d15ee2400c28e1cf Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 17 Jun 2024 10:18:12 -0700 Subject: [PATCH 1094/1578] bpf: Fix remap of arena. The bpf arena logic didn't account for mremap operation. Add a refcnt for multiple mmap events to prevent use-after-free in arena_vm_close. Fixes: 317460317a02 ("bpf: Introduce bpf_arena.") Reported-by: Pengfei Xu Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Reviewed-by: Barret Rhoden Tested-by: Pengfei Xu Closes: https://lore.kernel.org/bpf/Zmuw29IhgyPNKnIM@xpf.sh.intel.com Link: https://lore.kernel.org/bpf/20240617171812.76634-1-alexei.starovoitov@gmail.com --- kernel/bpf/arena.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 583ee4fe48efd5..e52b3ad231b9c4 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -212,6 +212,7 @@ static u64 arena_map_mem_usage(const struct bpf_map *map) struct vma_list { struct vm_area_struct *vma; struct list_head head; + atomic_t mmap_count; }; static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma) @@ -221,20 +222,30 @@ static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma) vml = kmalloc(sizeof(*vml), GFP_KERNEL); if (!vml) return -ENOMEM; + atomic_set(&vml->mmap_count, 1); vma->vm_private_data = vml; vml->vma = vma; list_add(&vml->head, &arena->vma_list); return 0; } +static void arena_vm_open(struct vm_area_struct *vma) +{ + struct vma_list *vml = vma->vm_private_data; + + atomic_inc(&vml->mmap_count); +} + static void arena_vm_close(struct vm_area_struct *vma) { struct bpf_map *map = vma->vm_file->private_data; struct bpf_arena *arena = container_of(map, struct bpf_arena, map); - struct vma_list *vml; + struct vma_list *vml = vma->vm_private_data; + if (!atomic_dec_and_test(&vml->mmap_count)) + return; guard(mutex)(&arena->lock); - vml = vma->vm_private_data; + /* update link list under lock */ list_del(&vml->head); vma->vm_private_data = NULL; kfree(vml); @@ -287,6 +298,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) } static const struct vm_operations_struct arena_vm_ops = { + .open = arena_vm_open, .close = arena_vm_close, .fault = arena_vm_fault, }; From bb7e5a193d8becf3920e3848287f1b23c5fc9b24 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 18 Jun 2024 11:27:53 +0800 Subject: [PATCH 1095/1578] block, bfq: remove blkg_path() After commit 35fe6d763229 ("block: use standard blktrace API to output cgroup info for debug notes"), the field 'bfqg->blkg_path' is not used and hence can be removed, and therefor blkg_path() is not used anymore and can be removed. Signed-off-by: Yu Kuai Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20240618032753.3502528-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 51 --------------------------------------------- block/bfq-iosched.h | 3 --- block/blk-cgroup.h | 13 ------------ 3 files changed, 67 deletions(-) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index d442ee358fc257..b758693697c093 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -797,57 +797,6 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) */ bfq_link_bfqg(bfqd, bfqg); __bfq_bic_change_cgroup(bfqd, bic, bfqg); - /* - * Update blkg_path for bfq_log_* functions. We cache this - * path, and update it here, for the following - * reasons. Operations on blkg objects in blk-cgroup are - * protected with the request_queue lock, and not with the - * lock that protects the instances of this scheduler - * (bfqd->lock). This exposes BFQ to the following sort of - * race. - * - * The blkg_lookup performed in bfq_get_queue, protected - * through rcu, may happen to return the address of a copy of - * the original blkg. If this is the case, then the - * bfqg_and_blkg_get performed in bfq_get_queue, to pin down - * the blkg, is useless: it does not prevent blk-cgroup code - * from destroying both the original blkg and all objects - * directly or indirectly referred by the copy of the - * blkg. - * - * On the bright side, destroy operations on a blkg invoke, as - * a first step, hooks of the scheduler associated with the - * blkg. And these hooks are executed with bfqd->lock held for - * BFQ. As a consequence, for any blkg associated with the - * request queue this instance of the scheduler is attached - * to, we are guaranteed that such a blkg is not destroyed, and - * that all the pointers it contains are consistent, while we - * are holding bfqd->lock. A blkg_lookup performed with - * bfqd->lock held then returns a fully consistent blkg, which - * remains consistent until this lock is held. - * - * Thanks to the last fact, and to the fact that: (1) bfqg has - * been obtained through a blkg_lookup in the above - * assignment, and (2) bfqd->lock is being held, here we can - * safely use the policy data for the involved blkg (i.e., the - * field bfqg->pd) to get to the blkg associated with bfqg, - * and then we can safely use any field of blkg. After we - * release bfqd->lock, even just getting blkg through this - * bfqg may cause dangling references to be traversed, as - * bfqg->pd may not exist any more. - * - * In view of the above facts, here we cache, in the bfqg, any - * blkg data we may need for this bic, and for its associated - * bfq_queue. As of now, we need to cache only the path of the - * blkg, which is used in the bfq_log_* functions. - * - * Finally, note that bfqg itself needs to be protected from - * destruction on the blkg_free of the original blkg (which - * invokes bfq_pd_free). We use an additional private - * refcounter for bfqg, to let it disappear only after no - * bfq_queue refers to it any longer. - */ - blkg_path(bfqg_to_blkg(bfqg), bfqg->blkg_path, sizeof(bfqg->blkg_path)); bic->blkcg_serial_nr = serial_nr; } diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 467e8cfc41a249..08ddf2cfae5b1c 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -1003,9 +1003,6 @@ struct bfq_group { /* must be the first member */ struct blkg_policy_data pd; - /* cached path for this blkg (see comments in bfq_bic_update_cgroup) */ - char blkg_path[128]; - /* reference counter (see comments in bfq_bic_update_cgroup) */ refcount_t ref; diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 90b3959d88cfa4..bd472a30bc61f6 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -300,19 +300,6 @@ static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd) return cpd ? cpd->blkcg : NULL; } -/** - * blkg_path - format cgroup path of blkg - * @blkg: blkg of interest - * @buf: target buffer - * @buflen: target buffer length - * - * Format the path of the cgroup of @blkg into @buf. - */ -static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) -{ - return cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); -} - /** * blkg_get - get a blkg reference * @blkg: blkg to get From 66b5867150630e8f9c9a2b7430e55a3beaa83a5b Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Mon, 17 Jun 2024 13:49:10 +0000 Subject: [PATCH 1096/1578] bpf: Update BPF LSM maintainer list After catching up with KP recently, we discussed that I will be now be responsible for co-maintaining the BPF LSM. Adding myself as designated maintainer of the BPF LSM, and specifying more files in which the BPF LSM maintenance responsibilities should now extend out to. This is at the back of all the BPF kfuncs that have been added recently, which are fundamentally restricted to being used only from BPF LSM program types. Signed-off-by: Matt Bobrowski Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/ZnA-1qdtXS1TayD7@google.com --- MAINTAINERS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 10ecbf192ebb07..9c621cb0c795d8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4083,12 +4083,13 @@ F: kernel/bpf/ringbuf.c BPF [SECURITY & LSM] (Security Audit and Enforcement using BPF) M: KP Singh -R: Matt Bobrowski +M: Matt Bobrowski L: bpf@vger.kernel.org S: Maintained F: Documentation/bpf/prog_lsm.rst F: include/linux/bpf_lsm.h F: kernel/bpf/bpf_lsm.c +F: kernel/trace/bpf_trace.c F: security/bpf/ BPF [SELFTESTS] (Test Runners & Infrastructure) From c3f3edf73a8f854f8766a69d2734198a58762e33 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Wed, 12 Jun 2024 09:41:51 -0500 Subject: [PATCH 1097/1578] KVM: Stop processing *all* memslots when "null" mmu_notifier handler is found Bail from outer address space loop, not just the inner memslot loop, when a "null" handler is encountered by __kvm_handle_hva_range(), which is the intended behavior. On x86, which has multiple address spaces thanks to SMM emulation, breaking from just the memslot loop results in undefined behavior due to assigning the non-existent return value from kvm_null_fn() to a bool. In practice, the bug is benign as kvm_mmu_notifier_invalidate_range_end() is the only caller that passes handler=kvm_null_fn, and it doesn't set flush_on_ret, i.e. assigning garbage to r.ret is ultimately ignored. And for most configuration the compiler elides the entire sequence, i.e. there is no undefined behavior at runtime. ------------[ cut here ]------------ UBSAN: invalid-load in arch/x86/kvm/../../../virt/kvm/kvm_main.c:655:10 load of value 160 is not a valid value for type '_Bool' CPU: 370 PID: 8246 Comm: CPU 0/KVM Not tainted 6.8.2-amdsos-build58-ubuntu-22.04+ #1 Hardware name: AMD Corporation Sh54p/Sh54p, BIOS WPC4429N 04/25/2024 Call Trace: dump_stack_lvl+0x48/0x60 ubsan_epilogue+0x5/0x30 __ubsan_handle_load_invalid_value+0x79/0x80 kvm_mmu_notifier_invalidate_range_end.cold+0x18/0x4f [kvm] __mmu_notifier_invalidate_range_end+0x63/0xe0 __split_huge_pmd+0x367/0xfc0 do_huge_pmd_wp_page+0x1cc/0x380 __handle_mm_fault+0x8ee/0xe50 handle_mm_fault+0xe4/0x4a0 __get_user_pages+0x190/0x840 get_user_pages_unlocked+0xe0/0x590 hva_to_pfn+0x114/0x550 [kvm] kvm_faultin_pfn+0xed/0x5b0 [kvm] kvm_tdp_page_fault+0x123/0x170 [kvm] kvm_mmu_page_fault+0x244/0xaa0 [kvm] vcpu_enter_guest+0x592/0x1070 [kvm] kvm_arch_vcpu_ioctl_run+0x145/0x8a0 [kvm] kvm_vcpu_ioctl+0x288/0x6d0 [kvm] __x64_sys_ioctl+0x8f/0xd0 do_syscall_64+0x77/0x120 entry_SYSCALL_64_after_hwframe+0x6e/0x76 ---[ end trace ]--- Fixes: 071064f14d87 ("KVM: Don't take mmu_lock for range invalidation unless necessary") Signed-off-by: Babu Moger Link: https://lore.kernel.org/r/b8723d39903b64c241c50f5513f804390c7b5eec.1718203311.git.babu.moger@amd.com [sean: massage changelog] Signed-off-by: Sean Christopherson --- virt/kvm/kvm_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 843aa68cbcd050..68be4be5d846e0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -651,7 +651,7 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm, range->on_lock(kvm); if (IS_KVM_NULL_FN(range->handler)) - break; + goto mmu_unlock; } r.ret |= range->handler(kvm, &gfn_range); } @@ -660,6 +660,7 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm, if (range->flush_on_ret && r.ret) kvm_flush_remote_tlbs(kvm); +mmu_unlock: if (r.found_memslot) KVM_MMU_UNLOCK(kvm); From 5d272dd1b3430bb31fa30042490fa081512424e4 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 18 Jun 2024 09:00:04 -0700 Subject: [PATCH 1098/1578] cpumask: limit FORCE_NR_CPUS to just the UP case Hardcoding the number of CPUs at compile time does improve code generation, but if you get it wrong the result will be confusion. We already limited this earlier to only "experts" (see commit fe5759d5bfda "cpumask: limit visibility of FORCE_NR_CPUS"), but with distro kernel configs often having EXPERT enabled, that turns out to not be much of a limit. To quote the philosophers at Disney: "Everyone can be an expert. And when everyone's an expert, no one will be". There's a runtime warning if you then set nr_cpus to anything but the forced number, but apparently that can be ignored too [1] and by then it's pretty much too late anyway. If we had some real way to limit this to "embedded only", maybe it would be worth it, but let's see if anybody even notices that the option is gone. We need to simplify kernel configuration anyway. Link: https://lore.kernel.org/all/20240618105036.208a8860@rorschach.local.home/ [1] Reported-by: Steven Rostedt Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Paul McKenney Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Yury Norov Signed-off-by: Linus Torvalds --- lib/Kconfig | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/lib/Kconfig b/lib/Kconfig index d33a268bc256e9..b0a76dff5c1821 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -539,13 +539,7 @@ config CPUMASK_OFFSTACK stack overflow. config FORCE_NR_CPUS - bool "Set number of CPUs at compile time" - depends on SMP && EXPERT && !COMPILE_TEST - help - Say Yes if you have NR_CPUS set to an actual number of possible - CPUs in your system, not to a default value. This forces the core - code to rely on compile-time value and optimize kernel routines - better. + def_bool !SMP config CPU_RMAP bool From 60ff540a1d476c2d48b96f7bc8ac8581b820e878 Mon Sep 17 00:00:00 2001 From: Shuming Fan Date: Wed, 12 Jun 2024 15:57:40 +0800 Subject: [PATCH 1099/1578] ASoC: Intel: soc-acpi: mtl: fix speaker no sound on Dell SKU 0C64 Dell SKU 0C64 has a single rt1318 amplifier. The prefix name of control still needs to be set rt1318-1 corresponding to UCM config. Signed-off-by: Shuming Fan Reviewed-by: Bard Liao Link: https://msgid.link/r/20240612075740.1678082-1-shumingf@realtek.com Signed-off-by: Mark Brown --- sound/soc/intel/common/soc-acpi-intel-mtl-match.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/intel/common/soc-acpi-intel-mtl-match.c b/sound/soc/intel/common/soc-acpi-intel-mtl-match.c index 48252fa9e39e05..8e0ae3635a35d7 100644 --- a/sound/soc/intel/common/soc-acpi-intel-mtl-match.c +++ b/sound/soc/intel/common/soc-acpi-intel-mtl-match.c @@ -293,7 +293,7 @@ static const struct snd_soc_acpi_adr_device rt1318_1_single_adr[] = { .adr = 0x000130025D131801, .num_endpoints = 1, .endpoints = &single_endpoint, - .name_prefix = "rt1318" + .name_prefix = "rt1318-1" } }; From 2c1b7bbe253986619fa5623a13055316e730e746 Mon Sep 17 00:00:00 2001 From: Amit Kumar Mahapatra Date: Mon, 17 Jun 2024 21:00:52 +0530 Subject: [PATCH 1100/1578] spi: Fix SPI slave probe failure While adding a SPI device, the SPI core ensures that multiple logical CS doesn't map to the same physical CS. For example, spi->chip_select[0] != spi->chip_select[1] and so forth. However, unlike the SPI master, the SPI slave doesn't have the list of chip selects, this leads to probe failure when the SPI controller is configured as slave. Update the __spi_add_device() function to perform this check only if the SPI controller is configured as master. Fixes: 4d8ff6b0991d ("spi: Add multi-cs memories support in SPI core") Signed-off-by: Amit Kumar Mahapatra Link: https://msgid.link/r/20240617153052.26636-1-amit.kumar-mahapatra@amd.com Signed-off-by: Mark Brown --- drivers/spi/spi.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 9da736d51a2ba4..fc13fa19218955 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -689,10 +689,12 @@ static int __spi_add_device(struct spi_device *spi) * Make sure that multiple logical CS doesn't map to the same physical CS. * For example, spi->chip_select[0] != spi->chip_select[1] and so on. */ - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { - status = spi_dev_check_cs(dev, spi, idx, spi, idx + 1); - if (status) - return status; + if (!spi_controller_is_target(ctlr)) { + for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + status = spi_dev_check_cs(dev, spi, idx, spi, idx + 1); + if (status) + return status; + } } /* Set the bus ID string */ From 81d23d2a24012e448f651e007fac2cfd20a45ce0 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 17 Jun 2024 12:34:32 +0300 Subject: [PATCH 1101/1578] ptp: fix integer overflow in max_vclocks_store On 32bit systems, the "4 * max" multiply can overflow. Use kcalloc() to do the allocation to prevent this. Fixes: 44c494c8e30e ("ptp: track available ptp vclocks information") Signed-off-by: Dan Carpenter Reviewed-by: Wojciech Drewek Reviewed-by: Jiri Pirko Reviewed-by: Heng Qi Link: https://lore.kernel.org/r/ee8110ed-6619-4bd7-9024-28c1f2ac24f4@moroto.mountain Signed-off-by: Jakub Kicinski --- drivers/ptp/ptp_sysfs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/ptp/ptp_sysfs.c b/drivers/ptp/ptp_sysfs.c index a15460aaa03b3a..6b1b8f57cd9510 100644 --- a/drivers/ptp/ptp_sysfs.c +++ b/drivers/ptp/ptp_sysfs.c @@ -296,8 +296,7 @@ static ssize_t max_vclocks_store(struct device *dev, if (max < ptp->n_vclocks) goto out; - size = sizeof(int) * max; - vclock_index = kzalloc(size, GFP_KERNEL); + vclock_index = kcalloc(max, sizeof(int), GFP_KERNEL); if (!vclock_index) { err = -ENOMEM; goto out; From e2b447c9a1bba718f9c07513a1e8958209e862a1 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 17 Jun 2024 09:28:33 +0100 Subject: [PATCH 1102/1578] selftests: openvswitch: Use bash as interpreter openvswitch.sh makes use of substitutions of the form ${ns:0:1}, to obtain the first character of $ns. Empirically, this is works with bash but not dash. When run with dash these evaluate to an empty string and printing an error to stdout. # dash -c 'ns=client; echo "${ns:0:1}"' 2>error # cat error dash: 1: Bad substitution # bash -c 'ns=client; echo "${ns:0:1}"' 2>error c # cat error This leads to tests that neither pass nor fail. F.e. TEST: arp_ping [START] adding sandbox 'test_arp_ping' Adding DP/Bridge IF: sbx:test_arp_ping dp:arpping {, , } create namespaces ./openvswitch.sh: 282: eval: Bad substitution TEST: ct_connect_v4 [START] adding sandbox 'test_ct_connect_v4' Adding DP/Bridge IF: sbx:test_ct_connect_v4 dp:ct4 {, , } ./openvswitch.sh: 322: eval: Bad substitution create namespaces Resolve this by making openvswitch.sh a bash script. Fixes: 918423fda910 ("selftests: openvswitch: add an initial flow programming case") Signed-off-by: Simon Horman Reviewed-by: Przemek Kitszel Link: https://lore.kernel.org/r/20240617-ovs-selftest-bash-v1-1-7ae6ccd3617b@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/openvswitch/openvswitch.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/openvswitch/openvswitch.sh b/tools/testing/selftests/net/openvswitch/openvswitch.sh index 5cae5354384914..15bca07087179d 100755 --- a/tools/testing/selftests/net/openvswitch/openvswitch.sh +++ b/tools/testing/selftests/net/openvswitch/openvswitch.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # SPDX-License-Identifier: GPL-2.0 # # OVS kernel module self tests From 890182bb3d00d49ccd70b0d651ad8342745a08e7 Mon Sep 17 00:00:00 2001 From: Haylen Chu Date: Tue, 18 Jun 2024 09:16:14 +0000 Subject: [PATCH 1103/1578] riscv: dts: sophgo: disable write-protection for milkv duo Milkv Duo does not have a write-protect pin, so disable write protect to prevent SDcards misdetected as read-only. Fixes: 89a7056ed4f7 ("riscv: dts: sophgo: add sdcard support for milkv duo") Signed-off-by: Haylen Chu Link: https://lore.kernel.org/r/SEYPR01MB4221943C7B101DD2318DA0D3D7CE2@SEYPR01MB4221.apcprd01.prod.exchangelabs.com Signed-off-by: Inochi Amaoto Signed-off-by: Chen Wang --- arch/riscv/boot/dts/sophgo/cv1800b-milkv-duo.dts | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/boot/dts/sophgo/cv1800b-milkv-duo.dts b/arch/riscv/boot/dts/sophgo/cv1800b-milkv-duo.dts index cd013588adc0e3..375ff2661b6e25 100644 --- a/arch/riscv/boot/dts/sophgo/cv1800b-milkv-duo.dts +++ b/arch/riscv/boot/dts/sophgo/cv1800b-milkv-duo.dts @@ -45,6 +45,7 @@ no-1-8-v; no-mmc; no-sdio; + disable-wp; }; &uart0 { From cd6f12e173df44a20c2ac2ac110007dc14968088 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 14 Jun 2024 11:45:15 +0200 Subject: [PATCH 1104/1578] net: phy: dp83tg720: wake up PHYs in managed mode In case this PHY is bootstrapped for managed mode, we need to manually wake it. Otherwise no link will be detected. Cc: stable@vger.kernel.org Fixes: cb80ee2f9bee1 ("net: phy: Add support for the DP83TG720S Ethernet PHY") Signed-off-by: Oleksij Rempel Link: https://lore.kernel.org/r/20240614094516.1481231-1-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/phy/dp83tg720.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/net/phy/dp83tg720.c b/drivers/net/phy/dp83tg720.c index 326c9770a6dccc..1186dfc70fb3cb 100644 --- a/drivers/net/phy/dp83tg720.c +++ b/drivers/net/phy/dp83tg720.c @@ -17,6 +17,11 @@ #define DP83TG720S_PHY_RESET 0x1f #define DP83TG720S_HW_RESET BIT(15) +#define DP83TG720S_LPS_CFG3 0x18c +/* Power modes are documented as bit fields but used as values */ +/* Power Mode 0 is Normal mode */ +#define DP83TG720S_LPS_CFG3_PWR_MODE_0 BIT(0) + #define DP83TG720S_RGMII_DELAY_CTRL 0x602 /* In RGMII mode, Enable or disable the internal delay for RXD */ #define DP83TG720S_RGMII_RX_CLK_SEL BIT(1) @@ -154,10 +159,17 @@ static int dp83tg720_config_init(struct phy_device *phydev) */ usleep_range(1000, 2000); - if (phy_interface_is_rgmii(phydev)) - return dp83tg720_config_rgmii_delay(phydev); + if (phy_interface_is_rgmii(phydev)) { + ret = dp83tg720_config_rgmii_delay(phydev); + if (ret) + return ret; + } - return 0; + /* In case the PHY is bootstrapped in managed mode, we need to + * wake it. + */ + return phy_write_mmd(phydev, MDIO_MMD_VEND2, DP83TG720S_LPS_CFG3, + DP83TG720S_LPS_CFG3_PWR_MODE_0); } static struct phy_driver dp83tg720_driver[] = { From 40a64cc9679540ff7c46ecc51178b07d42abbb1c Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 14 Jun 2024 11:45:16 +0200 Subject: [PATCH 1105/1578] net: phy: dp83tg720: get master/slave configuration in link down state Get master/slave configuration for initial system start with the link in down state. This ensures ethtool shows current configuration. Also fixes link reconfiguration with ethtool while in down state, preventing ethtool from displaying outdated configuration. Even though dp83tg720_config_init() is executed periodically as long as the link is in admin up state but no carrier is detected, this is not sufficient for the link in admin down state where dp83tg720_read_status() is not periodically executed. To cover this case, we need an extra read role configuration in dp83tg720_config_aneg(). Fixes: cb80ee2f9bee1 ("net: phy: Add support for the DP83TG720S Ethernet PHY") Cc: stable@vger.kernel.org Signed-off-by: Oleksij Rempel Link: https://lore.kernel.org/r/20240614094516.1481231-2-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/phy/dp83tg720.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/drivers/net/phy/dp83tg720.c b/drivers/net/phy/dp83tg720.c index 1186dfc70fb3cb..c706429b225a20 100644 --- a/drivers/net/phy/dp83tg720.c +++ b/drivers/net/phy/dp83tg720.c @@ -36,11 +36,20 @@ static int dp83tg720_config_aneg(struct phy_device *phydev) { + int ret; + /* Autoneg is not supported and this PHY supports only one speed. * We need to care only about master/slave configuration if it was * changed by user. */ - return genphy_c45_pma_baset1_setup_master_slave(phydev); + ret = genphy_c45_pma_baset1_setup_master_slave(phydev); + if (ret) + return ret; + + /* Re-read role configuration to make changes visible even if + * the link is in administrative down state. + */ + return genphy_c45_pma_baset1_read_master_slave(phydev); } static int dp83tg720_read_status(struct phy_device *phydev) @@ -69,6 +78,8 @@ static int dp83tg720_read_status(struct phy_device *phydev) return ret; /* After HW reset we need to restore master/slave configuration. + * genphy_c45_pma_baset1_read_master_slave() call will be done + * by the dp83tg720_config_aneg() function. */ ret = dp83tg720_config_aneg(phydev); if (ret) @@ -168,8 +179,15 @@ static int dp83tg720_config_init(struct phy_device *phydev) /* In case the PHY is bootstrapped in managed mode, we need to * wake it. */ - return phy_write_mmd(phydev, MDIO_MMD_VEND2, DP83TG720S_LPS_CFG3, - DP83TG720S_LPS_CFG3_PWR_MODE_0); + ret = phy_write_mmd(phydev, MDIO_MMD_VEND2, DP83TG720S_LPS_CFG3, + DP83TG720S_LPS_CFG3_PWR_MODE_0); + if (ret) + return ret; + + /* Make role configuration visible for ethtool on init and after + * rest. + */ + return genphy_c45_pma_baset1_read_master_slave(phydev); } static struct phy_driver dp83tg720_driver[] = { From b8c43360f6e424131fa81d3ba8792ad8ff25a09e Mon Sep 17 00:00:00 2001 From: Xiaolei Wang Date: Mon, 17 Jun 2024 09:39:22 +0800 Subject: [PATCH 1106/1578] net: stmmac: No need to calculate speed divider when offload is disabled commit be27b8965297 ("net: stmmac: replace priv->speed with the portTransmitRate from the tc-cbs parameters") introduced a problem. When deleting, it prompts "Invalid portTransmitRate 0 (idleSlope - sendSlope)" and exits. Add judgment on cbs.enable. Only when offload is enabled, speed divider needs to be calculated. Fixes: be27b8965297 ("net: stmmac: replace priv->speed with the portTransmitRate from the tc-cbs parameters") Signed-off-by: Xiaolei Wang Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240617013922.1035854-1-xiaolei.wang@windriver.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/stmicro/stmmac/stmmac_tc.c | 40 ++++++++++--------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c index 1562fbdd0a0400..996f2bcd07a243 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c @@ -358,24 +358,28 @@ static int tc_setup_cbs(struct stmmac_priv *priv, port_transmit_rate_kbps = qopt->idleslope - qopt->sendslope; - /* Port Transmit Rate and Speed Divider */ - switch (div_s64(port_transmit_rate_kbps, 1000)) { - case SPEED_10000: - case SPEED_5000: - ptr = 32; - break; - case SPEED_2500: - case SPEED_1000: - ptr = 8; - break; - case SPEED_100: - ptr = 4; - break; - default: - netdev_err(priv->dev, - "Invalid portTransmitRate %lld (idleSlope - sendSlope)\n", - port_transmit_rate_kbps); - return -EINVAL; + if (qopt->enable) { + /* Port Transmit Rate and Speed Divider */ + switch (div_s64(port_transmit_rate_kbps, 1000)) { + case SPEED_10000: + case SPEED_5000: + ptr = 32; + break; + case SPEED_2500: + case SPEED_1000: + ptr = 8; + break; + case SPEED_100: + ptr = 4; + break; + default: + netdev_err(priv->dev, + "Invalid portTransmitRate %lld (idleSlope - sendSlope)\n", + port_transmit_rate_kbps); + return -EINVAL; + } + } else { + ptr = 0; } mode_to_use = priv->plat->tx_queues_cfg[queue].mode_to_use; From 29433a17a79caa8680b9c0761f2b10502fda9ce3 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Tue, 18 Jun 2024 19:22:58 +1200 Subject: [PATCH 1107/1578] cifs: drop the incorrect assertion in cifs_swap_rw() Since commit 2282679fb20b ("mm: submit multipage write for SWP_FS_OPS swap-space"), we can plug multiple pages then unplug them all together. That means iov_iter_count(iter) could be way bigger than PAGE_SIZE, it actually equals the size of iov_iter_npages(iter, INT_MAX). Note this issue has nothing to do with large folios as we don't support THP_SWPOUT to non-block devices. Fixes: 2282679fb20b ("mm: submit multipage write for SWP_FS_OPS swap-space") Reported-by: Christoph Hellwig Closes: https://lore.kernel.org/linux-mm/20240614100329.1203579-1-hch@lst.de/ Cc: NeilBrown Cc: Anna Schumaker Cc: Steve French Cc: Trond Myklebust Cc: Chuanhua Han Cc: Ryan Roberts Cc: Chris Li Cc: "Huang, Ying" Cc: Jeff Layton Cc: Paulo Alcantara Cc: Ronnie Sahlberg Cc: Shyam Prasad N Cc: Tom Talpey Cc: Bharath SM Cc: Signed-off-by: Barry Song Signed-off-by: Steve French --- fs/smb/client/file.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 9d5c2440abfc8f..1e269e0bc75b35 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -3200,8 +3200,6 @@ static int cifs_swap_rw(struct kiocb *iocb, struct iov_iter *iter) { ssize_t ret; - WARN_ON_ONCE(iov_iter_count(iter) != PAGE_SIZE); - if (iov_iter_rw(iter) == READ) ret = netfs_unbuffered_read_iter_locked(iocb, iter); else From 739c9765793e5794578a64aab293c58607f1826a Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Tue, 18 Jun 2024 15:01:52 +0100 Subject: [PATCH 1108/1578] x86/resctrl: Don't try to free nonexistent RMIDs Commit 6791e0ea3071 ("x86/resctrl: Access per-rmid structures by index") adds logic to map individual monitoring groups into a global index space used for tracking allocated RMIDs. Attempts to free the default RMID are ignored in free_rmid(), and this works fine on x86. With arm64 MPAM, there is a latent bug here however: on platforms with no monitors exposed through resctrl, each control group still gets a different monitoring group ID as seen by the hardware, since the CLOSID always forms part of the monitoring group ID. This means that when removing a control group, the code may try to free this group's default monitoring group RMID for real. If there are no monitors however, the RMID tracking table rmid_ptrs[] would be a waste of memory and is never allocated, leading to a splat when free_rmid() tries to dereference the table. One option would be to treat RMID 0 as special for every CLOSID, but this would be ugly since bookkeeping still needs to be done for these monitoring group IDs when there are monitors present in the hardware. Instead, add a gating check of resctrl_arch_mon_capable() in free_rmid(), and just do nothing if the hardware doesn't have monitors. This fix mirrors the gating checks already present in mkdir_rdt_prepare_rmid_alloc() and elsewhere. No functional change on x86. [ bp: Massage commit message. ] Fixes: 6791e0ea3071 ("x86/resctrl: Access per-rmid structures by index") Signed-off-by: Dave Martin Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Reinette Chatre Link: https://lore.kernel.org/r/20240618140152.83154-1-Dave.Martin@arm.com --- arch/x86/kernel/cpu/resctrl/monitor.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 2345e6836593fc..366f496ca3ce20 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -519,7 +519,8 @@ void free_rmid(u32 closid, u32 rmid) * allows architectures that ignore the closid parameter to avoid an * unnecessary check. */ - if (idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, + if (!resctrl_arch_mon_capable() || + idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, RESCTRL_RESERVED_RMID)) return; From 7be4cb7189f747b4e5b6977d0e4387bde3204e62 Mon Sep 17 00:00:00 2001 From: Jose Ignacio Tornos Martinez Date: Mon, 17 Jun 2024 12:28:21 +0200 Subject: [PATCH 1109/1578] net: usb: ax88179_178a: improve reset check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After ecf848eb934b ("net: usb: ax88179_178a: fix link status when link is set to down/up") to not reset from usbnet_open after the reset from usbnet_probe at initialization stage to speed up this, some issues have been reported. It seems to happen that if the initialization is slower, and some time passes between the probe operation and the open operation, the second reset from open is necessary too to have the device working. The reason is that if there is no activity with the phy, this is "disconnected". In order to improve this, the solution is to detect when the phy is "disconnected", and we can use the phy status register for this. So we will only reset the device from reset operation in this situation, that is, only if necessary. The same bahavior is happening when the device is stopped (link set to down) and later is restarted (link set to up), so if the phy keeps working we only need to enable the mac again, but if enough time passes between the device stop and restart, reset is necessary, and we can detect the situation checking the phy status register too. cc: stable@vger.kernel.org # 6.6+ Fixes: ecf848eb934b ("net: usb: ax88179_178a: fix link status when link is set to down/up") Reported-by: Yongqin Liu Reported-by: Antje Miederhöfer Reported-by: Arne Fitzenreiter Tested-by: Yongqin Liu Tested-by: Antje Miederhöfer Signed-off-by: Jose Ignacio Tornos Martinez Signed-off-by: David S. Miller --- drivers/net/usb/ax88179_178a.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c index 51c295e1e823ad..c2fb736f78b26d 100644 --- a/drivers/net/usb/ax88179_178a.c +++ b/drivers/net/usb/ax88179_178a.c @@ -174,7 +174,6 @@ struct ax88179_data { u32 wol_supported; u32 wolopts; u8 disconnecting; - u8 initialized; }; struct ax88179_int_data { @@ -1678,12 +1677,21 @@ static int ax88179_reset(struct usbnet *dev) static int ax88179_net_reset(struct usbnet *dev) { - struct ax88179_data *ax179_data = dev->driver_priv; + u16 tmp16; - if (ax179_data->initialized) + ax88179_read_cmd(dev, AX_ACCESS_PHY, AX88179_PHY_ID, GMII_PHY_PHYSR, + 2, &tmp16); + if (tmp16) { + ax88179_read_cmd(dev, AX_ACCESS_MAC, AX_MEDIUM_STATUS_MODE, + 2, 2, &tmp16); + if (!(tmp16 & AX_MEDIUM_RECEIVE_EN)) { + tmp16 |= AX_MEDIUM_RECEIVE_EN; + ax88179_write_cmd(dev, AX_ACCESS_MAC, AX_MEDIUM_STATUS_MODE, + 2, 2, &tmp16); + } + } else { ax88179_reset(dev); - else - ax179_data->initialized = 1; + } return 0; } From 604141c036e1b636e2a71cf6e1aa09d1e45f40c2 Mon Sep 17 00:00:00 2001 From: Heng Qi Date: Mon, 17 Jun 2024 21:15:23 +0800 Subject: [PATCH 1110/1578] virtio_net: checksum offloading handling fix In virtio spec 0.95, VIRTIO_NET_F_GUEST_CSUM was designed to handle partially checksummed packets, and the validation of fully checksummed packets by the device is independent of VIRTIO_NET_F_GUEST_CSUM negotiation. However, the specification erroneously stated: "If VIRTIO_NET_F_GUEST_CSUM is not negotiated, the device MUST set flags to zero and SHOULD supply a fully checksummed packet to the driver." This statement is inaccurate because even without VIRTIO_NET_F_GUEST_CSUM negotiation, the device can still set the VIRTIO_NET_HDR_F_DATA_VALID flag. Essentially, the device can facilitate the validation of these packets' checksums - a process known as RX checksum offloading - removing the need for the driver to do so. This scenario is currently not implemented in the driver and requires correction. The necessary specification correction[1] has been made and approved in the virtio TC vote. [1] https://lists.oasis-open.org/archives/virtio-comment/202401/msg00011.html Fixes: 4f49129be6fa ("virtio-net: Set RXCSUM feature if GUEST_CSUM is available") Signed-off-by: Heng Qi Reviewed-by: Jiri Pirko Acked-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 61a57d134544f9..aa70a7ed8072c0 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -5666,8 +5666,16 @@ static int virtnet_probe(struct virtio_device *vdev) dev->features |= dev->hw_features & NETIF_F_ALL_TSO; /* (!csum && gso) case will be fixed by register_netdev() */ } - if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM)) - dev->features |= NETIF_F_RXCSUM; + + /* 1. With VIRTIO_NET_F_GUEST_CSUM negotiation, the driver doesn't + * need to calculate checksums for partially checksummed packets, + * as they're considered valid by the upper layer. + * 2. Without VIRTIO_NET_F_GUEST_CSUM negotiation, the driver only + * receives fully checksummed packets. The device may assist in + * validating these packets' checksums, so the driver won't have to. + */ + dev->features |= NETIF_F_RXCSUM; + if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) || virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6)) dev->features |= NETIF_F_GRO_HW; From 703eec1b242276f2d97d98f04790ddad319ddde4 Mon Sep 17 00:00:00 2001 From: Heng Qi Date: Mon, 17 Jun 2024 21:15:24 +0800 Subject: [PATCH 1111/1578] virtio_net: fixing XDP for fully checksummed packets handling The XDP program can't correctly handle partially checksummed packets, but works fine with fully checksummed packets. If the device has already validated fully checksummed packets, then the driver doesn't need to re-validate them, saving CPU resources. Additionally, the driver does not drop all partially checksummed packets when VIRTIO_NET_F_GUEST_CSUM is not negotiated. This is not a bug, as the driver has always done this. Fixes: 436c9453a1ac ("virtio-net: keep vnet header zeroed after processing XDP") Signed-off-by: Heng Qi Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index aa70a7ed8072c0..ea10db9a09fa21 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1360,6 +1360,10 @@ static struct sk_buff *receive_small_xdp(struct net_device *dev, if (unlikely(hdr->hdr.gso_type)) goto err_xdp; + /* Partially checksummed packets must be dropped. */ + if (unlikely(hdr->hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) + goto err_xdp; + buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); @@ -1677,6 +1681,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, if (unlikely(hdr->hdr.gso_type)) return NULL; + /* Partially checksummed packets must be dropped. */ + if (unlikely(hdr->hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) + return NULL; + /* Now XDP core assumes frag size is PAGE_SIZE, but buffers * with headroom may add hole in truesize, which * make their length exceed PAGE_SIZE. So we disabled the @@ -1943,6 +1951,7 @@ static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq, struct net_device *dev = vi->dev; struct sk_buff *skb; struct virtio_net_common_hdr *hdr; + u8 flags; if (unlikely(len < vi->hdr_len + ETH_HLEN)) { pr_debug("%s: short packet %i\n", dev->name, len); @@ -1951,6 +1960,15 @@ static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq, return; } + /* 1. Save the flags early, as the XDP program might overwrite them. + * These flags ensure packets marked as VIRTIO_NET_HDR_F_DATA_VALID + * stay valid after XDP processing. + * 2. XDP doesn't work with partially checksummed packets (refer to + * virtnet_xdp_set()), so packets marked as + * VIRTIO_NET_HDR_F_NEEDS_CSUM get dropped during XDP processing. + */ + flags = ((struct virtio_net_common_hdr *)buf)->hdr.flags; + if (vi->mergeable_rx_bufs) skb = receive_mergeable(dev, vi, rq, buf, ctx, len, xdp_xmit, stats); @@ -1966,7 +1984,7 @@ static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq, if (dev->features & NETIF_F_RXHASH && vi->has_rss_hash_report) virtio_skb_set_hash(&hdr->hash_v1_hdr, skb); - if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID) + if (flags & VIRTIO_NET_HDR_F_DATA_VALID) skb->ip_summed = CHECKSUM_UNNECESSARY; if (virtio_net_hdr_to_skb(skb, &hdr->hdr, From b95a4afe2defd6f46891985f9436a568cd35a31c Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 17 Jun 2024 17:50:26 +0100 Subject: [PATCH 1112/1578] octeontx2-pf: Add error handling to VLAN unoffload handling otx2_sq_append_skb makes used of __vlan_hwaccel_push_inside() to unoffload VLANs - push them from skb meta data into skb data. However, it omitts a check for __vlan_hwaccel_push_inside() returning NULL. Found by inspection based on [1] and [2]. Compile tested only. [1] Re: [PATCH net-next v1] net: stmmac: Enable TSO on VLANs https://lore.kernel.org/all/ZmrN2W8Fye450TKs@shell.armlinux.org.uk/ [2] Re: [PATCH net-next v2] net: stmmac: Enable TSO on VLANs https://lore.kernel.org/all/CANn89i+11L5=tKsa7V7Aeyxaj6nYGRwy35PAbCRYJ73G+b25sg@mail.gmail.com/ Fixes: fd9d7859db6c ("octeontx2-pf: Implement ingress/egress VLAN offload") Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c index a16e9f244117bd..929b4eac25d974 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c @@ -1174,8 +1174,11 @@ bool otx2_sq_append_skb(struct net_device *netdev, struct otx2_snd_queue *sq, if (skb_shinfo(skb)->gso_size && !is_hw_tso_supported(pfvf, skb)) { /* Insert vlan tag before giving pkt to tso */ - if (skb_vlan_tag_present(skb)) + if (skb_vlan_tag_present(skb)) { skb = __vlan_hwaccel_push_inside(skb); + if (!skb) + return true; + } otx2_sq_append_tso(pfvf, sq, skb, qidx); return true; } From fa997b0576c9df635ee363406f5e014dba0f9264 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Tue, 18 Jun 2024 17:28:29 +0200 Subject: [PATCH 1113/1578] ata: ahci: Do not enable LPM if no LPM states are supported by the HBA LPM consists of HIPM (host initiated power management) and DIPM (device initiated power management). ata_eh_set_lpm() will only enable HIPM if both the HBA and the device supports it. However, DIPM will be enabled as long as the device supports it. The HBA will later reject the device's request to enter a power state that it does not support (Slumber/Partial/DevSleep) (DevSleep is never initiated by the device). For a HBA that doesn't support any LPM states, simply don't set a LPM policy such that all the HIPM/DIPM probing/enabling will be skipped. Not enabling HIPM or DIPM in the first place is safer than relying on the device following the AHCI specification and respecting the NAK. (There are comments in the code that some devices misbehave when receiving a NAK.) Performing this check in ahci_update_initial_lpm_policy() also has the advantage that a HBA that doesn't support any LPM states will take the exact same code paths as a port that is external/hot plug capable. Side note: the port in ata_port_dbg() has not been given a unique id yet, but this is not overly important as the debug print is disabled unless explicitly enabled using dynamic debug. A follow-up series will make sure that the unique id assignment will be done earlier. For now, the important thing is that the function returns before setting the LPM policy. Fixes: 7627a0edef54 ("ata: ahci: Drop low power policy board type") Cc: stable@vger.kernel.org Reviewed-by: Mario Limonciello Reviewed-by: Mika Westerberg Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20240618152828.2686771-2-cassel@kernel.org Signed-off-by: Niklas Cassel --- drivers/ata/ahci.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index 07d66d2c5f0dd8..5eb38fbbbecdbe 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -1735,6 +1735,14 @@ static void ahci_update_initial_lpm_policy(struct ata_port *ap) if (ap->pflags & ATA_PFLAG_EXTERNAL) return; + /* If no LPM states are supported by the HBA, do not bother with LPM */ + if ((ap->host->flags & ATA_HOST_NO_PART) && + (ap->host->flags & ATA_HOST_NO_SSC) && + (ap->host->flags & ATA_HOST_NO_DEVSLP)) { + ata_port_dbg(ap, "no LPM states supported, not enabling LPM\n"); + return; + } + /* user modified policy via module param */ if (mobile_lpm_policy != -1) { policy = mobile_lpm_policy; From 1062d03827b78614259b3b4b992deb27ee6aa84d Mon Sep 17 00:00:00 2001 From: Geetha sowjanya Date: Tue, 18 Jun 2024 11:41:22 +0530 Subject: [PATCH 1114/1578] octeontx2-pf: Fix linking objects into multiple modules This patch fixes the below build warning messages that are caused due to linking same files to multiple modules by exporting the required symbols. "scripts/Makefile.build:244: drivers/net/ethernet/marvell/octeontx2/nic/Makefile: otx2_devlink.o is added to multiple modules: rvu_nicpf rvu_nicvf scripts/Makefile.build:244: drivers/net/ethernet/marvell/octeontx2/nic/Makefile: otx2_dcbnl.o is added to multiple modules: rvu_nicpf rvu_nicvf" Fixes: 8e67558177f8 ("octeontx2-pf: PFC config support with DCBx"). Signed-off-by: Geetha sowjanya Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/nic/Makefile | 3 +-- drivers/net/ethernet/marvell/octeontx2/nic/otx2_dcbnl.c | 7 +++++++ drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c | 2 ++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/Makefile b/drivers/net/ethernet/marvell/octeontx2/nic/Makefile index 5664f768cb0cd1..64a97a0a10ed6a 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/Makefile +++ b/drivers/net/ethernet/marvell/octeontx2/nic/Makefile @@ -9,10 +9,9 @@ obj-$(CONFIG_OCTEONTX2_VF) += rvu_nicvf.o otx2_ptp.o rvu_nicpf-y := otx2_pf.o otx2_common.o otx2_txrx.o otx2_ethtool.o \ otx2_flows.o otx2_tc.o cn10k.o otx2_dmac_flt.o \ otx2_devlink.o qos_sq.o qos.o -rvu_nicvf-y := otx2_vf.o otx2_devlink.o +rvu_nicvf-y := otx2_vf.o rvu_nicpf-$(CONFIG_DCB) += otx2_dcbnl.o -rvu_nicvf-$(CONFIG_DCB) += otx2_dcbnl.o rvu_nicpf-$(CONFIG_MACSEC) += cn10k_macsec.o ccflags-y += -I$(srctree)/drivers/net/ethernet/marvell/octeontx2/af diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_dcbnl.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_dcbnl.c index 28fb643d2917f7..aa01110f04a339 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_dcbnl.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_dcbnl.c @@ -54,6 +54,7 @@ int otx2_pfc_txschq_config(struct otx2_nic *pfvf) return 0; } +EXPORT_SYMBOL(otx2_pfc_txschq_config); static int otx2_pfc_txschq_alloc_one(struct otx2_nic *pfvf, u8 prio) { @@ -122,6 +123,7 @@ int otx2_pfc_txschq_alloc(struct otx2_nic *pfvf) return 0; } +EXPORT_SYMBOL(otx2_pfc_txschq_alloc); static int otx2_pfc_txschq_stop_one(struct otx2_nic *pfvf, u8 prio) { @@ -260,6 +262,7 @@ int otx2_pfc_txschq_update(struct otx2_nic *pfvf) return 0; } +EXPORT_SYMBOL(otx2_pfc_txschq_update); int otx2_pfc_txschq_stop(struct otx2_nic *pfvf) { @@ -282,6 +285,7 @@ int otx2_pfc_txschq_stop(struct otx2_nic *pfvf) return 0; } +EXPORT_SYMBOL(otx2_pfc_txschq_stop); int otx2_config_priority_flow_ctrl(struct otx2_nic *pfvf) { @@ -321,6 +325,7 @@ int otx2_config_priority_flow_ctrl(struct otx2_nic *pfvf) mutex_unlock(&pfvf->mbox.lock); return err; } +EXPORT_SYMBOL(otx2_config_priority_flow_ctrl); void otx2_update_bpid_in_rqctx(struct otx2_nic *pfvf, int vlan_prio, int qidx, bool pfc_enable) @@ -385,6 +390,7 @@ void otx2_update_bpid_in_rqctx(struct otx2_nic *pfvf, int vlan_prio, int qidx, "Updating BPIDs in CQ and Aura contexts of RQ%d failed with err %d\n", qidx, err); } +EXPORT_SYMBOL(otx2_update_bpid_in_rqctx); static int otx2_dcbnl_ieee_getpfc(struct net_device *dev, struct ieee_pfc *pfc) { @@ -472,3 +478,4 @@ int otx2_dcbnl_set_ops(struct net_device *dev) return 0; } +EXPORT_SYMBOL(otx2_dcbnl_set_ops); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c index 99ddf31269d963..458d34a62e1895 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c @@ -113,6 +113,7 @@ int otx2_register_dl(struct otx2_nic *pfvf) devlink_free(dl); return err; } +EXPORT_SYMBOL(otx2_register_dl); void otx2_unregister_dl(struct otx2_nic *pfvf) { @@ -124,3 +125,4 @@ void otx2_unregister_dl(struct otx2_nic *pfvf) ARRAY_SIZE(otx2_dl_params)); devlink_free(dl); } +EXPORT_SYMBOL(otx2_unregister_dl); From a8763466669d21b570b26160d0a5e0a2ee529d22 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Tue, 18 Jun 2024 09:29:21 +0200 Subject: [PATCH 1115/1578] selftests: openvswitch: Set value to nla flags. Netlink flags, although they don't have payload at the netlink level, are represented as having "True" as value in pyroute2. Without it, trying to add a flow with a flag-type action (e.g: pop_vlan) fails with the following traceback: Traceback (most recent call last): File "[...]/ovs-dpctl.py", line 2498, in sys.exit(main(sys.argv)) ^^^^^^^^^^^^^^ File "[...]/ovs-dpctl.py", line 2487, in main ovsflow.add_flow(rep["dpifindex"], flow) File "[...]/ovs-dpctl.py", line 2136, in add_flow reply = self.nlm_request( ^^^^^^^^^^^^^^^^^ File "[...]/pyroute2/netlink/nlsocket.py", line 822, in nlm_request return tuple(self._genlm_request(*argv, **kwarg)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "[...]/pyroute2/netlink/generic/__init__.py", line 126, in nlm_request return tuple(super().nlm_request(*argv, **kwarg)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "[...]/pyroute2/netlink/nlsocket.py", line 1124, in nlm_request self.put(msg, msg_type, msg_flags, msg_seq=msg_seq) File "[...]/pyroute2/netlink/nlsocket.py", line 389, in put self.sendto_gate(msg, addr) File "[...]/pyroute2/netlink/nlsocket.py", line 1056, in sendto_gate msg.encode() File "[...]/pyroute2/netlink/__init__.py", line 1245, in encode offset = self.encode_nlas(offset) ^^^^^^^^^^^^^^^^^^^^^^^^ File "[...]/pyroute2/netlink/__init__.py", line 1560, in encode_nlas nla_instance.setvalue(cell[1]) File "[...]/pyroute2/netlink/__init__.py", line 1265, in setvalue nlv.setvalue(nla_tuple[1]) ~~~~~~~~~^^^ IndexError: list index out of range Signed-off-by: Adrian Moreno Acked-by: Aaron Conole Signed-off-by: David S. Miller --- tools/testing/selftests/net/openvswitch/ovs-dpctl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/openvswitch/ovs-dpctl.py b/tools/testing/selftests/net/openvswitch/ovs-dpctl.py index 1dd057afd3fbe2..9f8dec2f6539cc 100644 --- a/tools/testing/selftests/net/openvswitch/ovs-dpctl.py +++ b/tools/testing/selftests/net/openvswitch/ovs-dpctl.py @@ -531,7 +531,7 @@ def parse(self, actstr): for flat_act in parse_flat_map: if parse_starts_block(actstr, flat_act[0], False): actstr = actstr[len(flat_act[0]):] - self["attrs"].append([flat_act[1]]) + self["attrs"].append([flat_act[1], True]) actstr = actstr[strspn(actstr, ", ") :] parsed = True From 895a37028a4854962def6e2f2820a23c84062f66 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Wed, 19 Jun 2024 13:18:56 +0100 Subject: [PATCH 1116/1578] arm64: mm: Permit PTE SW bits to change in live mappings Previously pgattr_change_is_safe() was overly-strict and complained (e.g. "[ 116.262743] __check_safe_pte_update: unsafe attribute change: 0x0560000043768fc3 -> 0x0160000043768fc3") if it saw any SW bits change in a live PTE. There is no such restriction on SW bits in the Arm ARM. Until now, no SW bits have been updated in live mappings via the set_ptes() route. PTE_DIRTY would be updated live, but this is handled by ptep_set_access_flags() which does not call pgattr_change_is_safe(). However, with the introduction of uffd-wp for arm64, there is core-mm code that does ptep_get(); pte_clear_uffd_wp(); set_ptes(); which triggers this false warning. Silence this warning by masking out the SW bits during checks. The bug isn't technically in the highlighted commit below, but that's where bisecting would likely lead as its what made the bug user-visible. Signed-off-by: Ryan Roberts Fixes: 5b32510af77b ("arm64/mm: Add uffd write-protect support") Link: https://lore.kernel.org/r/20240619121859.4153966-1-ryan.roberts@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/pgtable-hwdef.h | 1 + arch/arm64/mm/mmu.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 9943ff0af4c96d..1f60aa1bc750cb 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -170,6 +170,7 @@ #define PTE_CONT (_AT(pteval_t, 1) << 52) /* Contiguous range */ #define PTE_PXN (_AT(pteval_t, 1) << 53) /* Privileged XN */ #define PTE_UXN (_AT(pteval_t, 1) << 54) /* User XN */ +#define PTE_SWBITS_MASK _AT(pteval_t, (BIT(63) | GENMASK(58, 55))) #define PTE_ADDR_LOW (((_AT(pteval_t, 1) << (50 - PAGE_SHIFT)) - 1) << PAGE_SHIFT) #ifdef CONFIG_ARM64_PA_BITS_52 diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index c927e9312f102e..353ea5dc32b850 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -124,7 +124,8 @@ bool pgattr_change_is_safe(u64 old, u64 new) * The following mapping attributes may be updated in live * kernel mappings without the need for break-before-make. */ - pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE | PTE_NG; + pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE | PTE_NG | + PTE_SWBITS_MASK; /* creating or taking down mappings is always safe */ if (!pte_valid(__pte(old)) || !pte_valid(__pte(new))) From df75470b317b46affbe1f5f8f006b34175be9789 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Tue, 18 Jun 2024 19:34:18 +0200 Subject: [PATCH 1117/1578] spi: spi-imx: imx51: revert burst length calculation back to bits_per_word The patch 15a6af94a277 ("spi: Increase imx51 ecspi burst length based on transfer length") increased the burst length calculation in mx51_ecspi_prepare_transfer() to be based on the transfer length. This breaks HW CS + SPI_CS_WORD support which was added in 6e95b23a5b2d ("spi: imx: Implement support for CS_WORD") and transfers with bits-per-word != 8, 16, 32. SPI_CS_WORD means the CS should be toggled after each word. The implementation in the imx-spi driver relies on the fact that the HW CS is toggled automatically by the controller after each burst length number of bits. Setting the burst length to the number of bits of the _whole_ message breaks this use case. Further the patch 15a6af94a277 ("spi: Increase imx51 ecspi burst length based on transfer length") claims to optimize the transfers. But even without this patch, on modern spi-imx controllers with "dynamic_burst = true" (imx51, imx6 and newer), the transfers are already optimized, i.e. the burst length is dynamically adjusted in spi_imx_push() to avoid the pause between the SPI bursts. This has been confirmed by a scope measurement on an imx6d. Subsequent Patches tried to fix these and other problems: - 5f66db08cbd3 ("spi: imx: Take in account bits per word instead of assuming 8-bits") - e9b220aeacf1 ("spi: spi-imx: correctly configure burst length when using dma") - c712c05e46c8 ("spi: imx: fix the burst length at DMA mode and CPU mode") - cf6d79a0f576 ("spi: spi-imx: fix off-by-one in mx51 CPU mode burst length") but the HW CS + SPI_CS_WORD use case is still broken. To fix the problems revert the burst size calculation in mx51_ecspi_prepare_transfer() back to the original form, before 15a6af94a277 ("spi: Increase imx51 ecspi burst length based on transfer length") was applied. Cc: Stefan Moring Cc: Stefan Bigler Cc: Clark Wang Cc: Carlos Song Cc: Sebastian Reichel Cc: Thorsten Scherer Fixes: 15a6af94a277 ("spi: Increase imx51 ecspi burst length based on transfer length") Fixes: 5f66db08cbd3 ("spi: imx: Take in account bits per word instead of assuming 8-bits") Fixes: e9b220aeacf1 ("spi: spi-imx: correctly configure burst length when using dma") Fixes: c712c05e46c8 ("spi: imx: fix the burst length at DMA mode and CPU mode") Fixes: cf6d79a0f576 ("spi: spi-imx: fix off-by-one in mx51 CPU mode burst length") Link: https://lore.kernel.org/all/20240618-oxpecker-of-ideal-mastery-db59f8-mkl@pengutronix.de Signed-off-by: Marc Kleine-Budde Tested-by: Thorsten Scherer Link: https://msgid.link/r/20240618-spi-imx-fix-bustlength-v1-1-2053dd5fdf87@pengutronix.de Signed-off-by: Mark Brown --- drivers/spi/spi-imx.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c index f4006c82f867a6..33164ebdb58311 100644 --- a/drivers/spi/spi-imx.c +++ b/drivers/spi/spi-imx.c @@ -660,18 +660,8 @@ static int mx51_ecspi_prepare_transfer(struct spi_imx_data *spi_imx, ctrl |= (spi_imx->target_burst * 8 - 1) << MX51_ECSPI_CTRL_BL_OFFSET; else { - if (spi_imx->usedma) { - ctrl |= (spi_imx->bits_per_word - 1) - << MX51_ECSPI_CTRL_BL_OFFSET; - } else { - if (spi_imx->count >= MX51_ECSPI_CTRL_MAX_BURST) - ctrl |= (MX51_ECSPI_CTRL_MAX_BURST * BITS_PER_BYTE - 1) - << MX51_ECSPI_CTRL_BL_OFFSET; - else - ctrl |= (spi_imx->count / DIV_ROUND_UP(spi_imx->bits_per_word, - BITS_PER_BYTE) * spi_imx->bits_per_word - 1) - << MX51_ECSPI_CTRL_BL_OFFSET; - } + ctrl |= (spi_imx->bits_per_word - 1) + << MX51_ECSPI_CTRL_BL_OFFSET; } /* set clock speed */ From 8ecd06277a7664f4ef018abae3abd3451d64e7a6 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Mon, 17 Jun 2024 11:18:15 +0200 Subject: [PATCH 1118/1578] netfilter: ipset: Fix suspicious rcu_dereference_protected() When destroying all sets, we are either in pernet exit phase or are executing a "destroy all sets command" from userspace. The latter was taken into account in ip_set_dereference() (nfnetlink mutex is held), but the former was not. The patch adds the required check to rcu_dereference_protected() in ip_set_dereference(). Fixes: 4e7aaa6b82d6 ("netfilter: ipset: Fix race between namespace cleanup and gc in the list:set type") Reported-by: syzbot+b62c37cdd58103293a5a@syzkaller.appspotmail.com Reported-by: syzbot+cfbe1da5fdfc39efc293@syzkaller.appspotmail.com Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202406141556.e0b6f17e-lkp@intel.com Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_core.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index c7ae4d9bf3d24c..61431690cbd5f1 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -53,12 +53,13 @@ MODULE_DESCRIPTION("core IP set support"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); /* When the nfnl mutex or ip_set_ref_lock is held: */ -#define ip_set_dereference(p) \ - rcu_dereference_protected(p, \ +#define ip_set_dereference(inst) \ + rcu_dereference_protected((inst)->ip_set_list, \ lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET) || \ - lockdep_is_held(&ip_set_ref_lock)) + lockdep_is_held(&ip_set_ref_lock) || \ + (inst)->is_deleted) #define ip_set(inst, id) \ - ip_set_dereference((inst)->ip_set_list)[id] + ip_set_dereference(inst)[id] #define ip_set_ref_netlink(inst,id) \ rcu_dereference_raw((inst)->ip_set_list)[id] #define ip_set_dereference_nfnl(p) \ @@ -1133,7 +1134,7 @@ static int ip_set_create(struct sk_buff *skb, const struct nfnl_info *info, if (!list) goto cleanup; /* nfnl mutex is held, both lists are valid */ - tmp = ip_set_dereference(inst->ip_set_list); + tmp = ip_set_dereference(inst); memcpy(list, tmp, sizeof(struct ip_set *) * inst->ip_set_max); rcu_assign_pointer(inst->ip_set_list, list); /* Make sure all current packets have passed through */ From 28d8c13830cc530996157e22ecf22def90cb7f35 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Mon, 17 Jun 2024 18:13:32 -0700 Subject: [PATCH 1119/1578] amiflop: add missing MODULE_DESCRIPTION() macro With ARCH=m68k, make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/block/amiflop.o Add the missing invocation of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Reviewed-by: Geert Uytterhoeven Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20240617-md-m68k-drivers-block-v1-1-b200599a315e@quicinc.com Signed-off-by: Jens Axboe --- drivers/block/amiflop.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index a25414228e4741..310254c994aa88 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -232,6 +232,7 @@ static DEFINE_MUTEX(amiflop_mutex); static unsigned long int fd_def_df0 = FD_DD_3; /* default for df0 if it doesn't identify */ module_param(fd_def_df0, ulong, 0); +MODULE_DESCRIPTION("Amiga floppy driver"); MODULE_LICENSE("GPL"); /* From ba8df22e25e7e906254f4f490d7bcfbe637152aa Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Mon, 17 Jun 2024 18:13:33 -0700 Subject: [PATCH 1120/1578] ataflop: add missing MODULE_DESCRIPTION() macro With ARCH=m68k, make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/block/ataflop.o Add the missing invocation of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Reviewed-by: Geert Uytterhoeven Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20240617-md-m68k-drivers-block-v1-2-b200599a315e@quicinc.com Signed-off-by: Jens Axboe --- drivers/block/ataflop.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index cacc4ba942a814..b19884da43760c 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -2197,4 +2197,5 @@ static void __exit atari_floppy_exit(void) module_init(atari_floppy_init) module_exit(atari_floppy_exit) +MODULE_DESCRIPTION("Atari floppy driver"); MODULE_LICENSE("GPL"); From 465478bb00168a7620788990b1679c5067d421f2 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Mon, 17 Jun 2024 18:13:34 -0700 Subject: [PATCH 1121/1578] z2ram: add missing MODULE_DESCRIPTION() macro With ARCH=m68k, make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/block/z2ram.o Add the missing invocation of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Reviewed-by: Geert Uytterhoeven Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20240617-md-m68k-drivers-block-v1-3-b200599a315e@quicinc.com Signed-off-by: Jens Axboe --- drivers/block/z2ram.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c index 7c5f4e4d9b5037..4b7219be1bb860 100644 --- a/drivers/block/z2ram.c +++ b/drivers/block/z2ram.c @@ -409,4 +409,5 @@ static void __exit z2_exit(void) module_init(z2_init); module_exit(z2_exit); +MODULE_DESCRIPTION("Amiga Zorro II ramdisk driver"); MODULE_LICENSE("GPL"); From dc2e77979412d289df9049d8c693761db8602867 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Fri, 14 Jun 2024 12:30:44 -0400 Subject: [PATCH 1122/1578] net: Split a __sys_bind helper for io_uring io_uring holds a reference to the file and maintains a sockaddr_storage address. Similarly to what was done to __sys_connect_file, split an internal helper for __sys_bind in preparation to supporting an io_uring bind command. Reviewed-by: Jens Axboe Signed-off-by: Gabriel Krisman Bertazi Reviewed-by: Kuniyuki Iwashima Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20240614163047.31581-1-krisman@suse.de Signed-off-by: Jens Axboe --- include/linux/socket.h | 2 ++ net/socket.c | 25 ++++++++++++++++--------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/include/linux/socket.h b/include/linux/socket.h index 89d16b90370bd4..b3000f49e9f59f 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -442,6 +442,8 @@ extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, extern int __sys_socket(int family, int type, int protocol); extern struct file *__sys_socket_file(int family, int type, int protocol); extern int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen); +extern int __sys_bind_socket(struct socket *sock, struct sockaddr_storage *address, + int addrlen); extern int __sys_connect_file(struct file *file, struct sockaddr_storage *addr, int addrlen, int file_flags); extern int __sys_connect(int fd, struct sockaddr __user *uservaddr, diff --git a/net/socket.c b/net/socket.c index e416920e9399ec..fd0714e10cedd1 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1822,6 +1822,20 @@ SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol, return __sys_socketpair(family, type, protocol, usockvec); } +int __sys_bind_socket(struct socket *sock, struct sockaddr_storage *address, + int addrlen) +{ + int err; + + err = security_socket_bind(sock, (struct sockaddr *)address, + addrlen); + if (!err) + err = READ_ONCE(sock->ops)->bind(sock, + (struct sockaddr *)address, + addrlen); + return err; +} + /* * Bind a name to a socket. Nothing much to do here since it's * the protocol's responsibility to handle the local address. @@ -1839,15 +1853,8 @@ int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen) sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock) { err = move_addr_to_kernel(umyaddr, addrlen, &address); - if (!err) { - err = security_socket_bind(sock, - (struct sockaddr *)&address, - addrlen); - if (!err) - err = READ_ONCE(sock->ops)->bind(sock, - (struct sockaddr *) - &address, addrlen); - } + if (!err) + err = __sys_bind_socket(sock, &address, addrlen); fput_light(sock->file, fput_needed); } return err; From bb6aaf736680f0f3c2e6281735c47c64e2042819 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Fri, 14 Jun 2024 12:30:45 -0400 Subject: [PATCH 1123/1578] net: Split a __sys_listen helper for io_uring io_uring holds a reference to the file and maintains a sockaddr_storage address. Similarly to what was done to __sys_connect_file, split an internal helper for __sys_listen in preparation to support an io_uring listen command. Reviewed-by: Jens Axboe Signed-off-by: Gabriel Krisman Bertazi Reviewed-by: Kuniyuki Iwashima Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20240614163047.31581-2-krisman@suse.de Signed-off-by: Jens Axboe --- include/linux/socket.h | 1 + net/socket.c | 23 ++++++++++++++--------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/include/linux/socket.h b/include/linux/socket.h index b3000f49e9f59f..c1f16cdab67762 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -449,6 +449,7 @@ extern int __sys_connect_file(struct file *file, struct sockaddr_storage *addr, extern int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen); extern int __sys_listen(int fd, int backlog); +extern int __sys_listen_socket(struct socket *sock, int backlog); extern int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, int __user *usockaddr_len); extern int __sys_getpeername(int fd, struct sockaddr __user *usockaddr, diff --git a/net/socket.c b/net/socket.c index fd0714e10cedd1..fcbdd5bc47ac2f 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1870,23 +1870,28 @@ SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen) * necessary for a listen, and if that works, we mark the socket as * ready for listening. */ +int __sys_listen_socket(struct socket *sock, int backlog) +{ + int somaxconn, err; + + somaxconn = READ_ONCE(sock_net(sock->sk)->core.sysctl_somaxconn); + if ((unsigned int)backlog > somaxconn) + backlog = somaxconn; + + err = security_socket_listen(sock, backlog); + if (!err) + err = READ_ONCE(sock->ops)->listen(sock, backlog); + return err; +} int __sys_listen(int fd, int backlog) { struct socket *sock; int err, fput_needed; - int somaxconn; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock) { - somaxconn = READ_ONCE(sock_net(sock->sk)->core.sysctl_somaxconn); - if ((unsigned int)backlog > somaxconn) - backlog = somaxconn; - - err = security_socket_listen(sock, backlog); - if (!err) - err = READ_ONCE(sock->ops)->listen(sock, backlog); - + err = __sys_listen_socket(sock, backlog); fput_light(sock->file, fput_needed); } return err; From 7481fd93fa0a851740e26026485f56a1305454ce Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Fri, 14 Jun 2024 12:30:46 -0400 Subject: [PATCH 1124/1578] io_uring: Introduce IORING_OP_BIND IORING_OP_BIND provides the semantic of bind(2) via io_uring. While this is an essentially synchronous system call, the main point is to enable a network path to execute fully with io_uring registered and descriptorless files. Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20240614163047.31581-3-krisman@suse.de Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 1 + io_uring/net.c | 36 +++++++++++++++++++++++++++++++++++ io_uring/net.h | 3 +++ io_uring/opdef.c | 13 +++++++++++++ 4 files changed, 53 insertions(+) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 994bf7af0efe20..4ef153d95c875e 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -257,6 +257,7 @@ enum io_uring_op { IORING_OP_FUTEX_WAITV, IORING_OP_FIXED_FD_INSTALL, IORING_OP_FTRUNCATE, + IORING_OP_BIND, /* this goes last, obviously */ IORING_OP_LAST, diff --git a/io_uring/net.c b/io_uring/net.c index 7c98c4d5094633..8cc4cfc2fef312 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -51,6 +51,11 @@ struct io_connect { bool seen_econnaborted; }; +struct io_bind { + struct file *file; + int addr_len; +}; + struct io_sr_msg { struct file *file; union { @@ -1715,6 +1720,37 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } +int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind); + struct sockaddr __user *uaddr; + struct io_async_msghdr *io; + + if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) + return -EINVAL; + + uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr)); + bind->addr_len = READ_ONCE(sqe->addr2); + + io = io_msg_alloc_async(req); + if (unlikely(!io)) + return -ENOMEM; + return move_addr_to_kernel(uaddr, bind->addr_len, &io->addr); +} + +int io_bind(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind); + struct io_async_msghdr *io = req->async_data; + int ret; + + ret = __sys_bind_socket(sock_from_file(req->file), &io->addr, bind->addr_len); + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return 0; +} + void io_netmsg_cache_free(const void *entry) { struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry; diff --git a/io_uring/net.h b/io_uring/net.h index 0eb1c1920fc9a9..49f9a7bc111390 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -49,6 +49,9 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags); int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); void io_send_zc_cleanup(struct io_kiocb *req); +int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_bind(struct io_kiocb *req, unsigned int issue_flags); + void io_netmsg_cache_free(const void *entry); #else static inline void io_netmsg_cache_free(const void *entry) diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 2e3b7b16effb34..1424c2d9e9ef2d 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -495,6 +495,16 @@ const struct io_issue_def io_issue_defs[] = { .prep = io_ftruncate_prep, .issue = io_ftruncate, }, + [IORING_OP_BIND] = { +#if defined(CONFIG_NET) + .needs_file = 1, + .prep = io_bind_prep, + .issue = io_bind, + .async_size = sizeof(struct io_async_msghdr), +#else + .prep = io_eopnotsupp_prep, +#endif + }, }; const struct io_cold_def io_cold_defs[] = { @@ -716,6 +726,9 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_FTRUNCATE] = { .name = "FTRUNCATE", }, + [IORING_OP_BIND] = { + .name = "BIND", + }, }; const char *io_uring_get_opcode(u8 opcode) From ff140cc8628abfb1755691d16cfa8788d8820ef7 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Fri, 14 Jun 2024 12:30:47 -0400 Subject: [PATCH 1125/1578] io_uring: Introduce IORING_OP_LISTEN IORING_OP_LISTEN provides the semantic of listen(2) via io_uring. While this is an essentially synchronous system call, the main point is to enable a network path to execute fully with io_uring registered and descriptorless files. Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20240614163047.31581-4-krisman@suse.de Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 1 + io_uring/net.c | 28 ++++++++++++++++++++++++++++ io_uring/net.h | 3 +++ io_uring/opdef.c | 13 +++++++++++++ 4 files changed, 45 insertions(+) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 4ef153d95c875e..2aaf7ee256ac42 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -258,6 +258,7 @@ enum io_uring_op { IORING_OP_FIXED_FD_INSTALL, IORING_OP_FTRUNCATE, IORING_OP_BIND, + IORING_OP_LISTEN, /* this goes last, obviously */ IORING_OP_LAST, diff --git a/io_uring/net.c b/io_uring/net.c index 8cc4cfc2fef312..db4a4a03ce3ab6 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -56,6 +56,11 @@ struct io_bind { int addr_len; }; +struct io_listen { + struct file *file; + int backlog; +}; + struct io_sr_msg { struct file *file; union { @@ -1751,6 +1756,29 @@ int io_bind(struct io_kiocb *req, unsigned int issue_flags) return 0; } +int io_listen_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen); + + if (sqe->addr || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in || sqe->addr2) + return -EINVAL; + + listen->backlog = READ_ONCE(sqe->len); + return 0; +} + +int io_listen(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen); + int ret; + + ret = __sys_listen_socket(sock_from_file(req->file), listen->backlog); + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return 0; +} + void io_netmsg_cache_free(const void *entry) { struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry; diff --git a/io_uring/net.h b/io_uring/net.h index 49f9a7bc111390..52bfee05f06a14 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -52,6 +52,9 @@ void io_send_zc_cleanup(struct io_kiocb *req); int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_bind(struct io_kiocb *req, unsigned int issue_flags); +int io_listen_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_listen(struct io_kiocb *req, unsigned int issue_flags); + void io_netmsg_cache_free(const void *entry); #else static inline void io_netmsg_cache_free(const void *entry) diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 1424c2d9e9ef2d..2dd49cf22f642d 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -503,6 +503,16 @@ const struct io_issue_def io_issue_defs[] = { .async_size = sizeof(struct io_async_msghdr), #else .prep = io_eopnotsupp_prep, +#endif + }, + [IORING_OP_LISTEN] = { +#if defined(CONFIG_NET) + .needs_file = 1, + .prep = io_listen_prep, + .issue = io_listen, + .async_size = sizeof(struct io_async_msghdr), +#else + .prep = io_eopnotsupp_prep, #endif }, }; @@ -729,6 +739,9 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_BIND] = { .name = "BIND", }, + [IORING_OP_LISTEN] = { + .name = "LISTEN", + }, }; const char *io_uring_get_opcode(u8 opcode) From dd9300e9eaeeb212f77ffeb72d1d8756107f1f1f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:28 +0200 Subject: [PATCH 1126/1578] xen-blkfront: don't disable cache flushes when they fail MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit blkfront always had a robust negotiation protocol for detecting a write cache. Stop simply disabling cache flushes in the block layer as the flags handling is moving to the atomic queue limits API that needs user context to freeze the queue for that. Instead handle the case of the feature flags cleared inside of blkfront. This removes old debug code to check for such a mismatch which was previously impossible to hit, including the check for passthrough requests that blkfront never used to start with. Signed-off-by: Christoph Hellwig Acked-by: Roger Pau Monné Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240617060532.127975-2-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/xen-blkfront.c | 44 +++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 9b4ec3e4908cce..851b03844edd13 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -788,6 +788,11 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri * A barrier request a superset of FUA, so we can * implement it the same way. (It's also a FLUSH+FUA, * since it is guaranteed ordered WRT previous writes.) + * + * Note that can end up here with a FUA write and the + * flags cleared. This happens when the flag was + * run-time disabled after a failing I/O, and we'll + * simplify submit it as a normal write. */ if (info->feature_flush && info->feature_fua) ring_req->operation = @@ -795,8 +800,6 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri else if (info->feature_flush) ring_req->operation = BLKIF_OP_FLUSH_DISKCACHE; - else - ring_req->operation = 0; } ring_req->u.rw.nr_segments = num_grant; if (unlikely(require_extra_req)) { @@ -887,16 +890,6 @@ static inline void flush_requests(struct blkfront_ring_info *rinfo) notify_remote_via_irq(rinfo->irq); } -static inline bool blkif_request_flush_invalid(struct request *req, - struct blkfront_info *info) -{ - return (blk_rq_is_passthrough(req) || - ((req_op(req) == REQ_OP_FLUSH) && - !info->feature_flush) || - ((req->cmd_flags & REQ_FUA) && - !info->feature_fua)); -} - static blk_status_t blkif_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *qd) { @@ -908,12 +901,22 @@ static blk_status_t blkif_queue_rq(struct blk_mq_hw_ctx *hctx, rinfo = get_rinfo(info, qid); blk_mq_start_request(qd->rq); spin_lock_irqsave(&rinfo->ring_lock, flags); - if (RING_FULL(&rinfo->ring)) - goto out_busy; - if (blkif_request_flush_invalid(qd->rq, rinfo->dev_info)) - goto out_err; + /* + * Check if the backend actually supports flushes. + * + * While the block layer won't send us flushes if we don't claim to + * support them, the Xen protocol allows the backend to revoke support + * at any time. That is of course a really bad idea and dangerous, but + * has been allowed for 10+ years. In that case we simply clear the + * flags, and directly return here for an empty flush and ignore the + * FUA flag later on. + */ + if (unlikely(req_op(qd->rq) == REQ_OP_FLUSH && !info->feature_flush)) + goto complete; + if (RING_FULL(&rinfo->ring)) + goto out_busy; if (blkif_queue_request(qd->rq, rinfo)) goto out_busy; @@ -921,14 +924,14 @@ static blk_status_t blkif_queue_rq(struct blk_mq_hw_ctx *hctx, spin_unlock_irqrestore(&rinfo->ring_lock, flags); return BLK_STS_OK; -out_err: - spin_unlock_irqrestore(&rinfo->ring_lock, flags); - return BLK_STS_IOERR; - out_busy: blk_mq_stop_hw_queue(hctx); spin_unlock_irqrestore(&rinfo->ring_lock, flags); return BLK_STS_DEV_RESOURCE; +complete: + spin_unlock_irqrestore(&rinfo->ring_lock, flags); + blk_mq_end_request(qd->rq, BLK_STS_OK); + return BLK_STS_OK; } static void blkif_complete_rq(struct request *rq) @@ -1627,7 +1630,6 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) blkif_req(req)->error = BLK_STS_OK; info->feature_fua = 0; info->feature_flush = 0; - xlvbd_flush(info); } fallthrough; case BLKIF_OP_READ: From be60e7700e6df1e16a2f60f45bece08e6140a46d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:29 +0200 Subject: [PATCH 1127/1578] sd: remove sd_is_zoned Since commit 7437bb73f087 ("block: remove support for the host aware zone model"), only ZBC devices expose a zoned access model. sd_is_zoned is used to check for that and thus return false for host aware devices. Replace the helper with the simple open coded TYPE_ZBC check to fix this. Fixes: 7437bb73f087 ("block: remove support for the host aware zone model") Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20240617060532.127975-3-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/sd.c | 6 +----- drivers/scsi/sd.h | 5 ----- drivers/scsi/sd_zbc.c | 13 ++++--------- 3 files changed, 5 insertions(+), 19 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index e01393ed42076b..664523048ce819 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -466,10 +466,6 @@ provisioning_mode_store(struct device *dev, struct device_attribute *attr, if (sdp->type != TYPE_DISK) return -EINVAL; - /* ignore the provisioning mode for ZBC devices */ - if (sd_is_zoned(sdkp)) - return count; - mode = sysfs_match_string(lbp_mode, buf); if (mode < 0) return -EINVAL; @@ -2288,7 +2284,7 @@ static int sd_done(struct scsi_cmnd *SCpnt) } out: - if (sd_is_zoned(sdkp)) + if (sdkp->device->type == TYPE_ZBC) good_bytes = sd_zbc_complete(SCpnt, good_bytes, &sshdr); SCSI_LOG_HLCOMPLETE(1, scmd_printk(KERN_INFO, SCpnt, diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index 726f1613f6cb56..7603b3c67b233f 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -222,11 +222,6 @@ static inline sector_t sectors_to_logical(struct scsi_device *sdev, sector_t sec void sd_dif_config_host(struct scsi_disk *sdkp, struct queue_limits *lim); -static inline int sd_is_zoned(struct scsi_disk *sdkp) -{ - return sdkp->zoned == 1 || sdkp->device->type == TYPE_ZBC; -} - #ifdef CONFIG_BLK_DEV_ZONED int sd_zbc_read_zones(struct scsi_disk *sdkp, struct queue_limits *lim, diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index f685838d9ed214..8cc9c025017961 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -232,7 +232,7 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, int zone_idx = 0; int ret; - if (!sd_is_zoned(sdkp)) + if (sdkp->device->type != TYPE_ZBC) /* Not a zoned device */ return -EOPNOTSUPP; @@ -300,7 +300,7 @@ static blk_status_t sd_zbc_cmnd_checks(struct scsi_cmnd *cmd) struct scsi_disk *sdkp = scsi_disk(rq->q->disk); sector_t sector = blk_rq_pos(rq); - if (!sd_is_zoned(sdkp)) + if (sdkp->device->type != TYPE_ZBC) /* Not a zoned device */ return BLK_STS_IOERR; @@ -521,7 +521,7 @@ static int sd_zbc_check_capacity(struct scsi_disk *sdkp, unsigned char *buf, static void sd_zbc_print_zones(struct scsi_disk *sdkp) { - if (!sd_is_zoned(sdkp) || !sdkp->capacity) + if (sdkp->device->type != TYPE_ZBC || !sdkp->capacity) return; if (sdkp->capacity & (sdkp->zone_info.zone_blocks - 1)) @@ -598,13 +598,8 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, struct queue_limits *lim, u32 zone_blocks = 0; int ret; - if (!sd_is_zoned(sdkp)) { - /* - * Device managed or normal SCSI disk, no special handling - * required. - */ + if (sdkp->device->type != TYPE_ZBC) return 0; - } /* READ16/WRITE16/SYNC16 is mandatory for ZBC devices */ sdkp->device->use_16_for_rw = 1; From 308ad58af49d6c4c3b7a36b98972cc9db4d7b36a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:30 +0200 Subject: [PATCH 1128/1578] sd: move zone limits setup out of sd_read_block_characteristics Move a bit of code that sets up the zone flag and the write granularity into sd_zbc_read_zones to be with the rest of the zoned limits. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240617060532.127975-4-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/sd.c | 21 +-------------------- drivers/scsi/sd_zbc.c | 9 +++++++++ 2 files changed, 10 insertions(+), 20 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 664523048ce819..66f7d1e3429c86 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3312,29 +3312,10 @@ static void sd_read_block_characteristics(struct scsi_disk *sdkp, blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q); } - -#ifdef CONFIG_BLK_DEV_ZONED /* sd_probe rejects ZBD devices early otherwise */ - if (sdkp->device->type == TYPE_ZBC) { - lim->zoned = true; - - /* - * Per ZBC and ZAC specifications, writes in sequential write - * required zones of host-managed devices must be aligned to - * the device physical block size. - */ - lim->zone_write_granularity = sdkp->physical_block_size; - } else { - /* - * Host-aware devices are treated as conventional. - */ - lim->zoned = false; - } -#endif /* CONFIG_BLK_DEV_ZONED */ - if (!sdkp->first_scan) return; - if (lim->zoned) + if (sdkp->device->type == TYPE_ZBC) sd_printk(KERN_NOTICE, sdkp, "Host-managed zoned block device\n"); else if (sdkp->zoned == 1) sd_printk(KERN_NOTICE, sdkp, "Host-aware SMR disk used as regular disk\n"); diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index 8cc9c025017961..360ec980499529 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -601,6 +601,15 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, struct queue_limits *lim, if (sdkp->device->type != TYPE_ZBC) return 0; + lim->zoned = true; + + /* + * Per ZBC and ZAC specifications, writes in sequential write required + * zones of host-managed devices must be aligned to the device physical + * block size. + */ + lim->zone_write_granularity = sdkp->physical_block_size; + /* READ16/WRITE16/SYNC16 is mandatory for ZBC devices */ sdkp->device->use_16_for_rw = 1; sdkp->device->use_10_for_rw = 0; From c9055b44abe60da69aa4ee4fdcb78ee7fe733335 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:31 +0200 Subject: [PATCH 1129/1578] loop: stop using loop_reconfigure_limits in __loop_clr_fd __loop_clr_fd wants to clear all settings on the device. Prepare for moving more settings into the block limits by open coding loop_reconfigure_limits. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20240617060532.127975-5-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 93780f41646b75..fd671028fa8554 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1133,6 +1133,7 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode, static void __loop_clr_fd(struct loop_device *lo, bool release) { + struct queue_limits lim; struct file *filp; gfp_t gfp = lo->old_gfp_mask; @@ -1156,7 +1157,14 @@ static void __loop_clr_fd(struct loop_device *lo, bool release) lo->lo_offset = 0; lo->lo_sizelimit = 0; memset(lo->lo_file_name, 0, LO_NAME_SIZE); - loop_reconfigure_limits(lo, 512, false); + + /* reset the block size to the default */ + lim = queue_limits_start_update(lo->lo_queue); + lim.logical_block_size = SECTOR_SIZE; + lim.physical_block_size = SECTOR_SIZE; + lim.io_min = SECTOR_SIZE; + queue_limits_commit_update(lo->lo_queue, &lim); + invalidate_disk(lo->lo_disk); loop_sysfs_exit(lo); /* let user-space know about this change */ From ae0d40ff49642651f969883ef9fc79d69c1632d7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:32 +0200 Subject: [PATCH 1130/1578] loop: always update discard settings in loop_reconfigure_limits Simplify loop_reconfigure_limits by always updating the discard limits. This adds a little more work to loop_set_block_size, but doesn't change the outcome as the discard flag won't change. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20240617060532.127975-6-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index fd671028fa8554..ce197cbea5f434 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -975,8 +975,7 @@ loop_set_status_from_info(struct loop_device *lo, return 0; } -static int loop_reconfigure_limits(struct loop_device *lo, unsigned short bsize, - bool update_discard_settings) +static int loop_reconfigure_limits(struct loop_device *lo, unsigned short bsize) { struct queue_limits lim; @@ -984,8 +983,7 @@ static int loop_reconfigure_limits(struct loop_device *lo, unsigned short bsize, lim.logical_block_size = bsize; lim.physical_block_size = bsize; lim.io_min = bsize; - if (update_discard_settings) - loop_config_discard(lo, &lim); + loop_config_discard(lo, &lim); return queue_limits_commit_update(lo->lo_queue, &lim); } @@ -1086,7 +1084,7 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode, else bsize = 512; - error = loop_reconfigure_limits(lo, bsize, true); + error = loop_reconfigure_limits(lo, bsize); if (WARN_ON_ONCE(error)) goto out_unlock; @@ -1496,7 +1494,7 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg) invalidate_bdev(lo->lo_device); blk_mq_freeze_queue(lo->lo_queue); - err = loop_reconfigure_limits(lo, arg, false); + err = loop_reconfigure_limits(lo, arg); loop_update_dio(lo); blk_mq_unfreeze_queue(lo->lo_queue); From a17ece76bcfe7b86327b19cae1652d7c62068a30 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:33 +0200 Subject: [PATCH 1131/1578] loop: regularize upgrading the block size for direct I/O The LOOP_CONFIGURE path automatically upgrades the block size to that of the underlying file for O_DIRECT file descriptors, but the LOOP_SET_BLOCK_SIZE path does not. Fix this by lifting the code to pick the block size into common code. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20240617060532.127975-7-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index ce197cbea5f434..eea3e4919e356e 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -975,10 +975,24 @@ loop_set_status_from_info(struct loop_device *lo, return 0; } +static unsigned short loop_default_blocksize(struct loop_device *lo, + struct block_device *backing_bdev) +{ + /* In case of direct I/O, match underlying block size */ + if ((lo->lo_backing_file->f_flags & O_DIRECT) && backing_bdev) + return bdev_logical_block_size(backing_bdev); + return SECTOR_SIZE; +} + static int loop_reconfigure_limits(struct loop_device *lo, unsigned short bsize) { + struct file *file = lo->lo_backing_file; + struct inode *inode = file->f_mapping->host; struct queue_limits lim; + if (!bsize) + bsize = loop_default_blocksize(lo, inode->i_sb->s_bdev); + lim = queue_limits_start_update(lo->lo_queue); lim.logical_block_size = bsize; lim.physical_block_size = bsize; @@ -997,7 +1011,6 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode, int error; loff_t size; bool partscan; - unsigned short bsize; bool is_loop; if (!file) @@ -1076,15 +1089,7 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode, if (!(lo->lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) blk_queue_write_cache(lo->lo_queue, true, false); - if (config->block_size) - bsize = config->block_size; - else if ((lo->lo_backing_file->f_flags & O_DIRECT) && inode->i_sb->s_bdev) - /* In case of direct I/O, match underlying block size */ - bsize = bdev_logical_block_size(inode->i_sb->s_bdev); - else - bsize = 512; - - error = loop_reconfigure_limits(lo, bsize); + error = loop_reconfigure_limits(lo, config->block_size); if (WARN_ON_ONCE(error)) goto out_unlock; From 4ce37fe0938b02b7b947029c40b72d76a22a3882 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:34 +0200 Subject: [PATCH 1132/1578] loop: also use the default block size from an underlying block device Fix the code in loop_reconfigure_limits to pick a default block size for O_DIRECT file descriptors to also work when the loop device sits on top of a block device and not just on a regular file on a block device based file system. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20240617060532.127975-8-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index eea3e4919e356e..6a4826708a3acf 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -988,10 +988,16 @@ static int loop_reconfigure_limits(struct loop_device *lo, unsigned short bsize) { struct file *file = lo->lo_backing_file; struct inode *inode = file->f_mapping->host; + struct block_device *backing_bdev = NULL; struct queue_limits lim; + if (S_ISBLK(inode->i_mode)) + backing_bdev = I_BDEV(inode); + else if (inode->i_sb->s_bdev) + backing_bdev = inode->i_sb->s_bdev; + if (!bsize) - bsize = loop_default_blocksize(lo, inode->i_sb->s_bdev); + bsize = loop_default_blocksize(lo, backing_bdev); lim = queue_limits_start_update(lo->lo_queue); lim.logical_block_size = bsize; From 97dd4a43d69b74a114be466d6887e257971adfe9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:35 +0200 Subject: [PATCH 1133/1578] loop: fold loop_update_rotational into loop_reconfigure_limits This prepares for moving the rotational flag into the queue_limits and also fixes it for the case where the loop device is backed by a block device. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20240617060532.127975-9-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 6a4826708a3acf..8991de8fb1bb0b 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -916,24 +916,6 @@ static void loop_free_idle_workers_timer(struct timer_list *timer) return loop_free_idle_workers(lo, false); } -static void loop_update_rotational(struct loop_device *lo) -{ - struct file *file = lo->lo_backing_file; - struct inode *file_inode = file->f_mapping->host; - struct block_device *file_bdev = file_inode->i_sb->s_bdev; - struct request_queue *q = lo->lo_queue; - bool nonrot = true; - - /* not all filesystems (e.g. tmpfs) have a sb->s_bdev */ - if (file_bdev) - nonrot = bdev_nonrot(file_bdev); - - if (nonrot) - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - else - blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); -} - /** * loop_set_status_from_info - configure device from loop_info * @lo: struct loop_device to configure @@ -1003,6 +985,10 @@ static int loop_reconfigure_limits(struct loop_device *lo, unsigned short bsize) lim.logical_block_size = bsize; lim.physical_block_size = bsize; lim.io_min = bsize; + if (!backing_bdev || bdev_nonrot(backing_bdev)) + blk_queue_flag_set(QUEUE_FLAG_NONROT, lo->lo_queue); + else + blk_queue_flag_clear(QUEUE_FLAG_NONROT, lo->lo_queue); loop_config_discard(lo, &lim); return queue_limits_commit_update(lo->lo_queue, &lim); } @@ -1099,7 +1085,6 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode, if (WARN_ON_ONCE(error)) goto out_unlock; - loop_update_rotational(lo); loop_update_dio(lo); loop_sysfs_init(lo); From bbe5c84122b35c37f2706872fe34da66f0854b56 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:36 +0200 Subject: [PATCH 1134/1578] virtio_blk: remove virtblk_update_cache_mode virtblk_update_cache_mode boils down to a single call to blk_queue_write_cache. Remove it in preparation for moving the cache control flags into the queue_limits. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Stefan Hajnoczi Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20240617060532.127975-10-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/virtio_blk.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 2351f411fa4680..378b241911ca87 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -1089,14 +1089,6 @@ static int virtblk_get_cache_mode(struct virtio_device *vdev) return writeback; } -static void virtblk_update_cache_mode(struct virtio_device *vdev) -{ - u8 writeback = virtblk_get_cache_mode(vdev); - struct virtio_blk *vblk = vdev->priv; - - blk_queue_write_cache(vblk->disk->queue, writeback, false); -} - static const char *const virtblk_cache_types[] = { "write through", "write back" }; @@ -1116,7 +1108,7 @@ cache_type_store(struct device *dev, struct device_attribute *attr, return i; virtio_cwrite8(vdev, offsetof(struct virtio_blk_config, wce), i); - virtblk_update_cache_mode(vdev); + blk_queue_write_cache(disk->queue, virtblk_get_cache_mode(vdev), false); return count; } @@ -1528,7 +1520,8 @@ static int virtblk_probe(struct virtio_device *vdev) vblk->index = index; /* configure queue flush support */ - virtblk_update_cache_mode(vdev); + blk_queue_write_cache(vblk->disk->queue, virtblk_get_cache_mode(vdev), + false); /* If disk is read-only in the host, the guest should obey */ if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO)) From 6b377787a306253111404325aee98005b361e59a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:37 +0200 Subject: [PATCH 1135/1578] nbd: move setting the cache control flags to __nbd_set_size Move setting the cache control flags in nbd in preparation for moving these flags into the queue_limits structure. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Josef Bacik Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240617060532.127975-11-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index ad887d614d5b3f..44b8c671921e5c 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -342,6 +342,12 @@ static int __nbd_set_size(struct nbd_device *nbd, loff_t bytesize, lim.max_hw_discard_sectors = UINT_MAX; else lim.max_hw_discard_sectors = 0; + if (!(nbd->config->flags & NBD_FLAG_SEND_FLUSH)) + blk_queue_write_cache(nbd->disk->queue, false, false); + else if (nbd->config->flags & NBD_FLAG_SEND_FUA) + blk_queue_write_cache(nbd->disk->queue, true, true); + else + blk_queue_write_cache(nbd->disk->queue, true, false); lim.logical_block_size = blksize; lim.physical_block_size = blksize; error = queue_limits_commit_update(nbd->disk->queue, &lim); @@ -1286,19 +1292,10 @@ static void nbd_bdev_reset(struct nbd_device *nbd) static void nbd_parse_flags(struct nbd_device *nbd) { - struct nbd_config *config = nbd->config; - if (config->flags & NBD_FLAG_READ_ONLY) + if (nbd->config->flags & NBD_FLAG_READ_ONLY) set_disk_ro(nbd->disk, true); else set_disk_ro(nbd->disk, false); - if (config->flags & NBD_FLAG_SEND_FLUSH) { - if (config->flags & NBD_FLAG_SEND_FUA) - blk_queue_write_cache(nbd->disk->queue, true, true); - else - blk_queue_write_cache(nbd->disk->queue, true, false); - } - else - blk_queue_write_cache(nbd->disk->queue, false, false); } static void send_disconnects(struct nbd_device *nbd) From af2814149883e2c1851866ea2afcd8eadc040f79 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:38 +0200 Subject: [PATCH 1136/1578] block: freeze the queue in queue_attr_store queue_attr_store updates attributes used to control generating I/O, and can cause malformed bios if changed with I/O in flight. Freeze the queue in common code instead of adding it to almost every attribute. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20240617060532.127975-12-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 5 +++-- block/blk-sysfs.c | 9 ++------- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 0d4cd39c3d25da..58b0d6c7cc34d6 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4631,13 +4631,15 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) int ret; unsigned long i; + if (WARN_ON_ONCE(!q->mq_freeze_depth)) + return -EINVAL; + if (!set) return -EINVAL; if (q->nr_requests == nr) return 0; - blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); ret = 0; @@ -4671,7 +4673,6 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) } blk_mq_unquiesce_queue(q); - blk_mq_unfreeze_queue(q); return ret; } diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index f0f9314ab65c61..5c787965b7d09e 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -189,12 +189,9 @@ static ssize_t queue_discard_max_store(struct request_queue *q, if ((max_discard_bytes >> SECTOR_SHIFT) > UINT_MAX) return -EINVAL; - blk_mq_freeze_queue(q); lim = queue_limits_start_update(q); lim.max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT; err = queue_limits_commit_update(q, &lim); - blk_mq_unfreeze_queue(q); - if (err) return err; return ret; @@ -241,11 +238,9 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) if (ret < 0) return ret; - blk_mq_freeze_queue(q); lim = queue_limits_start_update(q); lim.max_user_sectors = max_sectors_kb << 1; err = queue_limits_commit_update(q, &lim); - blk_mq_unfreeze_queue(q); if (err) return err; return ret; @@ -585,13 +580,11 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, * ends up either enabling or disabling wbt completely. We can't * have IO inflight if that happens. */ - blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); wbt_set_min_lat(q, val); blk_mq_unquiesce_queue(q); - blk_mq_unfreeze_queue(q); return count; } @@ -722,9 +715,11 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, if (!entry->store) return -EIO; + blk_mq_freeze_queue(q); mutex_lock(&q->sysfs_lock); res = entry->store(q, page, length); mutex_unlock(&q->sysfs_lock); + blk_mq_unfreeze_queue(q); return res; } From 70905f8706b62113ae32c8df721384ff6ffb6c6a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:39 +0200 Subject: [PATCH 1137/1578] block: remove blk_flush_policy Fold blk_flush_policy into the only caller to prepare for pending changes to it. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20240617060532.127975-13-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-flush.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index c17cf8ed8113db..2234f8b3fc05f2 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -100,23 +100,6 @@ blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx) return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq; } -static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq) -{ - unsigned int policy = 0; - - if (blk_rq_sectors(rq)) - policy |= REQ_FSEQ_DATA; - - if (fflags & (1UL << QUEUE_FLAG_WC)) { - if (rq->cmd_flags & REQ_PREFLUSH) - policy |= REQ_FSEQ_PREFLUSH; - if (!(fflags & (1UL << QUEUE_FLAG_FUA)) && - (rq->cmd_flags & REQ_FUA)) - policy |= REQ_FSEQ_POSTFLUSH; - } - return policy; -} - static unsigned int blk_flush_cur_seq(struct request *rq) { return 1 << ffz(rq->flush.seq); @@ -399,12 +382,26 @@ bool blk_insert_flush(struct request *rq) { struct request_queue *q = rq->q; unsigned long fflags = q->queue_flags; /* may change, cache */ - unsigned int policy = blk_flush_policy(fflags, rq); struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx); + unsigned int policy = 0; /* FLUSH/FUA request must never be merged */ WARN_ON_ONCE(rq->bio != rq->biotail); + if (blk_rq_sectors(rq)) + policy |= REQ_FSEQ_DATA; + + /* + * Check which flushes we need to sequence for this operation. + */ + if (fflags & (1UL << QUEUE_FLAG_WC)) { + if (rq->cmd_flags & REQ_PREFLUSH) + policy |= REQ_FSEQ_PREFLUSH; + if (!(fflags & (1UL << QUEUE_FLAG_FUA)) && + (rq->cmd_flags & REQ_FUA)) + policy |= REQ_FSEQ_POSTFLUSH; + } + /* * @policy now records what operations need to be done. Adjust * REQ_PREFLUSH and FUA for the driver. From 1122c0c1cc71f740fa4d5f14f239194e06a1d5e7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:40 +0200 Subject: [PATCH 1138/1578] block: move cache control settings out of queue->flags Move the cache control settings into the queue_limits so that the flags can be set atomically with the device queue frozen. Add new features and flags field for the driver set flags, and internal (usually sysfs-controlled) flags in the block layer. Note that we'll eventually remove enough field from queue_limits to bring it back to the previous size. The disable flag is inverted compared to the previous meaning, which means it now survives a rescan, similar to the max_sectors and max_discard_sectors user limits. The FLUSH and FUA flags are now inherited by blk_stack_limits, which simplified the code in dm a lot, but also causes a slight behavior change in that dm-switch and dm-unstripe now advertise a write cache despite setting num_flush_bios to 0. The I/O path will handle this gracefully, but as far as I can tell the lack of num_flush_bios and thus flush support is a pre-existing data integrity bug in those targets that really needs fixing, after which a non-zero num_flush_bios should be required in dm for targets that map to underlying devices. Signed-off-by: Christoph Hellwig Acked-by: Ulf Hansson Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240617060532.127975-14-hch@lst.de Signed-off-by: Jens Axboe --- .../block/writeback_cache_control.rst | 67 +++++++++++-------- arch/um/drivers/ubd_kern.c | 2 +- block/blk-core.c | 2 +- block/blk-flush.c | 9 ++- block/blk-mq-debugfs.c | 2 - block/blk-settings.c | 29 ++------ block/blk-sysfs.c | 29 +++++--- block/blk-wbt.c | 4 +- drivers/block/drbd/drbd_main.c | 2 +- drivers/block/loop.c | 9 +-- drivers/block/nbd.c | 14 ++-- drivers/block/null_blk/main.c | 12 ++-- drivers/block/ps3disk.c | 7 +- drivers/block/rnbd/rnbd-clt.c | 10 +-- drivers/block/ublk_drv.c | 8 ++- drivers/block/virtio_blk.c | 20 ++++-- drivers/block/xen-blkfront.c | 8 ++- drivers/md/bcache/super.c | 7 +- drivers/md/dm-table.c | 39 +++-------- drivers/md/md.c | 8 ++- drivers/mmc/core/block.c | 42 ++++++------ drivers/mmc/core/queue.c | 12 ++-- drivers/mmc/core/queue.h | 3 +- drivers/mtd/mtd_blkdevs.c | 5 +- drivers/nvdimm/pmem.c | 4 +- drivers/nvme/host/core.c | 7 +- drivers/nvme/host/multipath.c | 6 -- drivers/scsi/sd.c | 28 +++++--- include/linux/blkdev.h | 38 +++++++++-- 29 files changed, 227 insertions(+), 206 deletions(-) diff --git a/Documentation/block/writeback_cache_control.rst b/Documentation/block/writeback_cache_control.rst index b208488d0aae85..c575e08beda8e3 100644 --- a/Documentation/block/writeback_cache_control.rst +++ b/Documentation/block/writeback_cache_control.rst @@ -46,41 +46,50 @@ worry if the underlying devices need any explicit cache flushing and how the Forced Unit Access is implemented. The REQ_PREFLUSH and REQ_FUA flags may both be set on a single bio. +Feature settings for block drivers +---------------------------------- -Implementation details for bio based block drivers --------------------------------------------------------------- +For devices that do not support volatile write caches there is no driver +support required, the block layer completes empty REQ_PREFLUSH requests before +entering the driver and strips off the REQ_PREFLUSH and REQ_FUA bits from +requests that have a payload. -These drivers will always see the REQ_PREFLUSH and REQ_FUA bits as they sit -directly below the submit_bio interface. For remapping drivers the REQ_FUA -bits need to be propagated to underlying devices, and a global flush needs -to be implemented for bios with the REQ_PREFLUSH bit set. For real device -drivers that do not have a volatile cache the REQ_PREFLUSH and REQ_FUA bits -on non-empty bios can simply be ignored, and REQ_PREFLUSH requests without -data can be completed successfully without doing any work. Drivers for -devices with volatile caches need to implement the support for these -flags themselves without any help from the block layer. +For devices with volatile write caches the driver needs to tell the block layer +that it supports flushing caches by setting the + BLK_FEAT_WRITE_CACHE -Implementation details for request_fn based block drivers ---------------------------------------------------------- +flag in the queue_limits feature field. For devices that also support the FUA +bit the block layer needs to be told to pass on the REQ_FUA bit by also setting +the -For devices that do not support volatile write caches there is no driver -support required, the block layer completes empty REQ_PREFLUSH requests before -entering the driver and strips off the REQ_PREFLUSH and REQ_FUA bits from -requests that have a payload. For devices with volatile write caches the -driver needs to tell the block layer that it supports flushing caches by -doing:: + BLK_FEAT_FUA + +flag in the features field of the queue_limits structure. + +Implementation details for bio based block drivers +-------------------------------------------------- + +For bio based drivers the REQ_PREFLUSH and REQ_FUA bit are simplify passed on +to the driver if the drivers sets the BLK_FEAT_WRITE_CACHE flag and the drivers +needs to handle them. + +*NOTE*: The REQ_FUA bit also gets passed on when the BLK_FEAT_FUA flags is +_not_ set. Any bio based driver that sets BLK_FEAT_WRITE_CACHE also needs to +handle REQ_FUA. - blk_queue_write_cache(sdkp->disk->queue, true, false); +For remapping drivers the REQ_FUA bits need to be propagated to underlying +devices, and a global flush needs to be implemented for bios with the +REQ_PREFLUSH bit set. -and handle empty REQ_OP_FLUSH requests in its prep_fn/request_fn. Note that -REQ_PREFLUSH requests with a payload are automatically turned into a sequence -of an empty REQ_OP_FLUSH request followed by the actual write by the block -layer. For devices that also support the FUA bit the block layer needs -to be told to pass through the REQ_FUA bit using:: +Implementation details for blk-mq drivers +----------------------------------------- - blk_queue_write_cache(sdkp->disk->queue, true, true); +When the BLK_FEAT_WRITE_CACHE flag is set, REQ_OP_WRITE | REQ_PREFLUSH requests +with a payload are automatically turned into a sequence of a REQ_OP_FLUSH +request followed by the actual write by the block layer. -and the driver must handle write requests that have the REQ_FUA bit set -in prep_fn/request_fn. If the FUA bit is not natively supported the block -layer turns it into an empty REQ_OP_FLUSH request after the actual write. +When the BLK_FEAT_FUA flags is set, the REQ_FUA bit simplify passed on for the +REQ_OP_WRITE request, else a REQ_OP_FLUSH request is sent by the block layer +after the completion of the write request for bio submissions with the REQ_FUA +bit set. diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index cdcb75a68989dd..19e01691ea0ea7 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -835,6 +835,7 @@ static int ubd_add(int n, char **error_out) struct queue_limits lim = { .max_segments = MAX_SG, .seg_boundary_mask = PAGE_SIZE - 1, + .features = BLK_FEAT_WRITE_CACHE, }; struct gendisk *disk; int err = 0; @@ -882,7 +883,6 @@ static int ubd_add(int n, char **error_out) } blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); - blk_queue_write_cache(disk->queue, true, false); disk->major = UBD_MAJOR; disk->first_minor = n << UBD_SHIFT; disk->minors = 1 << UBD_SHIFT; diff --git a/block/blk-core.c b/block/blk-core.c index 82c3ae22d76d88..2b45a4df9a1aa1 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -782,7 +782,7 @@ void submit_bio_noacct(struct bio *bio) if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE && bio_op(bio) != REQ_OP_ZONE_APPEND)) goto end_io; - if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { + if (!bdev_write_cache(bdev)) { bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA); if (!bio_sectors(bio)) { status = BLK_STS_OK; diff --git a/block/blk-flush.c b/block/blk-flush.c index 2234f8b3fc05f2..30b9d5033a2b85 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -381,8 +381,8 @@ static void blk_rq_init_flush(struct request *rq) bool blk_insert_flush(struct request *rq) { struct request_queue *q = rq->q; - unsigned long fflags = q->queue_flags; /* may change, cache */ struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx); + bool supports_fua = q->limits.features & BLK_FEAT_FUA; unsigned int policy = 0; /* FLUSH/FUA request must never be merged */ @@ -394,11 +394,10 @@ bool blk_insert_flush(struct request *rq) /* * Check which flushes we need to sequence for this operation. */ - if (fflags & (1UL << QUEUE_FLAG_WC)) { + if (blk_queue_write_cache(q)) { if (rq->cmd_flags & REQ_PREFLUSH) policy |= REQ_FSEQ_PREFLUSH; - if (!(fflags & (1UL << QUEUE_FLAG_FUA)) && - (rq->cmd_flags & REQ_FUA)) + if ((rq->cmd_flags & REQ_FUA) && !supports_fua) policy |= REQ_FSEQ_POSTFLUSH; } @@ -407,7 +406,7 @@ bool blk_insert_flush(struct request *rq) * REQ_PREFLUSH and FUA for the driver. */ rq->cmd_flags &= ~REQ_PREFLUSH; - if (!(fflags & (1UL << QUEUE_FLAG_FUA))) + if (!supports_fua) rq->cmd_flags &= ~REQ_FUA; /* diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 770c0c2b72faaa..e8b9db7c30c455 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -93,8 +93,6 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(INIT_DONE), QUEUE_FLAG_NAME(STABLE_WRITES), QUEUE_FLAG_NAME(POLL), - QUEUE_FLAG_NAME(WC), - QUEUE_FLAG_NAME(FUA), QUEUE_FLAG_NAME(DAX), QUEUE_FLAG_NAME(STATS), QUEUE_FLAG_NAME(REGISTERED), diff --git a/block/blk-settings.c b/block/blk-settings.c index f11c8676eb4c67..536ee202fcdccb 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -261,6 +261,9 @@ static int blk_validate_limits(struct queue_limits *lim) lim->misaligned = 0; } + if (!(lim->features & BLK_FEAT_WRITE_CACHE)) + lim->features &= ~BLK_FEAT_FUA; + err = blk_validate_integrity_limits(lim); if (err) return err; @@ -454,6 +457,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, { unsigned int top, bottom, alignment, ret = 0; + t->features |= (b->features & BLK_FEAT_INHERIT_MASK); + t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); t->max_user_sectors = min_not_zero(t->max_user_sectors, b->max_user_sectors); @@ -711,30 +716,6 @@ void blk_set_queue_depth(struct request_queue *q, unsigned int depth) } EXPORT_SYMBOL(blk_set_queue_depth); -/** - * blk_queue_write_cache - configure queue's write cache - * @q: the request queue for the device - * @wc: write back cache on or off - * @fua: device supports FUA writes, if true - * - * Tell the block layer about the write cache of @q. - */ -void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) -{ - if (wc) { - blk_queue_flag_set(QUEUE_FLAG_HW_WC, q); - blk_queue_flag_set(QUEUE_FLAG_WC, q); - } else { - blk_queue_flag_clear(QUEUE_FLAG_HW_WC, q); - blk_queue_flag_clear(QUEUE_FLAG_WC, q); - } - if (fua) - blk_queue_flag_set(QUEUE_FLAG_FUA, q); - else - blk_queue_flag_clear(QUEUE_FLAG_FUA, q); -} -EXPORT_SYMBOL_GPL(blk_queue_write_cache); - int bdev_alignment_offset(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 5c787965b7d09e..4f524c1d5e08bd 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -423,32 +423,41 @@ static ssize_t queue_io_timeout_store(struct request_queue *q, const char *page, static ssize_t queue_wc_show(struct request_queue *q, char *page) { - if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) - return sprintf(page, "write back\n"); - - return sprintf(page, "write through\n"); + if (q->limits.features & BLK_FLAGS_WRITE_CACHE_DISABLED) + return sprintf(page, "write through\n"); + return sprintf(page, "write back\n"); } static ssize_t queue_wc_store(struct request_queue *q, const char *page, size_t count) { + struct queue_limits lim; + bool disable; + int err; + if (!strncmp(page, "write back", 10)) { - if (!test_bit(QUEUE_FLAG_HW_WC, &q->queue_flags)) - return -EINVAL; - blk_queue_flag_set(QUEUE_FLAG_WC, q); + disable = false; } else if (!strncmp(page, "write through", 13) || - !strncmp(page, "none", 4)) { - blk_queue_flag_clear(QUEUE_FLAG_WC, q); + !strncmp(page, "none", 4)) { + disable = true; } else { return -EINVAL; } + lim = queue_limits_start_update(q); + if (disable) + lim.flags |= BLK_FLAGS_WRITE_CACHE_DISABLED; + else + lim.flags &= ~BLK_FLAGS_WRITE_CACHE_DISABLED; + err = queue_limits_commit_update(q, &lim); + if (err) + return err; return count; } static ssize_t queue_fua_show(struct request_queue *q, char *page) { - return sprintf(page, "%u\n", test_bit(QUEUE_FLAG_FUA, &q->queue_flags)); + return sprintf(page, "%u\n", !!(q->limits.features & BLK_FEAT_FUA)); } static ssize_t queue_dax_show(struct request_queue *q, char *page) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 64472134dd26df..1a5e4b049ecd1d 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -206,8 +206,8 @@ static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw, */ if (wb_acct & WBT_DISCARD) limit = rwb->wb_background; - else if (test_bit(QUEUE_FLAG_WC, &rwb->rqos.disk->queue->queue_flags) && - !wb_recent_wait(rwb)) + else if (blk_queue_write_cache(rwb->rqos.disk->queue) && + !wb_recent_wait(rwb)) limit = 0; else limit = rwb->wb_normal; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 113b441d4d3670..bf42a46781fa21 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2697,6 +2697,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig * connect. */ .max_hw_sectors = DRBD_MAX_BIO_SIZE_SAFE >> 8, + .features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA, }; device = minor_to_device(minor); @@ -2736,7 +2737,6 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig disk->private_data = device; blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue); - blk_queue_write_cache(disk->queue, true, true); device->md_io.page = alloc_page(GFP_KERNEL); if (!device->md_io.page) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 8991de8fb1bb0b..08d0fc7f17b701 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -985,6 +985,9 @@ static int loop_reconfigure_limits(struct loop_device *lo, unsigned short bsize) lim.logical_block_size = bsize; lim.physical_block_size = bsize; lim.io_min = bsize; + lim.features &= ~BLK_FEAT_WRITE_CACHE; + if (file->f_op->fsync && !(lo->lo_flags & LO_FLAGS_READ_ONLY)) + lim.features |= BLK_FEAT_WRITE_CACHE; if (!backing_bdev || bdev_nonrot(backing_bdev)) blk_queue_flag_set(QUEUE_FLAG_NONROT, lo->lo_queue); else @@ -1078,9 +1081,6 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode, lo->old_gfp_mask = mapping_gfp_mask(mapping); mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); - if (!(lo->lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) - blk_queue_write_cache(lo->lo_queue, true, false); - error = loop_reconfigure_limits(lo, config->block_size); if (WARN_ON_ONCE(error)) goto out_unlock; @@ -1131,9 +1131,6 @@ static void __loop_clr_fd(struct loop_device *lo, bool release) struct file *filp; gfp_t gfp = lo->old_gfp_mask; - if (test_bit(QUEUE_FLAG_WC, &lo->lo_queue->queue_flags)) - blk_queue_write_cache(lo->lo_queue, false, false); - /* * Freeze the request queue when unbinding on a live file descriptor and * thus an open device. When called from ->release we are guaranteed diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 44b8c671921e5c..cb1c86a6a3fb9d 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -342,12 +342,14 @@ static int __nbd_set_size(struct nbd_device *nbd, loff_t bytesize, lim.max_hw_discard_sectors = UINT_MAX; else lim.max_hw_discard_sectors = 0; - if (!(nbd->config->flags & NBD_FLAG_SEND_FLUSH)) - blk_queue_write_cache(nbd->disk->queue, false, false); - else if (nbd->config->flags & NBD_FLAG_SEND_FUA) - blk_queue_write_cache(nbd->disk->queue, true, true); - else - blk_queue_write_cache(nbd->disk->queue, true, false); + if (!(nbd->config->flags & NBD_FLAG_SEND_FLUSH)) { + lim.features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA); + } else if (nbd->config->flags & NBD_FLAG_SEND_FUA) { + lim.features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA; + } else { + lim.features |= BLK_FEAT_WRITE_CACHE; + lim.features &= ~BLK_FEAT_FUA; + } lim.logical_block_size = blksize; lim.physical_block_size = blksize; error = queue_limits_commit_update(nbd->disk->queue, &lim); diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 75f189e42f885d..21f9d256e88402 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1928,6 +1928,13 @@ static int null_add_dev(struct nullb_device *dev) goto out_cleanup_tags; } + if (dev->cache_size > 0) { + set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); + lim.features |= BLK_FEAT_WRITE_CACHE; + if (dev->fua) + lim.features |= BLK_FEAT_FUA; + } + nullb->disk = blk_mq_alloc_disk(nullb->tag_set, &lim, nullb); if (IS_ERR(nullb->disk)) { rv = PTR_ERR(nullb->disk); @@ -1940,11 +1947,6 @@ static int null_add_dev(struct nullb_device *dev) nullb_setup_bwtimer(nullb); } - if (dev->cache_size > 0) { - set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); - blk_queue_write_cache(nullb->q, true, dev->fua); - } - nullb->q->queuedata = nullb; blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q); diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index b810ac0a5c4b97..8b73cf459b5937 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -388,9 +388,8 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev) .max_segments = -1, .max_segment_size = dev->bounce_size, .dma_alignment = dev->blk_size - 1, + .features = BLK_FEAT_WRITE_CACHE, }; - - struct request_queue *queue; struct gendisk *gendisk; if (dev->blk_size < 512) { @@ -447,10 +446,6 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev) goto fail_free_tag_set; } - queue = gendisk->queue; - - blk_queue_write_cache(queue, true, false); - priv->gendisk = gendisk; gendisk->major = ps3disk_major; gendisk->first_minor = devidx * PS3DISK_MINORS; diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index b7ffe03c61606d..02c4b173182719 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1389,6 +1389,12 @@ static int rnbd_client_setup_device(struct rnbd_clt_dev *dev, le32_to_cpu(rsp->max_discard_sectors); } + if (rsp->cache_policy & RNBD_WRITEBACK) { + lim.features |= BLK_FEAT_WRITE_CACHE; + if (rsp->cache_policy & RNBD_FUA) + lim.features |= BLK_FEAT_FUA; + } + dev->gd = blk_mq_alloc_disk(&dev->sess->tag_set, &lim, dev); if (IS_ERR(dev->gd)) return PTR_ERR(dev->gd); @@ -1397,10 +1403,6 @@ static int rnbd_client_setup_device(struct rnbd_clt_dev *dev, blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, dev->queue); blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, dev->queue); - blk_queue_write_cache(dev->queue, - !!(rsp->cache_policy & RNBD_WRITEBACK), - !!(rsp->cache_policy & RNBD_FUA)); - return rnbd_clt_setup_gen_disk(dev, rsp, idx); } diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 4e159948c912c2..e45c65c1848d31 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -487,8 +487,6 @@ static void ublk_dev_param_basic_apply(struct ublk_device *ub) struct request_queue *q = ub->ub_disk->queue; const struct ublk_param_basic *p = &ub->params.basic; - blk_queue_write_cache(q, p->attrs & UBLK_ATTR_VOLATILE_CACHE, - p->attrs & UBLK_ATTR_FUA); if (p->attrs & UBLK_ATTR_ROTATIONAL) blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); else @@ -2210,6 +2208,12 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) lim.max_zone_append_sectors = p->max_zone_append_sectors; } + if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) { + lim.features |= BLK_FEAT_WRITE_CACHE; + if (ub->params.basic.attrs & UBLK_ATTR_FUA) + lim.features |= BLK_FEAT_FUA; + } + if (wait_for_completion_interruptible(&ub->completion) != 0) return -EINTR; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 378b241911ca87..b1a3c293528519 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -1100,6 +1100,7 @@ cache_type_store(struct device *dev, struct device_attribute *attr, struct gendisk *disk = dev_to_disk(dev); struct virtio_blk *vblk = disk->private_data; struct virtio_device *vdev = vblk->vdev; + struct queue_limits lim; int i; BUG_ON(!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_CONFIG_WCE)); @@ -1108,7 +1109,17 @@ cache_type_store(struct device *dev, struct device_attribute *attr, return i; virtio_cwrite8(vdev, offsetof(struct virtio_blk_config, wce), i); - blk_queue_write_cache(disk->queue, virtblk_get_cache_mode(vdev), false); + + lim = queue_limits_start_update(disk->queue); + if (virtblk_get_cache_mode(vdev)) + lim.features |= BLK_FEAT_WRITE_CACHE; + else + lim.features &= ~BLK_FEAT_WRITE_CACHE; + blk_mq_freeze_queue(disk->queue); + i = queue_limits_commit_update(disk->queue, &lim); + blk_mq_unfreeze_queue(disk->queue); + if (i) + return i; return count; } @@ -1504,6 +1515,9 @@ static int virtblk_probe(struct virtio_device *vdev) if (err) goto out_free_tags; + if (virtblk_get_cache_mode(vdev)) + lim.features |= BLK_FEAT_WRITE_CACHE; + vblk->disk = blk_mq_alloc_disk(&vblk->tag_set, &lim, vblk); if (IS_ERR(vblk->disk)) { err = PTR_ERR(vblk->disk); @@ -1519,10 +1533,6 @@ static int virtblk_probe(struct virtio_device *vdev) vblk->disk->fops = &virtblk_fops; vblk->index = index; - /* configure queue flush support */ - blk_queue_write_cache(vblk->disk->queue, virtblk_get_cache_mode(vdev), - false); - /* If disk is read-only in the host, the guest should obey */ if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO)) set_disk_ro(vblk->disk, 1); diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 851b03844edd13..9aafce3e5987bf 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -959,6 +959,12 @@ static void blkif_set_queue_limits(const struct blkfront_info *info, lim->max_secure_erase_sectors = UINT_MAX; } + if (info->feature_flush) { + lim->features |= BLK_FEAT_WRITE_CACHE; + if (info->feature_fua) + lim->features |= BLK_FEAT_FUA; + } + /* Hard sector size and max sectors impersonate the equiv. hardware. */ lim->logical_block_size = info->sector_size; lim->physical_block_size = info->physical_sector_size; @@ -987,8 +993,6 @@ static const char *flush_info(struct blkfront_info *info) static void xlvbd_flush(struct blkfront_info *info) { - blk_queue_write_cache(info->rq, info->feature_flush ? true : false, - info->feature_fua ? true : false); pr_info("blkfront: %s: %s %s %s %s %s %s %s\n", info->gd->disk_name, flush_info(info), "persistent grants:", info->feature_persistent ? diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 4d11fc664cb0b8..cb6595c8b5514e 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -897,7 +897,6 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, sector_t sectors, struct block_device *cached_bdev, const struct block_device_operations *ops) { - struct request_queue *q; const size_t max_stripes = min_t(size_t, INT_MAX, SIZE_MAX / sizeof(atomic_t)); struct queue_limits lim = { @@ -909,6 +908,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, .io_min = block_size, .logical_block_size = block_size, .physical_block_size = block_size, + .features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA, }; uint64_t n; int idx; @@ -975,12 +975,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, d->disk->fops = ops; d->disk->private_data = d; - q = d->disk->queue; - blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue); - - blk_queue_write_cache(q, true, true); - return 0; out_bioset_exit: diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index fd789eeb62d943..03abdae646829c 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1686,34 +1686,16 @@ int dm_calculate_queue_limits(struct dm_table *t, return validate_hardware_logical_block_alignment(t, limits); } -static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) -{ - unsigned long flush = (unsigned long) data; - struct request_queue *q = bdev_get_queue(dev->bdev); - - return (q->queue_flags & flush); -} - -static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush) +/* + * Check if a target requires flush support even if none of the underlying + * devices need it (e.g. to persist target-specific metadata). + */ +static bool dm_table_supports_flush(struct dm_table *t) { - /* - * Require at least one underlying device to support flushes. - * t->devices includes internal dm devices such as mirror logs - * so we need to use iterate_devices here, which targets - * supporting flushes must provide. - */ for (unsigned int i = 0; i < t->num_targets; i++) { struct dm_target *ti = dm_table_get_target(t, i); - if (!ti->num_flush_bios) - continue; - - if (ti->flush_supported) - return true; - - if (ti->type->iterate_devices && - ti->type->iterate_devices(ti, device_flush_capable, (void *) flush)) + if (ti->num_flush_bios && ti->flush_supported) return true; } @@ -1855,7 +1837,6 @@ static int device_requires_stable_pages(struct dm_target *ti, int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *limits) { - bool wc = false, fua = false; int r; if (dm_table_supports_nowait(t)) @@ -1876,12 +1857,8 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, if (!dm_table_supports_secure_erase(t)) limits->max_secure_erase_sectors = 0; - if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) { - wc = true; - if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_FUA))) - fua = true; - } - blk_queue_write_cache(q, wc, fua); + if (dm_table_supports_flush(t)) + limits->features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA; if (dm_table_supports_dax(t, device_not_dax_capable)) { blk_queue_flag_set(QUEUE_FLAG_DAX, q); diff --git a/drivers/md/md.c b/drivers/md/md.c index 67ece2cd725f50..2f4c5d1755d857 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5785,7 +5785,10 @@ struct mddev *md_alloc(dev_t dev, char *name) int partitioned; int shift; int unit; - int error ; + int error; + struct queue_limits lim = { + .features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA, + }; /* * Wait for any previous instance of this device to be completely @@ -5825,7 +5828,7 @@ struct mddev *md_alloc(dev_t dev, char *name) */ mddev->hold_active = UNTIL_STOP; - disk = blk_alloc_disk(NULL, NUMA_NO_NODE); + disk = blk_alloc_disk(&lim, NUMA_NO_NODE); if (IS_ERR(disk)) { error = PTR_ERR(disk); goto out_free_mddev; @@ -5843,7 +5846,6 @@ struct mddev *md_alloc(dev_t dev, char *name) disk->fops = &md_fops; disk->private_data = mddev; - blk_queue_write_cache(disk->queue, true, true); disk->events |= DISK_EVENT_MEDIA_CHANGE; mddev->gendisk = disk; error = add_disk(disk); diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 367509b5b6466c..2c9963248fcbd6 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -2466,8 +2466,7 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, struct mmc_blk_data *md; int devidx, ret; char cap_str[10]; - bool cache_enabled = false; - bool fua_enabled = false; + unsigned int features = 0; devidx = ida_alloc_max(&mmc_blk_ida, max_devices - 1, GFP_KERNEL); if (devidx < 0) { @@ -2499,7 +2498,24 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, */ md->read_only = mmc_blk_readonly(card); - md->disk = mmc_init_queue(&md->queue, card); + if (mmc_host_cmd23(card->host)) { + if ((mmc_card_mmc(card) && + card->csd.mmca_vsn >= CSD_SPEC_VER_3) || + (mmc_card_sd(card) && + card->scr.cmds & SD_SCR_CMD23_SUPPORT)) + md->flags |= MMC_BLK_CMD23; + } + + if (md->flags & MMC_BLK_CMD23 && + ((card->ext_csd.rel_param & EXT_CSD_WR_REL_PARAM_EN) || + card->ext_csd.rel_sectors)) { + md->flags |= MMC_BLK_REL_WR; + features |= (BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA); + } else if (mmc_cache_enabled(card->host)) { + features |= BLK_FEAT_WRITE_CACHE; + } + + md->disk = mmc_init_queue(&md->queue, card, features); if (IS_ERR(md->disk)) { ret = PTR_ERR(md->disk); goto err_kfree; @@ -2539,26 +2555,6 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, set_capacity(md->disk, size); - if (mmc_host_cmd23(card->host)) { - if ((mmc_card_mmc(card) && - card->csd.mmca_vsn >= CSD_SPEC_VER_3) || - (mmc_card_sd(card) && - card->scr.cmds & SD_SCR_CMD23_SUPPORT)) - md->flags |= MMC_BLK_CMD23; - } - - if (md->flags & MMC_BLK_CMD23 && - ((card->ext_csd.rel_param & EXT_CSD_WR_REL_PARAM_EN) || - card->ext_csd.rel_sectors)) { - md->flags |= MMC_BLK_REL_WR; - fua_enabled = true; - cache_enabled = true; - } - if (mmc_cache_enabled(card->host)) - cache_enabled = true; - - blk_queue_write_cache(md->queue.queue, cache_enabled, fua_enabled); - string_get_size((u64)size, 512, STRING_UNITS_2, cap_str, sizeof(cap_str)); pr_info("%s: %s %s %s%s\n", diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c index 241cdc2b2a2a3b..97ff993d31570c 100644 --- a/drivers/mmc/core/queue.c +++ b/drivers/mmc/core/queue.c @@ -344,10 +344,12 @@ static const struct blk_mq_ops mmc_mq_ops = { }; static struct gendisk *mmc_alloc_disk(struct mmc_queue *mq, - struct mmc_card *card) + struct mmc_card *card, unsigned int features) { struct mmc_host *host = card->host; - struct queue_limits lim = { }; + struct queue_limits lim = { + .features = features, + }; struct gendisk *disk; if (mmc_can_erase(card)) @@ -413,10 +415,12 @@ static inline bool mmc_merge_capable(struct mmc_host *host) * mmc_init_queue - initialise a queue structure. * @mq: mmc queue * @card: mmc card to attach this queue + * @features: block layer features (BLK_FEAT_*) * * Initialise a MMC card request queue. */ -struct gendisk *mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card) +struct gendisk *mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, + unsigned int features) { struct mmc_host *host = card->host; struct gendisk *disk; @@ -460,7 +464,7 @@ struct gendisk *mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card) return ERR_PTR(ret); - disk = mmc_alloc_disk(mq, card); + disk = mmc_alloc_disk(mq, card, features); if (IS_ERR(disk)) blk_mq_free_tag_set(&mq->tag_set); return disk; diff --git a/drivers/mmc/core/queue.h b/drivers/mmc/core/queue.h index 9ade3bcbb714e4..1498840a4ea008 100644 --- a/drivers/mmc/core/queue.h +++ b/drivers/mmc/core/queue.h @@ -94,7 +94,8 @@ struct mmc_queue { struct work_struct complete_work; }; -struct gendisk *mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card); +struct gendisk *mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, + unsigned int features); extern void mmc_cleanup_queue(struct mmc_queue *); extern void mmc_queue_suspend(struct mmc_queue *); extern void mmc_queue_resume(struct mmc_queue *); diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 3caa0717d46c01..1b9f57f231e8be 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -336,6 +336,8 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) lim.logical_block_size = tr->blksize; if (tr->discard) lim.max_hw_discard_sectors = UINT_MAX; + if (tr->flush) + lim.features |= BLK_FEAT_WRITE_CACHE; /* Create gendisk */ gd = blk_mq_alloc_disk(new->tag_set, &lim, new); @@ -373,9 +375,6 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) spin_lock_init(&new->queue_lock); INIT_LIST_HEAD(&new->rq_list); - if (tr->flush) - blk_queue_write_cache(new->rq, true, false); - blk_queue_flag_set(QUEUE_FLAG_NONROT, new->rq); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, new->rq); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 598fe2e89bda45..aff818469c114c 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -455,6 +455,7 @@ static int pmem_attach_disk(struct device *dev, .logical_block_size = pmem_sector_size(ndns), .physical_block_size = PAGE_SIZE, .max_hw_sectors = UINT_MAX, + .features = BLK_FEAT_WRITE_CACHE, }; int nid = dev_to_node(dev), fua; struct resource *res = &nsio->res; @@ -495,6 +496,8 @@ static int pmem_attach_disk(struct device *dev, dev_warn(dev, "unable to guarantee persistence of writes\n"); fua = 0; } + if (fua) + lim.features |= BLK_FEAT_FUA; if (!devm_request_mem_region(dev, res->start, resource_size(res), dev_name(&ndns->dev))) { @@ -543,7 +546,6 @@ static int pmem_attach_disk(struct device *dev, } pmem->virt_addr = addr; - blk_queue_write_cache(q, true, fua); blk_queue_flag_set(QUEUE_FLAG_NONROT, q); blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, q); if (pmem->pfn_flags & PFN_MAP) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 5a673fa5cb2612..9fc5e36fe2e55e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2056,7 +2056,6 @@ static int nvme_update_ns_info_generic(struct nvme_ns *ns, static int nvme_update_ns_info_block(struct nvme_ns *ns, struct nvme_ns_info *info) { - bool vwc = ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT; struct queue_limits lim; struct nvme_id_ns_nvm *nvm = NULL; struct nvme_zone_info zi = {}; @@ -2106,6 +2105,11 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, ns->head->ids.csi == NVME_CSI_ZNS) nvme_update_zone_info(ns, &lim, &zi); + if (ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT) + lim.features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA; + else + lim.features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA); + /* * Register a metadata profile for PI, or the plain non-integrity NVMe * metadata masquerading as Type 0 if supported, otherwise reject block @@ -2132,7 +2136,6 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3))) ns->head->features |= NVME_NS_DEAC; set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info)); - blk_queue_write_cache(ns->disk->queue, vwc, vwc); set_bit(NVME_NS_READY, &ns->flags); blk_mq_unfreeze_queue(ns->disk->queue); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 12c59db02539e5..3d0e23a0a4ddd8 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -521,7 +521,6 @@ static void nvme_requeue_work(struct work_struct *work) int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) { struct queue_limits lim; - bool vwc = false; mutex_init(&head->lock); bio_list_init(&head->requeue_list); @@ -562,11 +561,6 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) if (ctrl->tagset->nr_maps > HCTX_TYPE_POLL && ctrl->tagset->map[HCTX_TYPE_POLL].nr_queues) blk_queue_flag_set(QUEUE_FLAG_POLL, head->disk->queue); - - /* we need to propagate up the VMC settings */ - if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) - vwc = true; - blk_queue_write_cache(head->disk->queue, vwc, vwc); return 0; } diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 66f7d1e3429c86..d8ee4a4d4a6283 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -120,17 +120,18 @@ static const char *sd_cache_types[] = { "write back, no read (daft)" }; -static void sd_set_flush_flag(struct scsi_disk *sdkp) +static void sd_set_flush_flag(struct scsi_disk *sdkp, + struct queue_limits *lim) { - bool wc = false, fua = false; - if (sdkp->WCE) { - wc = true; + lim->features |= BLK_FEAT_WRITE_CACHE; if (sdkp->DPOFUA) - fua = true; + lim->features |= BLK_FEAT_FUA; + else + lim->features &= ~BLK_FEAT_FUA; + } else { + lim->features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA); } - - blk_queue_write_cache(sdkp->disk->queue, wc, fua); } static ssize_t @@ -168,9 +169,18 @@ cache_type_store(struct device *dev, struct device_attribute *attr, wce = (ct & 0x02) && !sdkp->write_prot ? 1 : 0; if (sdkp->cache_override) { + struct queue_limits lim; + sdkp->WCE = wce; sdkp->RCD = rcd; - sd_set_flush_flag(sdkp); + + lim = queue_limits_start_update(sdkp->disk->queue); + sd_set_flush_flag(sdkp, &lim); + blk_mq_freeze_queue(sdkp->disk->queue); + ret = queue_limits_commit_update(sdkp->disk->queue, &lim); + blk_mq_unfreeze_queue(sdkp->disk->queue); + if (ret) + return ret; return count; } @@ -3663,7 +3673,7 @@ static int sd_revalidate_disk(struct gendisk *disk) * We now have all cache related info, determine how we deal * with flush requests. */ - sd_set_flush_flag(sdkp); + sd_set_flush_flag(sdkp, &lim); /* Initial block count limit based on CDB TRANSFER LENGTH field size. */ dev_max = sdp->use_16_for_rw ? SD_MAX_XFER_BLOCKS : SD_DEF_XFER_BLOCKS; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0c247a71688561..acdfe5122faa44 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -282,6 +282,28 @@ static inline bool blk_op_is_passthrough(blk_opf_t op) return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT; } +/* flags set by the driver in queue_limits.features */ +enum { + /* supports a volatile write cache */ + BLK_FEAT_WRITE_CACHE = (1u << 0), + + /* supports passing on the FUA bit */ + BLK_FEAT_FUA = (1u << 1), +}; + +/* + * Flags automatically inherited when stacking limits. + */ +#define BLK_FEAT_INHERIT_MASK \ + (BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA) + + +/* internal flags in queue_limits.flags */ +enum { + /* do not send FLUSH or FUA command despite advertised write cache */ + BLK_FLAGS_WRITE_CACHE_DISABLED = (1u << 31), +}; + /* * BLK_BOUNCE_NONE: never bounce (default) * BLK_BOUNCE_HIGH: bounce all highmem pages @@ -292,6 +314,8 @@ enum blk_bounce { }; struct queue_limits { + unsigned int features; + unsigned int flags; enum blk_bounce bounce; unsigned long seg_boundary_mask; unsigned long virt_boundary_mask; @@ -536,12 +560,9 @@ struct request_queue { #define QUEUE_FLAG_ADD_RANDOM 10 /* Contributes to random pool */ #define QUEUE_FLAG_SYNCHRONOUS 11 /* always completes in submit context */ #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ -#define QUEUE_FLAG_HW_WC 13 /* Write back caching supported */ #define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ #define QUEUE_FLAG_STABLE_WRITES 15 /* don't modify blks until WB is done */ #define QUEUE_FLAG_POLL 16 /* IO polling enabled if set */ -#define QUEUE_FLAG_WC 17 /* Write back caching */ -#define QUEUE_FLAG_FUA 18 /* device supports FUA writes */ #define QUEUE_FLAG_DAX 19 /* device supports DAX */ #define QUEUE_FLAG_STATS 20 /* track IO start and completion times */ #define QUEUE_FLAG_REGISTERED 22 /* queue has been registered to a disk */ @@ -951,7 +972,6 @@ void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, sector_t offset, const char *pfx); extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); -extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua); struct blk_independent_access_ranges * disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges); @@ -1304,14 +1324,20 @@ static inline bool bdev_stable_writes(struct block_device *bdev) return test_bit(QUEUE_FLAG_STABLE_WRITES, &q->queue_flags); } +static inline bool blk_queue_write_cache(struct request_queue *q) +{ + return (q->limits.features & BLK_FEAT_WRITE_CACHE) && + !(q->limits.flags & BLK_FLAGS_WRITE_CACHE_DISABLED); +} + static inline bool bdev_write_cache(struct block_device *bdev) { - return test_bit(QUEUE_FLAG_WC, &bdev_get_queue(bdev)->queue_flags); + return blk_queue_write_cache(bdev_get_queue(bdev)); } static inline bool bdev_fua(struct block_device *bdev) { - return test_bit(QUEUE_FLAG_FUA, &bdev_get_queue(bdev)->queue_flags); + return bdev_get_queue(bdev)->limits.features & BLK_FEAT_FUA; } static inline bool bdev_nowait(struct block_device *bdev) From bd4a633b6f7c3c6b6ebc1a07317643270e751a94 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:41 +0200 Subject: [PATCH 1139/1578] block: move the nonrot flag to queue_limits Move the nonrot flag into the queue_limits feature field so that it can be set atomically with the queue frozen. Use the chance to switch to defaulting to non-rotational and require the driver to opt into rotational, which matches the polarity of the sysfs interface. For the z2ram, ps3vram, 2x memstick, ubiblock and dcssblk the new rotational flag is not set as they clearly are not rotational despite this being a behavior change. There are some other drivers that unconditionally set the rotational flag to keep the existing behavior as they arguably can be used on rotational devices even if that is probably not their main use today (e.g. virtio_blk and drbd). The flag is automatically inherited in blk_stack_limits matching the existing behavior in dm and md. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240617060532.127975-15-hch@lst.de Signed-off-by: Jens Axboe --- arch/m68k/emu/nfblock.c | 1 + arch/um/drivers/ubd_kern.c | 1 - arch/xtensa/platforms/iss/simdisk.c | 5 +++- block/blk-mq-debugfs.c | 1 - block/blk-sysfs.c | 39 ++++++++++++++++++++++++++--- drivers/block/amiflop.c | 5 +++- drivers/block/aoe/aoeblk.c | 1 + drivers/block/ataflop.c | 5 +++- drivers/block/brd.c | 2 -- drivers/block/drbd/drbd_main.c | 3 ++- drivers/block/floppy.c | 3 ++- drivers/block/loop.c | 8 +++--- drivers/block/mtip32xx/mtip32xx.c | 1 - drivers/block/n64cart.c | 2 -- drivers/block/nbd.c | 5 ---- drivers/block/null_blk/main.c | 1 - drivers/block/pktcdvd.c | 1 + drivers/block/ps3disk.c | 3 ++- drivers/block/rbd.c | 3 --- drivers/block/rnbd/rnbd-clt.c | 4 --- drivers/block/sunvdc.c | 1 + drivers/block/swim.c | 5 +++- drivers/block/swim3.c | 5 +++- drivers/block/ublk_drv.c | 9 +++---- drivers/block/virtio_blk.c | 4 ++- drivers/block/xen-blkfront.c | 1 - drivers/block/zram/zram_drv.c | 2 -- drivers/cdrom/gdrom.c | 1 + drivers/md/bcache/super.c | 2 -- drivers/md/dm-table.c | 12 --------- drivers/md/md.c | 13 ---------- drivers/mmc/core/queue.c | 1 - drivers/mtd/mtd_blkdevs.c | 1 - drivers/nvdimm/btt.c | 1 - drivers/nvdimm/pmem.c | 1 - drivers/nvme/host/core.c | 1 - drivers/nvme/host/multipath.c | 1 - drivers/s390/block/dasd_genhd.c | 1 - drivers/s390/block/scm_blk.c | 1 - drivers/scsi/sd.c | 4 +-- include/linux/blkdev.h | 10 ++++---- 41 files changed, 83 insertions(+), 88 deletions(-) diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c index 642fb80c5c4e31..8eea7ef9115146 100644 --- a/arch/m68k/emu/nfblock.c +++ b/arch/m68k/emu/nfblock.c @@ -98,6 +98,7 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize) { struct queue_limits lim = { .logical_block_size = bsize, + .features = BLK_FEAT_ROTATIONAL, }; struct nfhd_device *dev; int dev_id = id - NFHD_DEV_OFFSET; diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 19e01691ea0ea7..9f1e76ddda5a26 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -882,7 +882,6 @@ static int ubd_add(int n, char **error_out) goto out_cleanup_tags; } - blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); disk->major = UBD_MAJOR; disk->first_minor = n << UBD_SHIFT; disk->minors = 1 << UBD_SHIFT; diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c index defc67909a9c74..d6d2b533a5744d 100644 --- a/arch/xtensa/platforms/iss/simdisk.c +++ b/arch/xtensa/platforms/iss/simdisk.c @@ -263,6 +263,9 @@ static const struct proc_ops simdisk_proc_ops = { static int __init simdisk_setup(struct simdisk *dev, int which, struct proc_dir_entry *procdir) { + struct queue_limits lim = { + .features = BLK_FEAT_ROTATIONAL, + }; char tmp[2] = { '0' + which, 0 }; int err; @@ -271,7 +274,7 @@ static int __init simdisk_setup(struct simdisk *dev, int which, spin_lock_init(&dev->lock); dev->users = 0; - dev->gd = blk_alloc_disk(NULL, NUMA_NO_NODE); + dev->gd = blk_alloc_disk(&lim, NUMA_NO_NODE); if (IS_ERR(dev->gd)) { err = PTR_ERR(dev->gd); goto out; diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index e8b9db7c30c455..4d0e62ec88f033 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -84,7 +84,6 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(NOMERGES), QUEUE_FLAG_NAME(SAME_COMP), QUEUE_FLAG_NAME(FAIL_IO), - QUEUE_FLAG_NAME(NONROT), QUEUE_FLAG_NAME(IO_STAT), QUEUE_FLAG_NAME(NOXMERGES), QUEUE_FLAG_NAME(ADD_RANDOM), diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 4f524c1d5e08bd..637ed3bbbfb46f 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -263,6 +263,39 @@ static ssize_t queue_dma_alignment_show(struct request_queue *q, char *page) return queue_var_show(queue_dma_alignment(q), page); } +static ssize_t queue_feature_store(struct request_queue *q, const char *page, + size_t count, unsigned int feature) +{ + struct queue_limits lim; + unsigned long val; + ssize_t ret; + + ret = queue_var_store(&val, page, count); + if (ret < 0) + return ret; + + lim = queue_limits_start_update(q); + if (val) + lim.features |= feature; + else + lim.features &= ~feature; + ret = queue_limits_commit_update(q, &lim); + if (ret) + return ret; + return count; +} + +#define QUEUE_SYSFS_FEATURE(_name, _feature) \ +static ssize_t queue_##_name##_show(struct request_queue *q, char *page) \ +{ \ + return sprintf(page, "%u\n", !!(q->limits.features & _feature)); \ +} \ +static ssize_t queue_##_name##_store(struct request_queue *q, \ + const char *page, size_t count) \ +{ \ + return queue_feature_store(q, page, count, _feature); \ +} + #define QUEUE_SYSFS_BIT_FNS(name, flag, neg) \ static ssize_t \ queue_##name##_show(struct request_queue *q, char *page) \ @@ -289,7 +322,7 @@ queue_##name##_store(struct request_queue *q, const char *page, size_t count) \ return ret; \ } -QUEUE_SYSFS_BIT_FNS(nonrot, NONROT, 1); +QUEUE_SYSFS_FEATURE(rotational, BLK_FEAT_ROTATIONAL) QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0); QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0); QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0); @@ -526,7 +559,7 @@ static struct queue_sysfs_entry queue_hw_sector_size_entry = { .show = queue_logical_block_size_show, }; -QUEUE_RW_ENTRY(queue_nonrot, "rotational"); +QUEUE_RW_ENTRY(queue_rotational, "rotational"); QUEUE_RW_ENTRY(queue_iostats, "iostats"); QUEUE_RW_ENTRY(queue_random, "add_random"); QUEUE_RW_ENTRY(queue_stable_writes, "stable_writes"); @@ -624,7 +657,7 @@ static struct attribute *queue_attrs[] = { &queue_write_zeroes_max_entry.attr, &queue_zone_append_max_entry.attr, &queue_zone_write_granularity_entry.attr, - &queue_nonrot_entry.attr, + &queue_rotational_entry.attr, &queue_zoned_entry.attr, &queue_nr_zones_entry.attr, &queue_max_open_zones_entry.attr, diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index a25414228e4741..ff45701f7a5e31 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1776,10 +1776,13 @@ static const struct blk_mq_ops amiflop_mq_ops = { static int fd_alloc_disk(int drive, int system) { + struct queue_limits lim = { + .features = BLK_FEAT_ROTATIONAL, + }; struct gendisk *disk; int err; - disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL, NULL); + disk = blk_mq_alloc_disk(&unit[drive].tag_set, &lim, NULL); if (IS_ERR(disk)) return PTR_ERR(disk); diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index b6dac8cee70fe1..2028795ec61cbb 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -337,6 +337,7 @@ aoeblk_gdalloc(void *vp) struct queue_limits lim = { .max_hw_sectors = aoe_maxsectors, .io_opt = SZ_2M, + .features = BLK_FEAT_ROTATIONAL, }; ulong flags; int late = 0; diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index cacc4ba942a814..4ee10a742bdb93 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -1992,9 +1992,12 @@ static const struct blk_mq_ops ataflop_mq_ops = { static int ataflop_alloc_disk(unsigned int drive, unsigned int type) { + struct queue_limits lim = { + .features = BLK_FEAT_ROTATIONAL, + }; struct gendisk *disk; - disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL, NULL); + disk = blk_mq_alloc_disk(&unit[drive].tag_set, &lim, NULL); if (IS_ERR(disk)) return PTR_ERR(disk); diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 558d8e67056608..b25dc463b5e3a6 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -366,8 +366,6 @@ static int brd_alloc(int i) strscpy(disk->disk_name, buf, DISK_NAME_LEN); set_capacity(disk, rd_size * 2); - /* Tell the block layer that this is not a rotational device */ - blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, disk->queue); blk_queue_flag_set(QUEUE_FLAG_NOWAIT, disk->queue); err = add_disk(disk); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index bf42a46781fa21..2ef29a47807550 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2697,7 +2697,8 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig * connect. */ .max_hw_sectors = DRBD_MAX_BIO_SIZE_SAFE >> 8, - .features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA, + .features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | + BLK_FEAT_ROTATIONAL, }; device = minor_to_device(minor); diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 25c9d85667f1a2..6d7f7df97c3a6c 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4516,7 +4516,8 @@ static bool floppy_available(int drive) static int floppy_alloc_disk(unsigned int drive, unsigned int type) { struct queue_limits lim = { - .max_hw_sectors = 64, + .max_hw_sectors = 64, + .features = BLK_FEAT_ROTATIONAL, }; struct gendisk *disk; diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 08d0fc7f17b701..86b5d956dc4e02 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -985,13 +985,11 @@ static int loop_reconfigure_limits(struct loop_device *lo, unsigned short bsize) lim.logical_block_size = bsize; lim.physical_block_size = bsize; lim.io_min = bsize; - lim.features &= ~BLK_FEAT_WRITE_CACHE; + lim.features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_ROTATIONAL); if (file->f_op->fsync && !(lo->lo_flags & LO_FLAGS_READ_ONLY)) lim.features |= BLK_FEAT_WRITE_CACHE; - if (!backing_bdev || bdev_nonrot(backing_bdev)) - blk_queue_flag_set(QUEUE_FLAG_NONROT, lo->lo_queue); - else - blk_queue_flag_clear(QUEUE_FLAG_NONROT, lo->lo_queue); + if (backing_bdev && !bdev_nonrot(backing_bdev)) + lim.features |= BLK_FEAT_ROTATIONAL; loop_config_discard(lo, &lim); return queue_limits_commit_update(lo->lo_queue, &lim); } diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 43a187609ef794..1dbbf72659d549 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3485,7 +3485,6 @@ static int mtip_block_initialize(struct driver_data *dd) goto start_service_thread; /* Set device limits. */ - blk_queue_flag_set(QUEUE_FLAG_NONROT, dd->queue); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, dd->queue); dma_set_max_seg_size(&dd->pdev->dev, 0x400000); diff --git a/drivers/block/n64cart.c b/drivers/block/n64cart.c index 27b2187e7a6d55..b9fdeff31cafdf 100644 --- a/drivers/block/n64cart.c +++ b/drivers/block/n64cart.c @@ -150,8 +150,6 @@ static int __init n64cart_probe(struct platform_device *pdev) set_capacity(disk, size >> SECTOR_SHIFT); set_disk_ro(disk, 1); - blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); - err = add_disk(disk); if (err) goto out_cleanup_disk; diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index cb1c86a6a3fb9d..6cddf5baffe02a 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1867,11 +1867,6 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) goto out_err_disk; } - /* - * Tell the block layer that we are not a rotational device - */ - blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); - mutex_init(&nbd->config_lock); refcount_set(&nbd->config_refs, 0); /* diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 21f9d256e88402..83a4ebe4763ae5 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1948,7 +1948,6 @@ static int null_add_dev(struct nullb_device *dev) } nullb->q->queuedata = nullb; - blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q); rv = ida_alloc(&nullb_indexes, GFP_KERNEL); if (rv < 0) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 8a2ce80700109d..7cece5884b9c67 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2622,6 +2622,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) struct queue_limits lim = { .max_hw_sectors = PACKET_MAX_SECTORS, .logical_block_size = CD_FRAMESIZE, + .features = BLK_FEAT_ROTATIONAL, }; int idx; int ret = -ENOMEM; diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index 8b73cf459b5937..ff45ed76646957 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -388,7 +388,8 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev) .max_segments = -1, .max_segment_size = dev->bounce_size, .dma_alignment = dev->blk_size - 1, - .features = BLK_FEAT_WRITE_CACHE, + .features = BLK_FEAT_WRITE_CACHE | + BLK_FEAT_ROTATIONAL, }; struct gendisk *gendisk; diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 22ad704f81d8b9..ec1f1c7d4275cd 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4997,9 +4997,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) disk->fops = &rbd_bd_ops; disk->private_data = rbd_dev; - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */ - if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC)) blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q); diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 02c4b173182719..4918b0f68b46cd 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1352,10 +1352,6 @@ static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, if (dev->access_mode == RNBD_ACCESS_RO) set_disk_ro(dev->gd, true); - /* - * Network device does not need rotational - */ - blk_queue_flag_set(QUEUE_FLAG_NONROT, dev->queue); err = add_disk(dev->gd); if (err) put_disk(dev->gd); diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 5286cb8e0824d1..2d38331ee66793 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -791,6 +791,7 @@ static int probe_disk(struct vdc_port *port) .seg_boundary_mask = PAGE_SIZE - 1, .max_segment_size = PAGE_SIZE, .max_segments = port->ring_cookies, + .features = BLK_FEAT_ROTATIONAL, }; struct request_queue *q; struct gendisk *g; diff --git a/drivers/block/swim.c b/drivers/block/swim.c index 6731678f3a41db..126f151c4f2cf0 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -787,6 +787,9 @@ static void swim_cleanup_floppy_disk(struct floppy_state *fs) static int swim_floppy_init(struct swim_priv *swd) { + struct queue_limits lim = { + .features = BLK_FEAT_ROTATIONAL, + }; int err; int drive; struct swim __iomem *base = swd->base; @@ -820,7 +823,7 @@ static int swim_floppy_init(struct swim_priv *swd) goto exit_put_disks; swd->unit[drive].disk = - blk_mq_alloc_disk(&swd->unit[drive].tag_set, NULL, + blk_mq_alloc_disk(&swd->unit[drive].tag_set, &lim, &swd->unit[drive]); if (IS_ERR(swd->unit[drive].disk)) { blk_mq_free_tag_set(&swd->unit[drive].tag_set); diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index a04756ac778ee8..90be1017f7bfcd 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -1189,6 +1189,9 @@ static int swim3_add_device(struct macio_dev *mdev, int index) static int swim3_attach(struct macio_dev *mdev, const struct of_device_id *match) { + struct queue_limits lim = { + .features = BLK_FEAT_ROTATIONAL, + }; struct floppy_state *fs; struct gendisk *disk; int rc; @@ -1210,7 +1213,7 @@ static int swim3_attach(struct macio_dev *mdev, if (rc) goto out_unregister; - disk = blk_mq_alloc_disk(&fs->tag_set, NULL, fs); + disk = blk_mq_alloc_disk(&fs->tag_set, &lim, fs); if (IS_ERR(disk)) { rc = PTR_ERR(disk); goto out_free_tag_set; diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index e45c65c1848d31..4fcde099935868 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -484,14 +484,8 @@ static inline unsigned ublk_pos_to_tag(loff_t pos) static void ublk_dev_param_basic_apply(struct ublk_device *ub) { - struct request_queue *q = ub->ub_disk->queue; const struct ublk_param_basic *p = &ub->params.basic; - if (p->attrs & UBLK_ATTR_ROTATIONAL) - blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); - else - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - if (p->attrs & UBLK_ATTR_READ_ONLY) set_disk_ro(ub->ub_disk, true); @@ -2214,6 +2208,9 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) lim.features |= BLK_FEAT_FUA; } + if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL) + lim.features |= BLK_FEAT_ROTATIONAL; + if (wait_for_completion_interruptible(&ub->completion) != 0) return -EINTR; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index b1a3c293528519..13a2f24f176628 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -1451,7 +1451,9 @@ static int virtblk_read_limits(struct virtio_blk *vblk, static int virtblk_probe(struct virtio_device *vdev) { struct virtio_blk *vblk; - struct queue_limits lim = { }; + struct queue_limits lim = { + .features = BLK_FEAT_ROTATIONAL, + }; int err, index; unsigned int queue_depth; diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 9aafce3e5987bf..fa3a2ba525458b 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1146,7 +1146,6 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, err = PTR_ERR(gd); goto out_free_tag_set; } - blk_queue_flag_set(QUEUE_FLAG_VIRT, gd->queue); strcpy(gd->disk_name, DEV_NAME); ptr = encode_disk_name(gd->disk_name + sizeof(DEV_NAME) - 1, offset); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 3acd7006ad2ccd..aad840fc7e18e3 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2245,8 +2245,6 @@ static int zram_add(void) /* Actual capacity set using sysfs (/sys/block/zram/disksize */ set_capacity(zram->disk, 0); - /* zram devices sort of resembles non-rotational disks */ - blk_queue_flag_set(QUEUE_FLAG_NONROT, zram->disk->queue); blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, zram->disk->queue); blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue); ret = device_add_disk(NULL, zram->disk, zram_disk_groups); diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index eefdd422ad8e9f..71cfe7a85913c4 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -744,6 +744,7 @@ static int probe_gdrom(struct platform_device *devptr) .max_segments = 1, /* set a large max size to get most from DMA */ .max_segment_size = 0x40000, + .features = BLK_FEAT_ROTATIONAL, }; int err; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index cb6595c8b5514e..baa364eedd0051 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -974,8 +974,6 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, d->disk->minors = BCACHE_MINORS; d->disk->fops = ops; d->disk->private_data = d; - - blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue); return 0; out_bioset_exit: diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 03abdae646829c..c062af32970934 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1716,12 +1716,6 @@ static int device_dax_write_cache_enabled(struct dm_target *ti, return false; } -static int device_is_rotational(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) -{ - return !bdev_nonrot(dev->bdev); -} - static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { @@ -1870,12 +1864,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled, NULL)) dax_write_cache(t->md->dax_dev, true); - /* Ensure that all underlying devices are non-rotational. */ - if (dm_table_any_dev_attr(t, device_is_rotational, NULL)) - blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); - else - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - /* * Some devices don't use blk_integrity but still want stable pages * because they do their own checksumming. diff --git a/drivers/md/md.c b/drivers/md/md.c index 2f4c5d1755d857..c23423c51fb7c2 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6151,20 +6151,7 @@ int md_run(struct mddev *mddev) if (!mddev_is_dm(mddev)) { struct request_queue *q = mddev->gendisk->queue; - bool nonrot = true; - rdev_for_each(rdev, mddev) { - if (rdev->raid_disk >= 0 && !bdev_nonrot(rdev->bdev)) { - nonrot = false; - break; - } - } - if (mddev->degraded) - nonrot = false; - if (nonrot) - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - else - blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); blk_queue_flag_set(QUEUE_FLAG_IO_STAT, q); /* Set the NOWAIT flags if all underlying devices support it */ diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c index 97ff993d31570c..b4f62fa845864c 100644 --- a/drivers/mmc/core/queue.c +++ b/drivers/mmc/core/queue.c @@ -387,7 +387,6 @@ static struct gendisk *mmc_alloc_disk(struct mmc_queue *mq, blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, mq->queue); blk_queue_rq_timeout(mq->queue, 60 * HZ); - blk_queue_flag_set(QUEUE_FLAG_NONROT, mq->queue); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, mq->queue); dma_set_max_seg_size(mmc_dev(host), queue_max_segment_size(mq->queue)); diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 1b9f57f231e8be..bf8369ce7ddf1d 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -375,7 +375,6 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) spin_lock_init(&new->queue_lock); INIT_LIST_HEAD(&new->rq_list); - blk_queue_flag_set(QUEUE_FLAG_NONROT, new->rq); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, new->rq); gd->queue = new->rq; diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index c5f8451b494d6c..e474afa8e9f68d 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1518,7 +1518,6 @@ static int btt_blk_init(struct btt *btt) btt->btt_disk->fops = &btt_fops; btt->btt_disk->private_data = btt; - blk_queue_flag_set(QUEUE_FLAG_NONROT, btt->btt_disk->queue); blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, btt->btt_disk->queue); set_capacity(btt->btt_disk, btt->nlba * btt->sector_size >> 9); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index aff818469c114c..501cf226df0187 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -546,7 +546,6 @@ static int pmem_attach_disk(struct device *dev, } pmem->virt_addr = addr; - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, q); if (pmem->pfn_flags & PFN_MAP) blk_queue_flag_set(QUEUE_FLAG_DAX, q); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 9fc5e36fe2e55e..0d753fe71f35b0 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3744,7 +3744,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) if (ctrl->opts && ctrl->opts->data_digest) blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue); - blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue); if (ctrl->ops->supports_pci_p2pdma && ctrl->ops->supports_pci_p2pdma(ctrl)) blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 3d0e23a0a4ddd8..58c13304e558e0 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -549,7 +549,6 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) sprintf(head->disk->disk_name, "nvme%dn%d", ctrl->subsys->instance, head->instance); - blk_queue_flag_set(QUEUE_FLAG_NONROT, head->disk->queue); blk_queue_flag_set(QUEUE_FLAG_NOWAIT, head->disk->queue); blk_queue_flag_set(QUEUE_FLAG_IO_STAT, head->disk->queue); /* diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index 4533dd055ca8e3..1aa426b1deddc7 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -68,7 +68,6 @@ int dasd_gendisk_alloc(struct dasd_block *block) blk_mq_free_tag_set(&block->tag_set); return PTR_ERR(gdp); } - blk_queue_flag_set(QUEUE_FLAG_NONROT, gdp->queue); /* Initialize gendisk structure. */ gdp->major = DASD_MAJOR; diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c index 1d456a5a3bfb8e..2e2309fa9a0b34 100644 --- a/drivers/s390/block/scm_blk.c +++ b/drivers/s390/block/scm_blk.c @@ -475,7 +475,6 @@ int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev) goto out_tag; } rq = bdev->rq = bdev->gendisk->queue; - blk_queue_flag_set(QUEUE_FLAG_NONROT, rq); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, rq); bdev->gendisk->private_data = scmdev; diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index d8ee4a4d4a6283..a42c3c45e86830 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3318,7 +3318,7 @@ static void sd_read_block_characteristics(struct scsi_disk *sdkp, rcu_read_unlock(); if (rot == 1) { - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); + lim->features &= ~BLK_FEAT_ROTATIONAL; blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q); } @@ -3646,7 +3646,7 @@ static int sd_revalidate_disk(struct gendisk *disk) * cause this to be updated correctly and any device which * doesn't support it should be treated as rotational. */ - blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); + lim.features |= BLK_FEAT_ROTATIONAL; blk_queue_flag_set(QUEUE_FLAG_ADD_RANDOM, q); if (scsi_device_supports_vpd(sdp)) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index acdfe5122faa44..988e3248cffeb7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -289,14 +289,16 @@ enum { /* supports passing on the FUA bit */ BLK_FEAT_FUA = (1u << 1), + + /* rotational device (hard drive or floppy) */ + BLK_FEAT_ROTATIONAL = (1u << 2), }; /* * Flags automatically inherited when stacking limits. */ #define BLK_FEAT_INHERIT_MASK \ - (BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA) - + (BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_ROTATIONAL) /* internal flags in queue_limits.flags */ enum { @@ -553,8 +555,6 @@ struct request_queue { #define QUEUE_FLAG_NOMERGES 3 /* disable merge attempts */ #define QUEUE_FLAG_SAME_COMP 4 /* complete on same CPU-group */ #define QUEUE_FLAG_FAIL_IO 5 /* fake timeout */ -#define QUEUE_FLAG_NONROT 6 /* non-rotational device (SSD) */ -#define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ #define QUEUE_FLAG_IO_STAT 7 /* do disk/partitions IO accounting */ #define QUEUE_FLAG_NOXMERGES 9 /* No extended merges */ #define QUEUE_FLAG_ADD_RANDOM 10 /* Contributes to random pool */ @@ -589,7 +589,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) #define blk_queue_noxmerges(q) \ test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) -#define blk_queue_nonrot(q) test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags) +#define blk_queue_nonrot(q) ((q)->limits.features & BLK_FEAT_ROTATIONAL) #define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) #define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags) #define blk_queue_zone_resetall(q) \ From 39a9f1c334f9f27b3b3e6d0005c10ed667268346 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:42 +0200 Subject: [PATCH 1140/1578] block: move the add_random flag to queue_limits Move the add_random flag into the queue_limits feature field so that it can be set atomically with the queue frozen. Note that this also removes code from dm to clear the flag based on the underlying devices, which can't be reached as dm devices will always start out without the flag set. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240617060532.127975-16-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 1 - block/blk-sysfs.c | 6 +++--- drivers/block/mtip32xx/mtip32xx.c | 1 - drivers/md/dm-table.c | 18 ------------------ drivers/mmc/core/queue.c | 2 -- drivers/mtd/mtd_blkdevs.c | 3 --- drivers/s390/block/scm_blk.c | 4 ---- drivers/scsi/scsi_lib.c | 3 +-- drivers/scsi/sd.c | 11 +++-------- include/linux/blkdev.h | 5 +++-- 10 files changed, 10 insertions(+), 44 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 4d0e62ec88f033..6b7edb50bfd3fa 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -86,7 +86,6 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(FAIL_IO), QUEUE_FLAG_NAME(IO_STAT), QUEUE_FLAG_NAME(NOXMERGES), - QUEUE_FLAG_NAME(ADD_RANDOM), QUEUE_FLAG_NAME(SYNCHRONOUS), QUEUE_FLAG_NAME(SAME_FORCE), QUEUE_FLAG_NAME(INIT_DONE), diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 637ed3bbbfb46f..9174aca3b85526 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -323,7 +323,7 @@ queue_##name##_store(struct request_queue *q, const char *page, size_t count) \ } QUEUE_SYSFS_FEATURE(rotational, BLK_FEAT_ROTATIONAL) -QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0); +QUEUE_SYSFS_FEATURE(add_random, BLK_FEAT_ADD_RANDOM) QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0); QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0); #undef QUEUE_SYSFS_BIT_FNS @@ -561,7 +561,7 @@ static struct queue_sysfs_entry queue_hw_sector_size_entry = { QUEUE_RW_ENTRY(queue_rotational, "rotational"); QUEUE_RW_ENTRY(queue_iostats, "iostats"); -QUEUE_RW_ENTRY(queue_random, "add_random"); +QUEUE_RW_ENTRY(queue_add_random, "add_random"); QUEUE_RW_ENTRY(queue_stable_writes, "stable_writes"); #ifdef CONFIG_BLK_WBT @@ -665,7 +665,7 @@ static struct attribute *queue_attrs[] = { &queue_nomerges_entry.attr, &queue_iostats_entry.attr, &queue_stable_writes_entry.attr, - &queue_random_entry.attr, + &queue_add_random_entry.attr, &queue_poll_entry.attr, &queue_wc_entry.attr, &queue_fua_entry.attr, diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 1dbbf72659d549..c6ef0546ffc9d2 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3485,7 +3485,6 @@ static int mtip_block_initialize(struct driver_data *dd) goto start_service_thread; /* Set device limits. */ - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, dd->queue); dma_set_max_seg_size(&dd->pdev->dev, 0x400000); /* Set the capacity of the device in 512 byte sectors. */ diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index c062af32970934..0a3838e45affd4 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1716,14 +1716,6 @@ static int device_dax_write_cache_enabled(struct dm_target *ti, return false; } -static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) -{ - struct request_queue *q = bdev_get_queue(dev->bdev); - - return !blk_queue_add_random(q); -} - static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { @@ -1876,16 +1868,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, else blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q); - /* - * Determine whether or not this queue's I/O timings contribute - * to the entropy pool, Only request-based targets use this. - * Clear QUEUE_FLAG_ADD_RANDOM if any underlying device does not - * have it set. - */ - if (blk_queue_add_random(q) && - dm_table_any_dev_attr(t, device_is_not_random, NULL)) - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q); - /* * For a zoned target, setup the zones related queue attributes * and resources necessary for zone append emulation if necessary. diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c index b4f62fa845864c..da00904d4a3c7e 100644 --- a/drivers/mmc/core/queue.c +++ b/drivers/mmc/core/queue.c @@ -387,8 +387,6 @@ static struct gendisk *mmc_alloc_disk(struct mmc_queue *mq, blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, mq->queue); blk_queue_rq_timeout(mq->queue, 60 * HZ); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, mq->queue); - dma_set_max_seg_size(mmc_dev(host), queue_max_segment_size(mq->queue)); INIT_WORK(&mq->recovery_work, mmc_mq_recovery_handler); diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index bf8369ce7ddf1d..47ead84407cdcf 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -374,9 +374,6 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) /* Create the request queue */ spin_lock_init(&new->queue_lock); INIT_LIST_HEAD(&new->rq_list); - - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, new->rq); - gd->queue = new->rq; if (new->readonly) diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c index 2e2309fa9a0b34..3fcfe029db1b3a 100644 --- a/drivers/s390/block/scm_blk.c +++ b/drivers/s390/block/scm_blk.c @@ -439,7 +439,6 @@ int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev) .logical_block_size = 1 << 12, }; unsigned int devindex; - struct request_queue *rq; int len, ret; lim.max_segments = min(scmdev->nr_max_block, @@ -474,9 +473,6 @@ int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev) ret = PTR_ERR(bdev->gendisk); goto out_tag; } - rq = bdev->rq = bdev->gendisk->queue; - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, rq); - bdev->gendisk->private_data = scmdev; bdev->gendisk->fops = &scm_blk_devops; bdev->gendisk->major = scm_major; diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index ec39acc986d6ec..54f771ec8cfb5e 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -631,8 +631,7 @@ static bool scsi_end_request(struct request *req, blk_status_t error, if (blk_update_request(req, error, bytes)) return true; - // XXX: - if (blk_queue_add_random(q)) + if (q->limits.features & BLK_FEAT_ADD_RANDOM) add_disk_randomness(req->q->disk); WARN_ON_ONCE(!blk_rq_is_passthrough(req) && diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index a42c3c45e86830..a27f1c7f1b61d5 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3301,7 +3301,6 @@ static void sd_read_block_limits_ext(struct scsi_disk *sdkp) static void sd_read_block_characteristics(struct scsi_disk *sdkp, struct queue_limits *lim) { - struct request_queue *q = sdkp->disk->queue; struct scsi_vpd *vpd; u16 rot; @@ -3317,10 +3316,8 @@ static void sd_read_block_characteristics(struct scsi_disk *sdkp, sdkp->zoned = (vpd->data[8] >> 4) & 3; rcu_read_unlock(); - if (rot == 1) { - lim->features &= ~BLK_FEAT_ROTATIONAL; - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q); - } + if (rot == 1) + lim->features &= ~(BLK_FEAT_ROTATIONAL | BLK_FEAT_ADD_RANDOM); if (!sdkp->first_scan) return; @@ -3599,7 +3596,6 @@ static int sd_revalidate_disk(struct gendisk *disk) { struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdp = sdkp->device; - struct request_queue *q = sdkp->disk->queue; sector_t old_capacity = sdkp->capacity; struct queue_limits lim; unsigned char *buffer; @@ -3646,8 +3642,7 @@ static int sd_revalidate_disk(struct gendisk *disk) * cause this to be updated correctly and any device which * doesn't support it should be treated as rotational. */ - lim.features |= BLK_FEAT_ROTATIONAL; - blk_queue_flag_set(QUEUE_FLAG_ADD_RANDOM, q); + lim.features |= (BLK_FEAT_ROTATIONAL | BLK_FEAT_ADD_RANDOM); if (scsi_device_supports_vpd(sdp)) { sd_read_block_provisioning(sdkp); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 988e3248cffeb7..cf1bbf566b2bcd 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -292,6 +292,9 @@ enum { /* rotational device (hard drive or floppy) */ BLK_FEAT_ROTATIONAL = (1u << 2), + + /* contributes to the random number pool */ + BLK_FEAT_ADD_RANDOM = (1u << 3), }; /* @@ -557,7 +560,6 @@ struct request_queue { #define QUEUE_FLAG_FAIL_IO 5 /* fake timeout */ #define QUEUE_FLAG_IO_STAT 7 /* do disk/partitions IO accounting */ #define QUEUE_FLAG_NOXMERGES 9 /* No extended merges */ -#define QUEUE_FLAG_ADD_RANDOM 10 /* Contributes to random pool */ #define QUEUE_FLAG_SYNCHRONOUS 11 /* always completes in submit context */ #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ #define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ @@ -591,7 +593,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) #define blk_queue_nonrot(q) ((q)->limits.features & BLK_FEAT_ROTATIONAL) #define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) -#define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags) #define blk_queue_zone_resetall(q) \ test_bit(QUEUE_FLAG_ZONE_RESETALL, &(q)->queue_flags) #define blk_queue_dax(q) test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags) From cdb2497918cc2929691408bac87b58433b45b6d3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:43 +0200 Subject: [PATCH 1141/1578] block: move the io_stat flag setting to queue_limits Move the io_stat flag into the queue_limits feature field so that it can be set atomically with the queue frozen. Simplify md and dm to set the flag unconditionally instead of avoiding setting a simple flag for cases where it already is set by other means, which is a bit pointless. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240617060532.127975-17-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 1 - block/blk-mq.c | 6 +++++- block/blk-sysfs.c | 2 +- drivers/md/dm-table.c | 12 +++++++++--- drivers/md/dm.c | 13 +++---------- drivers/md/md.c | 5 ++--- drivers/nvme/host/multipath.c | 2 +- include/linux/blkdev.h | 9 +++++---- 8 files changed, 26 insertions(+), 24 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 6b7edb50bfd3fa..cbe99444ed1a54 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -84,7 +84,6 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(NOMERGES), QUEUE_FLAG_NAME(SAME_COMP), QUEUE_FLAG_NAME(FAIL_IO), - QUEUE_FLAG_NAME(IO_STAT), QUEUE_FLAG_NAME(NOXMERGES), QUEUE_FLAG_NAME(SYNCHRONOUS), QUEUE_FLAG_NAME(SAME_FORCE), diff --git a/block/blk-mq.c b/block/blk-mq.c index 58b0d6c7cc34d6..cf67dc13f7dd4c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4116,7 +4116,11 @@ struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, struct request_queue *q; int ret; - q = blk_alloc_queue(lim ? lim : &default_lim, set->numa_node); + if (!lim) + lim = &default_lim; + lim->features |= BLK_FEAT_IO_STAT; + + q = blk_alloc_queue(lim, set->numa_node); if (IS_ERR(q)) return q; q->queuedata = queuedata; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 9174aca3b85526..6f58530fb3c08e 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -324,7 +324,7 @@ queue_##name##_store(struct request_queue *q, const char *page, size_t count) \ QUEUE_SYSFS_FEATURE(rotational, BLK_FEAT_ROTATIONAL) QUEUE_SYSFS_FEATURE(add_random, BLK_FEAT_ADD_RANDOM) -QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0); +QUEUE_SYSFS_FEATURE(iostats, BLK_FEAT_IO_STAT) QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0); #undef QUEUE_SYSFS_BIT_FNS diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 0a3838e45affd4..5d5431e531aea9 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -579,6 +579,12 @@ int dm_split_args(int *argc, char ***argvp, char *input) return 0; } +static void dm_set_stacking_limits(struct queue_limits *limits) +{ + blk_set_stacking_limits(limits); + limits->features |= BLK_FEAT_IO_STAT; +} + /* * Impose necessary and sufficient conditions on a devices's table such * that any incoming bio which respects its logical_block_size can be @@ -617,7 +623,7 @@ static int validate_hardware_logical_block_alignment(struct dm_table *t, for (i = 0; i < t->num_targets; i++) { ti = dm_table_get_target(t, i); - blk_set_stacking_limits(&ti_limits); + dm_set_stacking_limits(&ti_limits); /* combine all target devices' limits */ if (ti->type->iterate_devices) @@ -1591,7 +1597,7 @@ int dm_calculate_queue_limits(struct dm_table *t, unsigned int zone_sectors = 0; bool zoned = false; - blk_set_stacking_limits(limits); + dm_set_stacking_limits(limits); t->integrity_supported = true; for (unsigned int i = 0; i < t->num_targets; i++) { @@ -1604,7 +1610,7 @@ int dm_calculate_queue_limits(struct dm_table *t, for (unsigned int i = 0; i < t->num_targets; i++) { struct dm_target *ti = dm_table_get_target(t, i); - blk_set_stacking_limits(&ti_limits); + dm_set_stacking_limits(&ti_limits); if (!ti->type->iterate_devices) { /* Set I/O hints portion of queue limits */ diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 13037d6a6f62a2..8a976cee448bed 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2386,22 +2386,15 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) struct table_device *td; int r; - switch (type) { - case DM_TYPE_REQUEST_BASED: + WARN_ON_ONCE(type == DM_TYPE_NONE); + + if (type == DM_TYPE_REQUEST_BASED) { md->disk->fops = &dm_rq_blk_dops; r = dm_mq_init_request_queue(md, t); if (r) { DMERR("Cannot initialize queue for request-based dm mapped device"); return r; } - break; - case DM_TYPE_BIO_BASED: - case DM_TYPE_DAX_BIO_BASED: - blk_queue_flag_set(QUEUE_FLAG_IO_STAT, md->queue); - break; - case DM_TYPE_NONE: - WARN_ON_ONCE(true); - break; } r = dm_calculate_queue_limits(t, &limits); diff --git a/drivers/md/md.c b/drivers/md/md.c index c23423c51fb7c2..8db0db8d5a27ac 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5787,7 +5787,8 @@ struct mddev *md_alloc(dev_t dev, char *name) int unit; int error; struct queue_limits lim = { - .features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA, + .features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | + BLK_FEAT_IO_STAT, }; /* @@ -6152,8 +6153,6 @@ int md_run(struct mddev *mddev) if (!mddev_is_dm(mddev)) { struct request_queue *q = mddev->gendisk->queue; - blk_queue_flag_set(QUEUE_FLAG_IO_STAT, q); - /* Set the NOWAIT flags if all underlying devices support it */ if (nowait) blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 58c13304e558e0..eea727cfa9e67d 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -538,6 +538,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) blk_set_stacking_limits(&lim); lim.dma_alignment = 3; + lim.features |= BLK_FEAT_IO_STAT; if (head->ids.csi != NVME_CSI_ZNS) lim.max_zone_append_sectors = 0; @@ -550,7 +551,6 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) ctrl->subsys->instance, head->instance); blk_queue_flag_set(QUEUE_FLAG_NOWAIT, head->disk->queue); - blk_queue_flag_set(QUEUE_FLAG_IO_STAT, head->disk->queue); /* * This assumes all controllers that refer to a namespace either * support poll queues or not. That is not a strict guarantee, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index cf1bbf566b2bcd..5fafb2f95fd1a3 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -295,6 +295,9 @@ enum { /* contributes to the random number pool */ BLK_FEAT_ADD_RANDOM = (1u << 3), + + /* do disk/partitions IO accounting */ + BLK_FEAT_IO_STAT = (1u << 4), }; /* @@ -558,7 +561,6 @@ struct request_queue { #define QUEUE_FLAG_NOMERGES 3 /* disable merge attempts */ #define QUEUE_FLAG_SAME_COMP 4 /* complete on same CPU-group */ #define QUEUE_FLAG_FAIL_IO 5 /* fake timeout */ -#define QUEUE_FLAG_IO_STAT 7 /* do disk/partitions IO accounting */ #define QUEUE_FLAG_NOXMERGES 9 /* No extended merges */ #define QUEUE_FLAG_SYNCHRONOUS 11 /* always completes in submit context */ #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ @@ -577,8 +579,7 @@ struct request_queue { #define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */ #define QUEUE_FLAG_SKIP_TAGSET_QUIESCE 31 /* quiesce_tagset skip the queue*/ -#define QUEUE_FLAG_MQ_DEFAULT ((1UL << QUEUE_FLAG_IO_STAT) | \ - (1UL << QUEUE_FLAG_SAME_COMP) | \ +#define QUEUE_FLAG_MQ_DEFAULT ((1UL << QUEUE_FLAG_SAME_COMP) | \ (1UL << QUEUE_FLAG_NOWAIT)) void blk_queue_flag_set(unsigned int flag, struct request_queue *q); @@ -592,7 +593,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_noxmerges(q) \ test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) #define blk_queue_nonrot(q) ((q)->limits.features & BLK_FEAT_ROTATIONAL) -#define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) +#define blk_queue_io_stat(q) ((q)->limits.features & BLK_FEAT_IO_STAT) #define blk_queue_zone_resetall(q) \ test_bit(QUEUE_FLAG_ZONE_RESETALL, &(q)->queue_flags) #define blk_queue_dax(q) test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags) From 1a02f3a73f8c670eddeb44bf52a75ae7f67cfc11 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:44 +0200 Subject: [PATCH 1142/1578] block: move the stable_writes flag to queue_limits Move the stable_writes flag into the queue_limits feature field so that it can be set atomically with the queue frozen. The flag is now inherited by blk_stack_limits, which greatly simplifies the code in dm, and fixed md which previously did not pass on the flag set on lower devices. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240617060532.127975-18-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 1 - block/blk-sysfs.c | 29 +---------------------------- drivers/block/drbd/drbd_main.c | 5 ++--- drivers/block/rbd.c | 9 +++------ drivers/block/zram/zram_drv.c | 2 +- drivers/md/dm-table.c | 19 ------------------- drivers/md/raid5.c | 6 ++++-- drivers/mmc/core/queue.c | 5 +++-- drivers/nvme/host/core.c | 9 +++++---- drivers/nvme/host/multipath.c | 4 ---- drivers/scsi/iscsi_tcp.c | 8 ++++---- include/linux/blkdev.h | 9 ++++++--- 12 files changed, 29 insertions(+), 77 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index cbe99444ed1a54..eb73f1d348e5a9 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -88,7 +88,6 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(SYNCHRONOUS), QUEUE_FLAG_NAME(SAME_FORCE), QUEUE_FLAG_NAME(INIT_DONE), - QUEUE_FLAG_NAME(STABLE_WRITES), QUEUE_FLAG_NAME(POLL), QUEUE_FLAG_NAME(DAX), QUEUE_FLAG_NAME(STATS), diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 6f58530fb3c08e..cde525724831ef 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -296,37 +296,10 @@ static ssize_t queue_##_name##_store(struct request_queue *q, \ return queue_feature_store(q, page, count, _feature); \ } -#define QUEUE_SYSFS_BIT_FNS(name, flag, neg) \ -static ssize_t \ -queue_##name##_show(struct request_queue *q, char *page) \ -{ \ - int bit; \ - bit = test_bit(QUEUE_FLAG_##flag, &q->queue_flags); \ - return queue_var_show(neg ? !bit : bit, page); \ -} \ -static ssize_t \ -queue_##name##_store(struct request_queue *q, const char *page, size_t count) \ -{ \ - unsigned long val; \ - ssize_t ret; \ - ret = queue_var_store(&val, page, count); \ - if (ret < 0) \ - return ret; \ - if (neg) \ - val = !val; \ - \ - if (val) \ - blk_queue_flag_set(QUEUE_FLAG_##flag, q); \ - else \ - blk_queue_flag_clear(QUEUE_FLAG_##flag, q); \ - return ret; \ -} - QUEUE_SYSFS_FEATURE(rotational, BLK_FEAT_ROTATIONAL) QUEUE_SYSFS_FEATURE(add_random, BLK_FEAT_ADD_RANDOM) QUEUE_SYSFS_FEATURE(iostats, BLK_FEAT_IO_STAT) -QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0); -#undef QUEUE_SYSFS_BIT_FNS +QUEUE_SYSFS_FEATURE(stable_writes, BLK_FEAT_STABLE_WRITES); static ssize_t queue_zoned_show(struct request_queue *q, char *page) { diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 2ef29a47807550..f92673f05c7abc 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2698,7 +2698,8 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig */ .max_hw_sectors = DRBD_MAX_BIO_SIZE_SAFE >> 8, .features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | - BLK_FEAT_ROTATIONAL, + BLK_FEAT_ROTATIONAL | + BLK_FEAT_STABLE_WRITES, }; device = minor_to_device(minor); @@ -2737,8 +2738,6 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig sprintf(disk->disk_name, "drbd%d", minor); disk->private_data = device; - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue); - device->md_io.page = alloc_page(GFP_KERNEL); if (!device->md_io.page) goto out_no_io_page; diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index ec1f1c7d4275cd..008e850555f41a 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4949,7 +4949,6 @@ static const struct blk_mq_ops rbd_mq_ops = { static int rbd_init_disk(struct rbd_device *rbd_dev) { struct gendisk *disk; - struct request_queue *q; unsigned int objset_bytes = rbd_dev->layout.object_size * rbd_dev->layout.stripe_count; struct queue_limits lim = { @@ -4979,12 +4978,14 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) lim.max_write_zeroes_sectors = objset_bytes >> SECTOR_SHIFT; } + if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC)) + lim.features |= BLK_FEAT_STABLE_WRITES; + disk = blk_mq_alloc_disk(&rbd_dev->tag_set, &lim, rbd_dev); if (IS_ERR(disk)) { err = PTR_ERR(disk); goto out_tag_set; } - q = disk->queue; snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", rbd_dev->dev_id); @@ -4996,10 +4997,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) disk->minors = RBD_MINORS_PER_MAJOR; disk->fops = &rbd_bd_ops; disk->private_data = rbd_dev; - - if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC)) - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q); - rbd_dev->disk = disk; return 0; diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index aad840fc7e18e3..f8f1b5b54795ac 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2208,6 +2208,7 @@ static int zram_add(void) #if ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE .max_write_zeroes_sectors = UINT_MAX, #endif + .features = BLK_FEAT_STABLE_WRITES, }; struct zram *zram; int ret, device_id; @@ -2246,7 +2247,6 @@ static int zram_add(void) /* Actual capacity set using sysfs (/sys/block/zram/disksize */ set_capacity(zram->disk, 0); blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, zram->disk->queue); - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue); ret = device_add_disk(NULL, zram->disk, zram_disk_groups); if (ret) goto out_cleanup_disk; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 5d5431e531aea9..aaf379cb15d91f 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1819,13 +1819,6 @@ static bool dm_table_supports_secure_erase(struct dm_table *t) return true; } -static int device_requires_stable_pages(struct dm_target *ti, - struct dm_dev *dev, sector_t start, - sector_t len, void *data) -{ - return bdev_stable_writes(dev->bdev); -} - int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *limits) { @@ -1862,18 +1855,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled, NULL)) dax_write_cache(t->md->dax_dev, true); - /* - * Some devices don't use blk_integrity but still want stable pages - * because they do their own checksumming. - * If any underlying device requires stable pages, a table must require - * them as well. Only targets that support iterate_devices are considered: - * don't want error, zero, etc to require stable pages. - */ - if (dm_table_any_dev_attr(t, device_requires_stable_pages, NULL)) - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q); - else - blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q); - /* * For a zoned target, setup the zones related queue attributes * and resources necessary for zone append emulation if necessary. diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 675c68fa6c6403..e875763d69917d 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7082,12 +7082,14 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) err = -ENODEV; else if (new != conf->skip_copy) { struct request_queue *q = mddev->gendisk->queue; + struct queue_limits lim = queue_limits_start_update(q); conf->skip_copy = new; if (new) - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q); + lim.features |= BLK_FEAT_STABLE_WRITES; else - blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q); + lim.features &= ~BLK_FEAT_STABLE_WRITES; + err = queue_limits_commit_update(q, &lim); } mddev_unlock_and_resume(mddev); return err ?: len; diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c index da00904d4a3c7e..d0b3ca8a11f071 100644 --- a/drivers/mmc/core/queue.c +++ b/drivers/mmc/core/queue.c @@ -378,13 +378,14 @@ static struct gendisk *mmc_alloc_disk(struct mmc_queue *mq, lim.max_segments = host->max_segs; } + if (mmc_host_is_spi(host) && host->use_spi_crc) + lim.features |= BLK_FEAT_STABLE_WRITES; + disk = blk_mq_alloc_disk(&mq->tag_set, &lim, mq); if (IS_ERR(disk)) return disk; mq->queue = disk->queue; - if (mmc_host_is_spi(host) && host->use_spi_crc) - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, mq->queue); blk_queue_rq_timeout(mq->queue, 60 * HZ); dma_set_max_seg_size(mmc_dev(host), queue_max_segment_size(mq->queue)); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 0d753fe71f35b0..5ecf762d7c8837 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3724,6 +3724,7 @@ static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns) static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) { + struct queue_limits lim = { }; struct nvme_ns *ns; struct gendisk *disk; int node = ctrl->numa_node; @@ -3732,7 +3733,10 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) if (!ns) return; - disk = blk_mq_alloc_disk(ctrl->tagset, NULL, ns); + if (ctrl->opts && ctrl->opts->data_digest) + lim.features |= BLK_FEAT_STABLE_WRITES; + + disk = blk_mq_alloc_disk(ctrl->tagset, &lim, ns); if (IS_ERR(disk)) goto out_free_ns; disk->fops = &nvme_bdev_ops; @@ -3741,9 +3745,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) ns->disk = disk; ns->queue = disk->queue; - if (ctrl->opts && ctrl->opts->data_digest) - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue); - if (ctrl->ops->supports_pci_p2pdma && ctrl->ops->supports_pci_p2pdma(ctrl)) blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index eea727cfa9e67d..173796f2ddea9f 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -868,10 +868,6 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid) nvme_mpath_set_live(ns); } - if (test_bit(QUEUE_FLAG_STABLE_WRITES, &ns->queue->queue_flags) && - ns->head->disk) - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, - ns->head->disk->queue); #ifdef CONFIG_BLK_DEV_ZONED if (blk_queue_is_zoned(ns->queue) && ns->head->disk) ns->head->disk->nr_zones = ns->disk->nr_zones; diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c index 60688f18fac6f7..c708e105963833 100644 --- a/drivers/scsi/iscsi_tcp.c +++ b/drivers/scsi/iscsi_tcp.c @@ -1057,15 +1057,15 @@ static umode_t iscsi_sw_tcp_attr_is_visible(int param_type, int param) return 0; } -static int iscsi_sw_tcp_slave_configure(struct scsi_device *sdev) +static int iscsi_sw_tcp_device_configure(struct scsi_device *sdev, + struct queue_limits *lim) { struct iscsi_sw_tcp_host *tcp_sw_host = iscsi_host_priv(sdev->host); struct iscsi_session *session = tcp_sw_host->session; struct iscsi_conn *conn = session->leadconn; if (conn->datadgst_en) - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, - sdev->request_queue); + lim->features |= BLK_FEAT_STABLE_WRITES; return 0; } @@ -1083,7 +1083,7 @@ static const struct scsi_host_template iscsi_sw_tcp_sht = { .eh_device_reset_handler= iscsi_eh_device_reset, .eh_target_reset_handler = iscsi_eh_recover_target, .dma_boundary = PAGE_SIZE - 1, - .slave_configure = iscsi_sw_tcp_slave_configure, + .device_configure = iscsi_sw_tcp_device_configure, .proc_name = "iscsi_tcp", .this_id = -1, .track_queue_depth = 1, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5fafb2f95fd1a3..8936eb6ba60956 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -298,13 +298,17 @@ enum { /* do disk/partitions IO accounting */ BLK_FEAT_IO_STAT = (1u << 4), + + /* don't modify data until writeback is done */ + BLK_FEAT_STABLE_WRITES = (1u << 5), }; /* * Flags automatically inherited when stacking limits. */ #define BLK_FEAT_INHERIT_MASK \ - (BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_ROTATIONAL) + (BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_ROTATIONAL | \ + BLK_FEAT_STABLE_WRITES) /* internal flags in queue_limits.flags */ enum { @@ -565,7 +569,6 @@ struct request_queue { #define QUEUE_FLAG_SYNCHRONOUS 11 /* always completes in submit context */ #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ #define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ -#define QUEUE_FLAG_STABLE_WRITES 15 /* don't modify blks until WB is done */ #define QUEUE_FLAG_POLL 16 /* IO polling enabled if set */ #define QUEUE_FLAG_DAX 19 /* device supports DAX */ #define QUEUE_FLAG_STATS 20 /* track IO start and completion times */ @@ -1323,7 +1326,7 @@ static inline bool bdev_stable_writes(struct block_device *bdev) if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && q->limits.integrity.csum_type != BLK_INTEGRITY_CSUM_NONE) return true; - return test_bit(QUEUE_FLAG_STABLE_WRITES, &q->queue_flags); + return q->limits.features & BLK_FEAT_STABLE_WRITES; } static inline bool blk_queue_write_cache(struct request_queue *q) From aadd5c59c910427c0464c217d5ed588ff14e2502 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:45 +0200 Subject: [PATCH 1143/1578] block: move the synchronous flag to queue_limits Move the synchronous flag into the queue_limits feature field so that it can be set atomically with the queue frozen. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240617060532.127975-19-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 1 - drivers/block/brd.c | 2 +- drivers/block/zram/zram_drv.c | 4 ++-- drivers/nvdimm/btt.c | 3 +-- drivers/nvdimm/pmem.c | 4 ++-- include/linux/blkdev.h | 7 ++++--- 6 files changed, 10 insertions(+), 11 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index eb73f1d348e5a9..957774e40b1d0c 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -85,7 +85,6 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(SAME_COMP), QUEUE_FLAG_NAME(FAIL_IO), QUEUE_FLAG_NAME(NOXMERGES), - QUEUE_FLAG_NAME(SYNCHRONOUS), QUEUE_FLAG_NAME(SAME_FORCE), QUEUE_FLAG_NAME(INIT_DONE), QUEUE_FLAG_NAME(POLL), diff --git a/drivers/block/brd.c b/drivers/block/brd.c index b25dc463b5e3a6..d77deb571dbd06 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -335,6 +335,7 @@ static int brd_alloc(int i) .max_hw_discard_sectors = UINT_MAX, .max_discard_segments = 1, .discard_granularity = PAGE_SIZE, + .features = BLK_FEAT_SYNCHRONOUS, }; list_for_each_entry(brd, &brd_devices, brd_list) @@ -366,7 +367,6 @@ static int brd_alloc(int i) strscpy(disk->disk_name, buf, DISK_NAME_LEN); set_capacity(disk, rd_size * 2); - blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, disk->queue); blk_queue_flag_set(QUEUE_FLAG_NOWAIT, disk->queue); err = add_disk(disk); if (err) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index f8f1b5b54795ac..efcb8d9d274c31 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2208,7 +2208,8 @@ static int zram_add(void) #if ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE .max_write_zeroes_sectors = UINT_MAX, #endif - .features = BLK_FEAT_STABLE_WRITES, + .features = BLK_FEAT_STABLE_WRITES | + BLK_FEAT_SYNCHRONOUS, }; struct zram *zram; int ret, device_id; @@ -2246,7 +2247,6 @@ static int zram_add(void) /* Actual capacity set using sysfs (/sys/block/zram/disksize */ set_capacity(zram->disk, 0); - blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, zram->disk->queue); ret = device_add_disk(NULL, zram->disk, zram_disk_groups); if (ret) goto out_cleanup_disk; diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index e474afa8e9f68d..e79c06d65bb77b 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1501,6 +1501,7 @@ static int btt_blk_init(struct btt *btt) .logical_block_size = btt->sector_size, .max_hw_sectors = UINT_MAX, .max_integrity_segments = 1, + .features = BLK_FEAT_SYNCHRONOUS, }; int rc; @@ -1518,8 +1519,6 @@ static int btt_blk_init(struct btt *btt) btt->btt_disk->fops = &btt_fops; btt->btt_disk->private_data = btt; - blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, btt->btt_disk->queue); - set_capacity(btt->btt_disk, btt->nlba * btt->sector_size >> 9); rc = device_add_disk(&btt->nd_btt->dev, btt->btt_disk, NULL); if (rc) diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 501cf226df0187..b821dcf018f6ae 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -455,7 +455,8 @@ static int pmem_attach_disk(struct device *dev, .logical_block_size = pmem_sector_size(ndns), .physical_block_size = PAGE_SIZE, .max_hw_sectors = UINT_MAX, - .features = BLK_FEAT_WRITE_CACHE, + .features = BLK_FEAT_WRITE_CACHE | + BLK_FEAT_SYNCHRONOUS, }; int nid = dev_to_node(dev), fua; struct resource *res = &nsio->res; @@ -546,7 +547,6 @@ static int pmem_attach_disk(struct device *dev, } pmem->virt_addr = addr; - blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, q); if (pmem->pfn_flags & PFN_MAP) blk_queue_flag_set(QUEUE_FLAG_DAX, q); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8936eb6ba60956..cee7b44a142513 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -301,6 +301,9 @@ enum { /* don't modify data until writeback is done */ BLK_FEAT_STABLE_WRITES = (1u << 5), + + /* always completes in submit context */ + BLK_FEAT_SYNCHRONOUS = (1u << 6), }; /* @@ -566,7 +569,6 @@ struct request_queue { #define QUEUE_FLAG_SAME_COMP 4 /* complete on same CPU-group */ #define QUEUE_FLAG_FAIL_IO 5 /* fake timeout */ #define QUEUE_FLAG_NOXMERGES 9 /* No extended merges */ -#define QUEUE_FLAG_SYNCHRONOUS 11 /* always completes in submit context */ #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ #define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ #define QUEUE_FLAG_POLL 16 /* IO polling enabled if set */ @@ -1315,8 +1317,7 @@ static inline bool bdev_nonrot(struct block_device *bdev) static inline bool bdev_synchronous(struct block_device *bdev) { - return test_bit(QUEUE_FLAG_SYNCHRONOUS, - &bdev_get_queue(bdev)->queue_flags); + return bdev->bd_disk->queue->limits.features & BLK_FEAT_SYNCHRONOUS; } static inline bool bdev_stable_writes(struct block_device *bdev) From f76af42f8bf13d2620084f305f01691de9238fc7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:46 +0200 Subject: [PATCH 1144/1578] block: move the nowait flag to queue_limits Move the nowait flag into the queue_limits feature field so that it can be set atomically with the queue frozen. Stacking drivers are simplified in that they now can simply set the flag, and blk_stack_limits will clear it when the features is not supported by any of the underlying devices. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240617060532.127975-20-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 1 - block/blk-mq.c | 2 +- block/blk-settings.c | 9 +++++++++ drivers/block/brd.c | 4 ++-- drivers/md/dm-table.c | 18 +++--------------- drivers/md/md.c | 18 +----------------- drivers/nvme/host/multipath.c | 3 +-- include/linux/blkdev.h | 9 +++++---- 8 files changed, 22 insertions(+), 42 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 957774e40b1d0c..62b132e9a9ce3b 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -96,7 +96,6 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(ZONE_RESETALL), QUEUE_FLAG_NAME(RQ_ALLOC_TIME), QUEUE_FLAG_NAME(HCTX_ACTIVE), - QUEUE_FLAG_NAME(NOWAIT), QUEUE_FLAG_NAME(SQ_SCHED), QUEUE_FLAG_NAME(SKIP_TAGSET_QUIESCE), }; diff --git a/block/blk-mq.c b/block/blk-mq.c index cf67dc13f7dd4c..43235acc87505f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4118,7 +4118,7 @@ struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, if (!lim) lim = &default_lim; - lim->features |= BLK_FEAT_IO_STAT; + lim->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT; q = blk_alloc_queue(lim, set->numa_node); if (IS_ERR(q)) diff --git a/block/blk-settings.c b/block/blk-settings.c index 536ee202fcdccb..bf4622c19b5c09 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -459,6 +459,15 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->features |= (b->features & BLK_FEAT_INHERIT_MASK); + /* + * BLK_FEAT_NOWAIT needs to be supported both by the stacking driver + * and all underlying devices. The stacking driver sets the flag + * before stacking the limits, and this will clear the flag if any + * of the underlying devices does not support it. + */ + if (!(b->features & BLK_FEAT_NOWAIT)) + t->features &= ~BLK_FEAT_NOWAIT; + t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); t->max_user_sectors = min_not_zero(t->max_user_sectors, b->max_user_sectors); diff --git a/drivers/block/brd.c b/drivers/block/brd.c index d77deb571dbd06..a300645cd9d4a5 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -335,7 +335,8 @@ static int brd_alloc(int i) .max_hw_discard_sectors = UINT_MAX, .max_discard_segments = 1, .discard_granularity = PAGE_SIZE, - .features = BLK_FEAT_SYNCHRONOUS, + .features = BLK_FEAT_SYNCHRONOUS | + BLK_FEAT_NOWAIT, }; list_for_each_entry(brd, &brd_devices, brd_list) @@ -367,7 +368,6 @@ static int brd_alloc(int i) strscpy(disk->disk_name, buf, DISK_NAME_LEN); set_capacity(disk, rd_size * 2); - blk_queue_flag_set(QUEUE_FLAG_NOWAIT, disk->queue); err = add_disk(disk); if (err) goto out_cleanup_disk; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index aaf379cb15d91f..84d636712c7284 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -582,7 +582,7 @@ int dm_split_args(int *argc, char ***argvp, char *input) static void dm_set_stacking_limits(struct queue_limits *limits) { blk_set_stacking_limits(limits); - limits->features |= BLK_FEAT_IO_STAT; + limits->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT; } /* @@ -1746,12 +1746,6 @@ static bool dm_table_supports_write_zeroes(struct dm_table *t) return true; } -static int device_not_nowait_capable(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) -{ - return !bdev_nowait(dev->bdev); -} - static bool dm_table_supports_nowait(struct dm_table *t) { for (unsigned int i = 0; i < t->num_targets; i++) { @@ -1759,10 +1753,6 @@ static bool dm_table_supports_nowait(struct dm_table *t) if (!dm_target_supports_nowait(ti->type)) return false; - - if (!ti->type->iterate_devices || - ti->type->iterate_devices(ti, device_not_nowait_capable, NULL)) - return false; } return true; @@ -1824,10 +1814,8 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, { int r; - if (dm_table_supports_nowait(t)) - blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q); - else - blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, q); + if (!dm_table_supports_nowait(t)) + limits->features &= ~BLK_FEAT_NOWAIT; if (!dm_table_supports_discards(t)) { limits->max_hw_discard_sectors = 0; diff --git a/drivers/md/md.c b/drivers/md/md.c index 8db0db8d5a27ac..f1c7d4f281c521 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5788,7 +5788,7 @@ struct mddev *md_alloc(dev_t dev, char *name) int error; struct queue_limits lim = { .features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | - BLK_FEAT_IO_STAT, + BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT, }; /* @@ -6150,13 +6150,6 @@ int md_run(struct mddev *mddev) } } - if (!mddev_is_dm(mddev)) { - struct request_queue *q = mddev->gendisk->queue; - - /* Set the NOWAIT flags if all underlying devices support it */ - if (nowait) - blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q); - } if (pers->sync_request) { if (mddev->kobj.sd && sysfs_create_group(&mddev->kobj, &md_redundancy_group)) @@ -7115,15 +7108,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); if (!mddev->thread) md_update_sb(mddev, 1); - /* - * If the new disk does not support REQ_NOWAIT, - * disable on the whole MD. - */ - if (!bdev_nowait(rdev->bdev)) { - pr_info("%s: Disabling nowait because %pg does not support nowait\n", - mdname(mddev), rdev->bdev); - blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->gendisk->queue); - } /* * Kick recovery, maybe this spare has to be added to the * array immediately. diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 173796f2ddea9f..61a162c9cf4e6c 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -538,7 +538,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) blk_set_stacking_limits(&lim); lim.dma_alignment = 3; - lim.features |= BLK_FEAT_IO_STAT; + lim.features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT; if (head->ids.csi != NVME_CSI_ZNS) lim.max_zone_append_sectors = 0; @@ -550,7 +550,6 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) sprintf(head->disk->disk_name, "nvme%dn%d", ctrl->subsys->instance, head->instance); - blk_queue_flag_set(QUEUE_FLAG_NOWAIT, head->disk->queue); /* * This assumes all controllers that refer to a namespace either * support poll queues or not. That is not a strict guarantee, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index cee7b44a142513..f3d4519d609d95 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -304,6 +304,9 @@ enum { /* always completes in submit context */ BLK_FEAT_SYNCHRONOUS = (1u << 6), + + /* supports REQ_NOWAIT */ + BLK_FEAT_NOWAIT = (1u << 7), }; /* @@ -580,12 +583,10 @@ struct request_queue { #define QUEUE_FLAG_ZONE_RESETALL 26 /* supports Zone Reset All */ #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */ -#define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */ #define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */ #define QUEUE_FLAG_SKIP_TAGSET_QUIESCE 31 /* quiesce_tagset skip the queue*/ -#define QUEUE_FLAG_MQ_DEFAULT ((1UL << QUEUE_FLAG_SAME_COMP) | \ - (1UL << QUEUE_FLAG_NOWAIT)) +#define QUEUE_FLAG_MQ_DEFAULT (1UL << QUEUE_FLAG_SAME_COMP) void blk_queue_flag_set(unsigned int flag, struct request_queue *q); void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); @@ -1348,7 +1349,7 @@ static inline bool bdev_fua(struct block_device *bdev) static inline bool bdev_nowait(struct block_device *bdev) { - return test_bit(QUEUE_FLAG_NOWAIT, &bdev_get_queue(bdev)->queue_flags); + return bdev->bd_disk->queue->limits.features & BLK_FEAT_NOWAIT; } static inline bool bdev_is_zoned(struct block_device *bdev) From f467fee48da4500786e145489787b37adae317c3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:47 +0200 Subject: [PATCH 1145/1578] block: move the dax flag to queue_limits Move the dax flag into the queue_limits feature field so that it can be set atomically with the queue frozen. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240617060532.127975-21-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 1 - drivers/md/dm-table.c | 4 ++-- drivers/nvdimm/pmem.c | 7 ++----- drivers/s390/block/dcssblk.c | 2 +- include/linux/blkdev.h | 6 ++++-- 5 files changed, 9 insertions(+), 11 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 62b132e9a9ce3b..f4fa820251ce83 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -88,7 +88,6 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(SAME_FORCE), QUEUE_FLAG_NAME(INIT_DONE), QUEUE_FLAG_NAME(POLL), - QUEUE_FLAG_NAME(DAX), QUEUE_FLAG_NAME(STATS), QUEUE_FLAG_NAME(REGISTERED), QUEUE_FLAG_NAME(QUIESCED), diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 84d636712c7284..e44697037e86f4 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1834,11 +1834,11 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, limits->features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA; if (dm_table_supports_dax(t, device_not_dax_capable)) { - blk_queue_flag_set(QUEUE_FLAG_DAX, q); + limits->features |= BLK_FEAT_DAX; if (dm_table_supports_dax(t, device_not_dax_synchronous_capable)) set_dax_synchronous(t->md->dax_dev); } else - blk_queue_flag_clear(QUEUE_FLAG_DAX, q); + limits->features &= ~BLK_FEAT_DAX; if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled, NULL)) dax_write_cache(t->md->dax_dev, true); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index b821dcf018f6ae..1dd74c969d5a09 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -465,7 +465,6 @@ static int pmem_attach_disk(struct device *dev, struct dax_device *dax_dev; struct nd_pfn_sb *pfn_sb; struct pmem_device *pmem; - struct request_queue *q; struct gendisk *disk; void *addr; int rc; @@ -499,6 +498,8 @@ static int pmem_attach_disk(struct device *dev, } if (fua) lim.features |= BLK_FEAT_FUA; + if (is_nd_pfn(dev)) + lim.features |= BLK_FEAT_DAX; if (!devm_request_mem_region(dev, res->start, resource_size(res), dev_name(&ndns->dev))) { @@ -509,7 +510,6 @@ static int pmem_attach_disk(struct device *dev, disk = blk_alloc_disk(&lim, nid); if (IS_ERR(disk)) return PTR_ERR(disk); - q = disk->queue; pmem->disk = disk; pmem->pgmap.owner = pmem; @@ -547,9 +547,6 @@ static int pmem_attach_disk(struct device *dev, } pmem->virt_addr = addr; - if (pmem->pfn_flags & PFN_MAP) - blk_queue_flag_set(QUEUE_FLAG_DAX, q); - disk->fops = &pmem_fops; disk->private_data = pmem; nvdimm_namespace_disk_name(ndns, disk->disk_name); diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 6d1689a2717e5f..d5a5d11ae0dcdf 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -548,6 +548,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char { struct queue_limits lim = { .logical_block_size = 4096, + .features = BLK_FEAT_DAX, }; int rc, i, j, num_of_segments; struct dcssblk_dev_info *dev_info; @@ -643,7 +644,6 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char dev_info->gd->fops = &dcssblk_devops; dev_info->gd->private_data = dev_info; dev_info->gd->flags |= GENHD_FL_NO_PART; - blk_queue_flag_set(QUEUE_FLAG_DAX, dev_info->gd->queue); seg_byte_size = (dev_info->end - dev_info->start + 1); set_capacity(dev_info->gd, seg_byte_size >> 9); // size in sectors diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f3d4519d609d95..7022e06a3dd9a3 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -307,6 +307,9 @@ enum { /* supports REQ_NOWAIT */ BLK_FEAT_NOWAIT = (1u << 7), + + /* supports DAX */ + BLK_FEAT_DAX = (1u << 8), }; /* @@ -575,7 +578,6 @@ struct request_queue { #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ #define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ #define QUEUE_FLAG_POLL 16 /* IO polling enabled if set */ -#define QUEUE_FLAG_DAX 19 /* device supports DAX */ #define QUEUE_FLAG_STATS 20 /* track IO start and completion times */ #define QUEUE_FLAG_REGISTERED 22 /* queue has been registered to a disk */ #define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */ @@ -602,7 +604,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_io_stat(q) ((q)->limits.features & BLK_FEAT_IO_STAT) #define blk_queue_zone_resetall(q) \ test_bit(QUEUE_FLAG_ZONE_RESETALL, &(q)->queue_flags) -#define blk_queue_dax(q) test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags) +#define blk_queue_dax(q) ((q)->limits.features & BLK_FEAT_DAX) #define blk_queue_pci_p2pdma(q) \ test_bit(QUEUE_FLAG_PCI_P2PDMA, &(q)->queue_flags) #ifdef CONFIG_BLK_RQ_ALLOC_TIME From 8023e144f9d6e35f8786937e2f0c2fea0aba6dbc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:48 +0200 Subject: [PATCH 1146/1578] block: move the poll flag to queue_limits Move the poll flag into the queue_limits feature field so that it can be set atomically with the queue frozen. Stacking drivers are simplified in that they now can simply set the flag, and blk_stack_limits will clear it when the features is not supported by any of the underlying devices. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240617060532.127975-22-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-core.c | 5 ++-- block/blk-mq-debugfs.c | 1 - block/blk-mq.c | 31 +++++++++++--------- block/blk-settings.c | 10 ++++--- block/blk-sysfs.c | 4 +-- drivers/md/dm-table.c | 54 +++++++++-------------------------- drivers/nvme/host/multipath.c | 12 +------- include/linux/blkdev.h | 4 ++- 8 files changed, 45 insertions(+), 76 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 2b45a4df9a1aa1..8d9fbd353fc7fc 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -791,7 +791,7 @@ void submit_bio_noacct(struct bio *bio) } } - if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + if (!(q->limits.features & BLK_FEAT_POLL)) bio_clear_polled(bio); switch (bio_op(bio)) { @@ -915,8 +915,7 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) return 0; q = bdev_get_queue(bdev); - if (cookie == BLK_QC_T_NONE || - !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + if (cookie == BLK_QC_T_NONE || !(q->limits.features & BLK_FEAT_POLL)) return 0; blk_flush_plug(current->plug, false); diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index f4fa820251ce83..3a21527913840d 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -87,7 +87,6 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(NOXMERGES), QUEUE_FLAG_NAME(SAME_FORCE), QUEUE_FLAG_NAME(INIT_DONE), - QUEUE_FLAG_NAME(POLL), QUEUE_FLAG_NAME(STATS), QUEUE_FLAG_NAME(REGISTERED), QUEUE_FLAG_NAME(QUIESCED), diff --git a/block/blk-mq.c b/block/blk-mq.c index 43235acc87505f..e2b9710ddc5ad1 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4109,6 +4109,12 @@ void blk_mq_release(struct request_queue *q) blk_mq_sysfs_deinit(q); } +static bool blk_mq_can_poll(struct blk_mq_tag_set *set) +{ + return set->nr_maps > HCTX_TYPE_POLL && + set->map[HCTX_TYPE_POLL].nr_queues; +} + struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, struct queue_limits *lim, void *queuedata) { @@ -4119,6 +4125,8 @@ struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, if (!lim) lim = &default_lim; lim->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT; + if (blk_mq_can_poll(set)) + lim->features |= BLK_FEAT_POLL; q = blk_alloc_queue(lim, set->numa_node); if (IS_ERR(q)) @@ -4273,17 +4281,6 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, mutex_unlock(&q->sysfs_lock); } -static void blk_mq_update_poll_flag(struct request_queue *q) -{ - struct blk_mq_tag_set *set = q->tag_set; - - if (set->nr_maps > HCTX_TYPE_POLL && - set->map[HCTX_TYPE_POLL].nr_queues) - blk_queue_flag_set(QUEUE_FLAG_POLL, q); - else - blk_queue_flag_clear(QUEUE_FLAG_POLL, q); -} - int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q) { @@ -4311,7 +4308,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, q->tag_set = set; q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; - blk_mq_update_poll_flag(q); INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); INIT_LIST_HEAD(&q->flush_list); @@ -4798,8 +4794,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, fallback: blk_mq_update_queue_map(set); list_for_each_entry(q, &set->tag_list, tag_set_list) { + struct queue_limits lim; + blk_mq_realloc_hw_ctxs(set, q); - blk_mq_update_poll_flag(q); + if (q->nr_hw_queues != set->nr_hw_queues) { int i = prev_nr_hw_queues; @@ -4811,6 +4809,13 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, set->nr_hw_queues = prev_nr_hw_queues; goto fallback; } + lim = queue_limits_start_update(q); + if (blk_mq_can_poll(set)) + lim.features |= BLK_FEAT_POLL; + else + lim.features &= ~BLK_FEAT_POLL; + if (queue_limits_commit_update(q, &lim) < 0) + pr_warn("updating the poll flag failed\n"); blk_mq_map_swqueue(q); } diff --git a/block/blk-settings.c b/block/blk-settings.c index bf4622c19b5c09..026ba68d829856 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -460,13 +460,15 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->features |= (b->features & BLK_FEAT_INHERIT_MASK); /* - * BLK_FEAT_NOWAIT needs to be supported both by the stacking driver - * and all underlying devices. The stacking driver sets the flag - * before stacking the limits, and this will clear the flag if any - * of the underlying devices does not support it. + * BLK_FEAT_NOWAIT and BLK_FEAT_POLL need to be supported both by the + * stacking driver and all underlying devices. The stacking driver sets + * the flags before stacking the limits, and this will clear the flags + * if any of the underlying devices does not support it. */ if (!(b->features & BLK_FEAT_NOWAIT)) t->features &= ~BLK_FEAT_NOWAIT; + if (!(b->features & BLK_FEAT_POLL)) + t->features &= ~BLK_FEAT_POLL; t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); t->max_user_sectors = min_not_zero(t->max_user_sectors, diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index cde525724831ef..da4e96d686f91e 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -394,13 +394,13 @@ static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page, static ssize_t queue_poll_show(struct request_queue *q, char *page) { - return queue_var_show(test_bit(QUEUE_FLAG_POLL, &q->queue_flags), page); + return queue_var_show(q->limits.features & BLK_FEAT_POLL, page); } static ssize_t queue_poll_store(struct request_queue *q, const char *page, size_t count) { - if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + if (!(q->limits.features & BLK_FEAT_POLL)) return -EINVAL; pr_info_ratelimited("writes to the poll attribute are ignored.\n"); pr_info_ratelimited("please use driver specific parameters instead.\n"); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index e44697037e86f4..ca1f136575cff4 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -582,7 +582,7 @@ int dm_split_args(int *argc, char ***argvp, char *input) static void dm_set_stacking_limits(struct queue_limits *limits) { blk_set_stacking_limits(limits); - limits->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT; + limits->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT | BLK_FEAT_POLL; } /* @@ -1024,14 +1024,13 @@ bool dm_table_request_based(struct dm_table *t) return __table_type_request_based(dm_table_get_type(t)); } -static bool dm_table_supports_poll(struct dm_table *t); - static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md) { enum dm_queue_mode type = dm_table_get_type(t); unsigned int per_io_data_size = 0, front_pad, io_front_pad; unsigned int min_pool_size = 0, pool_size; struct dm_md_mempools *pools; + unsigned int bioset_flags = 0; if (unlikely(type == DM_TYPE_NONE)) { DMERR("no table type is set, can't allocate mempools"); @@ -1048,6 +1047,9 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * goto init_bs; } + if (md->queue->limits.features & BLK_FEAT_POLL) + bioset_flags |= BIOSET_PERCPU_CACHE; + for (unsigned int i = 0; i < t->num_targets; i++) { struct dm_target *ti = dm_table_get_target(t, i); @@ -1060,8 +1062,7 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * io_front_pad = roundup(per_io_data_size, __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET; - if (bioset_init(&pools->io_bs, pool_size, io_front_pad, - dm_table_supports_poll(t) ? BIOSET_PERCPU_CACHE : 0)) + if (bioset_init(&pools->io_bs, pool_size, io_front_pad, bioset_flags)) goto out_free_pools; if (t->integrity_supported && bioset_integrity_create(&pools->io_bs, pool_size)) @@ -1404,14 +1405,6 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) return &t->targets[(KEYS_PER_NODE * n) + k]; } -static int device_not_poll_capable(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) -{ - struct request_queue *q = bdev_get_queue(dev->bdev); - - return !test_bit(QUEUE_FLAG_POLL, &q->queue_flags); -} - /* * type->iterate_devices() should be called when the sanity check needs to * iterate and check all underlying data devices. iterate_devices() will @@ -1459,19 +1452,6 @@ static int count_device(struct dm_target *ti, struct dm_dev *dev, return 0; } -static bool dm_table_supports_poll(struct dm_table *t) -{ - for (unsigned int i = 0; i < t->num_targets; i++) { - struct dm_target *ti = dm_table_get_target(t, i); - - if (!ti->type->iterate_devices || - ti->type->iterate_devices(ti, device_not_poll_capable, NULL)) - return false; - } - - return true; -} - /* * Check whether a table has no data devices attached using each * target's iterate_devices method. @@ -1817,6 +1797,13 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, if (!dm_table_supports_nowait(t)) limits->features &= ~BLK_FEAT_NOWAIT; + /* + * The current polling impementation does not support request based + * stacking. + */ + if (!__table_type_bio_based(t->type)) + limits->features &= ~BLK_FEAT_POLL; + if (!dm_table_supports_discards(t)) { limits->max_hw_discard_sectors = 0; limits->discard_granularity = 0; @@ -1858,21 +1845,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, return r; dm_update_crypto_profile(q, t); - - /* - * Check for request-based device is left to - * dm_mq_init_request_queue()->blk_mq_init_allocated_queue(). - * - * For bio-based device, only set QUEUE_FLAG_POLL when all - * underlying devices supporting polling. - */ - if (__table_type_bio_based(t->type)) { - if (dm_table_supports_poll(t)) - blk_queue_flag_set(QUEUE_FLAG_POLL, q); - else - blk_queue_flag_clear(QUEUE_FLAG_POLL, q); - } - return 0; } diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 61a162c9cf4e6c..4933194d00e592 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -538,7 +538,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) blk_set_stacking_limits(&lim); lim.dma_alignment = 3; - lim.features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT; + lim.features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT | BLK_FEAT_POLL; if (head->ids.csi != NVME_CSI_ZNS) lim.max_zone_append_sectors = 0; @@ -549,16 +549,6 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) head->disk->private_data = head; sprintf(head->disk->disk_name, "nvme%dn%d", ctrl->subsys->instance, head->instance); - - /* - * This assumes all controllers that refer to a namespace either - * support poll queues or not. That is not a strict guarantee, - * but if the assumption is wrong the effect is only suboptimal - * performance but not correctness problem. - */ - if (ctrl->tagset->nr_maps > HCTX_TYPE_POLL && - ctrl->tagset->map[HCTX_TYPE_POLL].nr_queues) - blk_queue_flag_set(QUEUE_FLAG_POLL, head->disk->queue); return 0; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7022e06a3dd9a3..cd27b66cbacc00 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -310,6 +310,9 @@ enum { /* supports DAX */ BLK_FEAT_DAX = (1u << 8), + + /* supports I/O polling */ + BLK_FEAT_POLL = (1u << 9), }; /* @@ -577,7 +580,6 @@ struct request_queue { #define QUEUE_FLAG_NOXMERGES 9 /* No extended merges */ #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ #define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ -#define QUEUE_FLAG_POLL 16 /* IO polling enabled if set */ #define QUEUE_FLAG_STATS 20 /* track IO start and completion times */ #define QUEUE_FLAG_REGISTERED 22 /* queue has been registered to a disk */ #define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */ From b1fc937a55f5735b98d9dceae5bb6ba262501f56 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:49 +0200 Subject: [PATCH 1147/1578] block: move the zoned flag into the features field Move the zoned flags into the features field to reclaim a little bit of space. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240617060532.127975-23-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 5 ++--- drivers/block/null_blk/zoned.c | 2 +- drivers/block/ublk_drv.c | 2 +- drivers/block/virtio_blk.c | 5 +++-- drivers/md/dm-table.c | 11 ++++++----- drivers/md/dm-zone.c | 2 +- drivers/md/dm-zoned-target.c | 2 +- drivers/nvme/host/zns.c | 2 +- drivers/scsi/sd_zbc.c | 2 +- include/linux/blkdev.h | 9 ++++++--- 10 files changed, 23 insertions(+), 19 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index 026ba68d829856..96e07f24bd9aa1 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -68,7 +68,7 @@ static void blk_apply_bdi_limits(struct backing_dev_info *bdi, static int blk_validate_zoned_limits(struct queue_limits *lim) { - if (!lim->zoned) { + if (!(lim->features & BLK_FEAT_ZONED)) { if (WARN_ON_ONCE(lim->max_open_zones) || WARN_ON_ONCE(lim->max_active_zones) || WARN_ON_ONCE(lim->zone_write_granularity) || @@ -602,8 +602,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, b->max_secure_erase_sectors); t->zone_write_granularity = max(t->zone_write_granularity, b->zone_write_granularity); - t->zoned = max(t->zoned, b->zoned); - if (!t->zoned) { + if (!(t->features & BLK_FEAT_ZONED)) { t->zone_write_granularity = 0; t->max_zone_append_sectors = 0; } diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index f118d304f31080..ca8e739e76b981 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -158,7 +158,7 @@ int null_init_zoned_dev(struct nullb_device *dev, sector += dev->zone_size_sects; } - lim->zoned = true; + lim->features |= BLK_FEAT_ZONED; lim->chunk_sectors = dev->zone_size_sects; lim->max_zone_append_sectors = dev->zone_append_max_sectors; lim->max_open_zones = dev->zone_max_open; diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 4fcde099935868..69c16018cbb19a 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2196,7 +2196,7 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) return -EOPNOTSUPP; - lim.zoned = true; + lim.features |= BLK_FEAT_ZONED; lim.max_active_zones = p->max_active_zones; lim.max_open_zones = p->max_open_zones; lim.max_zone_append_sectors = p->max_zone_append_sectors; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 13a2f24f176628..cea45b296f8bec 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -728,7 +728,7 @@ static int virtblk_read_zoned_limits(struct virtio_blk *vblk, dev_dbg(&vdev->dev, "probing host-managed zoned device\n"); - lim->zoned = true; + lim->features |= BLK_FEAT_ZONED; virtio_cread(vdev, struct virtio_blk_config, zoned.max_open_zones, &v); @@ -1546,7 +1546,8 @@ static int virtblk_probe(struct virtio_device *vdev) * All steps that follow use the VQs therefore they need to be * placed after the virtio_device_ready() call above. */ - if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && lim.zoned) { + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && + (lim.features & BLK_FEAT_ZONED)) { blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, vblk->disk->queue); err = blk_revalidate_disk_zones(vblk->disk); if (err) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index ca1f136575cff4..df6313c3fe6ba4 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1605,12 +1605,12 @@ int dm_calculate_queue_limits(struct dm_table *t, ti->type->iterate_devices(ti, dm_set_device_limits, &ti_limits); - if (!zoned && ti_limits.zoned) { + if (!zoned && (ti_limits.features & BLK_FEAT_ZONED)) { /* * After stacking all limits, validate all devices * in table support this zoned model and zone sectors. */ - zoned = ti_limits.zoned; + zoned = (ti_limits.features & BLK_FEAT_ZONED); zone_sectors = ti_limits.chunk_sectors; } @@ -1658,12 +1658,12 @@ int dm_calculate_queue_limits(struct dm_table *t, * zoned model on host-managed zoned block devices. * BUT... */ - if (limits->zoned) { + if (limits->features & BLK_FEAT_ZONED) { /* * ...IF the above limits stacking determined a zoned model * validate that all of the table's devices conform to it. */ - zoned = limits->zoned; + zoned = limits->features & BLK_FEAT_ZONED; zone_sectors = limits->chunk_sectors; } if (validate_hardware_zoned(t, zoned, zone_sectors)) @@ -1834,7 +1834,8 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, * For a zoned target, setup the zones related queue attributes * and resources necessary for zone append emulation if necessary. */ - if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && limits->zoned) { + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && + (limits->features & limits->features & BLK_FEAT_ZONED)) { r = dm_set_zones_restrictions(t, q, limits); if (r) return r; diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 5d66d916730efa..88d313229b43ff 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -263,7 +263,7 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, if (nr_conv_zones >= ret) { lim->max_open_zones = 0; lim->max_active_zones = 0; - lim->zoned = false; + lim->features &= ~BLK_FEAT_ZONED; clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); disk->nr_zones = 0; return 0; diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 12236e6f46f39c..cd0ee144973f9f 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -1009,7 +1009,7 @@ static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits) limits->max_sectors = chunk_sectors; /* We are exposing a drive-managed zoned block device */ - limits->zoned = false; + limits->features &= ~BLK_FEAT_ZONED; } /* diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index 77aa0f440a6d2a..06f2417aa50de7 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -108,7 +108,7 @@ int nvme_query_zone_info(struct nvme_ns *ns, unsigned lbaf, void nvme_update_zone_info(struct nvme_ns *ns, struct queue_limits *lim, struct nvme_zone_info *zi) { - lim->zoned = 1; + lim->features |= BLK_FEAT_ZONED; lim->max_open_zones = zi->max_open_zones; lim->max_active_zones = zi->max_active_zones; lim->max_zone_append_sectors = ns->ctrl->max_zone_append; diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index 360ec980499529..d3f84665946ec4 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -601,7 +601,7 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, struct queue_limits *lim, if (sdkp->device->type != TYPE_ZBC) return 0; - lim->zoned = true; + lim->features |= BLK_FEAT_ZONED; /* * Per ZBC and ZAC specifications, writes in sequential write required diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index cd27b66cbacc00..bdc30c1fb1b57b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -313,6 +313,9 @@ enum { /* supports I/O polling */ BLK_FEAT_POLL = (1u << 9), + + /* is a zoned device */ + BLK_FEAT_ZONED = (1u << 10), }; /* @@ -320,7 +323,7 @@ enum { */ #define BLK_FEAT_INHERIT_MASK \ (BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_ROTATIONAL | \ - BLK_FEAT_STABLE_WRITES) + BLK_FEAT_STABLE_WRITES | BLK_FEAT_ZONED) /* internal flags in queue_limits.flags */ enum { @@ -372,7 +375,6 @@ struct queue_limits { unsigned char misaligned; unsigned char discard_misaligned; unsigned char raid_partial_stripes_expensive; - bool zoned; unsigned int max_open_zones; unsigned int max_active_zones; @@ -654,7 +656,8 @@ static inline enum rpm_status queue_rpm_status(struct request_queue *q) static inline bool blk_queue_is_zoned(struct request_queue *q) { - return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && q->limits.zoned; + return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && + (q->limits.features & BLK_FEAT_ZONED); } #ifdef CONFIG_BLK_DEV_ZONED From a52758a39768f441e468a41da6c15a59d6d6011a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:50 +0200 Subject: [PATCH 1148/1578] block: move the zone_resetall flag to queue_limits Move the zone_resetall flag into the queue_limits feature field so that it can be set atomically with the queue frozen. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240617060532.127975-24-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 1 - drivers/block/null_blk/zoned.c | 3 +-- drivers/block/ublk_drv.c | 4 +--- drivers/block/virtio_blk.c | 3 +-- drivers/nvme/host/zns.c | 3 +-- drivers/scsi/sd_zbc.c | 5 +---- include/linux/blkdev.h | 6 ++++-- 7 files changed, 9 insertions(+), 16 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 3a21527913840d..f2fd72f4414ae8 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -91,7 +91,6 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(REGISTERED), QUEUE_FLAG_NAME(QUIESCED), QUEUE_FLAG_NAME(PCI_P2PDMA), - QUEUE_FLAG_NAME(ZONE_RESETALL), QUEUE_FLAG_NAME(RQ_ALLOC_TIME), QUEUE_FLAG_NAME(HCTX_ACTIVE), QUEUE_FLAG_NAME(SQ_SCHED), diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index ca8e739e76b981..b42c00f1313254 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -158,7 +158,7 @@ int null_init_zoned_dev(struct nullb_device *dev, sector += dev->zone_size_sects; } - lim->features |= BLK_FEAT_ZONED; + lim->features |= BLK_FEAT_ZONED | BLK_FEAT_ZONE_RESETALL; lim->chunk_sectors = dev->zone_size_sects; lim->max_zone_append_sectors = dev->zone_append_max_sectors; lim->max_open_zones = dev->zone_max_open; @@ -171,7 +171,6 @@ int null_register_zoned_dev(struct nullb *nullb) struct request_queue *q = nullb->q; struct gendisk *disk = nullb->disk; - blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); disk->nr_zones = bdev_nr_zones(disk->part0); pr_info("%s: using %s zone append\n", diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 69c16018cbb19a..4fdff13fc23b8a 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -248,8 +248,6 @@ static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) static void ublk_dev_param_zoned_apply(struct ublk_device *ub) { - blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue); - ub->ub_disk->nr_zones = ublk_get_nr_zones(ub); } @@ -2196,7 +2194,7 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) return -EOPNOTSUPP; - lim.features |= BLK_FEAT_ZONED; + lim.features |= BLK_FEAT_ZONED | BLK_FEAT_ZONE_RESETALL; lim.max_active_zones = p->max_active_zones; lim.max_open_zones = p->max_open_zones; lim.max_zone_append_sectors = p->max_zone_append_sectors; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index cea45b296f8bec..6c64a67ab9c901 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -728,7 +728,7 @@ static int virtblk_read_zoned_limits(struct virtio_blk *vblk, dev_dbg(&vdev->dev, "probing host-managed zoned device\n"); - lim->features |= BLK_FEAT_ZONED; + lim->features |= BLK_FEAT_ZONED | BLK_FEAT_ZONE_RESETALL; virtio_cread(vdev, struct virtio_blk_config, zoned.max_open_zones, &v); @@ -1548,7 +1548,6 @@ static int virtblk_probe(struct virtio_device *vdev) */ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && (lim.features & BLK_FEAT_ZONED)) { - blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, vblk->disk->queue); err = blk_revalidate_disk_zones(vblk->disk); if (err) goto out_cleanup_disk; diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index 06f2417aa50de7..99bb89c2495ae3 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -108,13 +108,12 @@ int nvme_query_zone_info(struct nvme_ns *ns, unsigned lbaf, void nvme_update_zone_info(struct nvme_ns *ns, struct queue_limits *lim, struct nvme_zone_info *zi) { - lim->features |= BLK_FEAT_ZONED; + lim->features |= BLK_FEAT_ZONED | BLK_FEAT_ZONE_RESETALL; lim->max_open_zones = zi->max_open_zones; lim->max_active_zones = zi->max_active_zones; lim->max_zone_append_sectors = ns->ctrl->max_zone_append; lim->chunk_sectors = ns->head->zsze = nvme_lba_to_sect(ns->head, zi->zone_size); - blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ns->queue); } static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns, diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index d3f84665946ec4..f7067afac79c14 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -592,8 +592,6 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp) int sd_zbc_read_zones(struct scsi_disk *sdkp, struct queue_limits *lim, u8 buf[SD_BUF_SIZE]) { - struct gendisk *disk = sdkp->disk; - struct request_queue *q = disk->queue; unsigned int nr_zones; u32 zone_blocks = 0; int ret; @@ -601,7 +599,7 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, struct queue_limits *lim, if (sdkp->device->type != TYPE_ZBC) return 0; - lim->features |= BLK_FEAT_ZONED; + lim->features |= BLK_FEAT_ZONED | BLK_FEAT_ZONE_RESETALL; /* * Per ZBC and ZAC specifications, writes in sequential write required @@ -630,7 +628,6 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, struct queue_limits *lim, sdkp->early_zone_info.zone_blocks = zone_blocks; /* The drive satisfies the kernel restrictions: set it up */ - blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); if (sdkp->zones_max_open == U32_MAX) lim->max_open_zones = 0; else diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bdc30c1fb1b57b..1077cb8d8fd808 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -316,6 +316,9 @@ enum { /* is a zoned device */ BLK_FEAT_ZONED = (1u << 10), + + /* supports Zone Reset All */ + BLK_FEAT_ZONE_RESETALL = (1u << 11), }; /* @@ -586,7 +589,6 @@ struct request_queue { #define QUEUE_FLAG_REGISTERED 22 /* queue has been registered to a disk */ #define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */ #define QUEUE_FLAG_PCI_P2PDMA 25 /* device supports PCI p2p requests */ -#define QUEUE_FLAG_ZONE_RESETALL 26 /* supports Zone Reset All */ #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */ #define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */ @@ -607,7 +609,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_nonrot(q) ((q)->limits.features & BLK_FEAT_ROTATIONAL) #define blk_queue_io_stat(q) ((q)->limits.features & BLK_FEAT_IO_STAT) #define blk_queue_zone_resetall(q) \ - test_bit(QUEUE_FLAG_ZONE_RESETALL, &(q)->queue_flags) + ((q)->limits.features & BLK_FEAT_ZONE_RESETALL) #define blk_queue_dax(q) ((q)->limits.features & BLK_FEAT_DAX) #define blk_queue_pci_p2pdma(q) \ test_bit(QUEUE_FLAG_PCI_P2PDMA, &(q)->queue_flags) From 9c1e42e3c876c66796eda23e79836a4d92613a61 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:51 +0200 Subject: [PATCH 1149/1578] block: move the pci_p2pdma flag to queue_limits Move the pci_p2pdma flag into the queue_limits feature field so that it can be set atomically with the queue frozen. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240617060532.127975-25-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 1 - drivers/nvme/host/core.c | 8 +++----- include/linux/blkdev.h | 7 ++++--- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index f2fd72f4414ae8..8b5a68861c119b 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -90,7 +90,6 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(STATS), QUEUE_FLAG_NAME(REGISTERED), QUEUE_FLAG_NAME(QUIESCED), - QUEUE_FLAG_NAME(PCI_P2PDMA), QUEUE_FLAG_NAME(RQ_ALLOC_TIME), QUEUE_FLAG_NAME(HCTX_ACTIVE), QUEUE_FLAG_NAME(SQ_SCHED), diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 5ecf762d7c8837..31e752e8d632cd 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3735,6 +3735,9 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) if (ctrl->opts && ctrl->opts->data_digest) lim.features |= BLK_FEAT_STABLE_WRITES; + if (ctrl->ops->supports_pci_p2pdma && + ctrl->ops->supports_pci_p2pdma(ctrl)) + lim.features |= BLK_FEAT_PCI_P2PDMA; disk = blk_mq_alloc_disk(ctrl->tagset, &lim, ns); if (IS_ERR(disk)) @@ -3744,11 +3747,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) ns->disk = disk; ns->queue = disk->queue; - - if (ctrl->ops->supports_pci_p2pdma && - ctrl->ops->supports_pci_p2pdma(ctrl)) - blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue); - ns->ctrl = ctrl; kref_init(&ns->kref); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1077cb8d8fd808..ab0f7dfba556eb 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -319,6 +319,9 @@ enum { /* supports Zone Reset All */ BLK_FEAT_ZONE_RESETALL = (1u << 11), + + /* supports PCI(e) p2p requests */ + BLK_FEAT_PCI_P2PDMA = (1u << 12), }; /* @@ -588,7 +591,6 @@ struct request_queue { #define QUEUE_FLAG_STATS 20 /* track IO start and completion times */ #define QUEUE_FLAG_REGISTERED 22 /* queue has been registered to a disk */ #define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */ -#define QUEUE_FLAG_PCI_P2PDMA 25 /* device supports PCI p2p requests */ #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */ #define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */ @@ -611,8 +613,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_zone_resetall(q) \ ((q)->limits.features & BLK_FEAT_ZONE_RESETALL) #define blk_queue_dax(q) ((q)->limits.features & BLK_FEAT_DAX) -#define blk_queue_pci_p2pdma(q) \ - test_bit(QUEUE_FLAG_PCI_P2PDMA, &(q)->queue_flags) +#define blk_queue_pci_p2pdma(q) ((q)->limits.features & BLK_FEAT_PCI_P2PDMA) #ifdef CONFIG_BLK_RQ_ALLOC_TIME #define blk_queue_rq_alloc_time(q) \ test_bit(QUEUE_FLAG_RQ_ALLOC_TIME, &(q)->queue_flags) From 8c8f5c85b20d0a7dc0ab9b2a17318130d69ceb5a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:52 +0200 Subject: [PATCH 1150/1578] block: move the skip_tagset_quiesce flag to queue_limits Move the skip_tagset_quiesce flag into the queue_limits feature field so that it can be set atomically with the queue frozen. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240617060532.127975-26-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 1 - drivers/nvme/host/core.c | 8 +++++--- include/linux/blkdev.h | 6 ++++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 8b5a68861c119b..344f9e503bdb32 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -93,7 +93,6 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(RQ_ALLOC_TIME), QUEUE_FLAG_NAME(HCTX_ACTIVE), QUEUE_FLAG_NAME(SQ_SCHED), - QUEUE_FLAG_NAME(SKIP_TAGSET_QUIESCE), }; #undef QUEUE_FLAG_NAME diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 31e752e8d632cd..bf410d10b12006 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4489,13 +4489,15 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, return ret; if (ctrl->ops->flags & NVME_F_FABRICS) { - ctrl->connect_q = blk_mq_alloc_queue(set, NULL, NULL); + struct queue_limits lim = { + .features = BLK_FEAT_SKIP_TAGSET_QUIESCE, + }; + + ctrl->connect_q = blk_mq_alloc_queue(set, &lim, NULL); if (IS_ERR(ctrl->connect_q)) { ret = PTR_ERR(ctrl->connect_q); goto out_free_tag_set; } - blk_queue_flag_set(QUEUE_FLAG_SKIP_TAGSET_QUIESCE, - ctrl->connect_q); } ctrl->tagset = set; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ab0f7dfba556eb..2c433ebf6f2030 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -322,6 +322,9 @@ enum { /* supports PCI(e) p2p requests */ BLK_FEAT_PCI_P2PDMA = (1u << 12), + + /* skip this queue in blk_mq_(un)quiesce_tagset */ + BLK_FEAT_SKIP_TAGSET_QUIESCE = (1u << 13), }; /* @@ -594,7 +597,6 @@ struct request_queue { #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */ #define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */ -#define QUEUE_FLAG_SKIP_TAGSET_QUIESCE 31 /* quiesce_tagset skip the queue*/ #define QUEUE_FLAG_MQ_DEFAULT (1UL << QUEUE_FLAG_SAME_COMP) @@ -629,7 +631,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags) #define blk_queue_sq_sched(q) test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags) #define blk_queue_skip_tagset_quiesce(q) \ - test_bit(QUEUE_FLAG_SKIP_TAGSET_QUIESCE, &(q)->queue_flags) + ((q)->limits.features & BLK_FEAT_SKIP_TAGSET_QUIESCE) extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q); From 339d3948c07b4aa2940aeb874294a7d6782cec16 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:53 +0200 Subject: [PATCH 1151/1578] block: move the bounce flag into the features field Move the bounce flag into the features field to reclaim a little bit of space. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240617060532.127975-27-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 1 - block/blk.h | 2 +- drivers/scsi/scsi_lib.c | 2 +- include/linux/blkdev.h | 6 ++++-- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index 96e07f24bd9aa1..d0e9096f93ca8a 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -479,7 +479,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, b->max_write_zeroes_sectors); t->max_zone_append_sectors = min(queue_limits_max_zone_append_sectors(t), queue_limits_max_zone_append_sectors(b)); - t->bounce = max(t->bounce, b->bounce); t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, b->seg_boundary_mask); diff --git a/block/blk.h b/block/blk.h index 79e8d5d4fe0caf..fa32f7fad5d7e6 100644 --- a/block/blk.h +++ b/block/blk.h @@ -394,7 +394,7 @@ struct bio *__blk_queue_bounce(struct bio *bio, struct request_queue *q); static inline bool blk_queue_may_bounce(struct request_queue *q) { return IS_ENABLED(CONFIG_BOUNCE) && - q->limits.bounce == BLK_BOUNCE_HIGH && + (q->limits.features & BLK_FEAT_BOUNCE_HIGH) && max_low_pfn >= max_pfn; } diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 54f771ec8cfb5e..e2f7bfb2b9e450 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1986,7 +1986,7 @@ void scsi_init_limits(struct Scsi_Host *shost, struct queue_limits *lim) shost->dma_alignment, dma_get_cache_alignment() - 1); if (shost->no_highmem) - lim->bounce = BLK_BOUNCE_HIGH; + lim->features |= BLK_FEAT_BOUNCE_HIGH; dma_set_seg_boundary(dev, shost->dma_boundary); dma_set_max_seg_size(dev, shost->max_segment_size); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2c433ebf6f2030..e96ba7b97288d2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -325,6 +325,9 @@ enum { /* skip this queue in blk_mq_(un)quiesce_tagset */ BLK_FEAT_SKIP_TAGSET_QUIESCE = (1u << 13), + + /* bounce all highmem pages */ + BLK_FEAT_BOUNCE_HIGH = (1u << 14), }; /* @@ -332,7 +335,7 @@ enum { */ #define BLK_FEAT_INHERIT_MASK \ (BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_ROTATIONAL | \ - BLK_FEAT_STABLE_WRITES | BLK_FEAT_ZONED) + BLK_FEAT_STABLE_WRITES | BLK_FEAT_ZONED | BLK_FEAT_BOUNCE_HIGH) /* internal flags in queue_limits.flags */ enum { @@ -352,7 +355,6 @@ enum blk_bounce { struct queue_limits { unsigned int features; unsigned int flags; - enum blk_bounce bounce; unsigned long seg_boundary_mask; unsigned long virt_boundary_mask; From 9a3bc8d16e0aacd65c31aaf23a2bced3288a7779 Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Thu, 13 Jun 2024 17:42:46 +0800 Subject: [PATCH 1152/1578] seg6: fix parameter passing when calling NF_HOOK() in End.DX4 and End.DX6 behaviors input_action_end_dx4() and input_action_end_dx6() are called NF_HOOK() for PREROUTING hook, in PREROUTING hook, we should passing a valid indev, and a NULL outdev to NF_HOOK(), otherwise may trigger a NULL pointer dereference, as below: [74830.647293] BUG: kernel NULL pointer dereference, address: 0000000000000090 [74830.655633] #PF: supervisor read access in kernel mode [74830.657888] #PF: error_code(0x0000) - not-present page [74830.659500] PGD 0 P4D 0 [74830.660450] Oops: 0000 [#1] PREEMPT SMP PTI ... [74830.664953] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 [74830.666569] RIP: 0010:rpfilter_mt+0x44/0x15e [ipt_rpfilter] ... [74830.689725] Call Trace: [74830.690402] [74830.690953] ? show_trace_log_lvl+0x1c4/0x2df [74830.692020] ? show_trace_log_lvl+0x1c4/0x2df [74830.693095] ? ipt_do_table+0x286/0x710 [ip_tables] [74830.694275] ? __die_body.cold+0x8/0xd [74830.695205] ? page_fault_oops+0xac/0x140 [74830.696244] ? exc_page_fault+0x62/0x150 [74830.697225] ? asm_exc_page_fault+0x22/0x30 [74830.698344] ? rpfilter_mt+0x44/0x15e [ipt_rpfilter] [74830.699540] ipt_do_table+0x286/0x710 [ip_tables] [74830.700758] ? ip6_route_input+0x19d/0x240 [74830.701752] nf_hook_slow+0x3f/0xb0 [74830.702678] input_action_end_dx4+0x19b/0x1e0 [74830.703735] ? input_action_end_t+0xe0/0xe0 [74830.704734] seg6_local_input_core+0x2d/0x60 [74830.705782] lwtunnel_input+0x5b/0xb0 [74830.706690] __netif_receive_skb_one_core+0x63/0xa0 [74830.707825] process_backlog+0x99/0x140 [74830.709538] __napi_poll+0x2c/0x160 [74830.710673] net_rx_action+0x296/0x350 [74830.711860] __do_softirq+0xcb/0x2ac [74830.713049] do_softirq+0x63/0x90 input_action_end_dx4() passing a NULL indev to NF_HOOK(), and finally trigger a NULL dereference in rpfilter_mt()->rpfilter_is_loopback(): static bool rpfilter_is_loopback(const struct sk_buff *skb, const struct net_device *in) { // in is NULL return skb->pkt_type == PACKET_LOOPBACK || in->flags & IFF_LOOPBACK; } Fixes: 7a3f5b0de364 ("netfilter: add netfilter hooks to SRv6 data plane") Signed-off-by: Jianguo Wu Reviewed-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/ipv6/seg6_local.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 24e2b4b494cb08..c434940131b1d0 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -941,8 +941,8 @@ static int input_action_end_dx6(struct sk_buff *skb, if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, - dev_net(skb->dev), NULL, skb, NULL, - skb_dst(skb)->dev, input_action_end_dx6_finish); + dev_net(skb->dev), NULL, skb, skb->dev, + NULL, input_action_end_dx6_finish); return input_action_end_dx6_finish(dev_net(skb->dev), NULL, skb); drop: @@ -991,8 +991,8 @@ static int input_action_end_dx4(struct sk_buff *skb, if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, - dev_net(skb->dev), NULL, skb, NULL, - skb_dst(skb)->dev, input_action_end_dx4_finish); + dev_net(skb->dev), NULL, skb, skb->dev, + NULL, input_action_end_dx4_finish); return input_action_end_dx4_finish(dev_net(skb->dev), NULL, skb); drop: From 096597cfe4ea08b1830e775436d76d7c9d6d3037 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Tue, 18 Jun 2024 21:44:24 -0700 Subject: [PATCH 1153/1578] thermal: int340x: processor_thermal: Support shared interrupts On some systems the processor thermal device interrupt is shared with other PCI devices. In this case return IRQ_NONE from the interrupt handler when the interrupt is not for the processor thermal device. Signed-off-by: Srinivas Pandruvada Fixes: f0658708e863 ("thermal: int340x: processor_thermal: Use non MSI interrupts by default") Cc: 6.7+ # 6.7+ Signed-off-by: Rafael J. Wysocki --- .../intel/int340x_thermal/processor_thermal_device_pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c index 14e34eabc41913..4a1bfebb1b8e52 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c @@ -150,7 +150,7 @@ static irqreturn_t proc_thermal_irq_handler(int irq, void *devid) { struct proc_thermal_pci *pci_info = devid; struct proc_thermal_device *proc_priv; - int ret = IRQ_HANDLED; + int ret = IRQ_NONE; u32 status; proc_priv = pci_info->proc_priv; @@ -175,6 +175,7 @@ static irqreturn_t proc_thermal_irq_handler(int irq, void *devid) /* Disable enable interrupt flag */ proc_thermal_mmio_write(pci_info, PROC_THERMAL_MMIO_INT_ENABLE_0, 0); pkg_thermal_schedule_work(&pci_info->work); + ret = IRQ_HANDLED; } pci_write_config_byte(pci_info->pdev, 0xdc, 0x01); From 3e05b222382ec67dce7358d50b6006e91d028d8b Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Tue, 18 Jun 2024 22:06:18 -0400 Subject: [PATCH 1154/1578] io_uring: Fix probe of disabled operations io_probe checks io_issue_def->not_supported, but we never really set that field, as we mark non-supported functions through a specific ->prep handler. This means we end up returning IO_URING_OP_SUPPORTED, even for disabled operations. Fix it by just checking the prep handler itself. Fixes: 66f4af93da57 ("io_uring: add support for probing opcodes") Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20240619020620.5301-2-krisman@suse.de Signed-off-by: Jens Axboe --- io_uring/opdef.c | 8 ++++++++ io_uring/opdef.h | 4 ++-- io_uring/register.c | 2 +- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 2dd49cf22f642d..a2be3bbca5ffa1 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -751,6 +751,14 @@ const char *io_uring_get_opcode(u8 opcode) return "INVALID"; } +bool io_uring_op_supported(u8 opcode) +{ + if (opcode < IORING_OP_LAST && + io_issue_defs[opcode].prep != io_eopnotsupp_prep) + return true; + return false; +} + void __init io_uring_optable_init(void) { int i; diff --git a/io_uring/opdef.h b/io_uring/opdef.h index 7ee6f5aa90aa3c..14456436ff74ac 100644 --- a/io_uring/opdef.h +++ b/io_uring/opdef.h @@ -17,8 +17,6 @@ struct io_issue_def { unsigned poll_exclusive : 1; /* op supports buffer selection */ unsigned buffer_select : 1; - /* opcode is not supported by this kernel */ - unsigned not_supported : 1; /* skip auditing */ unsigned audit_skip : 1; /* supports ioprio */ @@ -47,5 +45,7 @@ struct io_cold_def { extern const struct io_issue_def io_issue_defs[]; extern const struct io_cold_def io_cold_defs[]; +bool io_uring_op_supported(u8 opcode); + void io_uring_optable_init(void); #endif diff --git a/io_uring/register.c b/io_uring/register.c index f121e02f5e10e6..e28cc226217cdb 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -59,7 +59,7 @@ static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, for (i = 0; i < nr_args; i++) { p->ops[i].op = i; - if (!io_issue_defs[i].not_supported) + if (io_uring_op_supported(i)) p->ops[i].flags = IO_URING_OP_SUPPORTED; } p->ops_len = i; From 6bc9199d0c84f5cd72922223231c7708698059a2 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Tue, 18 Jun 2024 22:06:19 -0400 Subject: [PATCH 1155/1578] io_uring: Allocate only necessary memory in io_probe We write at most IORING_OP_LAST entries in the probe buffer, so we don't need to allocate temporary space for more than that. As a side effect, we no longer can overflow "size". Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20240619020620.5301-3-krisman@suse.de Signed-off-by: Jens Axboe --- io_uring/register.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/io_uring/register.c b/io_uring/register.c index e28cc226217cdb..e3c20be5a19803 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -39,9 +39,10 @@ static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, size_t size; int i, ret; + if (nr_args > IORING_OP_LAST) + nr_args = IORING_OP_LAST; + size = struct_size(p, ops, nr_args); - if (size == SIZE_MAX) - return -EOVERFLOW; p = kzalloc(size, GFP_KERNEL); if (!p) return -ENOMEM; @@ -54,8 +55,6 @@ static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, goto out; p->last_op = IORING_OP_LAST - 1; - if (nr_args > IORING_OP_LAST) - nr_args = IORING_OP_LAST; for (i = 0; i < nr_args; i++) { p->ops[i].op = i; From d92a7580392ad4681b1d4f9275d00b95375ebe01 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 17 Jun 2024 17:26:37 +0200 Subject: [PATCH 1156/1578] drm/fbdev-dma: Only set smem_start is enable per module option Only export struct fb_info.fix.smem_start if that is required by the user and the memory does not come from vmalloc(). Setting struct fb_info.fix.smem_start breaks systems where DMA memory is backed by vmalloc address space. An example error is shown below. [ 3.536043] ------------[ cut here ]------------ [ 3.540716] virt_to_phys used for non-linear address: 000000007fc4f540 (0xffff800086001000) [ 3.552628] WARNING: CPU: 4 PID: 61 at arch/arm64/mm/physaddr.c:12 __virt_to_phys+0x68/0x98 [ 3.565455] Modules linked in: [ 3.568525] CPU: 4 PID: 61 Comm: kworker/u12:5 Not tainted 6.6.23-06226-g4986cc3e1b75-dirty #250 [ 3.577310] Hardware name: NXP i.MX95 19X19 board (DT) [ 3.582452] Workqueue: events_unbound deferred_probe_work_func [ 3.588291] pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 3.595233] pc : __virt_to_phys+0x68/0x98 [ 3.599246] lr : __virt_to_phys+0x68/0x98 [ 3.603276] sp : ffff800083603990 [ 3.677939] Call trace: [ 3.680393] __virt_to_phys+0x68/0x98 [ 3.684067] drm_fbdev_dma_helper_fb_probe+0x138/0x238 [ 3.689214] __drm_fb_helper_initial_config_and_unlock+0x2b0/0x4c0 [ 3.695385] drm_fb_helper_initial_config+0x4c/0x68 [ 3.700264] drm_fbdev_dma_client_hotplug+0x8c/0xe0 [ 3.705161] drm_client_register+0x60/0xb0 [ 3.709269] drm_fbdev_dma_setup+0x94/0x148 Additionally, DMA memory is assumed to by contiguous in physical address space, which is not guaranteed by vmalloc(). Resolve this by checking the module flag drm_leak_fbdev_smem when DRM allocated the instance of struct fb_info. Fbdev-dma then only sets smem_start only if required (via FBINFO_HIDE_SMEM_START). Also guarantee that the framebuffer is not located in vmalloc address space. Signed-off-by: Thomas Zimmermann Reported-by: Peng Fan (OSS) Closes: https://lore.kernel.org/dri-devel/20240604080328.4024838-1-peng.fan@oss.nxp.com/ Reported-by: Geert Uytterhoeven Closes: https://lore.kernel.org/dri-devel/CAMuHMdX3N0szUvt1VTbroa2zrT1Nye_VzPb5qqCZ7z5gSm7HGw@mail.gmail.com/ Fixes: a51c7663f144 ("drm/fb-helper: Consolidate CONFIG_DRM_FBDEV_LEAK_PHYS_SMEM") Tested-by: Geert Uytterhoeven Reviewed-by: Daniel Vetter Cc: # v6.4+ Link: https://patchwork.freedesktop.org/patch/msgid/20240617152843.11886-1-tzimmermann@suse.de --- drivers/gpu/drm/drm_fb_helper.c | 6 +++--- drivers/gpu/drm/drm_fbdev_dma.c | 5 ++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c index d612133e2cf7ec..117237d3528bda 100644 --- a/drivers/gpu/drm/drm_fb_helper.c +++ b/drivers/gpu/drm/drm_fb_helper.c @@ -524,6 +524,9 @@ struct fb_info *drm_fb_helper_alloc_info(struct drm_fb_helper *fb_helper) if (!info) return ERR_PTR(-ENOMEM); + if (!drm_leak_fbdev_smem) + info->flags |= FBINFO_HIDE_SMEM_START; + ret = fb_alloc_cmap(&info->cmap, 256, 0); if (ret) goto err_release; @@ -1860,9 +1863,6 @@ __drm_fb_helper_initial_config_and_unlock(struct drm_fb_helper *fb_helper) info = fb_helper->info; info->var.pixclock = 0; - if (!drm_leak_fbdev_smem) - info->flags |= FBINFO_HIDE_SMEM_START; - /* Need to drop locks to avoid recursive deadlock in * register_framebuffer. This is ok because the only thing left to do is * register the fbdev emulation instance in kernel_fb_helper_list. */ diff --git a/drivers/gpu/drm/drm_fbdev_dma.c b/drivers/gpu/drm/drm_fbdev_dma.c index 6c9427bb4053ba..13cd754af311d1 100644 --- a/drivers/gpu/drm/drm_fbdev_dma.c +++ b/drivers/gpu/drm/drm_fbdev_dma.c @@ -130,7 +130,10 @@ static int drm_fbdev_dma_helper_fb_probe(struct drm_fb_helper *fb_helper, info->flags |= FBINFO_READS_FAST; /* signal caching */ info->screen_size = sizes->surface_height * fb->pitches[0]; info->screen_buffer = map.vaddr; - info->fix.smem_start = page_to_phys(virt_to_page(info->screen_buffer)); + if (!(info->flags & FBINFO_HIDE_SMEM_START)) { + if (!drm_WARN_ON(dev, is_vmalloc_addr(info->screen_buffer))) + info->fix.smem_start = page_to_phys(virt_to_page(info->screen_buffer)); + } info->fix.smem_len = info->screen_size; return 0; From a2225e0250c5fa397dcebf6ce65a9f05a114e0cf Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Thu, 13 Jun 2024 17:42:47 +0800 Subject: [PATCH 1157/1578] netfilter: move the sysctl nf_hooks_lwtunnel into the netfilter core Currently, the sysctl net.netfilter.nf_hooks_lwtunnel depends on the nf_conntrack module, but the nf_conntrack module is not always loaded. Therefore, accessing net.netfilter.nf_hooks_lwtunnel may have an error. Move sysctl nf_hooks_lwtunnel into the netfilter core. Fixes: 7a3f5b0de364 ("netfilter: add netfilter hooks to SRv6 data plane") Suggested-by: Pablo Neira Ayuso Signed-off-by: Jianguo Wu Signed-off-by: Pablo Neira Ayuso --- include/net/netns/netfilter.h | 3 ++ net/netfilter/core.c | 13 ++++- net/netfilter/nf_conntrack_standalone.c | 15 ------ net/netfilter/nf_hooks_lwtunnel.c | 67 +++++++++++++++++++++++++ net/netfilter/nf_internals.h | 6 +++ 5 files changed, 87 insertions(+), 17 deletions(-) diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h index 02bbdc577f8e28..a6a0bf4a247e51 100644 --- a/include/net/netns/netfilter.h +++ b/include/net/netns/netfilter.h @@ -15,6 +15,9 @@ struct netns_nf { const struct nf_logger __rcu *nf_loggers[NFPROTO_NUMPROTO]; #ifdef CONFIG_SYSCTL struct ctl_table_header *nf_log_dir_header; +#ifdef CONFIG_LWTUNNEL + struct ctl_table_header *nf_lwtnl_dir_header; +#endif #endif struct nf_hook_entries __rcu *hooks_ipv4[NF_INET_NUMHOOKS]; struct nf_hook_entries __rcu *hooks_ipv6[NF_INET_NUMHOOKS]; diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 3126911f504259..b00fc285b33499 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -815,12 +815,21 @@ int __init netfilter_init(void) if (ret < 0) goto err; +#ifdef CONFIG_LWTUNNEL + ret = netfilter_lwtunnel_init(); + if (ret < 0) + goto err_lwtunnel_pernet; +#endif ret = netfilter_log_init(); if (ret < 0) - goto err_pernet; + goto err_log_pernet; return 0; -err_pernet: +err_log_pernet: +#ifdef CONFIG_LWTUNNEL + netfilter_lwtunnel_fini(); +err_lwtunnel_pernet: +#endif unregister_pernet_subsys(&netfilter_net_ops); err: return ret; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 74112e9c5dabc3..6c40bdf8b05ab5 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -22,9 +22,6 @@ #include #include #include -#ifdef CONFIG_LWTUNNEL -#include -#endif #include static bool enable_hooks __read_mostly; @@ -612,9 +609,6 @@ enum nf_ct_sysctl_index { NF_SYSCTL_CT_PROTO_TIMEOUT_GRE, NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM, #endif -#ifdef CONFIG_LWTUNNEL - NF_SYSCTL_CT_LWTUNNEL, -#endif NF_SYSCTL_CT_LAST_SYSCTL, }; @@ -946,15 +940,6 @@ static struct ctl_table nf_ct_sysctl_table[] = { .proc_handler = proc_dointvec_jiffies, }, #endif -#ifdef CONFIG_LWTUNNEL - [NF_SYSCTL_CT_LWTUNNEL] = { - .procname = "nf_hooks_lwtunnel", - .data = NULL, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = nf_hooks_lwtunnel_sysctl_handler, - }, -#endif }; static struct ctl_table nf_ct_netfilter_table[] = { diff --git a/net/netfilter/nf_hooks_lwtunnel.c b/net/netfilter/nf_hooks_lwtunnel.c index 00e89ffd78f692..7cdb59bb4459f3 100644 --- a/net/netfilter/nf_hooks_lwtunnel.c +++ b/net/netfilter/nf_hooks_lwtunnel.c @@ -3,6 +3,9 @@ #include #include #include +#include + +#include "nf_internals.h" static inline int nf_hooks_lwtunnel_get(void) { @@ -50,4 +53,68 @@ int nf_hooks_lwtunnel_sysctl_handler(struct ctl_table *table, int write, return ret; } EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_sysctl_handler); + +static struct ctl_table nf_lwtunnel_sysctl_table[] = { + { + .procname = "nf_hooks_lwtunnel", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = nf_hooks_lwtunnel_sysctl_handler, + }, +}; + +static int __net_init nf_lwtunnel_net_init(struct net *net) +{ + struct ctl_table_header *hdr; + struct ctl_table *table; + + table = nf_lwtunnel_sysctl_table; + if (!net_eq(net, &init_net)) { + table = kmemdup(nf_lwtunnel_sysctl_table, + sizeof(nf_lwtunnel_sysctl_table), + GFP_KERNEL); + if (!table) + goto err_alloc; + } + + hdr = register_net_sysctl_sz(net, "net/netfilter", table, + ARRAY_SIZE(nf_lwtunnel_sysctl_table)); + if (!hdr) + goto err_reg; + + net->nf.nf_lwtnl_dir_header = hdr; + + return 0; +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static void __net_exit nf_lwtunnel_net_exit(struct net *net) +{ + const struct ctl_table *table; + + table = net->nf.nf_lwtnl_dir_header->ctl_table_arg; + unregister_net_sysctl_table(net->nf.nf_lwtnl_dir_header); + if (!net_eq(net, &init_net)) + kfree(table); +} + +static struct pernet_operations nf_lwtunnel_net_ops = { + .init = nf_lwtunnel_net_init, + .exit = nf_lwtunnel_net_exit, +}; + +int __init netfilter_lwtunnel_init(void) +{ + return register_pernet_subsys(&nf_lwtunnel_net_ops); +} + +void netfilter_lwtunnel_fini(void) +{ + unregister_pernet_subsys(&nf_lwtunnel_net_ops); +} #endif /* CONFIG_SYSCTL */ diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h index 832ae64179f0f2..25403023060b6d 100644 --- a/net/netfilter/nf_internals.h +++ b/net/netfilter/nf_internals.h @@ -29,6 +29,12 @@ void nf_queue_nf_hook_drop(struct net *net); /* nf_log.c */ int __init netfilter_log_init(void); +#ifdef CONFIG_LWTUNNEL +/* nf_hooks_lwtunnel.c */ +int __init netfilter_lwtunnel_init(void); +void netfilter_lwtunnel_fini(void); +#endif + /* core.c */ void nf_hook_entries_delete_raw(struct nf_hook_entries __rcu **pp, const struct nf_hook_ops *reg); From 72e50ef99431163203657128050d0697caf3321d Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Thu, 13 Jun 2024 17:42:48 +0800 Subject: [PATCH 1158/1578] selftests: add selftest for the SRv6 End.DX4 behavior with netfilter this selftest is designed for evaluating the SRv6 End.DX4 behavior used with netfilter(rpfilter), in this example, for implementing IPv4 L3 VPN use cases. Signed-off-by: Jianguo Wu Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/net/Makefile | 1 + tools/testing/selftests/net/config | 1 + .../net/srv6_end_dx4_netfilter_test.sh | 335 ++++++++++++++++++ 3 files changed, 337 insertions(+) create mode 100755 tools/testing/selftests/net/srv6_end_dx4_netfilter_test.sh diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index bd01e4a0be2c29..7a5f7dd320deb3 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -43,6 +43,7 @@ TEST_PROGS += srv6_hl2encap_red_l2vpn_test.sh TEST_PROGS += srv6_end_next_csid_l3vpn_test.sh TEST_PROGS += srv6_end_x_next_csid_l3vpn_test.sh TEST_PROGS += srv6_end_flavors_test.sh +TEST_PROGS += srv6_end_dx4_netfilter_test.sh TEST_PROGS += vrf_strict_mode_test.sh TEST_PROGS += arp_ndisc_evict_nocarrier.sh TEST_PROGS += ndisc_unsolicited_na_test.sh diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index 04de7a6ba6f318..c2766e558f9253 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -101,3 +101,4 @@ CONFIG_NETFILTER_XT_MATCH_POLICY=m CONFIG_CRYPTO_ARIA=y CONFIG_XFRM_INTERFACE=m CONFIG_XFRM_USER=m +CONFIG_IP_NF_MATCH_RPFILTER=m diff --git a/tools/testing/selftests/net/srv6_end_dx4_netfilter_test.sh b/tools/testing/selftests/net/srv6_end_dx4_netfilter_test.sh new file mode 100755 index 00000000000000..e23210aa547fd5 --- /dev/null +++ b/tools/testing/selftests/net/srv6_end_dx4_netfilter_test.sh @@ -0,0 +1,335 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# author: Jianguo Wu +# +# Mostly copied from tools/testing/selftests/net/srv6_end_dt4_l3vpn_test.sh. +# +# This script is designed for testing the support of netfilter hooks for +# SRv6 End.DX4 behavior. +# +# Hereafter a network diagram is shown, where one tenants (named 100) offer +# IPv4 L3 VPN services allowing hosts to communicate with each other across +# an IPv6 network. +# +# Routers rt-1 and rt-2 implement IPv4 L3 VPN services leveraging the SRv6 +# architecture. The key components for such VPNs are: a) SRv6 Encap behavior, +# b) SRv6 End.DX4 behavior. +# +# To explain how an IPv4 L3 VPN based on SRv6 works, let us briefly consider an +# example where, within the same domain of tenant 100, the host hs-1 pings +# the host hs-2. +# +# First of all, L2 reachability of the host hs-2 is taken into account by +# the router rt-1 which acts as an arp proxy. +# +# When the host hs-1 sends an IPv4 packet destined to hs-2, the router rt-1 +# receives the packet on the internal veth-t100 interface, rt-1 contains the +# SRv6 Encap route for encapsulating the IPv4 packet in a IPv6 plus the Segment +# Routing Header (SRH) packet. This packet is sent through the (IPv6) core +# network up to the router rt-2 that receives it on veth0 interface. +# +# The rt-2 router uses the 'localsid' routing table to process incoming +# IPv6+SRH packets which belong to the VPN of the tenant 100. For each of these +# packets, the SRv6 End.DX4 behavior removes the outer IPv6+SRH headers and +# routs the packet to the specified nexthop. Afterwards, the packet is sent to +# the host hs-2 through the veth-t100 interface. +# +# The ping response follows the same processing but this time the role of rt-1 +# and rt-2 are swapped. +# +# And when net.netfilter.nf_hooks_lwtunnel is set to 1 in rt-1 or rt-2, and a +# rpfilter iptables rule is added, SRv6 packets will go through netfilter PREROUTING +# hooks. +# +# +# +-------------------+ +-------------------+ +# | | | | +# | hs-1 netns | | hs-2 netns | +# | | | | +# | +-------------+ | | +-------------+ | +# | | veth0 | | | | veth0 | | +# | | 10.0.0.1/24 | | | | 10.0.0.2/24 | | +# | +-------------+ | | +-------------+ | +# | . | | . | +# +-------------------+ +-------------------+ +# . . +# . . +# . . +# +-----------------------------------+ +-----------------------------------+ +# | . | | . | +# | +---------------+ | | +---------------- | +# | | veth-t100 | | | | veth-t100 | | +# | | 10.0.0.11/24 | +----------+ | | +----------+ | 10.0.0.22/24 | | +# | +-------+-------+ | route | | | | route | +-------+-------- | +# | | table | | | | table | | +# | +----------+ | | +----------+ | +# | +--------------+ | | +--------------+ | +# | | veth0 | | | | veth0 | | +# | | 2001:11::1/64 |.|...|.| 2001:11::2/64 | | +# | +--------------+ | | +--------------+ | +# | | | | +# | rt-1 netns | | rt-2 netns | +# | | | | +# +-----------------------------------+ +-----------------------------------+ +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~ +# | Network configuration | +# ~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# rt-1: localsid table +# +----------------------------------------------------------------+ +# |SID |Action | +# +----------------------------------------------------------------+ +# |fc00:21:100::6004|apply SRv6 End.DX4 nh4 10.0.0.1 dev veth-t100 | +# +----------------------------------------------------------------+ +# +# rt-1: route table +# +---------------------------------------------------+ +# |host |Action | +# +---------------------------------------------------+ +# |10.0.0.2 |apply seg6 encap segs fc00:12:100::6004| +# +---------------------------------------------------+ +# |10.0.0.0/24|forward to dev veth_t100 | +# +---------------------------------------------------+ +# +# +# rt-2: localsid table +# +---------------------------------------------------------------+ +# |SID |Action | +# +---------------------------------------------------------------+ +# |fc00:12:100::6004|apply SRv6 End.DX4 nh4 10.0.0.2 dev veth-t100| +# +---------------------------------------------------------------+ +# +# rt-2: route table +# +---------------------------------------------------+ +# |host |Action | +# +---------------------------------------------------+ +# |10.0.0.1 |apply seg6 encap segs fc00:21:100::6004| +# +---------------------------------------------------+ +# |10.0.0.0/24|forward to dev veth_t100 | +# +---------------------------------------------------+ +# + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +readonly IPv6_RT_NETWORK=2001:11 +readonly IPv4_HS_NETWORK=10.0.0 +readonly SID_LOCATOR=fc00 + +PING_TIMEOUT_SEC=4 + +ret=0 + +PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no} + +log_test() +{ + local rc=$1 + local expected=$2 + local msg="$3" + + if [ ${rc} -eq ${expected} ]; then + nsuccess=$((nsuccess+1)) + printf "\n TEST: %-60s [ OK ]\n" "${msg}" + else + ret=1 + nfail=$((nfail+1)) + printf "\n TEST: %-60s [FAIL]\n" "${msg}" + if [ "${PAUSE_ON_FAIL}" = "yes" ]; then + echo + echo "hit enter to continue, 'q' to quit" + read a + [ "$a" = "q" ] && exit 1 + fi + fi +} + +print_log_test_results() +{ + if [ "$TESTS" != "none" ]; then + printf "\nTests passed: %3d\n" ${nsuccess} + printf "Tests failed: %3d\n" ${nfail} + fi +} + +log_section() +{ + echo + echo "################################################################################" + echo "TEST SECTION: $*" + echo "################################################################################" +} + +cleanup() +{ + ip link del veth-rt-1 2>/dev/null || true + ip link del veth-rt-2 2>/dev/null || true + + # destroy routers rt-* and hosts hs-* + for ns in $(ip netns show | grep -E 'rt-*|hs-*'); do + ip netns del ${ns} || true + done +} + +# Setup the basic networking for the routers +setup_rt_networking() +{ + local rt=$1 + local nsname=rt-${rt} + + ip netns add ${nsname} + + ip netns exec ${nsname} sysctl -wq net.ipv6.conf.all.accept_dad=0 + ip netns exec ${nsname} sysctl -wq net.ipv6.conf.default.accept_dad=0 + + ip link set veth-rt-${rt} netns ${nsname} + ip -netns ${nsname} link set veth-rt-${rt} name veth0 + + ip -netns ${nsname} addr add ${IPv6_RT_NETWORK}::${rt}/64 dev veth0 nodad + ip -netns ${nsname} link set veth0 up + ip -netns ${nsname} link set lo up + + ip netns exec ${nsname} sysctl -wq net.ipv4.ip_forward=1 + ip netns exec ${nsname} sysctl -wq net.ipv6.conf.all.forwarding=1 +} + +setup_rt_netfilter() +{ + local rt=$1 + local nsname=rt-${rt} + + ip netns exec ${nsname} sysctl -wq net.netfilter.nf_hooks_lwtunnel=1 + ip netns exec ${nsname} iptables -t raw -A PREROUTING -m rpfilter --invert -j DROP +} + +setup_hs() +{ + local hs=$1 + local rt=$2 + local tid=$3 + local hsname=hs-${hs} + local rtname=rt-${rt} + local rtveth=veth-t${tid} + + # set the networking for the host + ip netns add ${hsname} + + ip -netns ${hsname} link add veth0 type veth peer name ${rtveth} + ip -netns ${hsname} link set ${rtveth} netns ${rtname} + ip -netns ${hsname} addr add ${IPv4_HS_NETWORK}.${hs}/24 dev veth0 + ip -netns ${hsname} link set veth0 up + ip -netns ${hsname} link set lo up + + ip -netns ${rtname} addr add ${IPv4_HS_NETWORK}.${rt}${hs}/24 dev ${rtveth} + ip -netns ${rtname} link set ${rtveth} up + + ip netns exec ${rtname} sysctl -wq net.ipv4.conf.${rtveth}.proxy_arp=1 +} + +setup_vpn_config() +{ + local hssrc=$1 + local rtsrc=$2 + local hsdst=$3 + local rtdst=$4 + local tid=$5 + + local hssrc_name=hs-t${tid}-${hssrc} + local hsdst_name=hs-t${tid}-${hsdst} + local rtsrc_name=rt-${rtsrc} + local rtdst_name=rt-${rtdst} + local vpn_sid=${SID_LOCATOR}:${hssrc}${hsdst}:${tid}::6004 + + # set the encap route for encapsulating packets which arrive from the + # host hssrc and destined to the access router rtsrc. + ip -netns ${rtsrc_name} -4 route add ${IPv4_HS_NETWORK}.${hsdst}/32 \ + encap seg6 mode encap segs ${vpn_sid} dev veth0 + ip -netns ${rtsrc_name} -6 route add ${vpn_sid}/128 \ + via 2001:11::${rtdst} dev veth0 + + # set the decap route for decapsulating packets which arrive from + # the rtdst router and destined to the hsdst host. + ip -netns ${rtdst_name} -6 route add ${vpn_sid}/128 \ + encap seg6local action End.DX4 nh4 ${IPv4_HS_NETWORK}.${hsdst} dev veth-t${tid} +} + +setup() +{ + ip link add veth-rt-1 type veth peer name veth-rt-2 + # setup the networking for router rt-1 and router rt-2 + setup_rt_networking 1 + setup_rt_networking 2 + + # setup two hosts for the tenant 100. + # - host hs-1 is directly connected to the router rt-1; + # - host hs-2 is directly connected to the router rt-2. + setup_hs 1 1 100 + setup_hs 2 2 100 + + # setup the IPv4 L3 VPN which connects the host hs-1 and host hs-2. + setup_vpn_config 1 1 2 2 100 #args: src_host src_router dst_host dst_router tenant + setup_vpn_config 2 2 1 1 100 +} + +check_hs_connectivity() +{ + local hssrc=$1 + local hsdst=$2 + local tid=$3 + + ip netns exec hs-${hssrc} ping -c 1 -W ${PING_TIMEOUT_SEC} \ + ${IPv4_HS_NETWORK}.${hsdst} >/dev/null 2>&1 +} + +check_and_log_hs_connectivity() +{ + local hssrc=$1 + local hsdst=$2 + local tid=$3 + + check_hs_connectivity ${hssrc} ${hsdst} ${tid} + log_test $? 0 "Hosts connectivity: hs-${hssrc} -> hs-${hsdst} (tenant ${tid})" +} + +host_tests() +{ + log_section "SRv6 VPN connectivity test among hosts in the same tenant" + + check_and_log_hs_connectivity 1 2 100 + check_and_log_hs_connectivity 2 1 100 +} + +router_netfilter_tests() +{ + log_section "SRv6 VPN connectivity test with netfilter enabled in routers" + setup_rt_netfilter 1 + setup_rt_netfilter 2 + + check_and_log_hs_connectivity 1 2 100 + check_and_log_hs_connectivity 2 1 100 +} + +if [ "$(id -u)" -ne 0 ];then + echo "SKIP: Need root privileges" + exit $ksft_skip +fi + +if [ ! -x "$(command -v ip)" ]; then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +cleanup &>/dev/null + +setup + +host_tests +router_netfilter_tests + +print_log_test_results + +cleanup &>/dev/null + +exit ${ret} From 221200ffeb065c6bbd196760c168b42305961655 Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Thu, 13 Jun 2024 17:42:49 +0800 Subject: [PATCH 1159/1578] selftests: add selftest for the SRv6 End.DX6 behavior with netfilter this selftest is designed for evaluating the SRv6 End.DX6 behavior used with netfilter(rpfilter), in this example, for implementing IPv6 L3 VPN use cases. Signed-off-by: Jianguo Wu Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/net/Makefile | 1 + tools/testing/selftests/net/config | 1 + .../net/srv6_end_dx6_netfilter_test.sh | 340 ++++++++++++++++++ 3 files changed, 342 insertions(+) create mode 100755 tools/testing/selftests/net/srv6_end_dx6_netfilter_test.sh diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 7a5f7dd320deb3..d9393569d03a49 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -44,6 +44,7 @@ TEST_PROGS += srv6_end_next_csid_l3vpn_test.sh TEST_PROGS += srv6_end_x_next_csid_l3vpn_test.sh TEST_PROGS += srv6_end_flavors_test.sh TEST_PROGS += srv6_end_dx4_netfilter_test.sh +TEST_PROGS += srv6_end_dx6_netfilter_test.sh TEST_PROGS += vrf_strict_mode_test.sh TEST_PROGS += arp_ndisc_evict_nocarrier.sh TEST_PROGS += ndisc_unsolicited_na_test.sh diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index c2766e558f9253..d4891f7a2bfae1 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -102,3 +102,4 @@ CONFIG_CRYPTO_ARIA=y CONFIG_XFRM_INTERFACE=m CONFIG_XFRM_USER=m CONFIG_IP_NF_MATCH_RPFILTER=m +CONFIG_IP6_NF_MATCH_RPFILTER=m diff --git a/tools/testing/selftests/net/srv6_end_dx6_netfilter_test.sh b/tools/testing/selftests/net/srv6_end_dx6_netfilter_test.sh new file mode 100755 index 00000000000000..9e69a2ed5bc345 --- /dev/null +++ b/tools/testing/selftests/net/srv6_end_dx6_netfilter_test.sh @@ -0,0 +1,340 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# author: Jianguo Wu +# +# Mostly copied from tools/testing/selftests/net/srv6_end_dt6_l3vpn_test.sh. +# +# This script is designed for testing the support of netfilter hooks for +# SRv6 End.DX4 behavior. +# +# Hereafter a network diagram is shown, where one tenants (named 100) offer +# IPv6 L3 VPN services allowing hosts to communicate with each other across +# an IPv6 network. +# +# Routers rt-1 and rt-2 implement IPv6 L3 VPN services leveraging the SRv6 +# architecture. The key components for such VPNs are: a) SRv6 Encap behavior, +# b) SRv6 End.DX4 behavior. +# +# To explain how an IPv6 L3 VPN based on SRv6 works, let us briefly consider an +# example where, within the same domain of tenant 100, the host hs-1 pings +# the host hs-2. +# +# First of all, L2 reachability of the host hs-2 is taken into account by +# the router rt-1 which acts as an arp proxy. +# +# When the host hs-1 sends an IPv6 packet destined to hs-2, the router rt-1 +# receives the packet on the internal veth-t100 interface, rt-1 contains the +# SRv6 Encap route for encapsulating the IPv6 packet in a IPv6 plus the Segment +# Routing Header (SRH) packet. This packet is sent through the (IPv6) core +# network up to the router rt-2 that receives it on veth0 interface. +# +# The rt-2 router uses the 'localsid' routing table to process incoming +# IPv6+SRH packets which belong to the VPN of the tenant 100. For each of these +# packets, the SRv6 End.DX4 behavior removes the outer IPv6+SRH headers and +# routs the packet to the specified nexthop. Afterwards, the packet is sent to +# the host hs-2 through the veth-t100 interface. +# +# The ping response follows the same processing but this time the role of rt-1 +# and rt-2 are swapped. +# +# And when net.netfilter.nf_hooks_lwtunnel is set to 1 in rt-1 or rt-2, and a +# rpfilter iptables rule is added, SRv6 packets will go through netfilter PREROUTING +# hooks. +# +# +# +-------------------+ +-------------------+ +# | | | | +# | hs-1 netns | | hs-2 netns | +# | | | | +# | +-------------+ | | +-------------+ | +# | | veth0 | | | | veth0 | | +# | | cafe::1/64 | | | | cafe::2/64 | | +# | +-------------+ | | +-------------+ | +# | . | | . | +# +-------------------+ +-------------------+ +# . . +# . . +# . . +# +-----------------------------------+ +-----------------------------------+ +# | . | | . | +# | +---------------+ | | +---------------- | +# | | veth-t100 | | | | veth-t100 | | +# | | cafe::11/64 | +----------+ | | +----------+ | cafe::22/64 | | +# | +-------+-------+ | route | | | | route | +-------+-------- | +# | | table | | | | table | | +# | +----------+ | | +----------+ | +# | +--------------+ | | +--------------+ | +# | | veth0 | | | | veth0 | | +# | | 2001:11::1/64 |.|...|.| 2001:11::2/64 | | +# | +--------------+ | | +--------------+ | +# | | | | +# | rt-1 netns | | rt-2 netns | +# | | | | +# +-----------------------------------+ +-----------------------------------+ +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~ +# | Network configuration | +# ~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# rt-1: localsid table +# +----------------------------------------------------------------+ +# |SID |Action | +# +----------------------------------------------------------------+ +# |fc00:21:100::6004|apply SRv6 End.DX6 nh6 cafe::1 dev veth-t100 | +# +----------------------------------------------------------------+ +# +# rt-1: route table +# +---------------------------------------------------+ +# |host |Action | +# +---------------------------------------------------+ +# |cafe::2 |apply seg6 encap segs fc00:12:100::6004| +# +---------------------------------------------------+ +# |cafe::/64 |forward to dev veth_t100 | +# +---------------------------------------------------+ +# +# +# rt-2: localsid table +# +---------------------------------------------------------------+ +# |SID |Action | +# +---------------------------------------------------------------+ +# |fc00:12:100::6004|apply SRv6 End.DX6 nh6 cafe::2 dev veth-t100 | +# +---------------------------------------------------------------+ +# +# rt-2: route table +# +---------------------------------------------------+ +# |host |Action | +# +---------------------------------------------------+ +# |cafe::1 |apply seg6 encap segs fc00:21:100::6004| +# +---------------------------------------------------+ +# |cafe::/64 |forward to dev veth_t100 | +# +---------------------------------------------------+ +# + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +readonly IPv6_RT_NETWORK=2001:11 +readonly IPv6_HS_NETWORK=cafe +readonly SID_LOCATOR=fc00 + +PING_TIMEOUT_SEC=4 + +ret=0 + +PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no} + +log_test() +{ + local rc=$1 + local expected=$2 + local msg="$3" + + if [ ${rc} -eq ${expected} ]; then + nsuccess=$((nsuccess+1)) + printf "\n TEST: %-60s [ OK ]\n" "${msg}" + else + ret=1 + nfail=$((nfail+1)) + printf "\n TEST: %-60s [FAIL]\n" "${msg}" + if [ "${PAUSE_ON_FAIL}" = "yes" ]; then + echo + echo "hit enter to continue, 'q' to quit" + read a + [ "$a" = "q" ] && exit 1 + fi + fi +} + +print_log_test_results() +{ + if [ "$TESTS" != "none" ]; then + printf "\nTests passed: %3d\n" ${nsuccess} + printf "Tests failed: %3d\n" ${nfail} + fi +} + +log_section() +{ + echo + echo "################################################################################" + echo "TEST SECTION: $*" + echo "################################################################################" +} + +cleanup() +{ + ip link del veth-rt-1 2>/dev/null || true + ip link del veth-rt-2 2>/dev/null || true + + # destroy routers rt-* and hosts hs-* + for ns in $(ip netns show | grep -E 'rt-*|hs-*'); do + ip netns del ${ns} || true + done +} + +# Setup the basic networking for the routers +setup_rt_networking() +{ + local rt=$1 + local nsname=rt-${rt} + + ip netns add ${nsname} + + ip netns exec ${nsname} sysctl -wq net.ipv6.conf.all.accept_dad=0 + ip netns exec ${nsname} sysctl -wq net.ipv6.conf.default.accept_dad=0 + + ip link set veth-rt-${rt} netns ${nsname} + ip -netns ${nsname} link set veth-rt-${rt} name veth0 + + ip -netns ${nsname} addr add ${IPv6_RT_NETWORK}::${rt}/64 dev veth0 nodad + ip -netns ${nsname} link set veth0 up + ip -netns ${nsname} link set lo up + + ip netns exec ${nsname} sysctl -wq net.ipv6.conf.all.forwarding=1 +} + +setup_rt_netfilter() +{ + local rt=$1 + local nsname=rt-${rt} + + ip netns exec ${nsname} sysctl -wq net.netfilter.nf_hooks_lwtunnel=1 + ip netns exec ${nsname} ip6tables -t raw -A PREROUTING -m rpfilter --invert -j DROP +} + +setup_hs() +{ + local hs=$1 + local rt=$2 + local tid=$3 + local hsname=hs-${hs} + local rtname=rt-${rt} + local rtveth=veth-t${tid} + + # set the networking for the host + ip netns add ${hsname} + + ip -netns ${hsname} link add veth0 type veth peer name ${rtveth} + ip -netns ${hsname} link set ${rtveth} netns ${rtname} + ip -netns ${hsname} addr add ${IPv6_HS_NETWORK}::${hs}/64 dev veth0 nodad + ip -netns ${hsname} link set veth0 up + ip -netns ${hsname} link set lo up + + ip -netns ${rtname} addr add ${IPv6_HS_NETWORK}::${rt}${hs}/64 dev ${rtveth} + ip -netns ${rtname} link set ${rtveth} up + + ip netns exec ${rtname} sysctl -wq net.ipv6.conf.all.accept_dad=0 + ip netns exec ${rtname} sysctl -wq net.ipv6.conf.default.accept_dad=0 + + ip netns exec ${rtname} sysctl -wq net.ipv6.conf.${rtveth}.proxy_ndp=1 +} + +setup_vpn_config() +{ + local hssrc=$1 + local rtsrc=$2 + local hsdst=$3 + local rtdst=$4 + local tid=$5 + + local hssrc_name=hs-t${tid}-${hssrc} + local hsdst_name=hs-t${tid}-${hsdst} + local rtsrc_name=rt-${rtsrc} + local rtdst_name=rt-${rtdst} + local rtveth=veth-t${tid} + local vpn_sid=${SID_LOCATOR}:${hssrc}${hsdst}:${tid}::6004 + + ip -netns ${rtsrc_name} -6 neigh add proxy ${IPv6_HS_NETWORK}::${hsdst} dev ${rtveth} + + # set the encap route for encapsulating packets which arrive from the + # host hssrc and destined to the access router rtsrc. + ip -netns ${rtsrc_name} -6 route add ${IPv6_HS_NETWORK}::${hsdst}/128 \ + encap seg6 mode encap segs ${vpn_sid} dev veth0 + ip -netns ${rtsrc_name} -6 route add ${vpn_sid}/128 \ + via 2001:11::${rtdst} dev veth0 + + # set the decap route for decapsulating packets which arrive from + # the rtdst router and destined to the hsdst host. + ip -netns ${rtdst_name} -6 route add ${vpn_sid}/128 \ + encap seg6local action End.DX6 nh6 ${IPv6_HS_NETWORK}::${hsdst} dev veth-t${tid} +} + +setup() +{ + ip link add veth-rt-1 type veth peer name veth-rt-2 + # setup the networking for router rt-1 and router rt-2 + setup_rt_networking 1 + setup_rt_networking 2 + + # setup two hosts for the tenant 100. + # - host hs-1 is directly connected to the router rt-1; + # - host hs-2 is directly connected to the router rt-2. + setup_hs 1 1 100 + setup_hs 2 2 100 + + # setup the IPv4 L3 VPN which connects the host hs-1 and host hs-2. + setup_vpn_config 1 1 2 2 100 #args: src_host src_router dst_host dst_router tenant + setup_vpn_config 2 2 1 1 100 +} + +check_hs_connectivity() +{ + local hssrc=$1 + local hsdst=$2 + local tid=$3 + + ip netns exec hs-${hssrc} ping -6 -c 1 -W ${PING_TIMEOUT_SEC} \ + ${IPv6_HS_NETWORK}::${hsdst} >/dev/null 2>&1 +} + +check_and_log_hs_connectivity() +{ + local hssrc=$1 + local hsdst=$2 + local tid=$3 + + check_hs_connectivity ${hssrc} ${hsdst} ${tid} + log_test $? 0 "Hosts connectivity: hs-${hssrc} -> hs-${hsdst} (tenant ${tid})" +} + +host_tests() +{ + log_section "SRv6 VPN connectivity test among hosts in the same tenant" + + check_and_log_hs_connectivity 1 2 100 + check_and_log_hs_connectivity 2 1 100 +} + +router_netfilter_tests() +{ + log_section "SRv6 VPN connectivity test with netfilter enabled in routers" + setup_rt_netfilter 1 + setup_rt_netfilter 2 + + check_and_log_hs_connectivity 1 2 100 + check_and_log_hs_connectivity 2 1 100 +} + +if [ "$(id -u)" -ne 0 ];then + echo "SKIP: Need root privileges" + exit $ksft_skip +fi + +if [ ! -x "$(command -v ip)" ]; then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +cleanup &>/dev/null + +setup + +host_tests +router_netfilter_tests + +print_log_test_results + +cleanup &>/dev/null + +exit ${ret} From 98d919dfee1cc402ca29d45da642852d7c9a2301 Mon Sep 17 00:00:00 2001 From: Vijendar Mukunda Date: Mon, 17 Jun 2024 12:58:34 +0530 Subject: [PATCH 1160/1578] ASoC: amd: acp: add a null check for chip_pdev structure When acp platform device creation is skipped, chip->chip_pdev value will remain NULL. Add NULL check for chip->chip_pdev structure in snd_acp_resume() function to avoid null pointer dereference. Fixes: 088a40980efb ("ASoC: amd: acp: add pm ops support for acp pci driver") Signed-off-by: Vijendar Mukunda Link: https://msgid.link/r/20240617072844.871468-1-Vijendar.Mukunda@amd.com Signed-off-by: Mark Brown --- sound/soc/amd/acp/acp-pci.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sound/soc/amd/acp/acp-pci.c b/sound/soc/amd/acp/acp-pci.c index ad320b29e87dcb..aa3e72d134518a 100644 --- a/sound/soc/amd/acp/acp-pci.c +++ b/sound/soc/amd/acp/acp-pci.c @@ -199,10 +199,12 @@ static int __maybe_unused snd_acp_resume(struct device *dev) ret = acp_init(chip); if (ret) dev_err(dev, "ACP init failed\n"); - child = chip->chip_pdev->dev; - adata = dev_get_drvdata(&child); - if (adata) - acp_enable_interrupts(adata); + if (chip->chip_pdev) { + child = chip->chip_pdev->dev; + adata = dev_get_drvdata(&child); + if (adata) + acp_enable_interrupts(adata); + } return ret; } From 70fa3900c3ed92158628710e81d274e5cb52f92b Mon Sep 17 00:00:00 2001 From: Vijendar Mukunda Date: Mon, 17 Jun 2024 12:58:35 +0530 Subject: [PATCH 1161/1578] ASoC: amd: acp: remove i2s configuration check in acp_i2s_probe() ACP supports different pin configurations for I2S IO. Checking ACP pin configuration value against specific value breaks the functionality for other I2S pin configurations. This check is no longer required in i2s dai driver probe call as i2s configuration check will be verified during acp platform device creation sequence. Remove i2s_mode check in acp_i2s_probe() function. Fixes: b24484c18b10 ("ASoC: amd: acp: ACP code generic to support newer platforms") Signed-off-by: Vijendar Mukunda Link: https://msgid.link/r/20240617072844.871468-2-Vijendar.Mukunda@amd.com Signed-off-by: Mark Brown --- sound/soc/amd/acp/acp-i2s.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/sound/soc/amd/acp/acp-i2s.c b/sound/soc/amd/acp/acp-i2s.c index 60cbc881be6e10..ef12f97ddc69ec 100644 --- a/sound/soc/amd/acp/acp-i2s.c +++ b/sound/soc/amd/acp/acp-i2s.c @@ -588,20 +588,12 @@ static int acp_i2s_probe(struct snd_soc_dai *dai) { struct device *dev = dai->component->dev; struct acp_dev_data *adata = dev_get_drvdata(dev); - struct acp_resource *rsrc = adata->rsrc; - unsigned int val; if (!adata->acp_base) { dev_err(dev, "I2S base is NULL\n"); return -EINVAL; } - val = readl(adata->acp_base + rsrc->i2s_pin_cfg_offset); - if (val != rsrc->i2s_mode) { - dev_err(dev, "I2S Mode not supported val %x\n", val); - return -EINVAL; - } - return 0; } From 379bcd2c9197bf2c429434e8a01cea0ee1852316 Mon Sep 17 00:00:00 2001 From: Vijendar Mukunda Date: Mon, 17 Jun 2024 12:58:36 +0530 Subject: [PATCH 1162/1578] ASoC: amd: acp: move chip->flag variable assignment chip->flag variable assignment will be skipped when acp platform device creation is skipped. In this case chip>flag value will not be set. chip->flag variable should be assigned along with other structure variables for 'chip' structure. Move chip->flag variable assignment prior to acp platform device creation. Fixes: 3a94c8ad0aae ("ASoC: amd: acp: add code for scanning acp pdm controller") Signed-off-by: Vijendar Mukunda Link: https://msgid.link/r/20240617072844.871468-3-Vijendar.Mukunda@amd.com Signed-off-by: Mark Brown --- sound/soc/amd/acp/acp-pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/amd/acp/acp-pci.c b/sound/soc/amd/acp/acp-pci.c index aa3e72d134518a..777b5a78d8a9ef 100644 --- a/sound/soc/amd/acp/acp-pci.c +++ b/sound/soc/amd/acp/acp-pci.c @@ -100,6 +100,7 @@ static int acp_pci_probe(struct pci_dev *pci, const struct pci_device_id *pci_id ret = -EINVAL; goto release_regions; } + chip->flag = flag; dmic_dev = platform_device_register_data(dev, "dmic-codec", PLATFORM_DEVID_NONE, NULL, 0); if (IS_ERR(dmic_dev)) { dev_err(dev, "failed to create DMIC device\n"); @@ -139,7 +140,6 @@ static int acp_pci_probe(struct pci_dev *pci, const struct pci_device_id *pci_id } } - chip->flag = flag; memset(&pdevinfo, 0, sizeof(pdevinfo)); pdevinfo.name = chip->name; From e2654a4453ba3dac9baacf9980d841d84e15b869 Mon Sep 17 00:00:00 2001 From: Roman Li Date: Tue, 7 May 2024 16:26:08 -0400 Subject: [PATCH 1163/1578] drm/amd/display: Remove redundant idle optimization check [Why] Disable idle optimization for each atomic commit is unnecessary, and can lead to a potential race condition. [How] Remove idle optimization check from amdgpu_dm_atomic_commit_tail() Fixes: 196107eb1e15 ("drm/amd/display: Add IPS checks before dcn register access") Cc: stable@vger.kernel.org Reviewed-by: Hamza Mahfooz Acked-by: Roman Li Signed-off-by: Roman Li Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index f1d67c6f4b98f4..e426adf95d7de6 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -9169,9 +9169,6 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state) trace_amdgpu_dm_atomic_commit_tail_begin(state); - if (dm->dc->caps.ips_support && dm->dc->idle_optimizations_allowed) - dc_allow_idle_optimizations(dm->dc, false); - drm_atomic_helper_update_legacy_modeset_state(dev, state); drm_dp_mst_atomic_wait_for_dependencies(state); From 84801d4f1e4fbd2c44dddecaec9099bdff100a42 Mon Sep 17 00:00:00 2001 From: Yunxiang Li Date: Thu, 23 May 2024 07:48:19 -0400 Subject: [PATCH 1164/1578] drm/amdgpu: fix locking scope when flushing tlb MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Which method is used to flush tlb does not depend on whether a reset is in progress or not. We should skip flush altogether if the GPU will get reset. So put both path under reset_domain read lock. Signed-off-by: Yunxiang Li Reviewed-by: Christian König Signed-off-by: Alex Deucher CC: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 66 +++++++++++++------------ 1 file changed, 34 insertions(+), 32 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c index be4629cdac0491..08b9dfb6533552 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c @@ -684,12 +684,17 @@ int amdgpu_gmc_flush_gpu_tlb_pasid(struct amdgpu_device *adev, uint16_t pasid, struct amdgpu_ring *ring = &adev->gfx.kiq[inst].ring; struct amdgpu_kiq *kiq = &adev->gfx.kiq[inst]; unsigned int ndw; - signed long r; + int r; uint32_t seq; - if (!adev->gmc.flush_pasid_uses_kiq || !ring->sched.ready || - !down_read_trylock(&adev->reset_domain->sem)) { + /* + * A GPU reset should flush all TLBs anyway, so no need to do + * this while one is ongoing. + */ + if (!down_read_trylock(&adev->reset_domain->sem)) + return 0; + if (!adev->gmc.flush_pasid_uses_kiq || !ring->sched.ready) { if (adev->gmc.flush_tlb_needs_extra_type_2) adev->gmc.gmc_funcs->flush_gpu_tlb_pasid(adev, pasid, 2, all_hub, @@ -703,43 +708,40 @@ int amdgpu_gmc_flush_gpu_tlb_pasid(struct amdgpu_device *adev, uint16_t pasid, adev->gmc.gmc_funcs->flush_gpu_tlb_pasid(adev, pasid, flush_type, all_hub, inst); - return 0; - } + r = 0; + } else { + /* 2 dwords flush + 8 dwords fence */ + ndw = kiq->pmf->invalidate_tlbs_size + 8; - /* 2 dwords flush + 8 dwords fence */ - ndw = kiq->pmf->invalidate_tlbs_size + 8; + if (adev->gmc.flush_tlb_needs_extra_type_2) + ndw += kiq->pmf->invalidate_tlbs_size; - if (adev->gmc.flush_tlb_needs_extra_type_2) - ndw += kiq->pmf->invalidate_tlbs_size; + if (adev->gmc.flush_tlb_needs_extra_type_0) + ndw += kiq->pmf->invalidate_tlbs_size; - if (adev->gmc.flush_tlb_needs_extra_type_0) - ndw += kiq->pmf->invalidate_tlbs_size; + spin_lock(&adev->gfx.kiq[inst].ring_lock); + amdgpu_ring_alloc(ring, ndw); + if (adev->gmc.flush_tlb_needs_extra_type_2) + kiq->pmf->kiq_invalidate_tlbs(ring, pasid, 2, all_hub); - spin_lock(&adev->gfx.kiq[inst].ring_lock); - amdgpu_ring_alloc(ring, ndw); - if (adev->gmc.flush_tlb_needs_extra_type_2) - kiq->pmf->kiq_invalidate_tlbs(ring, pasid, 2, all_hub); + if (flush_type == 2 && adev->gmc.flush_tlb_needs_extra_type_0) + kiq->pmf->kiq_invalidate_tlbs(ring, pasid, 0, all_hub); - if (flush_type == 2 && adev->gmc.flush_tlb_needs_extra_type_0) - kiq->pmf->kiq_invalidate_tlbs(ring, pasid, 0, all_hub); + kiq->pmf->kiq_invalidate_tlbs(ring, pasid, flush_type, all_hub); + r = amdgpu_fence_emit_polling(ring, &seq, MAX_KIQ_REG_WAIT); + if (r) { + amdgpu_ring_undo(ring); + spin_unlock(&adev->gfx.kiq[inst].ring_lock); + goto error_unlock_reset; + } - kiq->pmf->kiq_invalidate_tlbs(ring, pasid, flush_type, all_hub); - r = amdgpu_fence_emit_polling(ring, &seq, MAX_KIQ_REG_WAIT); - if (r) { - amdgpu_ring_undo(ring); + amdgpu_ring_commit(ring); spin_unlock(&adev->gfx.kiq[inst].ring_lock); - goto error_unlock_reset; - } - - amdgpu_ring_commit(ring); - spin_unlock(&adev->gfx.kiq[inst].ring_lock); - r = amdgpu_fence_wait_polling(ring, seq, usec_timeout); - if (r < 1) { - dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r); - r = -ETIME; - goto error_unlock_reset; + if (amdgpu_fence_wait_polling(ring, seq, usec_timeout) < 1) { + dev_err(adev->dev, "timeout waiting for kiq fence\n"); + r = -ETIME; + } } - r = 0; error_unlock_reset: up_read(&adev->reset_domain->sem); From 56342da3d8cc15efe9df7f29985ba8d256bdc258 Mon Sep 17 00:00:00 2001 From: Hamza Mahfooz Date: Mon, 3 Jun 2024 10:16:45 -0400 Subject: [PATCH 1165/1578] drm/amd/display: prevent register access while in IPS We can't read/write to DCN registers while in IPS. Since, that can cause the system to hang. So, before proceeding with the access in that scenario, force the system out of IPS. Cc: stable@vger.kernel.org # 6.6+ Reviewed-by: Roman Li Signed-off-by: Hamza Mahfooz Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index e426adf95d7de6..e9ac20bed0f2b7 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -11437,6 +11437,12 @@ void amdgpu_dm_trigger_timing_sync(struct drm_device *dev) mutex_unlock(&adev->dm.dc_lock); } +static inline void amdgpu_dm_exit_ips_for_hw_access(struct dc *dc) +{ + if (dc->ctx->dmub_srv && !dc->ctx->dmub_srv->idle_exit_counter) + dc_exit_ips_for_hw_access(dc); +} + void dm_write_reg_func(const struct dc_context *ctx, uint32_t address, u32 value, const char *func_name) { @@ -11447,6 +11453,8 @@ void dm_write_reg_func(const struct dc_context *ctx, uint32_t address, return; } #endif + + amdgpu_dm_exit_ips_for_hw_access(ctx->dc); cgs_write_register(ctx->cgs_device, address, value); trace_amdgpu_dc_wreg(&ctx->perf_trace->write_count, address, value); } @@ -11470,6 +11478,8 @@ uint32_t dm_read_reg_func(const struct dc_context *ctx, uint32_t address, return 0; } + amdgpu_dm_exit_ips_for_hw_access(ctx->dc); + value = cgs_read_register(ctx->cgs_device, address); trace_amdgpu_dc_rreg(&ctx->perf_trace->read_count, address, value); From 49c9ffabde555c841392858d8b9e6cf58998a50c Mon Sep 17 00:00:00 2001 From: Harish Kasiviswanathan Date: Wed, 5 Jun 2024 09:30:50 -0400 Subject: [PATCH 1166/1578] drm/amdgpu: Indicate CU havest info to CP To achieve full occupancy CP hardware needs to know if CUs in SE are symmetrically or asymmetrically harvested v2: Reset is_symmetric_cus for each loop Signed-off-by: Harish Kasiviswanathan Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index 7b16e8cca86ac0..f5b9f443cfdd79 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -4195,9 +4195,10 @@ static u32 gfx_v9_4_3_get_cu_active_bitmap(struct amdgpu_device *adev, int xcc_i static int gfx_v9_4_3_get_cu_info(struct amdgpu_device *adev, struct amdgpu_cu_info *cu_info) { - int i, j, k, counter, xcc_id, active_cu_number = 0; - u32 mask, bitmap, ao_bitmap, ao_cu_mask = 0; + int i, j, k, prev_counter, counter, xcc_id, active_cu_number = 0; + u32 mask, bitmap, ao_bitmap, ao_cu_mask = 0, tmp; unsigned disable_masks[4 * 4]; + bool is_symmetric_cus; if (!adev || !cu_info) return -EINVAL; @@ -4215,6 +4216,7 @@ static int gfx_v9_4_3_get_cu_info(struct amdgpu_device *adev, mutex_lock(&adev->grbm_idx_mutex); for (xcc_id = 0; xcc_id < NUM_XCC(adev->gfx.xcc_mask); xcc_id++) { + is_symmetric_cus = true; for (i = 0; i < adev->gfx.config.max_shader_engines; i++) { for (j = 0; j < adev->gfx.config.max_sh_per_se; j++) { mask = 1; @@ -4242,6 +4244,15 @@ static int gfx_v9_4_3_get_cu_info(struct amdgpu_device *adev, ao_cu_mask |= (ao_bitmap << (i * 16 + j * 8)); cu_info->ao_cu_bitmap[i][j] = ao_bitmap; } + if (i && is_symmetric_cus && prev_counter != counter) + is_symmetric_cus = false; + prev_counter = counter; + } + if (is_symmetric_cus) { + tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_CPC_DEBUG); + tmp = REG_SET_FIELD(tmp, CP_CPC_DEBUG, CPC_HARVESTING_RELAUNCH_DISABLE, 1); + tmp = REG_SET_FIELD(tmp, CP_CPC_DEBUG, CPC_HARVESTING_DISPATCH_DISABLE, 1); + WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_CPC_DEBUG, tmp); } gfx_v9_4_3_xcc_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff, xcc_id); From 8bd82363e2ee2eb3a9a8ea1fa94ebe1900d05a71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Wed, 5 Jun 2024 13:27:20 +0200 Subject: [PATCH 1167/1578] drm/amdgpu: revert "take runtime pm reference when we attach a buffer" v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit b8c415e3bf98 ("drm/amdgpu: take runtime pm reference when we attach a buffer") and commit 425285d39afd ("drm/amdgpu: add amdgpu runpm usage trace for separate funcs"). Taking a runtime pm reference for DMA-buf is actually completely unnecessary and even dangerous. The problem is that calling pm_runtime_get_sync() from the DMA-buf callbacks is illegal because we have the reservation locked here which is also taken during resume. So this would deadlock. When the buffer is in GTT it is still accessible even when the GPU is powered down and when it is in VRAM the buffer gets migrated to GTT before powering down. The only use case which would make it mandatory to keep the runtime pm reference would be if we pin the buffer into VRAM, and that's not something we currently do. v2: improve the commit message Signed-off-by: Christian König Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher CC: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c | 34 --------------------- drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 2 -- drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h | 15 --------- 3 files changed, 51 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c index 055ba2ea4c126f..662d0f28f35876 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c @@ -41,8 +41,6 @@ #include #include #include -#include -#include "amdgpu_trace.h" /** * amdgpu_dma_buf_attach - &dma_buf_ops.attach implementation @@ -58,42 +56,11 @@ static int amdgpu_dma_buf_attach(struct dma_buf *dmabuf, struct drm_gem_object *obj = dmabuf->priv; struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj); struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); - int r; if (pci_p2pdma_distance(adev->pdev, attach->dev, false) < 0) attach->peer2peer = false; - r = pm_runtime_get_sync(adev_to_drm(adev)->dev); - trace_amdgpu_runpm_reference_dumps(1, __func__); - if (r < 0) - goto out; - return 0; - -out: - pm_runtime_put_autosuspend(adev_to_drm(adev)->dev); - trace_amdgpu_runpm_reference_dumps(0, __func__); - return r; -} - -/** - * amdgpu_dma_buf_detach - &dma_buf_ops.detach implementation - * - * @dmabuf: DMA-buf where we remove the attachment from - * @attach: the attachment to remove - * - * Called when an attachment is removed from the DMA-buf. - */ -static void amdgpu_dma_buf_detach(struct dma_buf *dmabuf, - struct dma_buf_attachment *attach) -{ - struct drm_gem_object *obj = dmabuf->priv; - struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj); - struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); - - pm_runtime_mark_last_busy(adev_to_drm(adev)->dev); - pm_runtime_put_autosuspend(adev_to_drm(adev)->dev); - trace_amdgpu_runpm_reference_dumps(0, __func__); } /** @@ -267,7 +234,6 @@ static int amdgpu_dma_buf_begin_cpu_access(struct dma_buf *dma_buf, const struct dma_buf_ops amdgpu_dmabuf_ops = { .attach = amdgpu_dma_buf_attach, - .detach = amdgpu_dma_buf_detach, .pin = amdgpu_dma_buf_pin, .unpin = amdgpu_dma_buf_unpin, .map_dma_buf = amdgpu_dma_buf_map, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index 10832b4704484b..bc3ac73b6b8d0c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@ -181,7 +181,6 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct amd amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr, seq, flags | AMDGPU_FENCE_FLAG_INT); pm_runtime_get_noresume(adev_to_drm(adev)->dev); - trace_amdgpu_runpm_reference_dumps(1, __func__); ptr = &ring->fence_drv.fences[seq & ring->fence_drv.num_fences_mask]; if (unlikely(rcu_dereference_protected(*ptr, 1))) { struct dma_fence *old; @@ -309,7 +308,6 @@ bool amdgpu_fence_process(struct amdgpu_ring *ring) dma_fence_put(fence); pm_runtime_mark_last_busy(adev_to_drm(adev)->dev); pm_runtime_put_autosuspend(adev_to_drm(adev)->dev); - trace_amdgpu_runpm_reference_dumps(0, __func__); } while (last_seq != seq); return true; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h index 7aafeb763e5dde..383fce40d4dd7e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h @@ -554,21 +554,6 @@ TRACE_EVENT(amdgpu_reset_reg_dumps, __entry->value) ); -TRACE_EVENT(amdgpu_runpm_reference_dumps, - TP_PROTO(uint32_t index, const char *func), - TP_ARGS(index, func), - TP_STRUCT__entry( - __field(uint32_t, index) - __string(func, func) - ), - TP_fast_assign( - __entry->index = index; - __assign_str(func); - ), - TP_printk("amdgpu runpm reference dump 0x%x: 0x%s\n", - __entry->index, - __get_str(func)) -); #undef AMDGPU_JOB_GET_TIMELINE_NAME #endif From c60e20f13c27662de36cd5538d6299760780db52 Mon Sep 17 00:00:00 2001 From: Daniel Miess Date: Tue, 28 May 2024 16:17:17 -0400 Subject: [PATCH 1168/1578] drm/amd/display: Change dram_clock_latency to 34us for dcn351 [Why] Intermittent underflow observed when using 4k144 display on dcn351 [How] Update dram_clock_change_latency_us from 11.72us to 34us Reviewed-by: Nicholas Kazlauskas Acked-by: Zaeem Mohamed Signed-off-by: Daniel Miess Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dml/dcn351/dcn351_fpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn351/dcn351_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn351/dcn351_fpu.c index e4f333d4fb54f0..a201dbb743d791 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn351/dcn351_fpu.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn351/dcn351_fpu.c @@ -215,7 +215,7 @@ struct _vcs_dpi_soc_bounding_box_st dcn3_51_soc = { .urgent_latency_pixel_data_only_us = 4.0, .urgent_latency_pixel_mixed_with_vm_data_us = 4.0, .urgent_latency_vm_data_only_us = 4.0, - .dram_clock_change_latency_us = 11.72, + .dram_clock_change_latency_us = 34, .urgent_out_of_order_return_per_channel_pixel_only_bytes = 4096, .urgent_out_of_order_return_per_channel_pixel_and_vm_bytes = 4096, .urgent_out_of_order_return_per_channel_vm_only_bytes = 4096, From 6071607bfefefc50a3907c0ba88878846960d29a Mon Sep 17 00:00:00 2001 From: Paul Hsieh Date: Tue, 28 May 2024 14:36:00 +0800 Subject: [PATCH 1169/1578] drm/amd/display: change dram_clock_latency to 34us for dcn35 [Why & How] Current DRAM setting would cause underflow on customer platform. Modify dram_clock_change_latency_us from 11.72 to 34.0 us as per recommendation from HW team Reviewed-by: Nicholas Kazlauskas Acked-by: Zaeem Mohamed Signed-off-by: Paul Hsieh Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c index 60f251cf973b1b..beed7adbbd43e0 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c @@ -177,7 +177,7 @@ struct _vcs_dpi_soc_bounding_box_st dcn3_5_soc = { .urgent_latency_pixel_data_only_us = 4.0, .urgent_latency_pixel_mixed_with_vm_data_us = 4.0, .urgent_latency_vm_data_only_us = 4.0, - .dram_clock_change_latency_us = 11.72, + .dram_clock_change_latency_us = 34.0, .urgent_out_of_order_return_per_channel_pixel_only_bytes = 4096, .urgent_out_of_order_return_per_channel_pixel_and_vm_bytes = 4096, .urgent_out_of_order_return_per_channel_vm_only_bytes = 4096, From c03d770c0b014a3007a5874bf6b3c3e64d32aaac Mon Sep 17 00:00:00 2001 From: Michael Strauss Date: Tue, 7 May 2024 12:03:15 -0400 Subject: [PATCH 1170/1578] drm/amd/display: Attempt to avoid empty TUs when endpoint is DPIA [WHY] Empty SST TUs are illegal to transmit over a USB4 DP tunnel. Current policy is to configure stream encoder to pack 2 pixels per pclk even when ODM combine is not in use, allowing seamless dynamic ODM reconfiguration. However, in extreme edge cases where average pixel count per TU is less than 2, this can lead to unexpected empty TU generation during compliance testing. For example, VIC 1 with a 1xHBR3 link configuration will average 1.98 pix/TU. [HOW] Calculate average pixel count per TU, and block 2 pixels per clock if endpoint is a DPIA tunnel and pixel clock is low enough that we will never require 2:1 ODM combine. Cc: stable@vger.kernel.org # 6.6+ Reviewed-by: Wenjing Liu Acked-by: Hamza Mahfooz Signed-off-by: Michael Strauss Signed-off-by: Alex Deucher --- .../amd/display/dc/hwss/dcn35/dcn35_hwseq.c | 72 +++++++++++++++++++ .../amd/display/dc/hwss/dcn35/dcn35_hwseq.h | 2 + .../amd/display/dc/hwss/dcn35/dcn35_init.c | 2 +- 3 files changed, 75 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c index 5295f52e4fc843..dcced89c07b380 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c @@ -1439,3 +1439,75 @@ void dcn35_set_long_vblank(struct pipe_ctx **pipe_ctx, } } } + +static bool should_avoid_empty_tu(struct pipe_ctx *pipe_ctx) +{ + /* Calculate average pixel count per TU, return false if under ~2.00 to + * avoid empty TUs. This is only required for DPIA tunneling as empty TUs + * are legal to generate for native DP links. Assume TU size 64 as there + * is currently no scenario where it's reprogrammed from HW default. + * MTPs have no such limitation, so this does not affect MST use cases. + */ + unsigned int pix_clk_mhz; + unsigned int symclk_mhz; + unsigned int avg_pix_per_tu_x1000; + unsigned int tu_size_bytes = 64; + struct dc_crtc_timing *timing = &pipe_ctx->stream->timing; + struct dc_link_settings *link_settings = &pipe_ctx->link_config.dp_link_settings; + const struct dc *dc = pipe_ctx->stream->link->dc; + + if (pipe_ctx->stream->link->ep_type != DISPLAY_ENDPOINT_USB4_DPIA) + return false; + + // Not necessary for MST configurations + if (pipe_ctx->stream->signal == SIGNAL_TYPE_DISPLAY_PORT_MST) + return false; + + pix_clk_mhz = timing->pix_clk_100hz / 10000; + + // If this is true, can't block due to dynamic ODM + if (pix_clk_mhz > dc->clk_mgr->bw_params->clk_table.entries[0].dispclk_mhz) + return false; + + switch (link_settings->link_rate) { + case LINK_RATE_LOW: + symclk_mhz = 162; + break; + case LINK_RATE_HIGH: + symclk_mhz = 270; + break; + case LINK_RATE_HIGH2: + symclk_mhz = 540; + break; + case LINK_RATE_HIGH3: + symclk_mhz = 810; + break; + default: + // We shouldn't be tunneling any other rates, something is wrong + ASSERT(0); + return false; + } + + avg_pix_per_tu_x1000 = (1000 * pix_clk_mhz * tu_size_bytes) + / (symclk_mhz * link_settings->lane_count); + + // Add small empirically-decided margin to account for potential jitter + return (avg_pix_per_tu_x1000 < 2020); +} + +bool dcn35_is_dp_dig_pixel_rate_div_policy(struct pipe_ctx *pipe_ctx) +{ + struct dc *dc = pipe_ctx->stream->ctx->dc; + + if (!is_h_timing_divisible_by_2(pipe_ctx->stream)) + return false; + + if (should_avoid_empty_tu(pipe_ctx)) + return false; + + if (dc_is_dp_signal(pipe_ctx->stream->signal) && !dc->link_srv->dp_is_128b_132b_signal(pipe_ctx) && + dc->debug.enable_dp_dig_pixel_rate_div_policy) + return true; + + return false; +} diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.h b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.h index a731c8880d60aa..f0ea7d1511ae68 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.h +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.h @@ -95,4 +95,6 @@ void dcn35_set_static_screen_control(struct pipe_ctx **pipe_ctx, void dcn35_set_long_vblank(struct pipe_ctx **pipe_ctx, int num_pipes, uint32_t v_total_min, uint32_t v_total_max); +bool dcn35_is_dp_dig_pixel_rate_div_policy(struct pipe_ctx *pipe_ctx); + #endif /* __DC_HWSS_DCN35_H__ */ diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_init.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_init.c index df3bf77f3fb46f..199781233fd5f2 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_init.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_init.c @@ -158,7 +158,7 @@ static const struct hwseq_private_funcs dcn35_private_funcs = { .setup_hpo_hw_control = dcn35_setup_hpo_hw_control, .calculate_dccg_k1_k2_values = dcn32_calculate_dccg_k1_k2_values, .set_pixels_per_cycle = dcn32_set_pixels_per_cycle, - .is_dp_dig_pixel_rate_div_policy = dcn32_is_dp_dig_pixel_rate_div_policy, + .is_dp_dig_pixel_rate_div_policy = dcn35_is_dp_dig_pixel_rate_div_policy, .dsc_pg_control = dcn35_dsc_pg_control, .dsc_pg_status = dcn32_dsc_pg_status, .enable_plane = dcn35_enable_plane, From 301daa346f0e34a87fb6c1e4a05db2aa0a66b573 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 14 Jun 2024 12:54:52 -0700 Subject: [PATCH 1171/1578] drm/amd/display: Disable CONFIG_DRM_AMD_DC_FP for RISC-V with clang Commit 77acc6b55ae4 ("riscv: add support for kernel-mode FPU") and commit a28e4b672f04 ("drm/amd/display: use ARCH_HAS_KERNEL_FPU_SUPPORT") enabled support for CONFIG_DRM_AMD_DC_FP with RISC-V. Unfortunately, this exposed -Wframe-larger-than warnings (which become fatal with CONFIG_WERROR=y) when building ARCH=riscv allmodconfig with clang: drivers/gpu/drm/amd/amdgpu/../display/dc/dml/dcn32/display_mode_vba_32.c:58:13: error: stack frame size (2448) exceeds limit (2048) in 'DISPCLKDPPCLKDCFCLKDeepSleepPrefetchParametersWatermarksAndPerformanceCalculation' [-Werror,-Wframe-larger-than] 58 | static void DISPCLKDPPCLKDCFCLKDeepSleepPrefetchParametersWatermarksAndPerformanceCalculation( | ^ 1 error generated. Many functions in this file use a large number of parameters, which must be passed on the stack at a certain pointer due to register exhaustion, which can cause high stack usage when inlining and issues with stack slot analysis get involved. While the compiler can and should do better (as GCC uses less than half the amount of stack space for the same function), it is not as simple as a fix as adjusting the functions not to take a large number of parameters. Unfortunately, modifying these files to avoid the problem is a difficult to justify approach because any revisions to the files in the kernel tree never make it back to the original source (so copies of the code for newer hardware revisions just reintroduce the issue) and the files are hard to read/modify due to being "gcc-parsable HW gospel, coming straight from HW engineers". Avoid building the problematic code for RISC-V by modifying the existing condition for arm64 that exists for the same reason. Factor out the logical not to make the condition a little more readable naturally. Fixes: a28e4b672f04 ("drm/amd/display: use ARCH_HAS_KERNEL_FPU_SUPPORT") Reported-by: Palmer Dabbelt Closes: https://lore.kernel.org/20240530145741.7506-2-palmer@rivosinc.com/ Reviewed-by: Harry Wentland Signed-off-by: Nathan Chancellor Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/Kconfig b/drivers/gpu/drm/amd/display/Kconfig index 5fcd4f778dc3dd..47b8b49da8a72b 100644 --- a/drivers/gpu/drm/amd/display/Kconfig +++ b/drivers/gpu/drm/amd/display/Kconfig @@ -8,7 +8,7 @@ config DRM_AMD_DC depends on BROKEN || !CC_IS_CLANG || ARM64 || RISCV || SPARC64 || X86_64 select SND_HDA_COMPONENT if SND_HDA_CORE # !CC_IS_CLANG: https://github.com/ClangBuiltLinux/linux/issues/1752 - select DRM_AMD_DC_FP if ARCH_HAS_KERNEL_FPU_SUPPORT && (!ARM64 || !CC_IS_CLANG) + select DRM_AMD_DC_FP if ARCH_HAS_KERNEL_FPU_SUPPORT && !(CC_IS_CLANG && (ARM64 || RISCV)) help Choose this option if you want to use the new display engine support for AMDGPU. This adds required support for Vega and From 8bf0287528da1992c5e49d757b99ad6bbc34b522 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 19 Jun 2024 14:46:48 -0500 Subject: [PATCH 1172/1578] cifs: fix typo in module parameter enable_gcm_256 enable_gcm_256 (which allows the server to require the strongest encryption) is enabled by default, but the modinfo description incorrectly showed it disabled by default. Fix the typo. Cc: stable@vger.kernel.org Fixes: fee742b50289 ("smb3.1.1: enable negotiating stronger encryption by default") Signed-off-by: Steve French --- fs/smb/client/cifsfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index bb86fc0641d83a..6397fdefd876db 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -134,7 +134,7 @@ module_param(enable_oplocks, bool, 0644); MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1"); module_param(enable_gcm_256, bool, 0644); -MODULE_PARM_DESC(enable_gcm_256, "Enable requesting strongest (256 bit) GCM encryption. Default: n/N/0"); +MODULE_PARM_DESC(enable_gcm_256, "Enable requesting strongest (256 bit) GCM encryption. Default: y/Y/0"); module_param(require_gcm_256, bool, 0644); MODULE_PARM_DESC(require_gcm_256, "Require strongest (256 bit) GCM encryption. Default: n/N/0"); From a498df5421fd737d11bfd152428ba6b1c8538321 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 20 May 2024 09:11:45 -0400 Subject: [PATCH 1173/1578] drm/radeon: fix UBSAN warning in kv_dpm.c Adds bounds check for sumo_vid_mapping_entry. Reviewed-by: Mario Limonciello Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/radeon/sumo_dpm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/radeon/sumo_dpm.c b/drivers/gpu/drm/radeon/sumo_dpm.c index 21d27e6235f396..b11f7c5bbcbe96 100644 --- a/drivers/gpu/drm/radeon/sumo_dpm.c +++ b/drivers/gpu/drm/radeon/sumo_dpm.c @@ -1619,6 +1619,8 @@ void sumo_construct_vid_mapping_table(struct radeon_device *rdev, for (i = 0; i < SUMO_MAX_HARDWARE_POWERLEVELS; i++) { if (table[i].ulSupportedSCLK != 0) { + if (table[i].usVoltageIndex >= SUMO_MAX_NUMBER_VOLTAGES) + continue; vid_mapping_table->entries[table[i].usVoltageIndex].vid_7bit = table[i].usVoltageID; vid_mapping_table->entries[table[i].usVoltageIndex].vid_2bit = From f0d576f840153392d04b2d52cf3adab8f62e8cb6 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 20 May 2024 09:05:21 -0400 Subject: [PATCH 1174/1578] drm/amdgpu: fix UBSAN warning in kv_dpm.c Adds bounds check for sumo_vid_mapping_entry. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3392 Reviewed-by: Mario Limonciello Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c b/drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c index 6bb42d04b2479a..e8b6989a40f35a 100644 --- a/drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c +++ b/drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c @@ -164,6 +164,8 @@ static void sumo_construct_vid_mapping_table(struct amdgpu_device *adev, for (i = 0; i < SUMO_MAX_HARDWARE_POWERLEVELS; i++) { if (table[i].ulSupportedSCLK != 0) { + if (table[i].usVoltageIndex >= SUMO_MAX_NUMBER_VOLTAGES) + continue; vid_mapping_table->entries[table[i].usVoltageIndex].vid_7bit = table[i].usVoltageID; vid_mapping_table->entries[table[i].usVoltageIndex].vid_2bit = From e356d321d0240663a09b139fa3658ddbca163e27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Fri, 31 May 2024 10:56:00 +0200 Subject: [PATCH 1175/1578] drm/amdgpu: cleanup MES11 command submission MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The approach of having a separate WB slot for each submission doesn't really work well and for example breaks GPU reset. Use a status query packet for the fence update instead since those should always succeed we can use the fence of the original packet to signal the state of the operation. While at it cleanup the coding style. Fixes: eef016ba8986 ("drm/amdgpu/mes11: Use a separate fence per transaction") Reviewed-by: Mukul Joshi Signed-off-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 76 ++++++++++++++++---------- 1 file changed, 48 insertions(+), 28 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 0d1407f2500596..32d4519541c6b8 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -154,18 +154,18 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, void *pkt, int size, int api_status_off) { - int ndw = size / 4; - signed long r; - union MESAPI__MISC *x_pkt = pkt; - struct MES_API_STATUS *api_status; + union MESAPI__QUERY_MES_STATUS mes_status_pkt; + signed long timeout = 3000000; /* 3000 ms */ struct amdgpu_device *adev = mes->adev; struct amdgpu_ring *ring = &mes->ring; - unsigned long flags; - signed long timeout = 3000000; /* 3000 ms */ + struct MES_API_STATUS *api_status; + union MESAPI__MISC *x_pkt = pkt; const char *op_str, *misc_op_str; - u32 fence_offset; - u64 fence_gpu_addr; - u64 *fence_ptr; + unsigned long flags; + u64 status_gpu_addr; + u32 status_offset; + u64 *status_ptr; + signed long r; int ret; if (x_pkt->header.opcode >= MES_SCH_API_MAX) @@ -177,28 +177,38 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, /* Worst case in sriov where all other 15 VF timeout, each VF needs about 600ms */ timeout = 15 * 600 * 1000; } - BUG_ON(size % 4 != 0); - ret = amdgpu_device_wb_get(adev, &fence_offset); + ret = amdgpu_device_wb_get(adev, &status_offset); if (ret) return ret; - fence_gpu_addr = - adev->wb.gpu_addr + (fence_offset * 4); - fence_ptr = (u64 *)&adev->wb.wb[fence_offset]; - *fence_ptr = 0; + + status_gpu_addr = adev->wb.gpu_addr + (status_offset * 4); + status_ptr = (u64 *)&adev->wb.wb[status_offset]; + *status_ptr = 0; spin_lock_irqsave(&mes->ring_lock, flags); - if (amdgpu_ring_alloc(ring, ndw)) { - spin_unlock_irqrestore(&mes->ring_lock, flags); - amdgpu_device_wb_free(adev, fence_offset); - return -ENOMEM; - } + r = amdgpu_ring_alloc(ring, (size + sizeof(mes_status_pkt)) / 4); + if (r) + goto error_unlock_free; api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off); - api_status->api_completion_fence_addr = fence_gpu_addr; + api_status->api_completion_fence_addr = status_gpu_addr; api_status->api_completion_fence_value = 1; - amdgpu_ring_write_multiple(ring, pkt, ndw); + amdgpu_ring_write_multiple(ring, pkt, size / 4); + + memset(&mes_status_pkt, 0, sizeof(mes_status_pkt)); + mes_status_pkt.header.type = MES_API_TYPE_SCHEDULER; + mes_status_pkt.header.opcode = MES_SCH_API_QUERY_SCHEDULER_STATUS; + mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS; + mes_status_pkt.api_status.api_completion_fence_addr = + ring->fence_drv.gpu_addr; + mes_status_pkt.api_status.api_completion_fence_value = + ++ring->fence_drv.sync_seq; + + amdgpu_ring_write_multiple(ring, &mes_status_pkt, + sizeof(mes_status_pkt) / 4); + amdgpu_ring_commit(ring); spin_unlock_irqrestore(&mes->ring_lock, flags); @@ -206,15 +216,16 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, misc_op_str = mes_v11_0_get_misc_op_string(x_pkt); if (misc_op_str) - dev_dbg(adev->dev, "MES msg=%s (%s) was emitted\n", op_str, misc_op_str); + dev_dbg(adev->dev, "MES msg=%s (%s) was emitted\n", op_str, + misc_op_str); else if (op_str) dev_dbg(adev->dev, "MES msg=%s was emitted\n", op_str); else - dev_dbg(adev->dev, "MES msg=%d was emitted\n", x_pkt->header.opcode); + dev_dbg(adev->dev, "MES msg=%d was emitted\n", + x_pkt->header.opcode); - r = amdgpu_mes_fence_wait_polling(fence_ptr, (u64)1, timeout); - amdgpu_device_wb_free(adev, fence_offset); - if (r < 1) { + r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout); + if (r < 1 || !*status_ptr) { if (misc_op_str) dev_err(adev->dev, "MES failed to respond to msg=%s (%s)\n", @@ -229,10 +240,19 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, while (halt_if_hws_hang) schedule(); - return -ETIMEDOUT; + r = -ETIMEDOUT; + goto error_wb_free; } + amdgpu_device_wb_free(adev, status_offset); return 0; + +error_unlock_free: + spin_unlock_irqrestore(&mes->ring_lock, flags); + +error_wb_free: + amdgpu_device_wb_free(adev, status_offset); + return r; } static int convert_to_mes_queue_type(int queue_type) From ed5a4484f074aa2bfb1dad99ff3628ea8da4acdc Mon Sep 17 00:00:00 2001 From: Likun Gao Date: Wed, 12 Jun 2024 14:30:40 +0800 Subject: [PATCH 1176/1578] drm/amdgpu: init TA fw for psp v14 Add support to init TA firmware for psp v14. Signed-off-by: Likun Gao Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/psp_v14_0.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v14_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v14_0.c index f08a32c1869460..40b28298af301f 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v14_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v14_0.c @@ -32,7 +32,9 @@ #include "mp/mp_14_0_2_sh_mask.h" MODULE_FIRMWARE("amdgpu/psp_14_0_2_sos.bin"); +MODULE_FIRMWARE("amdgpu/psp_14_0_2_ta.bin"); MODULE_FIRMWARE("amdgpu/psp_14_0_3_sos.bin"); +MODULE_FIRMWARE("amdgpu/psp_14_0_3_ta.bin"); /* For large FW files the time to complete can be very long */ #define USBC_PD_POLLING_LIMIT_S 240 @@ -64,6 +66,9 @@ static int psp_v14_0_init_microcode(struct psp_context *psp) case IP_VERSION(14, 0, 2): case IP_VERSION(14, 0, 3): err = psp_init_sos_microcode(psp, ucode_prefix); + if (err) + return err; + err = psp_init_ta_microcode(psp, ucode_prefix); if (err) return err; break; From f770a6e9a3d7a90f77863b51325614f37a57fef5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 12 Jun 2024 19:28:13 -0400 Subject: [PATCH 1177/1578] bcachefs: Fix initialization order for srcu barrier btree_iter_init() needs to happen before key_cache_init(), to initialize btree_trans_barrier Reported-by: syzbot+3cca837c2183f8f6fcaf@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 65e239d329157c..635da5b3439cf2 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -912,9 +912,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_io_clock_init(&c->io_clock[WRITE]) ?: bch2_fs_journal_init(&c->journal) ?: bch2_fs_replicas_init(c) ?: + bch2_fs_btree_iter_init(c) ?: bch2_fs_btree_cache_init(c) ?: bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: - bch2_fs_btree_iter_init(c) ?: bch2_fs_btree_interior_update_init(c) ?: bch2_fs_buckets_waiting_for_journal_init(c) ?: bch2_fs_btree_write_buffer_init(c) ?: From d47df4f616d523b4ef832d03ec28b2e6d838067b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 12 Jun 2024 19:51:15 -0400 Subject: [PATCH 1178/1578] bcachefs: Fix array-index-out-of-bounds We use 0 size arrays as markers, but ubsan doesn't know that - cast them to a pointer to fix the splat. Also, make sure this code gets tested a bit more. Signed-off-by: Kent Overstreet --- fs/bcachefs/bkey.c | 2 +- fs/bcachefs/bkey_methods.c | 6 +++++- fs/bcachefs/bkey_methods.h | 3 ++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c index f46978e5cb7c63..94a1d1982fa889 100644 --- a/fs/bcachefs/bkey.c +++ b/fs/bcachefs/bkey.c @@ -1064,7 +1064,7 @@ void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k) { const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current; u8 *l = k->key_start; - u8 *h = (u8 *) (k->_data + f->key_u64s) - 1; + u8 *h = (u8 *) ((u64 *) k->_data + f->key_u64s) - 1; while (l < h) { swap(*l, *h); diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index c2c3dae521865c..bd32aac0519213 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -398,8 +398,12 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, for (i = 0; i < nr_compat; i++) switch (!write ? i : nr_compat - 1 - i) { case 0: - if (big_endian != CPU_BIG_ENDIAN) + if (big_endian != CPU_BIG_ENDIAN) { + bch2_bkey_swab_key(f, k); + } else if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { bch2_bkey_swab_key(f, k); + bch2_bkey_swab_key(f, k); + } break; case 1: if (version < bcachefs_metadata_version_bkey_renumber) diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h index 726ef74837638f..baef0722f5fb60 100644 --- a/fs/bcachefs/bkey_methods.h +++ b/fs/bcachefs/bkey_methods.h @@ -129,7 +129,8 @@ static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id, struct bkey_packed *k) { if (version < bcachefs_metadata_version_current || - big_endian != CPU_BIG_ENDIAN) + big_endian != CPU_BIG_ENDIAN || + IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) __bch2_bkey_compat(level, btree_id, version, big_endian, write, f, k); From 3727ca56049d893859b68f70e50092250de79f28 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Jun 2024 09:09:52 -0400 Subject: [PATCH 1179/1578] bcachefs: Fix a locking bug in the do_discard_fast() path We can't discard a bucket while it's still open; this needs the bucket_is_open_safe() version, which takes the open_buckets lock. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index c4b6601f5b7482..d2241f2b40feda 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -882,7 +882,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, closure_wake_up(&c->freelist_wait); if (statechange(a->data_type == BCH_DATA_need_discard) && - !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) && + !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) && bucket_flushed(new_a)) bch2_discard_one_bucket_fast(c, new.k->p); From d406545613b5c2716d5658038c46861863510b90 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Jun 2024 09:20:41 -0400 Subject: [PATCH 1180/1578] bcachefs: Fix shift overflow in read_one_super() Reported-by: syzbot+9f74cb4006b83e2a3df1@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/super-io.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 055478d21e9ef8..b156fc85b8a3eb 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -649,9 +649,10 @@ static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf bytes = vstruct_bytes(sb->sb); - if (bytes > 512ULL << min(BCH_SB_LAYOUT_SIZE_BITS_MAX, sb->sb->layout.sb_max_size_bits)) { - prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)", - bytes, 512UL << sb->sb->layout.sb_max_size_bits); + u64 sb_size = 512ULL << min(BCH_SB_LAYOUT_SIZE_BITS_MAX, sb->sb->layout.sb_max_size_bits); + if (bytes > sb_size) { + prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %llu)", + bytes, sb_size); return -BCH_ERR_invalid_sb_too_big; } From e3fd3faa453ce4cf4b6a0f3e29ee77d5d1b243a8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Jun 2024 09:28:01 -0400 Subject: [PATCH 1181/1578] bcachefs: Fix btree ID bitmasks these should be 64 bit bitmasks, not 32 bit. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 5 +++-- fs/bcachefs/btree_types.h | 16 ++++++++-------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 90c12fe2a2cd3b..5d3c5b5e34af80 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -1382,9 +1382,10 @@ enum btree_id { /* * Maximum number of btrees that we will _ever_ have under the current scheme, - * where we refer to them with bitfields + * where we refer to them with 64 bit bitfields - and we also need a bit for + * the interior btree node type: */ -#define BTREE_ID_NR_MAX 64 +#define BTREE_ID_NR_MAX 63 static inline bool btree_id_is_alloc(enum btree_id id) { diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index d63db4fefe7343..87f485e9c552d9 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -761,13 +761,13 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type) static inline bool btree_node_type_is_extents(enum btree_node_type type) { - const unsigned mask = 0 + const u64 mask = 0 #define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << (nr + 1)) BCH_BTREE_IDS() #undef x ; - return (1U << type) & mask; + return BIT_ULL(type) & mask; } static inline bool btree_id_is_extents(enum btree_id btree) @@ -777,35 +777,35 @@ static inline bool btree_id_is_extents(enum btree_id btree) static inline bool btree_type_has_snapshots(enum btree_id id) { - const unsigned mask = 0 + const u64 mask = 0 #define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_SNAPSHOTS)) << nr) BCH_BTREE_IDS() #undef x ; - return (1U << id) & mask; + return BIT_ULL(id) & mask; } static inline bool btree_type_has_snapshot_field(enum btree_id id) { - const unsigned mask = 0 + const u64 mask = 0 #define x(name, nr, flags, ...) |((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr) BCH_BTREE_IDS() #undef x ; - return (1U << id) & mask; + return BIT_ULL(id) & mask; } static inline bool btree_type_has_ptrs(enum btree_id id) { - const unsigned mask = 0 + const u64 mask = 0 #define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_DATA)) << nr) BCH_BTREE_IDS() #undef x ; - return (1U << id) & mask; + return BIT_ULL(id) & mask; } struct btree_root { From 9e7cfb35e2668e542c333ed3ec4b0a951dd332ee Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Jun 2024 09:26:54 -0400 Subject: [PATCH 1182/1578] bcachefs: Check for invalid btree IDs We can only handle btree IDs up to 62, since the btree id (plus the type for interior btree nodes) has to fit ito a 64 bit bitmask - check for invalid ones to avoid invalid shifts later. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery.c | 8 +++++++- fs/bcachefs/sb-errors_format.h | 6 +++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index cf513fc79ce480..e632da69196ccf 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -326,6 +326,12 @@ static int journal_replay_entry_early(struct bch_fs *c, case BCH_JSET_ENTRY_btree_root: { struct btree_root *r; + if (fsck_err_on(entry->btree_id >= BTREE_ID_NR_MAX, + c, invalid_btree_id, + "invalid btree id %u (max %u)", + entry->btree_id, BTREE_ID_NR_MAX)) + return 0; + while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) { ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL }); if (ret) @@ -415,7 +421,7 @@ static int journal_replay_entry_early(struct bch_fs *c, atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time)); } } - +fsck_err: return ret; } diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 84d2763bd597bd..1d1251f1bb205a 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -273,7 +273,11 @@ x(sb_clean_entry_overrun, 267) \ x(btree_ptr_v2_written_0, 268) \ x(subvol_snapshot_bad, 269) \ - x(subvol_inode_bad, 270) + x(subvol_inode_bad, 270) \ + x(alloc_key_stripe_sectors_wrong, 271) \ + x(accounting_mismatch, 272) \ + x(accounting_replicas_not_marked, 273) \ + x(invalid_btree_id, 274) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, From dbf4d79b7fc7e9bf5d1546f6dfffd789ea061221 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Jun 2024 09:36:34 -0400 Subject: [PATCH 1183/1578] bcachefs: Fix early init error path in journal code We shouln't be running the journal shutdown sequence if we never fully initialized the journal. Reported-by: syzbot+ffd2270f0bca3322ee00@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index adec8e1ea73eab..dac2f498ae8b61 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1167,6 +1167,9 @@ void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca) void bch2_fs_journal_stop(struct journal *j) { + if (!test_bit(JOURNAL_running, &j->flags)) + return; + bch2_journal_reclaim_stop(j); bch2_journal_flush_all_pins(j); From 1ba44217f8258f92c56644ca4fad4462f1941e33 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Jun 2024 09:51:01 -0400 Subject: [PATCH 1184/1578] bcachefs: delete_dead_snapshots() doesn't need to go RW We've been moving away from going RW lazily; if we want to go RW we do that in set_may_go_rw(), and if we didn't go RW we don't need to delete dead snapshots. Reported-by: syzbot+4366624c0b5aac4906cf@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/snapshot.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 51918acfd72681..961b5f56358c8a 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -1565,13 +1565,6 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) return 0; - if (!test_bit(BCH_FS_started, &c->flags)) { - ret = bch2_fs_read_write_early(c); - bch_err_msg(c, ret, "deleting dead snapshots: error going rw"); - if (ret) - return ret; - } - trans = bch2_trans_get(c); /* From cff07e2739d81cf33eb2a378a6136eced852b8cb Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Jun 2024 10:06:03 -0400 Subject: [PATCH 1185/1578] bcachefs: Guard against overflowing LRU_TIME_BITS LRUs only have 48 bits for the time field (i.e. LRU order); thus we need overflow checks and guards. Reported-by: syzbot+df3bf3f088dcaa728857@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 22 +++++++++++++++------- fs/bcachefs/alloc_background.h | 8 +++++++- fs/bcachefs/bcachefs.h | 5 +++++ fs/bcachefs/bcachefs_format.h | 3 +++ fs/bcachefs/lru.h | 3 --- fs/bcachefs/sb-errors_format.h | 3 ++- 6 files changed, 32 insertions(+), 12 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index d2241f2b40feda..e258de70457893 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -259,6 +259,14 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, "invalid data type (got %u should be %u)", a.v->data_type, alloc_data_type(*a.v, a.v->data_type)); + for (unsigned i = 0; i < 2; i++) + bkey_fsck_err_on(a.v->io_time[i] > LRU_TIME_MAX, + c, err, + alloc_key_io_time_bad, + "invalid io_time[%s]: %llu, max %llu", + i == READ ? "read" : "write", + a.v->io_time[i], LRU_TIME_MAX); + switch (a.v->data_type) { case BCH_DATA_free: case BCH_DATA_need_gc_gens: @@ -757,8 +765,8 @@ int bch2_trigger_alloc(struct btree_trans *trans, alloc_data_type_set(new_a, new_a->data_type); if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) { - new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); - new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); + new_a->io_time[READ] = bch2_current_io_time(c, READ); + new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE); SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true); } @@ -781,7 +789,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, if (new_a->data_type == BCH_DATA_cached && !new_a->io_time[READ]) - new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); + new_a->io_time[READ] = bch2_current_io_time(c, READ); u64 old_lru = alloc_lru_idx_read(*old_a); u64 new_lru = alloc_lru_idx_read(*new_a); @@ -1579,7 +1587,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, if (ret) goto err; - a_mut->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); + a_mut->v.io_time[READ] = bch2_current_io_time(c, READ); ret = bch2_trans_update(trans, alloc_iter, &a_mut->k_i, BTREE_TRIGGER_norun); if (ret) @@ -1975,8 +1983,8 @@ static int invalidate_one_bucket(struct btree_trans *trans, a->v.data_type = 0; a->v.dirty_sectors = 0; a->v.cached_sectors = 0; - a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); - a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now); + a->v.io_time[READ] = bch2_current_io_time(c, READ); + a->v.io_time[WRITE] = bch2_current_io_time(c, WRITE); ret = bch2_trans_commit(trans, NULL, NULL, BCH_WATERMARK_btree| @@ -2204,7 +2212,7 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, if (ret) return ret; - now = atomic64_read(&c->io_clock[rw].now); + now = bch2_current_io_time(c, rw); if (a->v.io_time[rw] == now) goto out; diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index ae31a94be6f913..c3cc3c5ba5b63f 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -141,7 +141,13 @@ static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a, !bch2_bucket_sectors_fragmented(ca, a)) return 0; - u64 d = bch2_bucket_sectors_dirty(a); + /* + * avoid overflowing LRU_TIME_BITS on a corrupted fs, when + * bucket_sectors_dirty is (much) bigger than bucket_size + */ + u64 d = min(bch2_bucket_sectors_dirty(a), + ca->mi.bucket_size); + return div_u64(d * (1ULL << 31), ca->mi.bucket_size); } diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 2992a644d822c0..a6b83ecab7ce50 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -1214,6 +1214,11 @@ static inline s64 bch2_current_time(const struct bch_fs *c) return timespec_to_bch2_time(c, now); } +static inline u64 bch2_current_io_time(const struct bch_fs *c, int rw) +{ + return max(1ULL, (u64) atomic64_read(&c->io_clock[rw].now) & LRU_TIME_MAX); +} + static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c) { struct stdio_redirect *stdio = c->stdio; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 5d3c5b5e34af80..4b98fed1ee9a4c 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -476,6 +476,9 @@ struct bch_lru { #define LRU_ID_STRIPES (1U << 16) +#define LRU_TIME_BITS 48 +#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1) + /* Optional/variable size superblock sections: */ struct bch_sb_field { diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h index fb11ab0dd00ea9..bd71ba77de078a 100644 --- a/fs/bcachefs/lru.h +++ b/fs/bcachefs/lru.h @@ -2,9 +2,6 @@ #ifndef _BCACHEFS_LRU_H #define _BCACHEFS_LRU_H -#define LRU_TIME_BITS 48 -#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1) - static inline u64 lru_pos_id(struct bpos pos) { return pos.inode >> LRU_TIME_BITS; diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 1d1251f1bb205a..1768e5c49f999e 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -277,7 +277,8 @@ x(alloc_key_stripe_sectors_wrong, 271) \ x(accounting_mismatch, 272) \ x(accounting_replicas_not_marked, 273) \ - x(invalid_btree_id, 274) + x(invalid_btree_id, 274) \ + x(alloc_key_io_time_bad, 275) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, From 2e9940d4a19507deb29b3e05571fcaaed88155e2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Jun 2024 13:15:16 -0400 Subject: [PATCH 1186/1578] bcachefs: Handle cached data LRU wraparound We only have 48 bits for the LRU time field, which is insufficient to prevent wraparound. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 46 ++++++++++++++++++++++++++++++---- 1 file changed, 41 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index e258de70457893..7b5909764d1481 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -2019,6 +2019,21 @@ static int invalidate_one_bucket(struct btree_trans *trans, goto out; } +static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter *iter, + struct bch_dev *ca, bool *wrapped) +{ + struct bkey_s_c k; +again: + k = bch2_btree_iter_peek_upto(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX)); + if (!k.k && !*wrapped) { + bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0)); + *wrapped = true; + goto again; + } + + return k; +} + static void bch2_do_invalidates_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work); @@ -2032,12 +2047,33 @@ static void bch2_do_invalidates_work(struct work_struct *work) for_each_member_device(c, ca) { s64 nr_to_invalidate = should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); + struct btree_iter iter; + bool wrapped = false; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_lru, + lru_pos(ca->dev_idx, 0, + ((bch2_current_io_time(c, READ) + U32_MAX) & + LRU_TIME_MAX)), 0); - ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru, - lru_pos(ca->dev_idx, 0, 0), - lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX), - BTREE_ITER_intent, k, - invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate)); + while (true) { + bch2_trans_begin(trans); + + struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped); + ret = bkey_err(k); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; + if (!k.k) + break; + + ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate); + if (ret) + break; + + bch2_btree_iter_advance(&iter); + } + bch2_trans_iter_exit(trans, &iter); if (ret < 0) { bch2_dev_put(ca); From ddd118ab45e848b1956ef8c8ef84963a554b5b58 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Jun 2024 11:31:00 -0400 Subject: [PATCH 1187/1578] bcachefs: Fix bch2_sb_downgrade_update() Missing enum conversion Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-downgrade.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 3fb23e399ffb3e..4710b61631f0f2 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -228,7 +228,7 @@ int bch2_sb_downgrade_update(struct bch_fs *c) dst = (void *) &darray_top(table); dst->version = cpu_to_le16(src->version); - dst->recovery_passes[0] = cpu_to_le64(src->recovery_passes); + dst->recovery_passes[0] = cpu_to_le64(bch2_recovery_passes_to_stable(src->recovery_passes)); dst->recovery_passes[1] = 0; dst->nr_errors = cpu_to_le16(src->nr_errors); for (unsigned i = 0; i < src->nr_errors; i++) From 0a2a507d404eebbc168e8b1264edf0ac8c6047b4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 19 Jun 2024 08:43:15 -0400 Subject: [PATCH 1188/1578] bcachefs: set_worker_desc() for delete_dead_snapshots this is long running - help users see what's going on Signed-off-by: Kent Overstreet --- fs/bcachefs/snapshot.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 961b5f56358c8a..4ef98e696673fd 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -1680,6 +1680,8 @@ void bch2_delete_dead_snapshots_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); + set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name); + bch2_delete_dead_snapshots(c); bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); } From a56da69799bd5f0c72bdc0fb64c3e3d8c1b1bb36 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 19 Jun 2024 15:47:44 -0400 Subject: [PATCH 1189/1578] bcachefs: Fix bch2_trans_put() reference: https://github.com/koverstreet/bcachefs/issues/692 trans->ref is the reference used by the cycle detector, which walks btree_trans objects of other threads to walk the graph of held locks and issue wakeups when an abort is required. We have to wait for the ref to go to 1 before freeing trans->paths or clearing trans->locking_wait.task. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 3694c600a3add8..3a1419d1788856 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -3161,6 +3161,7 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) list_add_done: seqmutex_unlock(&c->btree_trans_lock); got_trans: + trans->ref.closure_get_happened = false; trans->c = c; trans->last_begin_time = local_clock(); trans->fn_idx = fn_idx; @@ -3235,7 +3236,6 @@ void bch2_trans_put(struct btree_trans *trans) trans_for_each_update(trans, i) __btree_path_put(trans->paths + i->path, true); trans->nr_updates = 0; - trans->locking_wait.task = NULL; check_btree_paths_leaked(trans); @@ -3256,6 +3256,13 @@ void bch2_trans_put(struct btree_trans *trans) if (unlikely(trans->journal_replay_not_finished)) bch2_journal_keys_put(c); + /* + * trans->ref protects trans->locking_wait.task, btree_paths arary; used + * by cycle detector + */ + closure_sync(&trans->ref); + trans->locking_wait.task = NULL; + unsigned long *paths_allocated = trans->paths_allocated; trans->paths_allocated = NULL; trans->paths = NULL; @@ -3273,8 +3280,6 @@ void bch2_trans_put(struct btree_trans *trans) trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans); if (trans) { - closure_sync(&trans->ref); - seqmutex_lock(&c->btree_trans_lock); list_del(&trans->list); seqmutex_unlock(&c->btree_trans_lock); From 02a176d42a88805b3520148a4eee28b0760cd8c0 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 18 Jun 2024 12:39:14 -0700 Subject: [PATCH 1190/1578] ipv6: bring NLM_DONE out to a separate recv() again MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit under Fixes optimized the number of recv() calls needed during RTM_GETROUTE dumps, but we got multiple reports of applications hanging on recv() calls. Applications expect that a route dump will be terminated with a recv() reading an individual NLM_DONE message. Coalescing NLM_DONE is perfectly legal in netlink, but even tho reporters fixed the code in respective projects, chances are it will take time for those applications to get updated. So revert to old behavior (for now)? This is an IPv6 version of commit 460b0d33cf10 ("inet: bring NLM_DONE out to a separate recv() again"). Reported-by: Maciej Żenczykowski Link: https://lore.kernel.org/all/CANP3RGc1RG71oPEBXNx_WZFP9AyphJefdO4paczN92n__ds4ow@mail.gmail.com Reported-by: Stefano Brivio Link: https://lore.kernel.org/all/20240315124808.033ff58d@elisabeth Reported-by: Ilya Maximets Link: https://lore.kernel.org/all/02b50aae-f0e9-47a4-8365-a977a85975d3@ovn.org Fixes: 5fc68320c1fb ("ipv6: remove RTNL protection from inet6_dump_fib()") Tested-by: Ilya Maximets Link: https://lore.kernel.org/r/20240618193914.561782-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_fib.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 6e57c03e3255f0..83e4f9855ae128 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2514,7 +2514,8 @@ int __init fib6_init(void) goto out_kmem_cache_create; ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, NULL, - inet6_dump_fib, RTNL_FLAG_DUMP_UNLOCKED); + inet6_dump_fib, RTNL_FLAG_DUMP_UNLOCKED | + RTNL_FLAG_DUMP_SPLIT_NLM_DONE); if (ret) goto out_unregister_subsys; From 74382aebc9035470ec4c789bdb0d09d8c14f261e Mon Sep 17 00:00:00 2001 From: Marcin Szycik Date: Tue, 18 Jun 2024 14:02:05 -0700 Subject: [PATCH 1191/1578] ice: Fix VSI list rule with ICE_SW_LKUP_LAST type Adding/updating VSI list rule, as well as allocating/freeing VSI list resource are called several times with type ICE_SW_LKUP_LAST, which fails because ice_update_vsi_list_rule() and ice_aq_alloc_free_vsi_list() consider it invalid. Allow calling these functions with ICE_SW_LKUP_LAST. This fixes at least one issue in switchdev mode, where the same rule with different action cannot be added, e.g.: tc filter add dev $PF1 ingress protocol arp prio 0 flower skip_sw \ dst_mac ff:ff:ff:ff:ff:ff action mirred egress redirect dev $VF1_PR tc filter add dev $PF1 ingress protocol arp prio 0 flower skip_sw \ dst_mac ff:ff:ff:ff:ff:ff action mirred egress redirect dev $VF2_PR Fixes: 0f94570d0cae ("ice: allow adding advanced rules") Suggested-by: Michal Swiatkowski Reviewed-by: Michal Swiatkowski Reviewed-by: Przemek Kitszel Signed-off-by: Marcin Szycik Reviewed-by: Jacob Keller Reviewed-by: Simon Horman Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen Link: https://lore.kernel.org/r/20240618210206.981885-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_switch.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_switch.c b/drivers/net/ethernet/intel/ice/ice_switch.c index 94d6670d090138..1191031b2a43d1 100644 --- a/drivers/net/ethernet/intel/ice/ice_switch.c +++ b/drivers/net/ethernet/intel/ice/ice_switch.c @@ -1899,7 +1899,8 @@ ice_aq_alloc_free_vsi_list(struct ice_hw *hw, u16 *vsi_list_id, lkup_type == ICE_SW_LKUP_ETHERTYPE_MAC || lkup_type == ICE_SW_LKUP_PROMISC || lkup_type == ICE_SW_LKUP_PROMISC_VLAN || - lkup_type == ICE_SW_LKUP_DFLT) { + lkup_type == ICE_SW_LKUP_DFLT || + lkup_type == ICE_SW_LKUP_LAST) { sw_buf->res_type = cpu_to_le16(ICE_AQC_RES_TYPE_VSI_LIST_REP); } else if (lkup_type == ICE_SW_LKUP_VLAN) { if (opc == ice_aqc_opc_alloc_res) @@ -2922,7 +2923,8 @@ ice_update_vsi_list_rule(struct ice_hw *hw, u16 *vsi_handle_arr, u16 num_vsi, lkup_type == ICE_SW_LKUP_ETHERTYPE_MAC || lkup_type == ICE_SW_LKUP_PROMISC || lkup_type == ICE_SW_LKUP_PROMISC_VLAN || - lkup_type == ICE_SW_LKUP_DFLT) + lkup_type == ICE_SW_LKUP_DFLT || + lkup_type == ICE_SW_LKUP_LAST) rule_type = remove ? ICE_AQC_SW_RULES_T_VSI_LIST_CLEAR : ICE_AQC_SW_RULES_T_VSI_LIST_SET; else if (lkup_type == ICE_SW_LKUP_VLAN) From f9ae848904289ddb16c7c9e4553ed4c64300de49 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Wed, 19 Jun 2024 01:29:04 +0100 Subject: [PATCH 1192/1578] net/tcp_ao: Don't leak ao_info on error-path It seems I introduced it together with TCP_AO_CMDF_AO_REQUIRED, on version 5 [1] of TCP-AO patches. Quite frustrative that having all these selftests that I've written, running kmemtest & kcov was always in todo. [1]: https://lore.kernel.org/netdev/20230215183335.800122-5-dima@arista.com/ Reported-by: Jakub Kicinski Closes: https://lore.kernel.org/netdev/20240617072451.1403e1d2@kernel.org/ Fixes: 0aadc73995d0 ("net/tcp: Prevent TCP-MD5 with TCP-AO being set") Cc: stable@vger.kernel.org Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240619-tcp-ao-required-leak-v1-1-6408f3c94247@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_ao.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index 37c42b63ff9934..09c0fa6756b7d5 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -1968,8 +1968,10 @@ static int tcp_ao_info_cmd(struct sock *sk, unsigned short int family, first = true; } - if (cmd.ao_required && tcp_ao_required_verify(sk)) - return -EKEYREJECTED; + if (cmd.ao_required && tcp_ao_required_verify(sk)) { + err = -EKEYREJECTED; + goto out; + } /* For sockets in TCP_CLOSED it's possible set keys that aren't * matching the future peer (address/port/VRF/etc), From d21d44dbdde83c4a8553c95de1853e63e88d7954 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Mon, 17 Jun 2024 17:47:36 +0200 Subject: [PATCH 1193/1578] drm/xe/vf: Don't touch GuC irq registers if using memory irqs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On platforms where VFs are using memory based interrupts, we missed invalid access to no longer existing interrupt registers, as we keep them marked with XE_REG_OPTION_VF. To fix that just either setup memirq vectors in GuC or enable legacy interrupts. Fixes: aef4eb7c7dec ("drm/xe/vf: Setup memory based interrupts in GuC") Signed-off-by: Michal Wajdeczko Cc: Matt Roper Reviewed-by: Matt Roper Link: https://patchwork.freedesktop.org/patch/msgid/20240617154736.685-1-michal.wajdeczko@intel.com (cherry picked from commit f0ccd2d805e55e12b430d5d6b9acd9f891af455e) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_guc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c index 240e7a4bbff1a9..5faca4fc2fef5a 100644 --- a/drivers/gpu/drm/xe/xe_guc.c +++ b/drivers/gpu/drm/xe/xe_guc.c @@ -631,8 +631,6 @@ int xe_guc_enable_communication(struct xe_guc *guc) struct xe_device *xe = guc_to_xe(guc); int err; - guc_enable_irq(guc); - if (IS_SRIOV_VF(xe) && xe_device_has_memirq(xe)) { struct xe_gt *gt = guc_to_gt(guc); struct xe_tile *tile = gt_to_tile(gt); @@ -640,6 +638,8 @@ int xe_guc_enable_communication(struct xe_guc *guc) err = xe_memirq_init_guc(&tile->sriov.vf.memirq, guc); if (err) return err; + } else { + guc_enable_irq(guc); } xe_mmio_rmw32(guc_to_gt(guc), PMINTRMSK, From 9b1effff19cdf2230d3ecb07ff4038a0da32e9cc Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Wed, 19 Jun 2024 17:16:02 +0100 Subject: [PATCH 1194/1578] ALSA: hda: cs35l56: Select SERIAL_MULTI_INSTANTIATE The ACPI IDs used in the CS35L56 HDA drivers are all handled by the serial multi-instantiate driver which starts multiple Linux device instances from a single ACPI Device() node. As serial multi-instantiate is not an optional part of the system add it as a dependency in Kconfig so that it is not overlooked. Signed-off-by: Simon Trimmer Link: https://lore.kernel.org/20240619161602.117452-1-simont@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/pci/hda/Kconfig b/sound/pci/hda/Kconfig index 0da625533afc27..e59df40a0007a8 100644 --- a/sound/pci/hda/Kconfig +++ b/sound/pci/hda/Kconfig @@ -162,6 +162,7 @@ config SND_HDA_SCODEC_CS35L56_I2C depends on ACPI || COMPILE_TEST depends on SND_SOC select FW_CS_DSP + select SERIAL_MULTI_INSTANTIATE select SND_HDA_GENERIC select SND_SOC_CS35L56_SHARED select SND_HDA_SCODEC_CS35L56 @@ -178,6 +179,7 @@ config SND_HDA_SCODEC_CS35L56_SPI depends on ACPI || COMPILE_TEST depends on SND_SOC select FW_CS_DSP + select SERIAL_MULTI_INSTANTIATE select SND_HDA_GENERIC select SND_SOC_CS35L56_SHARED select SND_HDA_SCODEC_CS35L56 From 6cd4a78d962bebbaf8beb7d2ead3f34120e3f7b2 Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Mon, 17 Jun 2024 22:02:05 +0100 Subject: [PATCH 1195/1578] net: do not leave a dangling sk pointer, when socket creation fails It is possible to trigger a use-after-free by: * attaching an fentry probe to __sock_release() and the probe calling the bpf_get_socket_cookie() helper * running traceroute -I 1.1.1.1 on a freshly booted VM A KASAN enabled kernel will log something like below (decoded and stripped): ================================================================== BUG: KASAN: slab-use-after-free in __sock_gen_cookie (./arch/x86/include/asm/atomic64_64.h:15 ./include/linux/atomic/atomic-arch-fallback.h:2583 ./include/linux/atomic/atomic-instrumented.h:1611 net/core/sock_diag.c:29) Read of size 8 at addr ffff888007110dd8 by task traceroute/299 CPU: 2 PID: 299 Comm: traceroute Tainted: G E 6.10.0-rc2+ #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 Call Trace: dump_stack_lvl (lib/dump_stack.c:117 (discriminator 1)) print_report (mm/kasan/report.c:378 mm/kasan/report.c:488) ? __sock_gen_cookie (./arch/x86/include/asm/atomic64_64.h:15 ./include/linux/atomic/atomic-arch-fallback.h:2583 ./include/linux/atomic/atomic-instrumented.h:1611 net/core/sock_diag.c:29) kasan_report (mm/kasan/report.c:603) ? __sock_gen_cookie (./arch/x86/include/asm/atomic64_64.h:15 ./include/linux/atomic/atomic-arch-fallback.h:2583 ./include/linux/atomic/atomic-instrumented.h:1611 net/core/sock_diag.c:29) kasan_check_range (mm/kasan/generic.c:183 mm/kasan/generic.c:189) __sock_gen_cookie (./arch/x86/include/asm/atomic64_64.h:15 ./include/linux/atomic/atomic-arch-fallback.h:2583 ./include/linux/atomic/atomic-instrumented.h:1611 net/core/sock_diag.c:29) bpf_get_socket_ptr_cookie (./arch/x86/include/asm/preempt.h:94 ./include/linux/sock_diag.h:42 net/core/filter.c:5094 net/core/filter.c:5092) bpf_prog_875642cf11f1d139___sock_release+0x6e/0x8e bpf_trampoline_6442506592+0x47/0xaf __sock_release (net/socket.c:652) __sock_create (net/socket.c:1601) ... Allocated by task 299 on cpu 2 at 78.328492s: kasan_save_stack (mm/kasan/common.c:48) kasan_save_track (mm/kasan/common.c:68) __kasan_slab_alloc (mm/kasan/common.c:312 mm/kasan/common.c:338) kmem_cache_alloc_noprof (mm/slub.c:3941 mm/slub.c:4000 mm/slub.c:4007) sk_prot_alloc (net/core/sock.c:2075) sk_alloc (net/core/sock.c:2134) inet_create (net/ipv4/af_inet.c:327 net/ipv4/af_inet.c:252) __sock_create (net/socket.c:1572) __sys_socket (net/socket.c:1660 net/socket.c:1644 net/socket.c:1706) __x64_sys_socket (net/socket.c:1718) do_syscall_64 (arch/x86/entry/common.c:52 arch/x86/entry/common.c:83) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) Freed by task 299 on cpu 2 at 78.328502s: kasan_save_stack (mm/kasan/common.c:48) kasan_save_track (mm/kasan/common.c:68) kasan_save_free_info (mm/kasan/generic.c:582) poison_slab_object (mm/kasan/common.c:242) __kasan_slab_free (mm/kasan/common.c:256) kmem_cache_free (mm/slub.c:4437 mm/slub.c:4511) __sk_destruct (net/core/sock.c:2117 net/core/sock.c:2208) inet_create (net/ipv4/af_inet.c:397 net/ipv4/af_inet.c:252) __sock_create (net/socket.c:1572) __sys_socket (net/socket.c:1660 net/socket.c:1644 net/socket.c:1706) __x64_sys_socket (net/socket.c:1718) do_syscall_64 (arch/x86/entry/common.c:52 arch/x86/entry/common.c:83) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) Fix this by clearing the struct socket reference in sk_common_release() to cover all protocol families create functions, which may already attached the reference to the sk object with sock_init_data(). Fixes: c5dbb89fc2ac ("bpf: Expose bpf_get_socket_cookie to tracing programs") Suggested-by: Kuniyuki Iwashima Signed-off-by: Ignat Korchagin Cc: stable@vger.kernel.org Link: https://lore.kernel.org/netdev/20240613194047.36478-1-kuniyu@amazon.com/T/ Reviewed-by: Kuniyuki Iwashima Reviewed-by: D. Wythe Link: https://lore.kernel.org/r/20240617210205.67311-1-ignat@cloudflare.com Signed-off-by: Paolo Abeni --- net/core/sock.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/core/sock.c b/net/core/sock.c index 8629f9aecf91a3..100e975073ca50 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3742,6 +3742,9 @@ void sk_common_release(struct sock *sk) sk->sk_prot->unhash(sk); + if (sk->sk_socket) + sk->sk_socket->sk = NULL; + /* * In this point socket cannot receive new packets, but it is possible * that some packets are in flight because some CPU runs receiver and From ebc4fc34eae8ddfbef49f2bdaced1bf4167ef80d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 27 May 2024 16:24:41 +0300 Subject: [PATCH 1196/1578] mmc: sdhci-pci: Convert PCIBIOS_* return codes to errnos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jmicron_pmos() and sdhci_pci_probe() use pci_{read,write}_config_byte() that return PCIBIOS_* codes. The return code is then returned as is by jmicron_probe() and sdhci_pci_probe(). Similarly, the return code is also returned as is from jmicron_resume(). Both probe and resume functions should return normal errnos. Convert PCIBIOS_* returns code using pcibios_err_to_errno() into normal errno before returning them the fix these issues. Fixes: 7582041ff3d4 ("mmc: sdhci-pci: fix simple_return.cocci warnings") Fixes: 45211e215984 ("sdhci: toggle JMicron PMOS setting") Cc: stable@vger.kernel.org Signed-off-by: Ilpo Järvinen Acked-by: Adrian Hunter Link: https://lore.kernel.org/r/20240527132443.14038-1-ilpo.jarvinen@linux.intel.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-pci-core.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/mmc/host/sdhci-pci-core.c b/drivers/mmc/host/sdhci-pci-core.c index ef89ec382bfef9..23e6ba70144c78 100644 --- a/drivers/mmc/host/sdhci-pci-core.c +++ b/drivers/mmc/host/sdhci-pci-core.c @@ -1326,7 +1326,7 @@ static int jmicron_pmos(struct sdhci_pci_chip *chip, int on) ret = pci_read_config_byte(chip->pdev, 0xAE, &scratch); if (ret) - return ret; + goto fail; /* * Turn PMOS on [bit 0], set over current detection to 2.4 V @@ -1337,7 +1337,10 @@ static int jmicron_pmos(struct sdhci_pci_chip *chip, int on) else scratch &= ~0x47; - return pci_write_config_byte(chip->pdev, 0xAE, scratch); + ret = pci_write_config_byte(chip->pdev, 0xAE, scratch); + +fail: + return pcibios_err_to_errno(ret); } static int jmicron_probe(struct sdhci_pci_chip *chip) @@ -2202,7 +2205,7 @@ static int sdhci_pci_probe(struct pci_dev *pdev, ret = pci_read_config_byte(pdev, PCI_SLOT_INFO, &slots); if (ret) - return ret; + return pcibios_err_to_errno(ret); slots = PCI_SLOT_INFO_SLOTS(slots) + 1; dev_dbg(&pdev->dev, "found %d slot(s)\n", slots); @@ -2211,7 +2214,7 @@ static int sdhci_pci_probe(struct pci_dev *pdev, ret = pci_read_config_byte(pdev, PCI_SLOT_INFO, &first_bar); if (ret) - return ret; + return pcibios_err_to_errno(ret); first_bar &= PCI_SLOT_INFO_FIRST_BAR_MASK; From a91bf3b3beadbb4f8b3bbc7969fb2ae1615e25c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 27 May 2024 16:24:42 +0300 Subject: [PATCH 1197/1578] mmc: sdhci-pci-o2micro: Convert PCIBIOS_* return codes to errnos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sdhci_pci_o2_probe() uses pci_read_config_{byte,dword}() that return PCIBIOS_* codes. The return code is then returned as is but as sdhci_pci_o2_probe() is probe function chain, it should return normal errnos. Convert PCIBIOS_* returns code using pcibios_err_to_errno() into normal errno before returning them. Add a label for read failure so that the conversion can be done in one place rather than on all of the return statements. Fixes: 3d757ddbd68c ("mmc: sdhci-pci-o2micro: add Bayhub new chip GG8 support for UHS-I") Fixes: d599005afde8 ("mmc: sdhci-pci-o2micro: Add missing checks in sdhci_pci_o2_probe") Fixes: 706adf6bc31c ("mmc: sdhci-pci-o2micro: Add SeaBird SeaEagle SD3 support") Fixes: 01acf6917aed ("mmc: sdhci-pci: add support of O2Micro/BayHubTech SD hosts") Fixes: 26daa1ed40c6 ("mmc: sdhci: Disable ADMA on some O2Micro SD/MMC parts.") Cc: stable@vger.kernel.org Signed-off-by: Ilpo Järvinen Acked-by: Adrian Hunter Link: https://lore.kernel.org/r/20240527132443.14038-2-ilpo.jarvinen@linux.intel.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-pci-o2micro.c | 41 +++++++++++++++------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/drivers/mmc/host/sdhci-pci-o2micro.c b/drivers/mmc/host/sdhci-pci-o2micro.c index d4a02184784a34..058bef1c7e4190 100644 --- a/drivers/mmc/host/sdhci-pci-o2micro.c +++ b/drivers/mmc/host/sdhci-pci-o2micro.c @@ -823,7 +823,7 @@ static int sdhci_pci_o2_probe(struct sdhci_pci_chip *chip) ret = pci_read_config_byte(chip->pdev, O2_SD_LOCK_WP, &scratch); if (ret) - return ret; + goto read_fail; scratch &= 0x7f; pci_write_config_byte(chip->pdev, O2_SD_LOCK_WP, scratch); @@ -834,7 +834,7 @@ static int sdhci_pci_o2_probe(struct sdhci_pci_chip *chip) ret = pci_read_config_byte(chip->pdev, O2_SD_CLKREQ, &scratch); if (ret) - return ret; + goto read_fail; scratch |= 0x20; pci_write_config_byte(chip->pdev, O2_SD_CLKREQ, scratch); @@ -843,7 +843,7 @@ static int sdhci_pci_o2_probe(struct sdhci_pci_chip *chip) */ ret = pci_read_config_byte(chip->pdev, O2_SD_CAPS, &scratch); if (ret) - return ret; + goto read_fail; scratch |= 0x01; pci_write_config_byte(chip->pdev, O2_SD_CAPS, scratch); pci_write_config_byte(chip->pdev, O2_SD_CAPS, 0x73); @@ -856,7 +856,7 @@ static int sdhci_pci_o2_probe(struct sdhci_pci_chip *chip) ret = pci_read_config_byte(chip->pdev, O2_SD_INF_MOD, &scratch); if (ret) - return ret; + goto read_fail; scratch |= 0x08; pci_write_config_byte(chip->pdev, O2_SD_INF_MOD, scratch); @@ -864,7 +864,7 @@ static int sdhci_pci_o2_probe(struct sdhci_pci_chip *chip) ret = pci_read_config_byte(chip->pdev, O2_SD_LOCK_WP, &scratch); if (ret) - return ret; + goto read_fail; scratch |= 0x80; pci_write_config_byte(chip->pdev, O2_SD_LOCK_WP, scratch); break; @@ -875,7 +875,7 @@ static int sdhci_pci_o2_probe(struct sdhci_pci_chip *chip) ret = pci_read_config_byte(chip->pdev, O2_SD_LOCK_WP, &scratch); if (ret) - return ret; + goto read_fail; scratch &= 0x7f; pci_write_config_byte(chip->pdev, O2_SD_LOCK_WP, scratch); @@ -886,7 +886,7 @@ static int sdhci_pci_o2_probe(struct sdhci_pci_chip *chip) O2_SD_FUNC_REG0, &scratch_32); if (ret) - return ret; + goto read_fail; scratch_32 = ((scratch_32 & 0xFF000000) >> 24); /* Check Whether subId is 0x11 or 0x12 */ @@ -898,7 +898,7 @@ static int sdhci_pci_o2_probe(struct sdhci_pci_chip *chip) O2_SD_FUNC_REG4, &scratch_32); if (ret) - return ret; + goto read_fail; /* Enable Base Clk setting change */ scratch_32 |= O2_SD_FREG4_ENABLE_CLK_SET; @@ -921,7 +921,7 @@ static int sdhci_pci_o2_probe(struct sdhci_pci_chip *chip) ret = pci_read_config_dword(chip->pdev, O2_SD_CLK_SETTING, &scratch_32); if (ret) - return ret; + goto read_fail; scratch_32 &= ~(0xFF00); scratch_32 |= 0x07E0C800; @@ -931,14 +931,14 @@ static int sdhci_pci_o2_probe(struct sdhci_pci_chip *chip) ret = pci_read_config_dword(chip->pdev, O2_SD_CLKREQ, &scratch_32); if (ret) - return ret; + goto read_fail; scratch_32 |= 0x3; pci_write_config_dword(chip->pdev, O2_SD_CLKREQ, scratch_32); ret = pci_read_config_dword(chip->pdev, O2_SD_PLL_SETTING, &scratch_32); if (ret) - return ret; + goto read_fail; scratch_32 &= ~(0x1F3F070E); scratch_32 |= 0x18270106; @@ -949,7 +949,7 @@ static int sdhci_pci_o2_probe(struct sdhci_pci_chip *chip) ret = pci_read_config_dword(chip->pdev, O2_SD_CAP_REG2, &scratch_32); if (ret) - return ret; + goto read_fail; scratch_32 &= ~(0xE0); pci_write_config_dword(chip->pdev, O2_SD_CAP_REG2, scratch_32); @@ -961,7 +961,7 @@ static int sdhci_pci_o2_probe(struct sdhci_pci_chip *chip) ret = pci_read_config_byte(chip->pdev, O2_SD_LOCK_WP, &scratch); if (ret) - return ret; + goto read_fail; scratch |= 0x80; pci_write_config_byte(chip->pdev, O2_SD_LOCK_WP, scratch); break; @@ -971,7 +971,7 @@ static int sdhci_pci_o2_probe(struct sdhci_pci_chip *chip) ret = pci_read_config_byte(chip->pdev, O2_SD_LOCK_WP, &scratch); if (ret) - return ret; + goto read_fail; scratch &= 0x7f; pci_write_config_byte(chip->pdev, O2_SD_LOCK_WP, scratch); @@ -979,7 +979,7 @@ static int sdhci_pci_o2_probe(struct sdhci_pci_chip *chip) ret = pci_read_config_dword(chip->pdev, O2_SD_PLL_SETTING, &scratch_32); if (ret) - return ret; + goto read_fail; if ((scratch_32 & 0xff000000) == 0x01000000) { scratch_32 &= 0x0000FFFF; @@ -998,7 +998,7 @@ static int sdhci_pci_o2_probe(struct sdhci_pci_chip *chip) O2_SD_FUNC_REG4, &scratch_32); if (ret) - return ret; + goto read_fail; scratch_32 |= (1 << 22); pci_write_config_dword(chip->pdev, O2_SD_FUNC_REG4, scratch_32); @@ -1017,7 +1017,7 @@ static int sdhci_pci_o2_probe(struct sdhci_pci_chip *chip) ret = pci_read_config_byte(chip->pdev, O2_SD_LOCK_WP, &scratch); if (ret) - return ret; + goto read_fail; scratch |= 0x80; pci_write_config_byte(chip->pdev, O2_SD_LOCK_WP, scratch); break; @@ -1028,7 +1028,7 @@ static int sdhci_pci_o2_probe(struct sdhci_pci_chip *chip) /* UnLock WP */ ret = pci_read_config_byte(chip->pdev, O2_SD_LOCK_WP, &scratch); if (ret) - return ret; + goto read_fail; scratch &= 0x7f; pci_write_config_byte(chip->pdev, O2_SD_LOCK_WP, scratch); @@ -1057,13 +1057,16 @@ static int sdhci_pci_o2_probe(struct sdhci_pci_chip *chip) /* Lock WP */ ret = pci_read_config_byte(chip->pdev, O2_SD_LOCK_WP, &scratch); if (ret) - return ret; + goto read_fail; scratch |= 0x80; pci_write_config_byte(chip->pdev, O2_SD_LOCK_WP, scratch); break; } return 0; + +read_fail: + return pcibios_err_to_errno(ret); } #ifdef CONFIG_PM_SLEEP From d77dc388cd61dfdafe30b98025fa827498378199 Mon Sep 17 00:00:00 2001 From: Kamal Dasu Date: Mon, 3 Jun 2024 18:08:34 -0400 Subject: [PATCH 1198/1578] mmc: sdhci-brcmstb: check R1_STATUS for erase/trim/discard When erase/trim/discard completion was converted to mmc_poll_for_busy(), optional support to poll with the host_ops->card_busy() callback was also added. The common sdhci's ->card_busy() turns out not to be working as expected for the sdhci-brcmstb variant, as it keeps returning busy beyond the card's busy period. In particular, this leads to the below splat for mmc_do_erase() when running a discard (BLKSECDISCARD) operation during mkfs.f2fs: Info: [/dev/mmcblk1p9] Discarding device [ 39.597258] sysrq: Show Blocked State [ 39.601183] task:mkfs.f2fs state:D stack:0 pid:1561 tgid:1561 ppid:1542 flags:0x0000000d [ 39.610609] Call trace: [ 39.613098] __switch_to+0xd8/0xf4 [ 39.616582] __schedule+0x440/0x4f4 [ 39.620137] schedule+0x2c/0x48 [ 39.623341] schedule_hrtimeout_range_clock+0xe0/0x114 [ 39.628562] schedule_hrtimeout_range+0x10/0x18 [ 39.633169] usleep_range_state+0x5c/0x90 [ 39.637253] __mmc_poll_for_busy+0xec/0x128 [ 39.641514] mmc_poll_for_busy+0x48/0x70 [ 39.645511] mmc_do_erase+0x1ec/0x210 [ 39.649237] mmc_erase+0x1b4/0x1d4 [ 39.652701] mmc_blk_mq_issue_rq+0x35c/0x6ac [ 39.657037] mmc_mq_queue_rq+0x18c/0x214 [ 39.661022] blk_mq_dispatch_rq_list+0x3a8/0x528 [ 39.665722] __blk_mq_sched_dispatch_requests+0x3a0/0x4ac [ 39.671198] blk_mq_sched_dispatch_requests+0x28/0x5c [ 39.676322] blk_mq_run_hw_queue+0x11c/0x12c [ 39.680668] blk_mq_flush_plug_list+0x200/0x33c [ 39.685278] blk_add_rq_to_plug+0x68/0xd8 [ 39.689365] blk_mq_submit_bio+0x3a4/0x458 [ 39.693539] __submit_bio+0x1c/0x80 [ 39.697096] submit_bio_noacct_nocheck+0x94/0x174 [ 39.701875] submit_bio_noacct+0x1b0/0x22c [ 39.706042] submit_bio+0xac/0xe8 [ 39.709424] blk_next_bio+0x4c/0x5c [ 39.712973] blkdev_issue_secure_erase+0x118/0x170 [ 39.717835] blkdev_common_ioctl+0x374/0x728 [ 39.722175] blkdev_ioctl+0x8c/0x2b0 [ 39.725816] vfs_ioctl+0x24/0x40 [ 39.729117] __arm64_sys_ioctl+0x5c/0x8c [ 39.733114] invoke_syscall+0x68/0xec [ 39.736839] el0_svc_common.constprop.0+0x70/0xd8 [ 39.741609] do_el0_svc+0x18/0x20 [ 39.744981] el0_svc+0x68/0x94 [ 39.748107] el0t_64_sync_handler+0x88/0x124 [ 39.752455] el0t_64_sync+0x168/0x16c To fix the problem let's override the host_ops->card_busy() callback by setting it to NULL, which forces the mmc core to poll with a CMD13 and checking the R1_STATUS in the mmc_busy_cb() function. Signed-off-by: Kamal Dasu Fixes: 0d84c3e6a5b2 ("mmc: core: Convert to mmc_poll_for_busy() for erase/trim/discard") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240603220834.21989-2-kamal.dasu@broadcom.com [Ulf: Clarified the commit message] Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-brcmstb.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/mmc/host/sdhci-brcmstb.c b/drivers/mmc/host/sdhci-brcmstb.c index 9053526fa212a1..150fb477b7cc95 100644 --- a/drivers/mmc/host/sdhci-brcmstb.c +++ b/drivers/mmc/host/sdhci-brcmstb.c @@ -24,6 +24,7 @@ #define BRCMSTB_MATCH_FLAGS_NO_64BIT BIT(0) #define BRCMSTB_MATCH_FLAGS_BROKEN_TIMEOUT BIT(1) #define BRCMSTB_MATCH_FLAGS_HAS_CLOCK_GATE BIT(2) +#define BRCMSTB_MATCH_FLAGS_USE_CARD_BUSY BIT(4) #define BRCMSTB_PRIV_FLAGS_HAS_CQE BIT(0) #define BRCMSTB_PRIV_FLAGS_GATE_CLOCK BIT(1) @@ -384,6 +385,9 @@ static int sdhci_brcmstb_probe(struct platform_device *pdev) if (match_priv->flags & BRCMSTB_MATCH_FLAGS_BROKEN_TIMEOUT) host->quirks |= SDHCI_QUIRK_BROKEN_TIMEOUT_VAL; + if (!(match_priv->flags & BRCMSTB_MATCH_FLAGS_USE_CARD_BUSY)) + host->mmc_host_ops.card_busy = NULL; + /* Change the base clock frequency if the DT property exists */ if (device_property_read_u32(&pdev->dev, "clock-frequency", &priv->base_freq_hz) != 0) From 84bb8d8bbd8384081c3fc5c4f20b223524af529d Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 6 Jun 2024 20:17:20 +0200 Subject: [PATCH 1199/1578] Revert "mmc: moxart-mmc: Use sg_miter for PIO" This reverts commit 3ee0e7c3e67cab83ffbbe7707b43df8d41c9fe47. The patch is not working for unknown reasons and I would need access to the hardware to fix the bug. This shouldn't matter anyway: the Moxa Art is not expected to use highmem, and sg_miter() is only necessary to have to properly deal with highmem. Reported-by: Sergei Antonov Signed-off-by: Linus Walleij Fixes: 3ee0e7c3e67c ("mmc: moxart-mmc: Use sg_miter for PIO") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240606-mmc-moxart-revert-v1-1-a01c2f40de9c@linaro.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/moxart-mmc.c | 78 +++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 35 deletions(-) diff --git a/drivers/mmc/host/moxart-mmc.c b/drivers/mmc/host/moxart-mmc.c index 9a5f75163acaee..8ede4ce9327149 100644 --- a/drivers/mmc/host/moxart-mmc.c +++ b/drivers/mmc/host/moxart-mmc.c @@ -131,10 +131,12 @@ struct moxart_host { struct dma_async_tx_descriptor *tx_desc; struct mmc_host *mmc; struct mmc_request *mrq; + struct scatterlist *cur_sg; struct completion dma_complete; struct completion pio_complete; - struct sg_mapping_iter sg_miter; + u32 num_sg; + u32 data_remain; u32 data_len; u32 fifo_width; u32 timeout; @@ -146,6 +148,35 @@ struct moxart_host { bool is_removed; }; +static inline void moxart_init_sg(struct moxart_host *host, + struct mmc_data *data) +{ + host->cur_sg = data->sg; + host->num_sg = data->sg_len; + host->data_remain = host->cur_sg->length; + + if (host->data_remain > host->data_len) + host->data_remain = host->data_len; +} + +static inline int moxart_next_sg(struct moxart_host *host) +{ + int remain; + struct mmc_data *data = host->mrq->cmd->data; + + host->cur_sg++; + host->num_sg--; + + if (host->num_sg > 0) { + host->data_remain = host->cur_sg->length; + remain = host->data_len - data->bytes_xfered; + if (remain > 0 && remain < host->data_remain) + host->data_remain = remain; + } + + return host->num_sg; +} + static int moxart_wait_for_status(struct moxart_host *host, u32 mask, u32 *status) { @@ -278,29 +309,14 @@ static void moxart_transfer_dma(struct mmc_data *data, struct moxart_host *host) static void moxart_transfer_pio(struct moxart_host *host) { - struct sg_mapping_iter *sgm = &host->sg_miter; struct mmc_data *data = host->mrq->cmd->data; u32 *sgp, len = 0, remain, status; if (host->data_len == data->bytes_xfered) return; - /* - * By updating sgm->consumes this will get a proper pointer into the - * buffer at any time. - */ - if (!sg_miter_next(sgm)) { - /* This shold not happen */ - dev_err(mmc_dev(host->mmc), "ran out of scatterlist prematurely\n"); - data->error = -EINVAL; - complete(&host->pio_complete); - return; - } - sgp = sgm->addr; - remain = sgm->length; - if (remain > host->data_len) - remain = host->data_len; - sgm->consumed = 0; + sgp = sg_virt(host->cur_sg); + remain = host->data_remain; if (data->flags & MMC_DATA_WRITE) { while (remain > 0) { @@ -315,7 +331,6 @@ static void moxart_transfer_pio(struct moxart_host *host) sgp++; len += 4; } - sgm->consumed += len; remain -= len; } @@ -332,22 +347,22 @@ static void moxart_transfer_pio(struct moxart_host *host) sgp++; len += 4; } - sgm->consumed += len; remain -= len; } } - data->bytes_xfered += sgm->consumed; - if (host->data_len == data->bytes_xfered) { + data->bytes_xfered += host->data_remain - remain; + host->data_remain = remain; + + if (host->data_len != data->bytes_xfered) + moxart_next_sg(host); + else complete(&host->pio_complete); - return; - } } static void moxart_prepare_data(struct moxart_host *host) { struct mmc_data *data = host->mrq->cmd->data; - unsigned int flags = SG_MITER_ATOMIC; /* Used from IRQ */ u32 datactrl; int blksz_bits; @@ -358,19 +373,15 @@ static void moxart_prepare_data(struct moxart_host *host) blksz_bits = ffs(data->blksz) - 1; BUG_ON(1 << blksz_bits != data->blksz); + moxart_init_sg(host, data); + datactrl = DCR_DATA_EN | (blksz_bits & DCR_BLK_SIZE); - if (data->flags & MMC_DATA_WRITE) { - flags |= SG_MITER_FROM_SG; + if (data->flags & MMC_DATA_WRITE) datactrl |= DCR_DATA_WRITE; - } else { - flags |= SG_MITER_TO_SG; - } if (moxart_use_dma(host)) datactrl |= DCR_DMA_EN; - else - sg_miter_start(&host->sg_miter, data->sg, data->sg_len, flags); writel(DCR_DATA_FIFO_RESET, host->base + REG_DATA_CONTROL); writel(MASK_DATA | FIFO_URUN | FIFO_ORUN, host->base + REG_CLEAR); @@ -443,9 +454,6 @@ static void moxart_request(struct mmc_host *mmc, struct mmc_request *mrq) } request_done: - if (!moxart_use_dma(host)) - sg_miter_stop(&host->sg_miter); - spin_unlock_irqrestore(&host->lock, flags); mmc_request_done(host->mmc, mrq); } From 8851346912a1fa33e7a5966fe51f07313b274627 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Tue, 18 Jun 2024 09:38:21 +0200 Subject: [PATCH 1200/1578] net: stmmac: Assign configured channel value to EXTTS event Assign the configured channel value to the EXTTS event in the timestamp interrupt handler. Without assigning the correct channel, applications like ts2phc will refuse to accept the event, resulting in errors such as: ... ts2phc[656.834]: config item end1.ts2phc.pin_index is 0 ts2phc[656.834]: config item end1.ts2phc.channel is 3 ts2phc[656.834]: config item end1.ts2phc.extts_polarity is 2 ts2phc[656.834]: config item end1.ts2phc.extts_correction is 0 ... ts2phc[656.862]: extts on unexpected channel ts2phc[658.141]: extts on unexpected channel ts2phc[659.140]: extts on unexpected channel Fixes: f4da56529da60 ("net: stmmac: Add support for external trigger timestamping") Cc: stable@vger.kernel.org Signed-off-by: Oleksij Rempel Reviewed-by: Wojciech Drewek Link: https://lore.kernel.org/r/20240618073821.619751-1-o.rempel@pengutronix.de Signed-off-by: Paolo Abeni --- drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c index f05bd757dfe525..5ef52ef2698fbe 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c @@ -218,6 +218,7 @@ static void timestamp_interrupt(struct stmmac_priv *priv) { u32 num_snapshot, ts_status, tsync_int; struct ptp_clock_event event; + u32 acr_value, channel; unsigned long flags; u64 ptp_time; int i; @@ -243,12 +244,15 @@ static void timestamp_interrupt(struct stmmac_priv *priv) num_snapshot = (ts_status & GMAC_TIMESTAMP_ATSNS_MASK) >> GMAC_TIMESTAMP_ATSNS_SHIFT; + acr_value = readl(priv->ptpaddr + PTP_ACR); + channel = ilog2(FIELD_GET(PTP_ACR_MASK, acr_value)); + for (i = 0; i < num_snapshot; i++) { read_lock_irqsave(&priv->ptp_lock, flags); get_ptptime(priv->ptpaddr, &ptp_time); read_unlock_irqrestore(&priv->ptp_lock, flags); event.type = PTP_CLOCK_EXTTS; - event.index = 0; + event.index = channel; event.timestamp = ptp_time; ptp_clock_event(priv->ptp_clock, &event); } From a23800f08a60787dfbf2b87b2e6ed411cb629859 Mon Sep 17 00:00:00 2001 From: Chenliang Li Date: Wed, 19 Jun 2024 14:38:19 +0800 Subject: [PATCH 1201/1578] io_uring/rsrc: fix incorrect assignment of iter->nr_segs in io_import_fixed In io_import_fixed when advancing the iter within the first bvec, the iter->nr_segs is set to bvec->bv_len. nr_segs should be the number of bvecs, plus we don't need to adjust it here, so just remove it. Fixes: b000ae0ec2d7 ("io_uring/rsrc: optimise single entry advance") Signed-off-by: Chenliang Li Reviewed-by: Pavel Begunkov Link: https://lore.kernel.org/r/20240619063819.2445-1-cliang01.li@samsung.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index edb9c5baf2e290..570bfa6a31aa93 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1068,7 +1068,6 @@ int io_import_fixed(int ddir, struct iov_iter *iter, * branch doesn't expect non PAGE_SIZE'd chunks. */ iter->bvec = bvec; - iter->nr_segs = bvec->bv_len; iter->count -= offset; iter->iov_offset = offset; } else { From f6860b6069b92559f5cdb65f48e2d82051eaebca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 19 Jun 2024 17:45:33 +0200 Subject: [PATCH 1202/1578] block: remove the unused blk_bounce enum The enum has been replaced with the BLK_FEAT_BOUNCE_HIGH flag. Reported-by: Keith Busch Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240619154623.450048-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 9 --------- 1 file changed, 9 deletions(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e96ba7b97288d2..f7d275e3fb2c1e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -343,15 +343,6 @@ enum { BLK_FLAGS_WRITE_CACHE_DISABLED = (1u << 31), }; -/* - * BLK_BOUNCE_NONE: never bounce (default) - * BLK_BOUNCE_HIGH: bounce all highmem pages - */ -enum blk_bounce { - BLK_BOUNCE_NONE, - BLK_BOUNCE_HIGH, -}; - struct queue_limits { unsigned int features; unsigned int flags; From 4e54ea72edd68d074be2403f3efc67ff0541e298 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 19 Jun 2024 17:45:34 +0200 Subject: [PATCH 1203/1578] block: fix spelling and grammar for in writeback_cache_control.rst Suggested-by: Damien Le Moal Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240619154623.450048-3-hch@lst.de Signed-off-by: Jens Axboe --- Documentation/block/writeback_cache_control.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/block/writeback_cache_control.rst b/Documentation/block/writeback_cache_control.rst index c575e08beda8e3..c3707d07178045 100644 --- a/Documentation/block/writeback_cache_control.rst +++ b/Documentation/block/writeback_cache_control.rst @@ -70,8 +70,8 @@ flag in the features field of the queue_limits structure. Implementation details for bio based block drivers -------------------------------------------------- -For bio based drivers the REQ_PREFLUSH and REQ_FUA bit are simplify passed on -to the driver if the drivers sets the BLK_FEAT_WRITE_CACHE flag and the drivers +For bio based drivers the REQ_PREFLUSH and REQ_FUA bit are simply passed on to +the driver if the driver sets the BLK_FEAT_WRITE_CACHE flag and the driver needs to handle them. *NOTE*: The REQ_FUA bit also gets passed on when the BLK_FEAT_FUA flags is @@ -89,7 +89,7 @@ When the BLK_FEAT_WRITE_CACHE flag is set, REQ_OP_WRITE | REQ_PREFLUSH requests with a payload are automatically turned into a sequence of a REQ_OP_FLUSH request followed by the actual write by the block layer. -When the BLK_FEAT_FUA flags is set, the REQ_FUA bit simplify passed on for the +When the BLK_FEAT_FUA flags is set, the REQ_FUA bit is simply passed on for the REQ_OP_WRITE request, else a REQ_OP_FLUSH request is sent by the block layer after the completion of the write request for bio submissions with the REQ_FUA bit set. From bae1c74316b86c67c95658c3a0cd312cec9aad77 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 19 Jun 2024 17:45:35 +0200 Subject: [PATCH 1204/1578] block: renumber and rename the cache disabled flag Start with the first bit, and drop the plural-S from the name. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240619154623.450048-4-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 6 +++--- include/linux/blkdev.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index da4e96d686f91e..59e6d111ed059a 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -429,7 +429,7 @@ static ssize_t queue_io_timeout_store(struct request_queue *q, const char *page, static ssize_t queue_wc_show(struct request_queue *q, char *page) { - if (q->limits.features & BLK_FLAGS_WRITE_CACHE_DISABLED) + if (q->limits.features & BLK_FLAG_WRITE_CACHE_DISABLED) return sprintf(page, "write through\n"); return sprintf(page, "write back\n"); } @@ -452,9 +452,9 @@ static ssize_t queue_wc_store(struct request_queue *q, const char *page, lim = queue_limits_start_update(q); if (disable) - lim.flags |= BLK_FLAGS_WRITE_CACHE_DISABLED; + lim.flags |= BLK_FLAG_WRITE_CACHE_DISABLED; else - lim.flags &= ~BLK_FLAGS_WRITE_CACHE_DISABLED; + lim.flags &= ~BLK_FLAG_WRITE_CACHE_DISABLED; err = queue_limits_commit_update(q, &lim); if (err) return err; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f7d275e3fb2c1e..713a98b6dbba08 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -339,8 +339,8 @@ enum { /* internal flags in queue_limits.flags */ enum { - /* do not send FLUSH or FUA command despite advertised write cache */ - BLK_FLAGS_WRITE_CACHE_DISABLED = (1u << 31), + /* do not send FLUSH/FUA commands despite advertising a write cache */ + BLK_FLAG_WRITE_CACHE_DISABLED = (1u << 0), }; struct queue_limits { @@ -1339,7 +1339,7 @@ static inline bool bdev_stable_writes(struct block_device *bdev) static inline bool blk_queue_write_cache(struct request_queue *q) { return (q->limits.features & BLK_FEAT_WRITE_CACHE) && - !(q->limits.flags & BLK_FLAGS_WRITE_CACHE_DISABLED); + !(q->limits.flags & BLK_FLAG_WRITE_CACHE_DISABLED); } static inline bool bdev_write_cache(struct block_device *bdev) From 5543217be468268dfedf504f4969771b9a377353 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 19 Jun 2024 17:45:36 +0200 Subject: [PATCH 1205/1578] block: move the misaligned flag into the features field Move the misaligned flags into the features field to reclaim a little bit of space. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240619154623.450048-5-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 20 ++++++++++---------- include/linux/blkdev.h | 4 +++- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index d0e9096f93ca8a..a1b10404e500bc 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -258,7 +258,7 @@ static int blk_validate_limits(struct queue_limits *lim) if (lim->alignment_offset) { lim->alignment_offset &= (lim->physical_block_size - 1); - lim->misaligned = 0; + lim->features &= ~BLK_FEAT_MISALIGNED; } if (!(lim->features & BLK_FEAT_WRITE_CACHE)) @@ -470,6 +470,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, if (!(b->features & BLK_FEAT_POLL)) t->features &= ~BLK_FEAT_POLL; + t->flags |= (b->flags & BLK_FEAT_MISALIGNED); + t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); t->max_user_sectors = min_not_zero(t->max_user_sectors, b->max_user_sectors); @@ -494,8 +496,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->max_segment_size = min_not_zero(t->max_segment_size, b->max_segment_size); - t->misaligned |= b->misaligned; - alignment = queue_limit_alignment_offset(b, start); /* Bottom device has different alignment. Check that it is @@ -509,7 +509,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, /* Verify that top and bottom intervals line up */ if (max(top, bottom) % min(top, bottom)) { - t->misaligned = 1; + t->flags |= BLK_FEAT_MISALIGNED; ret = -1; } } @@ -531,28 +531,28 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, /* Physical block size a multiple of the logical block size? */ if (t->physical_block_size & (t->logical_block_size - 1)) { t->physical_block_size = t->logical_block_size; - t->misaligned = 1; + t->flags |= BLK_FEAT_MISALIGNED; ret = -1; } /* Minimum I/O a multiple of the physical block size? */ if (t->io_min & (t->physical_block_size - 1)) { t->io_min = t->physical_block_size; - t->misaligned = 1; + t->flags |= BLK_FEAT_MISALIGNED; ret = -1; } /* Optimal I/O a multiple of the physical block size? */ if (t->io_opt & (t->physical_block_size - 1)) { t->io_opt = 0; - t->misaligned = 1; + t->flags |= BLK_FEAT_MISALIGNED; ret = -1; } /* chunk_sectors a multiple of the physical block size? */ if ((t->chunk_sectors << 9) & (t->physical_block_size - 1)) { t->chunk_sectors = 0; - t->misaligned = 1; + t->flags |= BLK_FEAT_MISALIGNED; ret = -1; } @@ -566,7 +566,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, /* Verify that new alignment_offset is on a logical block boundary */ if (t->alignment_offset & (t->logical_block_size - 1)) { - t->misaligned = 1; + t->flags |= BLK_FEAT_MISALIGNED; ret = -1; } @@ -729,7 +729,7 @@ int bdev_alignment_offset(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); - if (q->limits.misaligned) + if (q->limits.flags & BLK_FEAT_MISALIGNED) return -1; if (bdev_is_partition(bdev)) return queue_limit_alignment_offset(&q->limits, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 713a98b6dbba08..7ad2b1240fc0bf 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -341,6 +341,9 @@ enum { enum { /* do not send FLUSH/FUA commands despite advertising a write cache */ BLK_FLAG_WRITE_CACHE_DISABLED = (1u << 0), + + /* I/O topology is misaligned */ + BLK_FEAT_MISALIGNED = (1u << 1), }; struct queue_limits { @@ -374,7 +377,6 @@ struct queue_limits { unsigned short max_integrity_segments; unsigned short max_discard_segments; - unsigned char misaligned; unsigned char discard_misaligned; unsigned char raid_partial_stripes_expensive; unsigned int max_open_zones; From 4cac3d3a712b5c76d462b29b73b9e58c0b6d9946 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 19 Jun 2024 17:45:37 +0200 Subject: [PATCH 1206/1578] block: remove the discard_alignment flag queue_limits.discard_alignment is never read except in the places where it is stacked into another limit. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240619154623.450048-6-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 10 ---------- drivers/md/dm-cache-target.c | 1 - drivers/md/dm-clone-target.c | 1 - drivers/md/dm-table.c | 1 - include/linux/blkdev.h | 1 - 5 files changed, 14 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index a1b10404e500bc..62588d9a38e39a 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -578,16 +578,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, if (b->discard_granularity) { alignment = queue_limit_discard_alignment(b, start); - if (t->discard_granularity != 0 && - t->discard_alignment != alignment) { - top = t->discard_granularity + t->discard_alignment; - bottom = b->discard_granularity + alignment; - - /* Verify that top and bottom intervals line up */ - if ((max(top, bottom) % min(top, bottom)) != 0) - t->discard_misaligned = 1; - } - t->max_discard_sectors = min_not_zero(t->max_discard_sectors, b->max_discard_sectors); t->max_hw_discard_sectors = min_not_zero(t->max_hw_discard_sectors, diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 16884b5850532f..2d8dd9283ff4cf 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -3403,7 +3403,6 @@ static void set_discard_limits(struct cache *cache, struct queue_limits *limits) limits->max_hw_discard_sectors = origin_limits->max_hw_discard_sectors; limits->discard_granularity = origin_limits->discard_granularity; limits->discard_alignment = origin_limits->discard_alignment; - limits->discard_misaligned = origin_limits->discard_misaligned; } static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c index ad79b52ffc1434..b4384a8b13e360 100644 --- a/drivers/md/dm-clone-target.c +++ b/drivers/md/dm-clone-target.c @@ -2059,7 +2059,6 @@ static void set_discard_limits(struct clone *clone, struct queue_limits *limits) limits->max_hw_discard_sectors = dest_limits->max_hw_discard_sectors; limits->discard_granularity = dest_limits->discard_granularity; limits->discard_alignment = dest_limits->discard_alignment; - limits->discard_misaligned = dest_limits->discard_misaligned; limits->max_discard_segments = dest_limits->max_discard_segments; } diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index df6313c3fe6ba4..502ebc78d490f6 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1808,7 +1808,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, limits->max_hw_discard_sectors = 0; limits->discard_granularity = 0; limits->discard_alignment = 0; - limits->discard_misaligned = 0; } if (!dm_table_supports_write_zeroes(t)) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7ad2b1240fc0bf..86410ce41bf60e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -377,7 +377,6 @@ struct queue_limits { unsigned short max_integrity_segments; unsigned short max_discard_segments; - unsigned char discard_misaligned; unsigned char raid_partial_stripes_expensive; unsigned int max_open_zones; unsigned int max_active_zones; From 7d4dec525f5fd555037486af4d02dd3682655ba1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 19 Jun 2024 17:45:38 +0200 Subject: [PATCH 1207/1578] block: move the raid_partial_stripes_expensive flag into the features field Move the raid_partial_stripes_expensive flags into the features field to reclaim a little bit of space. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240619154623.450048-7-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 4 ---- drivers/md/bcache/super.c | 4 ++-- drivers/md/raid5.c | 2 +- include/linux/blkdev.h | 7 +++++-- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index 62588d9a38e39a..008fed84edb432 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -556,10 +556,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, ret = -1; } - t->raid_partial_stripes_expensive = - max(t->raid_partial_stripes_expensive, - b->raid_partial_stripes_expensive); - /* Find lowest common alignment_offset */ t->alignment_offset = lcm_not_zero(t->alignment_offset, alignment) % max(t->physical_block_size, t->io_min); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index baa364eedd0051..283b2511c6d21f 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1416,8 +1416,8 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) } if (bdev_io_opt(dc->bdev)) - dc->partial_stripes_expensive = - q->limits.raid_partial_stripes_expensive; + dc->partial_stripes_expensive = q->limits.features & + BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE; ret = bcache_device_init(&dc->disk, block_size, bdev_nr_sectors(dc->bdev) - dc->sb.data_offset, diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e875763d69917d..72f91eaa3201c4 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7707,7 +7707,7 @@ static int raid5_set_limits(struct mddev *mddev) blk_set_stacking_limits(&lim); lim.io_min = mddev->chunk_sectors << 9; lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded); - lim.raid_partial_stripes_expensive = 1; + lim.features |= BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE; lim.discard_granularity = stripe; lim.max_write_zeroes_sectors = 0; mddev_stack_rdev_limits(mddev, &lim, 0); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 86410ce41bf60e..1fa2b148c20696 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -328,6 +328,9 @@ enum { /* bounce all highmem pages */ BLK_FEAT_BOUNCE_HIGH = (1u << 14), + + /* undocumented magic for bcache */ + BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE = (1u << 15), }; /* @@ -335,7 +338,8 @@ enum { */ #define BLK_FEAT_INHERIT_MASK \ (BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_ROTATIONAL | \ - BLK_FEAT_STABLE_WRITES | BLK_FEAT_ZONED | BLK_FEAT_BOUNCE_HIGH) + BLK_FEAT_STABLE_WRITES | BLK_FEAT_ZONED | BLK_FEAT_BOUNCE_HIGH | \ + BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE) /* internal flags in queue_limits.flags */ enum { @@ -377,7 +381,6 @@ struct queue_limits { unsigned short max_integrity_segments; unsigned short max_discard_segments; - unsigned char raid_partial_stripes_expensive; unsigned int max_open_zones; unsigned int max_active_zones; From 5ddb88f22eb97218d9295e69c39e0ff7cc64e09c Mon Sep 17 00:00:00 2001 From: Andreas Hindborg Date: Thu, 20 Jun 2024 10:57:21 +0200 Subject: [PATCH 1208/1578] rust: block: do not use removed queue flag API `blk_queue_flag_set` and `blk_queue_flag_clear` was removed in favor of a new API. This caused a build error for Rust block device abstractions. Thus, use the new feature passing API instead of the old removed API. Fixes: bd4a633b6f7c ("block: move the nonrot flag to queue_limits") Signed-off-by: Andreas Hindborg Link: https://lore.kernel.org/r/20240620085721.1218296-1-nmi@metaspace.dk Signed-off-by: Jens Axboe --- rust/kernel/block/mq/gen_disk.rs | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/rust/kernel/block/mq/gen_disk.rs b/rust/kernel/block/mq/gen_disk.rs index e06044b549e0be..f548a619984778 100644 --- a/rust/kernel/block/mq/gen_disk.rs +++ b/rust/kernel/block/mq/gen_disk.rs @@ -100,6 +100,9 @@ impl GenDiskBuilder { lim.logical_block_size = self.logical_block_size; lim.physical_block_size = self.physical_block_size; + if self.rotational { + lim.features = bindings::BLK_FEAT_ROTATIONAL; + } // SAFETY: `tagset.raw_tag_set()` points to a valid and initialized tag set let gendisk = from_err_ptr(unsafe { @@ -152,20 +155,6 @@ impl GenDiskBuilder { // operation, so we will not race. unsafe { bindings::set_capacity(gendisk, self.capacity_sectors) }; - if !self.rotational { - // SAFETY: `gendisk` points to a valid and initialized instance of - // `struct gendisk`. This operation uses a relaxed atomic bit flip - // operation, so there is no race on this field. - unsafe { bindings::blk_queue_flag_set(bindings::QUEUE_FLAG_NONROT, (*gendisk).queue) }; - } else { - // SAFETY: `gendisk` points to a valid and initialized instance of - // `struct gendisk`. This operation uses a relaxed atomic bit flip - // operation, so there is no race on this field. - unsafe { - bindings::blk_queue_flag_clear(bindings::QUEUE_FLAG_NONROT, (*gendisk).queue) - }; - } - crate::error::to_result( // SAFETY: `gendisk` points to a valid and initialized instance of // `struct gendisk`. From 33dfafa90285c0873a24d633877d505ab8e3fc20 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 19 Jun 2024 09:55:48 -0400 Subject: [PATCH 1209/1578] bcachefs: Fix safe errors by default i.e. the start of automatic self healing: If errors=continue or fix_safe, we now automatically fix simple errors without user intervention. New error action option: fix_safe This replaces the existing errors=ro option, which gets a new slot, i.e. existing errors=ro users now get errors=fix_safe. This is currently only enabled for a limited set of errors - initially just disk accounting; errors we would never not want to fix, and we don't want to require user intervention (i.e. to make sure a bug report gets filed). Errors will still be counted in the superblock, so we (developers) will still know they've been occuring if a bug report gets filed (as bug reports typically include the errors superblock section). Eventually we'll be enabling this for a much wider set of errors, after we've done thorough error injection testing. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 5 +- fs/bcachefs/error.c | 19 +- fs/bcachefs/error.h | 7 - fs/bcachefs/opts.h | 2 +- fs/bcachefs/sb-errors_format.h | 564 +++++++++++++++++---------------- 5 files changed, 308 insertions(+), 289 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 4b98fed1ee9a4c..e3b1bde489c3b0 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -990,8 +990,9 @@ enum bch_version_upgrade_opts { #define BCH_ERROR_ACTIONS() \ x(continue, 0) \ - x(ro, 1) \ - x(panic, 2) + x(fix_safe, 1) \ + x(panic, 2) \ + x(ro, 3) enum bch_error_actions { #define x(t, n) BCH_ON_ERROR_##t = n, diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index c66eeffcd7f2a8..d95c40f1b6af8d 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -15,6 +15,7 @@ bool bch2_inconsistent_error(struct bch_fs *c) switch (c->opts.errors) { case BCH_ON_ERROR_continue: return false; + case BCH_ON_ERROR_fix_safe: case BCH_ON_ERROR_ro: if (bch2_fs_emergency_read_only(c)) bch_err(c, "inconsistency detected - emergency read only at journal seq %llu", @@ -191,6 +192,12 @@ static void prt_actioning(struct printbuf *out, const char *action) prt_str(out, "ing"); } +static const u8 fsck_flags_extra[] = { +#define x(t, n, flags) [BCH_FSCK_ERR_##t] = flags, + BCH_SB_ERRS() +#undef x +}; + int bch2_fsck_err(struct bch_fs *c, enum bch_fsck_flags flags, enum bch_sb_error_id err, @@ -203,6 +210,9 @@ int bch2_fsck_err(struct bch_fs *c, int ret = -BCH_ERR_fsck_ignore; const char *action_orig = "fix?", *action = action_orig; + if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra))) + flags |= fsck_flags_extra[err]; + if ((flags & FSCK_CAN_FIX) && test_bit(err, c->sb.errors_silent)) return -BCH_ERR_fsck_fix; @@ -265,7 +275,14 @@ int bch2_fsck_err(struct bch_fs *c, prt_printf(out, bch2_log_msg(c, "")); #endif - if (!test_bit(BCH_FS_fsck_running, &c->flags)) { + if ((flags & FSCK_CAN_FIX) && + (flags & FSCK_AUTOFIX) && + (c->opts.errors == BCH_ON_ERROR_continue || + c->opts.errors == BCH_ON_ERROR_fix_safe)) { + prt_str(out, ", "); + prt_actioning(out, action); + ret = -BCH_ERR_fsck_fix; + } else if (!test_bit(BCH_FS_fsck_running, &c->flags)) { if (c->opts.errors != BCH_ON_ERROR_continue || !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { prt_str(out, ", shutting down"); diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 36caedf72d89ab..777711504c35ce 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -108,13 +108,6 @@ struct fsck_err_state { char *last_msg; }; -enum bch_fsck_flags { - FSCK_CAN_FIX = 1 << 0, - FSCK_CAN_IGNORE = 1 << 1, - FSCK_NEED_FSCK = 1 << 2, - FSCK_NO_RATELIMIT = 1 << 3, -}; - #define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err) __printf(4, 5) __cold diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 25530e0bb2f386..b197ec90d4cb02 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -137,7 +137,7 @@ enum fsck_err_opts { x(errors, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_STR(bch2_error_actions), \ - BCH_SB_ERROR_ACTION, BCH_ON_ERROR_ro, \ + BCH_SB_ERROR_ACTION, BCH_ON_ERROR_fix_safe, \ NULL, "Action to take on filesystem error") \ x(metadata_replicas, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 1768e5c49f999e..d6f35a99c4291c 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -2,286 +2,294 @@ #ifndef _BCACHEFS_SB_ERRORS_FORMAT_H #define _BCACHEFS_SB_ERRORS_FORMAT_H -#define BCH_SB_ERRS() \ - x(clean_but_journal_not_empty, 0) \ - x(dirty_but_no_journal_entries, 1) \ - x(dirty_but_no_journal_entries_post_drop_nonflushes, 2) \ - x(sb_clean_journal_seq_mismatch, 3) \ - x(sb_clean_btree_root_mismatch, 4) \ - x(sb_clean_missing, 5) \ - x(jset_unsupported_version, 6) \ - x(jset_unknown_csum, 7) \ - x(jset_last_seq_newer_than_seq, 8) \ - x(jset_past_bucket_end, 9) \ - x(jset_seq_blacklisted, 10) \ - x(journal_entries_missing, 11) \ - x(journal_entry_replicas_not_marked, 12) \ - x(journal_entry_past_jset_end, 13) \ - x(journal_entry_replicas_data_mismatch, 14) \ - x(journal_entry_bkey_u64s_0, 15) \ - x(journal_entry_bkey_past_end, 16) \ - x(journal_entry_bkey_bad_format, 17) \ - x(journal_entry_bkey_invalid, 18) \ - x(journal_entry_btree_root_bad_size, 19) \ - x(journal_entry_blacklist_bad_size, 20) \ - x(journal_entry_blacklist_v2_bad_size, 21) \ - x(journal_entry_blacklist_v2_start_past_end, 22) \ - x(journal_entry_usage_bad_size, 23) \ - x(journal_entry_data_usage_bad_size, 24) \ - x(journal_entry_clock_bad_size, 25) \ - x(journal_entry_clock_bad_rw, 26) \ - x(journal_entry_dev_usage_bad_size, 27) \ - x(journal_entry_dev_usage_bad_dev, 28) \ - x(journal_entry_dev_usage_bad_pad, 29) \ - x(btree_node_unreadable, 30) \ - x(btree_node_fault_injected, 31) \ - x(btree_node_bad_magic, 32) \ - x(btree_node_bad_seq, 33) \ - x(btree_node_unsupported_version, 34) \ - x(btree_node_bset_older_than_sb_min, 35) \ - x(btree_node_bset_newer_than_sb, 36) \ - x(btree_node_data_missing, 37) \ - x(btree_node_bset_after_end, 38) \ - x(btree_node_replicas_sectors_written_mismatch, 39) \ - x(btree_node_replicas_data_mismatch, 40) \ - x(bset_unknown_csum, 41) \ - x(bset_bad_csum, 42) \ - x(bset_past_end_of_btree_node, 43) \ - x(bset_wrong_sector_offset, 44) \ - x(bset_empty, 45) \ - x(bset_bad_seq, 46) \ - x(bset_blacklisted_journal_seq, 47) \ - x(first_bset_blacklisted_journal_seq, 48) \ - x(btree_node_bad_btree, 49) \ - x(btree_node_bad_level, 50) \ - x(btree_node_bad_min_key, 51) \ - x(btree_node_bad_max_key, 52) \ - x(btree_node_bad_format, 53) \ - x(btree_node_bkey_past_bset_end, 54) \ - x(btree_node_bkey_bad_format, 55) \ - x(btree_node_bad_bkey, 56) \ - x(btree_node_bkey_out_of_order, 57) \ - x(btree_root_bkey_invalid, 58) \ - x(btree_root_read_error, 59) \ - x(btree_root_bad_min_key, 60) \ - x(btree_root_bad_max_key, 61) \ - x(btree_node_read_error, 62) \ - x(btree_node_topology_bad_min_key, 63) \ - x(btree_node_topology_bad_max_key, 64) \ - x(btree_node_topology_overwritten_by_prev_node, 65) \ - x(btree_node_topology_overwritten_by_next_node, 66) \ - x(btree_node_topology_interior_node_empty, 67) \ - x(fs_usage_hidden_wrong, 68) \ - x(fs_usage_btree_wrong, 69) \ - x(fs_usage_data_wrong, 70) \ - x(fs_usage_cached_wrong, 71) \ - x(fs_usage_reserved_wrong, 72) \ - x(fs_usage_persistent_reserved_wrong, 73) \ - x(fs_usage_nr_inodes_wrong, 74) \ - x(fs_usage_replicas_wrong, 75) \ - x(dev_usage_buckets_wrong, 76) \ - x(dev_usage_sectors_wrong, 77) \ - x(dev_usage_fragmented_wrong, 78) \ - x(dev_usage_buckets_ec_wrong, 79) \ - x(bkey_version_in_future, 80) \ - x(bkey_u64s_too_small, 81) \ - x(bkey_invalid_type_for_btree, 82) \ - x(bkey_extent_size_zero, 83) \ - x(bkey_extent_size_greater_than_offset, 84) \ - x(bkey_size_nonzero, 85) \ - x(bkey_snapshot_nonzero, 86) \ - x(bkey_snapshot_zero, 87) \ - x(bkey_at_pos_max, 88) \ - x(bkey_before_start_of_btree_node, 89) \ - x(bkey_after_end_of_btree_node, 90) \ - x(bkey_val_size_nonzero, 91) \ - x(bkey_val_size_too_small, 92) \ - x(alloc_v1_val_size_bad, 93) \ - x(alloc_v2_unpack_error, 94) \ - x(alloc_v3_unpack_error, 95) \ - x(alloc_v4_val_size_bad, 96) \ - x(alloc_v4_backpointers_start_bad, 97) \ - x(alloc_key_data_type_bad, 98) \ - x(alloc_key_empty_but_have_data, 99) \ - x(alloc_key_dirty_sectors_0, 100) \ - x(alloc_key_data_type_inconsistency, 101) \ - x(alloc_key_to_missing_dev_bucket, 102) \ - x(alloc_key_cached_inconsistency, 103) \ - x(alloc_key_cached_but_read_time_zero, 104) \ - x(alloc_key_to_missing_lru_entry, 105) \ - x(alloc_key_data_type_wrong, 106) \ - x(alloc_key_gen_wrong, 107) \ - x(alloc_key_dirty_sectors_wrong, 108) \ - x(alloc_key_cached_sectors_wrong, 109) \ - x(alloc_key_stripe_wrong, 110) \ - x(alloc_key_stripe_redundancy_wrong, 111) \ - x(bucket_sector_count_overflow, 112) \ - x(bucket_metadata_type_mismatch, 113) \ - x(need_discard_key_wrong, 114) \ - x(freespace_key_wrong, 115) \ - x(freespace_hole_missing, 116) \ - x(bucket_gens_val_size_bad, 117) \ - x(bucket_gens_key_wrong, 118) \ - x(bucket_gens_hole_wrong, 119) \ - x(bucket_gens_to_invalid_dev, 120) \ - x(bucket_gens_to_invalid_buckets, 121) \ - x(bucket_gens_nonzero_for_invalid_buckets, 122) \ - x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \ - x(need_discard_freespace_key_bad, 124) \ - x(backpointer_bucket_offset_wrong, 125) \ - x(backpointer_to_missing_device, 126) \ - x(backpointer_to_missing_alloc, 127) \ - x(backpointer_to_missing_ptr, 128) \ - x(lru_entry_at_time_0, 129) \ - x(lru_entry_to_invalid_bucket, 130) \ - x(lru_entry_bad, 131) \ - x(btree_ptr_val_too_big, 132) \ - x(btree_ptr_v2_val_too_big, 133) \ - x(btree_ptr_has_non_ptr, 134) \ - x(extent_ptrs_invalid_entry, 135) \ - x(extent_ptrs_no_ptrs, 136) \ - x(extent_ptrs_too_many_ptrs, 137) \ - x(extent_ptrs_redundant_crc, 138) \ - x(extent_ptrs_redundant_stripe, 139) \ - x(extent_ptrs_unwritten, 140) \ - x(extent_ptrs_written_and_unwritten, 141) \ - x(ptr_to_invalid_device, 142) \ - x(ptr_to_duplicate_device, 143) \ - x(ptr_after_last_bucket, 144) \ - x(ptr_before_first_bucket, 145) \ - x(ptr_spans_multiple_buckets, 146) \ - x(ptr_to_missing_backpointer, 147) \ - x(ptr_to_missing_alloc_key, 148) \ - x(ptr_to_missing_replicas_entry, 149) \ - x(ptr_to_missing_stripe, 150) \ - x(ptr_to_incorrect_stripe, 151) \ - x(ptr_gen_newer_than_bucket_gen, 152) \ - x(ptr_too_stale, 153) \ - x(stale_dirty_ptr, 154) \ - x(ptr_bucket_data_type_mismatch, 155) \ - x(ptr_cached_and_erasure_coded, 156) \ - x(ptr_crc_uncompressed_size_too_small, 157) \ - x(ptr_crc_csum_type_unknown, 158) \ - x(ptr_crc_compression_type_unknown, 159) \ - x(ptr_crc_redundant, 160) \ - x(ptr_crc_uncompressed_size_too_big, 161) \ - x(ptr_crc_nonce_mismatch, 162) \ - x(ptr_stripe_redundant, 163) \ - x(reservation_key_nr_replicas_invalid, 164) \ - x(reflink_v_refcount_wrong, 165) \ - x(reflink_p_to_missing_reflink_v, 166) \ - x(stripe_pos_bad, 167) \ - x(stripe_val_size_bad, 168) \ - x(stripe_sector_count_wrong, 169) \ - x(snapshot_tree_pos_bad, 170) \ - x(snapshot_tree_to_missing_snapshot, 171) \ - x(snapshot_tree_to_missing_subvol, 172) \ - x(snapshot_tree_to_wrong_subvol, 173) \ - x(snapshot_tree_to_snapshot_subvol, 174) \ - x(snapshot_pos_bad, 175) \ - x(snapshot_parent_bad, 176) \ - x(snapshot_children_not_normalized, 177) \ - x(snapshot_child_duplicate, 178) \ - x(snapshot_child_bad, 179) \ - x(snapshot_skiplist_not_normalized, 180) \ - x(snapshot_skiplist_bad, 181) \ - x(snapshot_should_not_have_subvol, 182) \ - x(snapshot_to_bad_snapshot_tree, 183) \ - x(snapshot_bad_depth, 184) \ - x(snapshot_bad_skiplist, 185) \ - x(subvol_pos_bad, 186) \ - x(subvol_not_master_and_not_snapshot, 187) \ - x(subvol_to_missing_root, 188) \ - x(subvol_root_wrong_bi_subvol, 189) \ - x(bkey_in_missing_snapshot, 190) \ - x(inode_pos_inode_nonzero, 191) \ - x(inode_pos_blockdev_range, 192) \ - x(inode_unpack_error, 193) \ - x(inode_str_hash_invalid, 194) \ - x(inode_v3_fields_start_bad, 195) \ - x(inode_snapshot_mismatch, 196) \ - x(inode_unlinked_but_clean, 197) \ - x(inode_unlinked_but_nlink_nonzero, 198) \ - x(inode_checksum_type_invalid, 199) \ - x(inode_compression_type_invalid, 200) \ - x(inode_subvol_root_but_not_dir, 201) \ - x(inode_i_size_dirty_but_clean, 202) \ - x(inode_i_sectors_dirty_but_clean, 203) \ - x(inode_i_sectors_wrong, 204) \ - x(inode_dir_wrong_nlink, 205) \ - x(inode_dir_multiple_links, 206) \ - x(inode_multiple_links_but_nlink_0, 207) \ - x(inode_wrong_backpointer, 208) \ - x(inode_wrong_nlink, 209) \ - x(inode_unreachable, 210) \ - x(deleted_inode_but_clean, 211) \ - x(deleted_inode_missing, 212) \ - x(deleted_inode_is_dir, 213) \ - x(deleted_inode_not_unlinked, 214) \ - x(extent_overlapping, 215) \ - x(extent_in_missing_inode, 216) \ - x(extent_in_non_reg_inode, 217) \ - x(extent_past_end_of_inode, 218) \ - x(dirent_empty_name, 219) \ - x(dirent_val_too_big, 220) \ - x(dirent_name_too_long, 221) \ - x(dirent_name_embedded_nul, 222) \ - x(dirent_name_dot_or_dotdot, 223) \ - x(dirent_name_has_slash, 224) \ - x(dirent_d_type_wrong, 225) \ - x(inode_bi_parent_wrong, 226) \ - x(dirent_in_missing_dir_inode, 227) \ - x(dirent_in_non_dir_inode, 228) \ - x(dirent_to_missing_inode, 229) \ - x(dirent_to_missing_subvol, 230) \ - x(dirent_to_itself, 231) \ - x(quota_type_invalid, 232) \ - x(xattr_val_size_too_small, 233) \ - x(xattr_val_size_too_big, 234) \ - x(xattr_invalid_type, 235) \ - x(xattr_name_invalid_chars, 236) \ - x(xattr_in_missing_inode, 237) \ - x(root_subvol_missing, 238) \ - x(root_dir_missing, 239) \ - x(root_inode_not_dir, 240) \ - x(dir_loop, 241) \ - x(hash_table_key_duplicate, 242) \ - x(hash_table_key_wrong_offset, 243) \ - x(unlinked_inode_not_on_deleted_list, 244) \ - x(reflink_p_front_pad_bad, 245) \ - x(journal_entry_dup_same_device, 246) \ - x(inode_bi_subvol_missing, 247) \ - x(inode_bi_subvol_wrong, 248) \ - x(inode_points_to_missing_dirent, 249) \ - x(inode_points_to_wrong_dirent, 250) \ - x(inode_bi_parent_nonzero, 251) \ - x(dirent_to_missing_parent_subvol, 252) \ - x(dirent_not_visible_in_parent_subvol, 253) \ - x(subvol_fs_path_parent_wrong, 254) \ - x(subvol_root_fs_path_parent_nonzero, 255) \ - x(subvol_children_not_set, 256) \ - x(subvol_children_bad, 257) \ - x(subvol_loop, 258) \ - x(subvol_unreachable, 259) \ - x(btree_node_bkey_bad_u64s, 260) \ - x(btree_node_topology_empty_interior_node, 261) \ - x(btree_ptr_v2_min_key_bad, 262) \ - x(btree_root_unreadable_and_scan_found_nothing, 263) \ - x(snapshot_node_missing, 264) \ - x(dup_backpointer_to_bad_csum_extent, 265) \ - x(btree_bitmap_not_marked, 266) \ - x(sb_clean_entry_overrun, 267) \ - x(btree_ptr_v2_written_0, 268) \ - x(subvol_snapshot_bad, 269) \ - x(subvol_inode_bad, 270) \ - x(alloc_key_stripe_sectors_wrong, 271) \ - x(accounting_mismatch, 272) \ - x(accounting_replicas_not_marked, 273) \ - x(invalid_btree_id, 274) \ - x(alloc_key_io_time_bad, 275) +enum bch_fsck_flags { + FSCK_CAN_FIX = 1 << 0, + FSCK_CAN_IGNORE = 1 << 1, + FSCK_NEED_FSCK = 1 << 2, + FSCK_NO_RATELIMIT = 1 << 3, + FSCK_AUTOFIX = 1 << 4, +}; + +#define BCH_SB_ERRS() \ + x(clean_but_journal_not_empty, 0, 0) \ + x(dirty_but_no_journal_entries, 1, 0) \ + x(dirty_but_no_journal_entries_post_drop_nonflushes, 2, 0) \ + x(sb_clean_journal_seq_mismatch, 3, 0) \ + x(sb_clean_btree_root_mismatch, 4, 0) \ + x(sb_clean_missing, 5, 0) \ + x(jset_unsupported_version, 6, 0) \ + x(jset_unknown_csum, 7, 0) \ + x(jset_last_seq_newer_than_seq, 8, 0) \ + x(jset_past_bucket_end, 9, 0) \ + x(jset_seq_blacklisted, 10, 0) \ + x(journal_entries_missing, 11, 0) \ + x(journal_entry_replicas_not_marked, 12, 0) \ + x(journal_entry_past_jset_end, 13, 0) \ + x(journal_entry_replicas_data_mismatch, 14, 0) \ + x(journal_entry_bkey_u64s_0, 15, 0) \ + x(journal_entry_bkey_past_end, 16, 0) \ + x(journal_entry_bkey_bad_format, 17, 0) \ + x(journal_entry_bkey_invalid, 18, 0) \ + x(journal_entry_btree_root_bad_size, 19, 0) \ + x(journal_entry_blacklist_bad_size, 20, 0) \ + x(journal_entry_blacklist_v2_bad_size, 21, 0) \ + x(journal_entry_blacklist_v2_start_past_end, 22, 0) \ + x(journal_entry_usage_bad_size, 23, 0) \ + x(journal_entry_data_usage_bad_size, 24, 0) \ + x(journal_entry_clock_bad_size, 25, 0) \ + x(journal_entry_clock_bad_rw, 26, 0) \ + x(journal_entry_dev_usage_bad_size, 27, 0) \ + x(journal_entry_dev_usage_bad_dev, 28, 0) \ + x(journal_entry_dev_usage_bad_pad, 29, 0) \ + x(btree_node_unreadable, 30, 0) \ + x(btree_node_fault_injected, 31, 0) \ + x(btree_node_bad_magic, 32, 0) \ + x(btree_node_bad_seq, 33, 0) \ + x(btree_node_unsupported_version, 34, 0) \ + x(btree_node_bset_older_than_sb_min, 35, 0) \ + x(btree_node_bset_newer_than_sb, 36, 0) \ + x(btree_node_data_missing, 37, 0) \ + x(btree_node_bset_after_end, 38, 0) \ + x(btree_node_replicas_sectors_written_mismatch, 39, 0) \ + x(btree_node_replicas_data_mismatch, 40, 0) \ + x(bset_unknown_csum, 41, 0) \ + x(bset_bad_csum, 42, 0) \ + x(bset_past_end_of_btree_node, 43, 0) \ + x(bset_wrong_sector_offset, 44, 0) \ + x(bset_empty, 45, 0) \ + x(bset_bad_seq, 46, 0) \ + x(bset_blacklisted_journal_seq, 47, 0) \ + x(first_bset_blacklisted_journal_seq, 48, 0) \ + x(btree_node_bad_btree, 49, 0) \ + x(btree_node_bad_level, 50, 0) \ + x(btree_node_bad_min_key, 51, 0) \ + x(btree_node_bad_max_key, 52, 0) \ + x(btree_node_bad_format, 53, 0) \ + x(btree_node_bkey_past_bset_end, 54, 0) \ + x(btree_node_bkey_bad_format, 55, 0) \ + x(btree_node_bad_bkey, 56, 0) \ + x(btree_node_bkey_out_of_order, 57, 0) \ + x(btree_root_bkey_invalid, 58, 0) \ + x(btree_root_read_error, 59, 0) \ + x(btree_root_bad_min_key, 60, 0) \ + x(btree_root_bad_max_key, 61, 0) \ + x(btree_node_read_error, 62, 0) \ + x(btree_node_topology_bad_min_key, 63, 0) \ + x(btree_node_topology_bad_max_key, 64, 0) \ + x(btree_node_topology_overwritten_by_prev_node, 65, 0) \ + x(btree_node_topology_overwritten_by_next_node, 66, 0) \ + x(btree_node_topology_interior_node_empty, 67, 0) \ + x(fs_usage_hidden_wrong, 68, FSCK_AUTOFIX) \ + x(fs_usage_btree_wrong, 69, FSCK_AUTOFIX) \ + x(fs_usage_data_wrong, 70, FSCK_AUTOFIX) \ + x(fs_usage_cached_wrong, 71, FSCK_AUTOFIX) \ + x(fs_usage_reserved_wrong, 72, FSCK_AUTOFIX) \ + x(fs_usage_persistent_reserved_wrong, 73, FSCK_AUTOFIX) \ + x(fs_usage_nr_inodes_wrong, 74, FSCK_AUTOFIX) \ + x(fs_usage_replicas_wrong, 75, FSCK_AUTOFIX) \ + x(dev_usage_buckets_wrong, 76, FSCK_AUTOFIX) \ + x(dev_usage_sectors_wrong, 77, FSCK_AUTOFIX) \ + x(dev_usage_fragmented_wrong, 78, FSCK_AUTOFIX) \ + x(dev_usage_buckets_ec_wrong, 79, FSCK_AUTOFIX) \ + x(bkey_version_in_future, 80, 0) \ + x(bkey_u64s_too_small, 81, 0) \ + x(bkey_invalid_type_for_btree, 82, 0) \ + x(bkey_extent_size_zero, 83, 0) \ + x(bkey_extent_size_greater_than_offset, 84, 0) \ + x(bkey_size_nonzero, 85, 0) \ + x(bkey_snapshot_nonzero, 86, 0) \ + x(bkey_snapshot_zero, 87, 0) \ + x(bkey_at_pos_max, 88, 0) \ + x(bkey_before_start_of_btree_node, 89, 0) \ + x(bkey_after_end_of_btree_node, 90, 0) \ + x(bkey_val_size_nonzero, 91, 0) \ + x(bkey_val_size_too_small, 92, 0) \ + x(alloc_v1_val_size_bad, 93, 0) \ + x(alloc_v2_unpack_error, 94, 0) \ + x(alloc_v3_unpack_error, 95, 0) \ + x(alloc_v4_val_size_bad, 96, 0) \ + x(alloc_v4_backpointers_start_bad, 97, 0) \ + x(alloc_key_data_type_bad, 98, 0) \ + x(alloc_key_empty_but_have_data, 99, 0) \ + x(alloc_key_dirty_sectors_0, 100, 0) \ + x(alloc_key_data_type_inconsistency, 101, 0) \ + x(alloc_key_to_missing_dev_bucket, 102, 0) \ + x(alloc_key_cached_inconsistency, 103, 0) \ + x(alloc_key_cached_but_read_time_zero, 104, 0) \ + x(alloc_key_to_missing_lru_entry, 105, 0) \ + x(alloc_key_data_type_wrong, 106, FSCK_AUTOFIX) \ + x(alloc_key_gen_wrong, 107, FSCK_AUTOFIX) \ + x(alloc_key_dirty_sectors_wrong, 108, FSCK_AUTOFIX) \ + x(alloc_key_cached_sectors_wrong, 109, FSCK_AUTOFIX) \ + x(alloc_key_stripe_wrong, 110, FSCK_AUTOFIX) \ + x(alloc_key_stripe_redundancy_wrong, 111, FSCK_AUTOFIX) \ + x(bucket_sector_count_overflow, 112, 0) \ + x(bucket_metadata_type_mismatch, 113, 0) \ + x(need_discard_key_wrong, 114, 0) \ + x(freespace_key_wrong, 115, 0) \ + x(freespace_hole_missing, 116, 0) \ + x(bucket_gens_val_size_bad, 117, 0) \ + x(bucket_gens_key_wrong, 118, 0) \ + x(bucket_gens_hole_wrong, 119, 0) \ + x(bucket_gens_to_invalid_dev, 120, 0) \ + x(bucket_gens_to_invalid_buckets, 121, 0) \ + x(bucket_gens_nonzero_for_invalid_buckets, 122, 0) \ + x(need_discard_freespace_key_to_invalid_dev_bucket, 123, 0) \ + x(need_discard_freespace_key_bad, 124, 0) \ + x(backpointer_bucket_offset_wrong, 125, 0) \ + x(backpointer_to_missing_device, 126, 0) \ + x(backpointer_to_missing_alloc, 127, 0) \ + x(backpointer_to_missing_ptr, 128, 0) \ + x(lru_entry_at_time_0, 129, 0) \ + x(lru_entry_to_invalid_bucket, 130, 0) \ + x(lru_entry_bad, 131, 0) \ + x(btree_ptr_val_too_big, 132, 0) \ + x(btree_ptr_v2_val_too_big, 133, 0) \ + x(btree_ptr_has_non_ptr, 134, 0) \ + x(extent_ptrs_invalid_entry, 135, 0) \ + x(extent_ptrs_no_ptrs, 136, 0) \ + x(extent_ptrs_too_many_ptrs, 137, 0) \ + x(extent_ptrs_redundant_crc, 138, 0) \ + x(extent_ptrs_redundant_stripe, 139, 0) \ + x(extent_ptrs_unwritten, 140, 0) \ + x(extent_ptrs_written_and_unwritten, 141, 0) \ + x(ptr_to_invalid_device, 142, 0) \ + x(ptr_to_duplicate_device, 143, 0) \ + x(ptr_after_last_bucket, 144, 0) \ + x(ptr_before_first_bucket, 145, 0) \ + x(ptr_spans_multiple_buckets, 146, 0) \ + x(ptr_to_missing_backpointer, 147, 0) \ + x(ptr_to_missing_alloc_key, 148, 0) \ + x(ptr_to_missing_replicas_entry, 149, 0) \ + x(ptr_to_missing_stripe, 150, 0) \ + x(ptr_to_incorrect_stripe, 151, 0) \ + x(ptr_gen_newer_than_bucket_gen, 152, 0) \ + x(ptr_too_stale, 153, 0) \ + x(stale_dirty_ptr, 154, 0) \ + x(ptr_bucket_data_type_mismatch, 155, 0) \ + x(ptr_cached_and_erasure_coded, 156, 0) \ + x(ptr_crc_uncompressed_size_too_small, 157, 0) \ + x(ptr_crc_csum_type_unknown, 158, 0) \ + x(ptr_crc_compression_type_unknown, 159, 0) \ + x(ptr_crc_redundant, 160, 0) \ + x(ptr_crc_uncompressed_size_too_big, 161, 0) \ + x(ptr_crc_nonce_mismatch, 162, 0) \ + x(ptr_stripe_redundant, 163, 0) \ + x(reservation_key_nr_replicas_invalid, 164, 0) \ + x(reflink_v_refcount_wrong, 165, 0) \ + x(reflink_p_to_missing_reflink_v, 166, 0) \ + x(stripe_pos_bad, 167, 0) \ + x(stripe_val_size_bad, 168, 0) \ + x(stripe_sector_count_wrong, 169, 0) \ + x(snapshot_tree_pos_bad, 170, 0) \ + x(snapshot_tree_to_missing_snapshot, 171, 0) \ + x(snapshot_tree_to_missing_subvol, 172, 0) \ + x(snapshot_tree_to_wrong_subvol, 173, 0) \ + x(snapshot_tree_to_snapshot_subvol, 174, 0) \ + x(snapshot_pos_bad, 175, 0) \ + x(snapshot_parent_bad, 176, 0) \ + x(snapshot_children_not_normalized, 177, 0) \ + x(snapshot_child_duplicate, 178, 0) \ + x(snapshot_child_bad, 179, 0) \ + x(snapshot_skiplist_not_normalized, 180, 0) \ + x(snapshot_skiplist_bad, 181, 0) \ + x(snapshot_should_not_have_subvol, 182, 0) \ + x(snapshot_to_bad_snapshot_tree, 183, 0) \ + x(snapshot_bad_depth, 184, 0) \ + x(snapshot_bad_skiplist, 185, 0) \ + x(subvol_pos_bad, 186, 0) \ + x(subvol_not_master_and_not_snapshot, 187, 0) \ + x(subvol_to_missing_root, 188, 0) \ + x(subvol_root_wrong_bi_subvol, 189, 0) \ + x(bkey_in_missing_snapshot, 190, 0) \ + x(inode_pos_inode_nonzero, 191, 0) \ + x(inode_pos_blockdev_range, 192, 0) \ + x(inode_unpack_error, 193, 0) \ + x(inode_str_hash_invalid, 194, 0) \ + x(inode_v3_fields_start_bad, 195, 0) \ + x(inode_snapshot_mismatch, 196, 0) \ + x(inode_unlinked_but_clean, 197, 0) \ + x(inode_unlinked_but_nlink_nonzero, 198, 0) \ + x(inode_checksum_type_invalid, 199, 0) \ + x(inode_compression_type_invalid, 200, 0) \ + x(inode_subvol_root_but_not_dir, 201, 0) \ + x(inode_i_size_dirty_but_clean, 202, 0) \ + x(inode_i_sectors_dirty_but_clean, 203, 0) \ + x(inode_i_sectors_wrong, 204, 0) \ + x(inode_dir_wrong_nlink, 205, 0) \ + x(inode_dir_multiple_links, 206, 0) \ + x(inode_multiple_links_but_nlink_0, 207, 0) \ + x(inode_wrong_backpointer, 208, 0) \ + x(inode_wrong_nlink, 209, 0) \ + x(inode_unreachable, 210, 0) \ + x(deleted_inode_but_clean, 211, 0) \ + x(deleted_inode_missing, 212, 0) \ + x(deleted_inode_is_dir, 213, 0) \ + x(deleted_inode_not_unlinked, 214, 0) \ + x(extent_overlapping, 215, 0) \ + x(extent_in_missing_inode, 216, 0) \ + x(extent_in_non_reg_inode, 217, 0) \ + x(extent_past_end_of_inode, 218, 0) \ + x(dirent_empty_name, 219, 0) \ + x(dirent_val_too_big, 220, 0) \ + x(dirent_name_too_long, 221, 0) \ + x(dirent_name_embedded_nul, 222, 0) \ + x(dirent_name_dot_or_dotdot, 223, 0) \ + x(dirent_name_has_slash, 224, 0) \ + x(dirent_d_type_wrong, 225, 0) \ + x(inode_bi_parent_wrong, 226, 0) \ + x(dirent_in_missing_dir_inode, 227, 0) \ + x(dirent_in_non_dir_inode, 228, 0) \ + x(dirent_to_missing_inode, 229, 0) \ + x(dirent_to_missing_subvol, 230, 0) \ + x(dirent_to_itself, 231, 0) \ + x(quota_type_invalid, 232, 0) \ + x(xattr_val_size_too_small, 233, 0) \ + x(xattr_val_size_too_big, 234, 0) \ + x(xattr_invalid_type, 235, 0) \ + x(xattr_name_invalid_chars, 236, 0) \ + x(xattr_in_missing_inode, 237, 0) \ + x(root_subvol_missing, 238, 0) \ + x(root_dir_missing, 239, 0) \ + x(root_inode_not_dir, 240, 0) \ + x(dir_loop, 241, 0) \ + x(hash_table_key_duplicate, 242, 0) \ + x(hash_table_key_wrong_offset, 243, 0) \ + x(unlinked_inode_not_on_deleted_list, 244, 0) \ + x(reflink_p_front_pad_bad, 245, 0) \ + x(journal_entry_dup_same_device, 246, 0) \ + x(inode_bi_subvol_missing, 247, 0) \ + x(inode_bi_subvol_wrong, 248, 0) \ + x(inode_points_to_missing_dirent, 249, 0) \ + x(inode_points_to_wrong_dirent, 250, 0) \ + x(inode_bi_parent_nonzero, 251, 0) \ + x(dirent_to_missing_parent_subvol, 252, 0) \ + x(dirent_not_visible_in_parent_subvol, 253, 0) \ + x(subvol_fs_path_parent_wrong, 254, 0) \ + x(subvol_root_fs_path_parent_nonzero, 255, 0) \ + x(subvol_children_not_set, 256, 0) \ + x(subvol_children_bad, 257, 0) \ + x(subvol_loop, 258, 0) \ + x(subvol_unreachable, 259, 0) \ + x(btree_node_bkey_bad_u64s, 260, 0) \ + x(btree_node_topology_empty_interior_node, 261, 0) \ + x(btree_ptr_v2_min_key_bad, 262, 0) \ + x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \ + x(snapshot_node_missing, 264, 0) \ + x(dup_backpointer_to_bad_csum_extent, 265, 0) \ + x(btree_bitmap_not_marked, 266, 0) \ + x(sb_clean_entry_overrun, 267, 0) \ + x(btree_ptr_v2_written_0, 268, 0) \ + x(subvol_snapshot_bad, 269, 0) \ + x(subvol_inode_bad, 270, 0) \ + x(alloc_key_stripe_sectors_wrong, 271, 0) \ + x(accounting_mismatch, 272, 0) \ + x(accounting_replicas_not_marked, 273, 0) \ + x(invalid_btree_id, 274, 0) \ + x(alloc_key_io_time_bad, 275, 0) enum bch_sb_error_id { -#define x(t, n) BCH_FSCK_ERR_##t = n, +#define x(t, n, ...) BCH_FSCK_ERR_##t = n, BCH_SB_ERRS() #undef x BCH_SB_ERR_MAX From c6cab97cdfd14571a17b9453b1d339eaa3b77c0b Mon Sep 17 00:00:00 2001 From: Youling Tang Date: Thu, 20 Jun 2024 09:22:42 +0800 Subject: [PATCH 1210/1578] bcachefs: fix alignment of VMA for memory mapped files on THP With CONFIG_READ_ONLY_THP_FOR_FS, the Linux kernel supports using THPs for read-only mmapped files, such as shared libraries. However, the kernel makes no attempt to actually align those mappings on 2MB boundaries, which makes it impossible to use those THPs most of the time. This issue applies to general file mapping THP as well as existing setups using CONFIG_READ_ONLY_THP_FOR_FS. This is easily fixed by using thp_get_unmapped_area for the unmapped_area function in bcachefs, which is what ext2, ext4, fuse, xfs and btrfs all use. Similar to commit b0c582233a85 ("btrfs: fix alignment of VMA for memory mapped files on THP"). Signed-off-by: Youling Tang Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 77126992dba8c2..8314d3e1582d31 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1157,6 +1157,7 @@ static const struct file_operations bch_file_operations = { .read_iter = bch2_read_iter, .write_iter = bch2_write_iter, .mmap = bch2_mmap, + .get_unmapped_area = thp_get_unmapped_area, .fsync = bch2_fsync, .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, From 8ad04409921f4c405859a651c9a9e5ff5eb5e8a9 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 18 Jun 2024 14:53:11 -0700 Subject: [PATCH 1211/1578] bnxt_en: Update firmware interface to 1.10.3.44 The relevant change is the max_tso_segs value returned by firmware in the HWRM_FUNC_QCAPS response. This value will be used in the next patch to cap the TSO segments. Signed-off-by: Michael Chan Link: https://lore.kernel.org/r/20240618215313.29631-2-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h | 311 ++++++++++-------- 1 file changed, 178 insertions(+), 133 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h index 06ea86c80be18a..f219709f956355 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h @@ -2,7 +2,7 @@ * * Copyright (c) 2014-2016 Broadcom Corporation * Copyright (c) 2014-2018 Broadcom Limited - * Copyright (c) 2018-2023 Broadcom Inc. + * Copyright (c) 2018-2024 Broadcom Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -500,7 +500,11 @@ struct cmd_nums { #define HWRM_TFC_IF_TBL_GET 0x399UL #define HWRM_TFC_TBL_SCOPE_CONFIG_GET 0x39aUL #define HWRM_TFC_RESC_USAGE_QUERY 0x39bUL + #define HWRM_QUEUE_PFCWD_TIMEOUT_QCAPS 0x39cUL + #define HWRM_QUEUE_PFCWD_TIMEOUT_CFG 0x39dUL + #define HWRM_QUEUE_PFCWD_TIMEOUT_QCFG 0x39eUL #define HWRM_SV 0x400UL + #define HWRM_DBG_LOG_BUFFER_FLUSH 0xff0fUL #define HWRM_DBG_READ_DIRECT 0xff10UL #define HWRM_DBG_READ_INDIRECT 0xff11UL #define HWRM_DBG_WRITE_DIRECT 0xff12UL @@ -609,8 +613,8 @@ struct hwrm_err_output { #define HWRM_VERSION_MAJOR 1 #define HWRM_VERSION_MINOR 10 #define HWRM_VERSION_UPDATE 3 -#define HWRM_VERSION_RSVD 39 -#define HWRM_VERSION_STR "1.10.3.39" +#define HWRM_VERSION_RSVD 44 +#define HWRM_VERSION_STR "1.10.3.44" /* hwrm_ver_get_input (size:192b/24B) */ struct hwrm_ver_get_input { @@ -664,6 +668,7 @@ struct hwrm_ver_get_output { #define VER_GET_RESP_DEV_CAPS_CFG_CFA_TFLIB_SUPPORTED 0x2000UL #define VER_GET_RESP_DEV_CAPS_CFG_CFA_TRUFLOW_SUPPORTED 0x4000UL #define VER_GET_RESP_DEV_CAPS_CFG_SECURE_BOOT_CAPABLE 0x8000UL + #define VER_GET_RESP_DEV_CAPS_CFG_SECURE_SOC_CAPABLE 0x10000UL u8 roce_fw_maj_8b; u8 roce_fw_min_8b; u8 roce_fw_bld_8b; @@ -843,7 +848,9 @@ struct hwrm_async_event_cmpl { #define ASYNC_EVENT_CMPL_EVENT_ID_HW_DOORBELL_RECOVERY_READ_ERROR 0x49UL #define ASYNC_EVENT_CMPL_EVENT_ID_CTX_ERROR 0x4aUL #define ASYNC_EVENT_CMPL_EVENT_ID_UDCC_SESSION_CHANGE 0x4bUL - #define ASYNC_EVENT_CMPL_EVENT_ID_MAX_RGTR_EVENT_ID 0x4cUL + #define ASYNC_EVENT_CMPL_EVENT_ID_DBG_BUF_PRODUCER 0x4cUL + #define ASYNC_EVENT_CMPL_EVENT_ID_PEER_MMAP_CHANGE 0x4dUL + #define ASYNC_EVENT_CMPL_EVENT_ID_MAX_RGTR_EVENT_ID 0x4eUL #define ASYNC_EVENT_CMPL_EVENT_ID_FW_TRACE_MSG 0xfeUL #define ASYNC_EVENT_CMPL_EVENT_ID_HWRM_ERROR 0xffUL #define ASYNC_EVENT_CMPL_EVENT_ID_LAST ASYNC_EVENT_CMPL_EVENT_ID_HWRM_ERROR @@ -1326,13 +1333,13 @@ struct hwrm_async_event_cmpl_error_report_base { u8 timestamp_lo; __le16 timestamp_hi; __le32 event_data1; - #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_MASK 0xffUL - #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_SFT 0 - #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_RESERVED 0x0UL - #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_PAUSE_STORM 0x1UL - #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_INVALID_SIGNAL 0x2UL - #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_NVM 0x3UL - #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_DOORBELL_DROP_THRESHOLD 0x4UL + #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_MASK 0xffUL + #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_SFT 0 + #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_RESERVED 0x0UL + #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_PAUSE_STORM 0x1UL + #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_INVALID_SIGNAL 0x2UL + #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_NVM 0x3UL + #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_DOORBELL_DROP_THRESHOLD 0x4UL #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_THERMAL_THRESHOLD 0x5UL #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_DUAL_DATA_RATE_NOT_SUPPORTED 0x6UL #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_LAST ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_DUAL_DATA_RATE_NOT_SUPPORTED @@ -1814,6 +1821,9 @@ struct hwrm_func_qcaps_output { #define FUNC_QCAPS_RESP_FLAGS_EXT2_SW_MAX_RESOURCE_LIMITS_SUPPORTED 0x800000UL #define FUNC_QCAPS_RESP_FLAGS_EXT2_TF_INGRESS_NIC_FLOW_SUPPORTED 0x1000000UL #define FUNC_QCAPS_RESP_FLAGS_EXT2_LPBK_STATS_SUPPORTED 0x2000000UL + #define FUNC_QCAPS_RESP_FLAGS_EXT2_TF_EGRESS_NIC_FLOW_SUPPORTED 0x4000000UL + #define FUNC_QCAPS_RESP_FLAGS_EXT2_MULTI_LOSSLESS_QUEUES_SUPPORTED 0x8000000UL + #define FUNC_QCAPS_RESP_FLAGS_EXT2_PEER_MMAP_SUPPORTED 0x10000000UL __le16 tunnel_disable_flag; #define FUNC_QCAPS_RESP_TUNNEL_DISABLE_FLAG_DISABLE_VXLAN 0x1UL #define FUNC_QCAPS_RESP_TUNNEL_DISABLE_FLAG_DISABLE_NGE 0x2UL @@ -1828,7 +1838,7 @@ struct hwrm_func_qcaps_output { #define FUNC_QCAPS_RESP_XID_PARTITION_CAP_RX_CK 0x2UL u8 device_serial_number[8]; __le16 ctxs_per_partition; - u8 unused_2[2]; + __le16 max_tso_segs; __le32 roce_vf_max_av; __le32 roce_vf_max_cq; __le32 roce_vf_max_mrw; @@ -2449,6 +2459,7 @@ struct hwrm_func_drv_rgtr_input { #define FUNC_DRV_RGTR_REQ_FLAGS_NPAR_1_2_SUPPORT 0x200UL #define FUNC_DRV_RGTR_REQ_FLAGS_ASYM_QUEUE_CFG_SUPPORT 0x400UL #define FUNC_DRV_RGTR_REQ_FLAGS_TF_INGRESS_NIC_FLOW_MODE 0x800UL + #define FUNC_DRV_RGTR_REQ_FLAGS_TF_EGRESS_NIC_FLOW_MODE 0x1000UL __le32 enables; #define FUNC_DRV_RGTR_REQ_ENABLES_OS_TYPE 0x1UL #define FUNC_DRV_RGTR_REQ_ENABLES_VER 0x2UL @@ -3660,22 +3671,24 @@ struct hwrm_func_backing_store_cfg_v2_input { __le16 target_id; __le64 resp_addr; __le16 type; - #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_QP 0x0UL - #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SRQ 0x1UL - #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_CQ 0x2UL - #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_VNIC 0x3UL - #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_STAT 0x4UL - #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SP_TQM_RING 0x5UL - #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_FP_TQM_RING 0x6UL - #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_MRAV 0xeUL - #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_TIM 0xfUL - #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_MP_TQM_RING 0x15UL - #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SQ_DB_SHADOW 0x16UL - #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_RQ_DB_SHADOW 0x17UL - #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SRQ_DB_SHADOW 0x18UL - #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_CQ_DB_SHADOW 0x19UL - #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_TBL_SCOPE 0x1cUL - #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_XID_PARTITION 0x1dUL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_QP 0x0UL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SRQ 0x1UL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_CQ 0x2UL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_VNIC 0x3UL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_STAT 0x4UL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SP_TQM_RING 0x5UL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_FP_TQM_RING 0x6UL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_MRAV 0xeUL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_TIM 0xfUL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_TX_CK 0x13UL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_RX_CK 0x14UL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_MP_TQM_RING 0x15UL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SQ_DB_SHADOW 0x16UL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_RQ_DB_SHADOW 0x17UL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SRQ_DB_SHADOW 0x18UL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_CQ_DB_SHADOW 0x19UL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_TBL_SCOPE 0x1cUL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_XID_PARTITION 0x1dUL #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SRT_TRACE 0x1eUL #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SRT2_TRACE 0x1fUL #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_CRT_TRACE 0x20UL @@ -3772,18 +3785,20 @@ struct hwrm_func_backing_store_qcfg_v2_output { __le16 seq_id; __le16 resp_len; __le16 type; - #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_QP 0x0UL - #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_SRQ 0x1UL - #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_CQ 0x2UL - #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_VNIC 0x3UL - #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_STAT 0x4UL - #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_SP_TQM_RING 0x5UL - #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_FP_TQM_RING 0x6UL - #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_MRAV 0xeUL - #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_TIM 0xfUL - #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_MP_TQM_RING 0x15UL - #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_TBL_SCOPE 0x1cUL - #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_XID_PARTITION 0x1dUL + #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_QP 0x0UL + #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_SRQ 0x1UL + #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_CQ 0x2UL + #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_VNIC 0x3UL + #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_STAT 0x4UL + #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_SP_TQM_RING 0x5UL + #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_FP_TQM_RING 0x6UL + #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_MRAV 0xeUL + #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_TIM 0xfUL + #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_TX_CK 0x13UL + #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_RX_CK 0x14UL + #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_MP_TQM_RING 0x15UL + #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_TBL_SCOPE 0x1cUL + #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_XID_PARTITION 0x1dUL #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_SRT_TRACE 0x1eUL #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_SRT2_TRACE 0x1fUL #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_CRT_TRACE 0x20UL @@ -3876,22 +3891,24 @@ struct hwrm_func_backing_store_qcaps_v2_input { __le16 target_id; __le64 resp_addr; __le16 type; - #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_QP 0x0UL - #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SRQ 0x1UL - #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_CQ 0x2UL - #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_VNIC 0x3UL - #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_STAT 0x4UL - #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SP_TQM_RING 0x5UL - #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_FP_TQM_RING 0x6UL - #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_MRAV 0xeUL - #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_TIM 0xfUL - #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_MP_TQM_RING 0x15UL - #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SQ_DB_SHADOW 0x16UL - #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_RQ_DB_SHADOW 0x17UL - #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SRQ_DB_SHADOW 0x18UL - #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_CQ_DB_SHADOW 0x19UL - #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_TBL_SCOPE 0x1cUL - #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_XID_PARTITION 0x1dUL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_QP 0x0UL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SRQ 0x1UL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_CQ 0x2UL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_VNIC 0x3UL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_STAT 0x4UL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SP_TQM_RING 0x5UL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_FP_TQM_RING 0x6UL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_MRAV 0xeUL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_TIM 0xfUL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_TX_CK 0x13UL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_RX_CK 0x14UL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_MP_TQM_RING 0x15UL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SQ_DB_SHADOW 0x16UL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_RQ_DB_SHADOW 0x17UL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SRQ_DB_SHADOW 0x18UL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_CQ_DB_SHADOW 0x19UL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_TBL_SCOPE 0x1cUL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_XID_PARTITION 0x1dUL #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SRT_TRACE 0x1eUL #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SRT2_TRACE 0x1fUL #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_CRT_TRACE 0x20UL @@ -3911,22 +3928,24 @@ struct hwrm_func_backing_store_qcaps_v2_output { __le16 seq_id; __le16 resp_len; __le16 type; - #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_QP 0x0UL - #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SRQ 0x1UL - #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_CQ 0x2UL - #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_VNIC 0x3UL - #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_STAT 0x4UL - #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SP_TQM_RING 0x5UL - #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_FP_TQM_RING 0x6UL - #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_MRAV 0xeUL - #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_TIM 0xfUL - #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_MP_TQM_RING 0x15UL - #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SQ_DB_SHADOW 0x16UL - #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_RQ_DB_SHADOW 0x17UL - #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SRQ_DB_SHADOW 0x18UL - #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_CQ_DB_SHADOW 0x19UL - #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_TBL_SCOPE 0x1cUL - #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_XID_PARTITION 0x1dUL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_QP 0x0UL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SRQ 0x1UL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_CQ 0x2UL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_VNIC 0x3UL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_STAT 0x4UL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SP_TQM_RING 0x5UL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_FP_TQM_RING 0x6UL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_MRAV 0xeUL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_TIM 0xfUL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_TX_CK 0x13UL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_RX_CK 0x14UL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_MP_TQM_RING 0x15UL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SQ_DB_SHADOW 0x16UL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_RQ_DB_SHADOW 0x17UL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SRQ_DB_SHADOW 0x18UL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_CQ_DB_SHADOW 0x19UL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_TBL_SCOPE 0x1cUL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_XID_PARTITION 0x1dUL #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SRT_TRACE 0x1eUL #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SRT2_TRACE 0x1fUL #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_CRT_TRACE 0x20UL @@ -4202,7 +4221,8 @@ struct hwrm_port_phy_cfg_input { #define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_100GB_PAM4_112 0x3eaUL #define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_200GB_PAM4_112 0x7d2UL #define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_400GB_PAM4_112 0xfa2UL - #define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_LAST PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_400GB_PAM4_112 + #define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_800GB_PAM4_112 0x1f42UL + #define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_LAST PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_800GB_PAM4_112 __le16 auto_link_speeds2_mask; #define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_1GB 0x1UL #define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_10GB 0x2UL @@ -4217,6 +4237,7 @@ struct hwrm_port_phy_cfg_input { #define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_100GB_PAM4_112 0x400UL #define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_200GB_PAM4_112 0x800UL #define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_400GB_PAM4_112 0x1000UL + #define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_800GB_PAM4_112 0x2000UL u8 unused_2[6]; }; @@ -4292,6 +4313,7 @@ struct hwrm_port_phy_qcfg_output { #define PORT_PHY_QCFG_RESP_LINK_SPEED_100GB 0x3e8UL #define PORT_PHY_QCFG_RESP_LINK_SPEED_200GB 0x7d0UL #define PORT_PHY_QCFG_RESP_LINK_SPEED_400GB 0xfa0UL + #define PORT_PHY_QCFG_RESP_LINK_SPEED_800GB 0x1f40UL #define PORT_PHY_QCFG_RESP_LINK_SPEED_10MB 0xffffUL #define PORT_PHY_QCFG_RESP_LINK_SPEED_LAST PORT_PHY_QCFG_RESP_LINK_SPEED_10MB u8 duplex_cfg; @@ -4451,7 +4473,13 @@ struct hwrm_port_phy_qcfg_output { #define PORT_PHY_QCFG_RESP_PHY_TYPE_400G_BASESR4 0x35UL #define PORT_PHY_QCFG_RESP_PHY_TYPE_400G_BASELR4 0x36UL #define PORT_PHY_QCFG_RESP_PHY_TYPE_400G_BASEER4 0x37UL - #define PORT_PHY_QCFG_RESP_PHY_TYPE_LAST PORT_PHY_QCFG_RESP_PHY_TYPE_400G_BASEER4 + #define PORT_PHY_QCFG_RESP_PHY_TYPE_800G_BASECR8 0x38UL + #define PORT_PHY_QCFG_RESP_PHY_TYPE_800G_BASESR8 0x39UL + #define PORT_PHY_QCFG_RESP_PHY_TYPE_800G_BASELR8 0x3aUL + #define PORT_PHY_QCFG_RESP_PHY_TYPE_800G_BASEER8 0x3bUL + #define PORT_PHY_QCFG_RESP_PHY_TYPE_800G_BASEFR8 0x3cUL + #define PORT_PHY_QCFG_RESP_PHY_TYPE_800G_BASEDR8 0x3dUL + #define PORT_PHY_QCFG_RESP_PHY_TYPE_LAST PORT_PHY_QCFG_RESP_PHY_TYPE_800G_BASEDR8 u8 media_type; #define PORT_PHY_QCFG_RESP_MEDIA_TYPE_UNKNOWN 0x0UL #define PORT_PHY_QCFG_RESP_MEDIA_TYPE_TP 0x1UL @@ -5049,33 +5077,43 @@ struct hwrm_port_qstats_ext_output { u8 valid; }; -/* hwrm_port_lpbk_qstats_input (size:128b/16B) */ +/* hwrm_port_lpbk_qstats_input (size:256b/32B) */ struct hwrm_port_lpbk_qstats_input { __le16 req_type; __le16 cmpl_ring; __le16 seq_id; __le16 target_id; __le64 resp_addr; + __le16 lpbk_stat_size; + u8 flags; + #define PORT_LPBK_QSTATS_REQ_FLAGS_COUNTER_MASK 0x1UL + u8 unused_0[5]; + __le64 lpbk_stat_host_addr; }; -/* hwrm_port_lpbk_qstats_output (size:768b/96B) */ +/* hwrm_port_lpbk_qstats_output (size:128b/16B) */ struct hwrm_port_lpbk_qstats_output { __le16 error_code; __le16 req_type; __le16 seq_id; __le16 resp_len; + __le16 lpbk_stat_size; + u8 unused_0[5]; + u8 valid; +}; + +/* port_lpbk_stats (size:640b/80B) */ +struct port_lpbk_stats { __le64 lpbk_ucast_frames; __le64 lpbk_mcast_frames; __le64 lpbk_bcast_frames; __le64 lpbk_ucast_bytes; __le64 lpbk_mcast_bytes; __le64 lpbk_bcast_bytes; - __le64 tx_stat_discard; - __le64 tx_stat_error; - __le64 rx_stat_discard; - __le64 rx_stat_error; - u8 unused_0[7]; - u8 valid; + __le64 lpbk_tx_discards; + __le64 lpbk_tx_errors; + __le64 lpbk_rx_discards; + __le64 lpbk_rx_errors; }; /* hwrm_port_ecn_qstats_input (size:256b/32B) */ @@ -5140,13 +5178,15 @@ struct hwrm_port_clr_stats_output { u8 valid; }; -/* hwrm_port_lpbk_clr_stats_input (size:128b/16B) */ +/* hwrm_port_lpbk_clr_stats_input (size:192b/24B) */ struct hwrm_port_lpbk_clr_stats_input { __le16 req_type; __le16 cmpl_ring; __le16 seq_id; __le16 target_id; __le64 resp_addr; + __le16 port_id; + u8 unused_0[6]; }; /* hwrm_port_lpbk_clr_stats_output (size:128b/16B) */ @@ -5287,10 +5327,11 @@ struct hwrm_port_phy_qcaps_output { #define PORT_PHY_QCAPS_RESP_SUPPORTED_PAM4_SPEEDS_FORCE_MODE_100G 0x2UL #define PORT_PHY_QCAPS_RESP_SUPPORTED_PAM4_SPEEDS_FORCE_MODE_200G 0x4UL __le16 flags2; - #define PORT_PHY_QCAPS_RESP_FLAGS2_PAUSE_UNSUPPORTED 0x1UL - #define PORT_PHY_QCAPS_RESP_FLAGS2_PFC_UNSUPPORTED 0x2UL - #define PORT_PHY_QCAPS_RESP_FLAGS2_BANK_ADDR_SUPPORTED 0x4UL - #define PORT_PHY_QCAPS_RESP_FLAGS2_SPEEDS2_SUPPORTED 0x8UL + #define PORT_PHY_QCAPS_RESP_FLAGS2_PAUSE_UNSUPPORTED 0x1UL + #define PORT_PHY_QCAPS_RESP_FLAGS2_PFC_UNSUPPORTED 0x2UL + #define PORT_PHY_QCAPS_RESP_FLAGS2_BANK_ADDR_SUPPORTED 0x4UL + #define PORT_PHY_QCAPS_RESP_FLAGS2_SPEEDS2_SUPPORTED 0x8UL + #define PORT_PHY_QCAPS_RESP_FLAGS2_REMOTE_LPBK_UNSUPPORTED 0x10UL u8 internal_port_cnt; u8 unused_0; __le16 supported_speeds2_force_mode; @@ -7443,17 +7484,17 @@ struct hwrm_cfa_l2_filter_cfg_input { __le16 target_id; __le64 resp_addr; __le32 flags; - #define CFA_L2_FILTER_CFG_REQ_FLAGS_PATH 0x1UL - #define CFA_L2_FILTER_CFG_REQ_FLAGS_PATH_TX 0x0UL - #define CFA_L2_FILTER_CFG_REQ_FLAGS_PATH_RX 0x1UL - #define CFA_L2_FILTER_CFG_REQ_FLAGS_PATH_LAST CFA_L2_FILTER_CFG_REQ_FLAGS_PATH_RX - #define CFA_L2_FILTER_CFG_REQ_FLAGS_DROP 0x2UL - #define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_MASK 0xcUL - #define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_SFT 2 - #define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_NO_ROCE_L2 (0x0UL << 2) - #define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_L2 (0x1UL << 2) - #define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_ROCE (0x2UL << 2) - #define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_LAST CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_ROCE + #define CFA_L2_FILTER_CFG_REQ_FLAGS_PATH 0x1UL + #define CFA_L2_FILTER_CFG_REQ_FLAGS_PATH_TX 0x0UL + #define CFA_L2_FILTER_CFG_REQ_FLAGS_PATH_RX 0x1UL + #define CFA_L2_FILTER_CFG_REQ_FLAGS_PATH_LAST CFA_L2_FILTER_CFG_REQ_FLAGS_PATH_RX + #define CFA_L2_FILTER_CFG_REQ_FLAGS_DROP 0x2UL + #define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_MASK 0xcUL + #define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_SFT 2 + #define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_NO_ROCE_L2 (0x0UL << 2) + #define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_L2 (0x1UL << 2) + #define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_ROCE (0x2UL << 2) + #define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_LAST CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_ROCE #define CFA_L2_FILTER_CFG_REQ_FLAGS_REMAP_OP_MASK 0x30UL #define CFA_L2_FILTER_CFG_REQ_FLAGS_REMAP_OP_SFT 4 #define CFA_L2_FILTER_CFG_REQ_FLAGS_REMAP_OP_NO_UPDATE (0x0UL << 4) @@ -8520,17 +8561,17 @@ struct hwrm_tunnel_dst_port_query_input { __le16 target_id; __le64 resp_addr; u8 tunnel_type; - #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_VXLAN 0x1UL - #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_GENEVE 0x5UL - #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_VXLAN_V4 0x9UL - #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_IPGRE_V1 0xaUL - #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_L2_ETYPE 0xbUL - #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_VXLAN_GPE_V6 0xcUL - #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_CUSTOM_GRE 0xdUL - #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_ECPRI 0xeUL - #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_SRV6 0xfUL - #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_VXLAN_GPE 0x10UL - #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_GRE 0x11UL + #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_VXLAN 0x1UL + #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_GENEVE 0x5UL + #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_VXLAN_V4 0x9UL + #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_IPGRE_V1 0xaUL + #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_L2_ETYPE 0xbUL + #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_VXLAN_GPE_V6 0xcUL + #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_CUSTOM_GRE 0xdUL + #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_ECPRI 0xeUL + #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_SRV6 0xfUL + #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_VXLAN_GPE 0x10UL + #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_GRE 0x11UL #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_ULP_DYN_UPAR 0x12UL #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES01 0x13UL #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES02 0x14UL @@ -8576,17 +8617,17 @@ struct hwrm_tunnel_dst_port_alloc_input { __le16 target_id; __le64 resp_addr; u8 tunnel_type; - #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_VXLAN 0x1UL - #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_GENEVE 0x5UL - #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_VXLAN_V4 0x9UL - #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_IPGRE_V1 0xaUL - #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_L2_ETYPE 0xbUL - #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_VXLAN_GPE_V6 0xcUL - #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_CUSTOM_GRE 0xdUL - #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_ECPRI 0xeUL - #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_SRV6 0xfUL - #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_VXLAN_GPE 0x10UL - #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_GRE 0x11UL + #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_VXLAN 0x1UL + #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_GENEVE 0x5UL + #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_VXLAN_V4 0x9UL + #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_IPGRE_V1 0xaUL + #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_L2_ETYPE 0xbUL + #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_VXLAN_GPE_V6 0xcUL + #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_CUSTOM_GRE 0xdUL + #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_ECPRI 0xeUL + #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_SRV6 0xfUL + #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_VXLAN_GPE 0x10UL + #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_GRE 0x11UL #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_ULP_DYN_UPAR 0x12UL #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES01 0x13UL #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES02 0x14UL @@ -8635,17 +8676,17 @@ struct hwrm_tunnel_dst_port_free_input { __le16 target_id; __le64 resp_addr; u8 tunnel_type; - #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN 0x1UL - #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_GENEVE 0x5UL - #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN_V4 0x9UL - #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_IPGRE_V1 0xaUL - #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_L2_ETYPE 0xbUL - #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN_GPE_V6 0xcUL - #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_CUSTOM_GRE 0xdUL - #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_ECPRI 0xeUL - #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_SRV6 0xfUL - #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN_GPE 0x10UL - #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_GRE 0x11UL + #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN 0x1UL + #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_GENEVE 0x5UL + #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN_V4 0x9UL + #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_IPGRE_V1 0xaUL + #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_L2_ETYPE 0xbUL + #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN_GPE_V6 0xcUL + #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_CUSTOM_GRE 0xdUL + #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_ECPRI 0xeUL + #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_SRV6 0xfUL + #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN_GPE 0x10UL + #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_GRE 0x11UL #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_ULP_DYN_UPAR 0x12UL #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES01 0x13UL #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES02 0x14UL @@ -9109,6 +9150,7 @@ struct hwrm_struct_hdr { #define STRUCT_HDR_STRUCT_ID_LLDP_GENERIC 0x424UL #define STRUCT_HDR_STRUCT_ID_LLDP_DEVICE 0x426UL #define STRUCT_HDR_STRUCT_ID_POWER_BKUP 0x427UL + #define STRUCT_HDR_STRUCT_ID_PEER_MMAP 0x429UL #define STRUCT_HDR_STRUCT_ID_AFM_OPAQUE 0x1UL #define STRUCT_HDR_STRUCT_ID_PORT_DESCRIPTION 0xaUL #define STRUCT_HDR_STRUCT_ID_RSS_V2 0x64UL @@ -9758,6 +9800,9 @@ struct hwrm_dbg_coredump_initiate_input { __le16 instance; __le16 unused_0; u8 seg_flags; + #define DBG_COREDUMP_INITIATE_REQ_SEG_FLAGS_LIVE_DATA 0x1UL + #define DBG_COREDUMP_INITIATE_REQ_SEG_FLAGS_CRASH_DATA 0x2UL + #define DBG_COREDUMP_INITIATE_REQ_SEG_FLAGS_COLLECT_CTX_L1_CACHE 0x4UL u8 unused_1[7]; }; @@ -10433,13 +10478,13 @@ struct hwrm_selftest_irq_output { /* dbc_dbc (size:64b/8B) */ struct dbc_dbc { - u32 index; + __le32 index; #define DBC_DBC_INDEX_MASK 0xffffffUL #define DBC_DBC_INDEX_SFT 0 #define DBC_DBC_EPOCH 0x1000000UL #define DBC_DBC_TOGGLE_MASK 0x6000000UL #define DBC_DBC_TOGGLE_SFT 25 - u32 type_path_xid; + __le32 type_path_xid; #define DBC_DBC_XID_MASK 0xfffffUL #define DBC_DBC_XID_SFT 0 #define DBC_DBC_PATH_MASK 0x3000000UL From b7bfcb4c7ce44fd0070ce8bccbc91c56341f05c1 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 18 Jun 2024 14:53:12 -0700 Subject: [PATCH 1212/1578] bnxt_en: Set TSO max segs on devices with limits Firmware will now advertise a non-zero TSO max segments if the device has a limit. 0 means no limit. The latest 5760X chip (early revs) has a limit of 2047 that cannot be exceeded. If exceeded, the chip will send out just a small number of segments. Call netif_set_tso_max_segs() if the device has a limit. Fixes: 2012a6abc876 ("bnxt_en: Add 5760X (P7) PCI IDs") Reviewed-by: Ajit Khaparde Reviewed-by: Somnath Kotur Signed-off-by: Michael Chan Link: https://lore.kernel.org/r/20240618215313.29631-3-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 3 +++ drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 + 2 files changed, 4 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index c437ca1c0fd399..89d29d6d75175b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -8996,6 +8996,7 @@ static int __bnxt_hwrm_func_qcaps(struct bnxt *bp) memcpy(vf->mac_addr, resp->mac_address, ETH_ALEN); #endif } + bp->tso_max_segs = le16_to_cpu(resp->max_tso_segs); hwrm_func_qcaps_exit: hwrm_req_drop(bp, req); @@ -15363,6 +15364,8 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) dev->priv_flags |= IFF_UNICAST_FLT; netif_set_tso_max_size(dev, GSO_MAX_SIZE); + if (bp->tso_max_segs) + netif_set_tso_max_segs(dev, bp->tso_max_segs); dev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | NETDEV_XDP_ACT_RX_SG; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index bbc7edccd5a4d0..9cf0acfa04e574 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -2318,6 +2318,7 @@ struct bnxt { u8 rss_hash_key_updated:1; u16 max_mtu; + u16 tso_max_segs; u8 max_tc; u8 max_lltc; /* lossless TCs */ struct bnxt_queue_info q_info[BNXT_MAX_QUEUE]; From 1e7962114c10957fe4d10a15eb714578a394e90b Mon Sep 17 00:00:00 2001 From: Pavan Chebbi Date: Tue, 18 Jun 2024 14:53:13 -0700 Subject: [PATCH 1213/1578] bnxt_en: Restore PTP tx_avail count in case of skb_pad() error The current code only restores PTP tx_avail count when we get DMA mapping errors. Fix it so that the PTP tx_avail count will be restored for both DMA mapping errors and skb_pad() errors. Otherwise PTP TX timestamp will not be available after a PTP packet hits the skb_pad() error. Fixes: 83bb623c968e ("bnxt_en: Transmit and retrieve packet timestamps") Reviewed-by: Andy Gospodarek Signed-off-by: Pavan Chebbi Signed-off-by: Michael Chan Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240618215313.29631-4-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 89d29d6d75175b..a6d69a45fa0145 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -732,9 +732,6 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; tx_dma_error: - if (BNXT_TX_PTP_IS_SET(lflags)) - atomic_inc(&bp->ptp_cfg->tx_avail); - last_frag = i; /* start back at beginning and unmap skb */ @@ -756,6 +753,8 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) tx_free: dev_kfree_skb_any(skb); tx_kick_pending: + if (BNXT_TX_PTP_IS_SET(lflags)) + atomic_inc(&bp->ptp_cfg->tx_avail); if (txr->kick_pending) bnxt_txr_db_kick(bp, txr, txr->tx_prod); txr->tx_buf_ring[txr->tx_prod].skb = NULL; From 48dea8f7bb011608fd969749a1980f8311ef45f2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 19 Jun 2024 08:17:48 +0200 Subject: [PATCH 1214/1578] selftests: virtio_net: add forgotten config options One may use tools/testing/selftests/drivers/net/virtio_net/config for example for vng build command like this one: $ vng -v -b -f tools/testing/selftests/drivers/net/virtio_net/config In that case, the needed kernel config options are not turned on. Add the missed kernel config options. Reported-by: Jakub Kicinski Closes: https://lore.kernel.org/netdev/20240617072614.75fe79e7@kernel.org/ Reported-by: Matthieu Baerts Closes: https://lore.kernel.org/netdev/1a63f209-b1d4-4809-bc30-295a5cafa296@kernel.org/ Fixes: ccfaed04db5e ("selftests: virtio_net: add initial tests") Signed-off-by: Jiri Pirko Reviewed-by: Xuan Zhuo Acked-by: Michael S. Tsirkin Link: https://lore.kernel.org/r/20240619061748.1869404-1-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/virtio_net/config | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/drivers/net/virtio_net/config b/tools/testing/selftests/drivers/net/virtio_net/config index f35de0542b6089..bcf7555eaffead 100644 --- a/tools/testing/selftests/drivers/net/virtio_net/config +++ b/tools/testing/selftests/drivers/net/virtio_net/config @@ -1,2 +1,8 @@ -CONFIG_VIRTIO_NET=y +CONFIG_BPF_SYSCALL=y +CONFIG_CGROUP_BPF=y +CONFIG_IPV6=y +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_NET_L3_MASTER_DEV=y +CONFIG_NET_VRF=m CONFIG_VIRTIO_DEBUG=y +CONFIG_VIRTIO_NET=y From fba383985354e83474f95f36d7c65feb75dba19d Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Wed, 19 Jun 2024 15:28:03 +0200 Subject: [PATCH 1215/1578] net: usb: rtl8150 fix unintiatilzed variables in rtl8150_get_link_ksettings This functions retrieves values by passing a pointer. As the function that retrieves them can fail before touching the pointers, the variables must be initialized. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+5186630949e3c55f0799@syzkaller.appspotmail.com Signed-off-by: Oliver Neukum Link: https://lore.kernel.org/r/20240619132816.11526-1-oneukum@suse.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/rtl8150.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/usb/rtl8150.c b/drivers/net/usb/rtl8150.c index 97afd7335d8685..01a3b2417a5401 100644 --- a/drivers/net/usb/rtl8150.c +++ b/drivers/net/usb/rtl8150.c @@ -778,7 +778,8 @@ static int rtl8150_get_link_ksettings(struct net_device *netdev, struct ethtool_link_ksettings *ecmd) { rtl8150_t *dev = netdev_priv(netdev); - short lpa, bmcr; + short lpa = 0; + short bmcr = 0; u32 supported; supported = (SUPPORTED_10baseT_Half | From fbd64f902b93fe9658b855b9892ae59ef6ea22b9 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 14 Jun 2024 11:00:49 +0300 Subject: [PATCH 1216/1578] mmc: sdhci: Do not invert write-protect twice mmc_of_parse() reads device property "wp-inverted" and sets MMC_CAP2_RO_ACTIVE_HIGH if it is true. MMC_CAP2_RO_ACTIVE_HIGH is used to invert a write-protect (AKA read-only) GPIO value. sdhci_get_property() also reads "wp-inverted" and sets SDHCI_QUIRK_INVERTED_WRITE_PROTECT which is used to invert the write-protect value as well but also acts upon a value read out from the SDHCI_PRESENT_STATE register. Many drivers call both mmc_of_parse() and sdhci_get_property(), so that both MMC_CAP2_RO_ACTIVE_HIGH and SDHCI_QUIRK_INVERTED_WRITE_PROTECT will be set if the controller has device property "wp-inverted". Amend the logic in sdhci_check_ro() to allow for that possibility, so that the write-protect value is not inverted twice. Also do not invert the value if it is a negative error value. Note that callers treat an error the same as not-write-protected, so the result is functionally the same in that case. Also do not invert the value if sdhci host operation ->get_ro() is used. None of the users of that callback set SDHCI_QUIRK_INVERTED_WRITE_PROTECT directly or indirectly, but two do call mmc_gpio_get_ro(), so leave it to them to deal with that if they ever set SDHCI_QUIRK_INVERTED_WRITE_PROTECT in the future. Fixes: 6d5cd068ee59 ("mmc: sdhci: use WP GPIO in sdhci_check_ro()") Signed-off-by: Adrian Hunter Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240614080051.4005-2-adrian.hunter@intel.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 746f4cf7ab0338..81b81d7bb3d81b 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -2515,26 +2515,34 @@ EXPORT_SYMBOL_GPL(sdhci_get_cd_nogpio); static int sdhci_check_ro(struct sdhci_host *host) { + bool allow_invert = false; unsigned long flags; int is_readonly; spin_lock_irqsave(&host->lock, flags); - if (host->flags & SDHCI_DEVICE_DEAD) + if (host->flags & SDHCI_DEVICE_DEAD) { is_readonly = 0; - else if (host->ops->get_ro) + } else if (host->ops->get_ro) { is_readonly = host->ops->get_ro(host); - else if (mmc_can_gpio_ro(host->mmc)) + } else if (mmc_can_gpio_ro(host->mmc)) { is_readonly = mmc_gpio_get_ro(host->mmc); - else + /* Do not invert twice */ + allow_invert = !(host->mmc->caps2 & MMC_CAP2_RO_ACTIVE_HIGH); + } else { is_readonly = !(sdhci_readl(host, SDHCI_PRESENT_STATE) & SDHCI_WRITE_PROTECT); + allow_invert = true; + } spin_unlock_irqrestore(&host->lock, flags); - /* This quirk needs to be replaced by a callback-function later */ - return host->quirks & SDHCI_QUIRK_INVERTED_WRITE_PROTECT ? - !is_readonly : is_readonly; + if (is_readonly >= 0 && + allow_invert && + (host->quirks & SDHCI_QUIRK_INVERTED_WRITE_PROTECT)) + is_readonly = !is_readonly; + + return is_readonly; } #define SAMPLE_COUNT 5 From ab069ce125965a5e282f7b53b86aee76ab32975c Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 14 Jun 2024 11:00:50 +0300 Subject: [PATCH 1217/1578] mmc: sdhci: Do not lock spinlock around mmc_gpio_get_ro() sdhci_check_ro() can call mmc_gpio_get_ro() while holding the sdhci host->lock spinlock. That would be a problem if the GPIO access done by mmc_gpio_get_ro() needed to sleep. However, host->lock is not needed anyway. The mmc core ensures that host operations do not race with each other, and asynchronous callbacks like the interrupt handler, software timeouts, completion work etc, cannot affect sdhci_check_ro(). So remove the locking. Fixes: 6d5cd068ee59 ("mmc: sdhci: use WP GPIO in sdhci_check_ro()") Signed-off-by: Adrian Hunter Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240614080051.4005-3-adrian.hunter@intel.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 81b81d7bb3d81b..112584aa077231 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -2516,11 +2516,8 @@ EXPORT_SYMBOL_GPL(sdhci_get_cd_nogpio); static int sdhci_check_ro(struct sdhci_host *host) { bool allow_invert = false; - unsigned long flags; int is_readonly; - spin_lock_irqsave(&host->lock, flags); - if (host->flags & SDHCI_DEVICE_DEAD) { is_readonly = 0; } else if (host->ops->get_ro) { @@ -2535,8 +2532,6 @@ static int sdhci_check_ro(struct sdhci_host *host) allow_invert = true; } - spin_unlock_irqrestore(&host->lock, flags); - if (is_readonly >= 0 && allow_invert && (host->quirks & SDHCI_QUIRK_INVERTED_WRITE_PROTECT)) From 90f3feb24172185f1832636264943e8b5e289245 Mon Sep 17 00:00:00 2001 From: Elinor Montmasson Date: Thu, 20 Jun 2024 15:25:03 +0200 Subject: [PATCH 1218/1578] ASoC: fsl-asoc-card: set priv->pdev before using it priv->pdev pointer was set after being used in fsl_asoc_card_audmux_init(). Move this assignment at the start of the probe function, so sub-functions can correctly use pdev through priv. fsl_asoc_card_audmux_init() dereferences priv->pdev to get access to the dev struct, used with dev_err macros. As priv is zero-initialised, there would be a NULL pointer dereference. Note that if priv->dev is dereferenced before assignment but never used, for example if there is no error to be printed, the driver won't crash probably due to compiler optimisations. Fixes: 708b4351f08c ("ASoC: fsl: Add Freescale Generic ASoC Sound Card with ASRC support") Signed-off-by: Elinor Montmasson Link: https://patch.msgid.link/20240620132511.4291-2-elinor.montmasson@savoirfairelinux.com Signed-off-by: Mark Brown --- sound/soc/fsl/fsl-asoc-card.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sound/soc/fsl/fsl-asoc-card.c b/sound/soc/fsl/fsl-asoc-card.c index 5ddc0c2fe53ff4..eb67689dcd6e62 100644 --- a/sound/soc/fsl/fsl-asoc-card.c +++ b/sound/soc/fsl/fsl-asoc-card.c @@ -559,6 +559,8 @@ static int fsl_asoc_card_probe(struct platform_device *pdev) if (!priv) return -ENOMEM; + priv->pdev = pdev; + cpu_np = of_parse_phandle(np, "audio-cpu", 0); /* Give a chance to old DT binding */ if (!cpu_np) @@ -787,7 +789,6 @@ static int fsl_asoc_card_probe(struct platform_device *pdev) } /* Initialize sound card */ - priv->pdev = pdev; priv->card.dev = &pdev->dev; priv->card.owner = THIS_MODULE; ret = snd_soc_of_parse_card_name(&priv->card, "model"); From 7c7b1be19b228b450c2945ec379d7fc6bfef9852 Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Wed, 19 Jun 2024 14:27:02 +0200 Subject: [PATCH 1219/1578] Input: ads7846 - use spi_device_id table As the driver supports more devices over time the single MODULE_ALIAS is complete and raises several warnings: SPI driver ads7846 has no spi_device_id for ti,tsc2046 SPI driver ads7846 has no spi_device_id for ti,ads7843 SPI driver ads7846 has no spi_device_id for ti,ads7845 SPI driver ads7846 has no spi_device_id for ti,ads7873 Fix this by adding a spi_device_id table and removing the manual MODULE_ALIAS. Signed-off-by: Alexander Stein Link: https://lore.kernel.org/r/20240619122703.2081476-1-alexander.stein@ew.tq-group.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/ads7846.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/input/touchscreen/ads7846.c b/drivers/input/touchscreen/ads7846.c index d2bbb436a77df9..4d13db13b9e578 100644 --- a/drivers/input/touchscreen/ads7846.c +++ b/drivers/input/touchscreen/ads7846.c @@ -1111,6 +1111,16 @@ static const struct of_device_id ads7846_dt_ids[] = { }; MODULE_DEVICE_TABLE(of, ads7846_dt_ids); +static const struct spi_device_id ads7846_spi_ids[] = { + { "tsc2046", 7846 }, + { "ads7843", 7843 }, + { "ads7845", 7845 }, + { "ads7846", 7846 }, + { "ads7873", 7873 }, + { }, +}; +MODULE_DEVICE_TABLE(spi, ads7846_spi_ids); + static const struct ads7846_platform_data *ads7846_get_props(struct device *dev) { struct ads7846_platform_data *pdata; @@ -1386,10 +1396,10 @@ static struct spi_driver ads7846_driver = { }, .probe = ads7846_probe, .remove = ads7846_remove, + .id_table = ads7846_spi_ids, }; module_spi_driver(ads7846_driver); MODULE_DESCRIPTION("ADS7846 TouchScreen Driver"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("spi:ads7846"); From 5a5696a11f7e0c5ce3185b6234d02996f8267108 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Thu, 20 Jun 2024 09:42:18 -0700 Subject: [PATCH 1220/1578] nvme-apple: add missing MODULE_DESCRIPTION() make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/nvme/host/nvme-apple.o Add the missing invocation of the MODULE_DESCRIPTION() macro. Reviewed-by: Eric Curtin Reviewed-by: Sven Peter Reviewed-by: Sagi Grimberg Signed-off-by: Jeff Johnson Signed-off-by: Keith Busch --- drivers/nvme/host/apple.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index dd6ec0865141a9..0cfa39361d3b60 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -1602,4 +1602,5 @@ static struct platform_driver apple_nvme_driver = { module_platform_driver(apple_nvme_driver); MODULE_AUTHOR("Sven Peter "); +MODULE_DESCRIPTION("Apple ANS NVM Express device driver"); MODULE_LICENSE("GPL"); From fd80731e5e9d1402cb2f85022a6abf9b1982ec5f Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Thu, 20 Jun 2024 11:37:39 +0200 Subject: [PATCH 1221/1578] usb: gadget: printer: SS+ support We need to treat super speed plus as super speed, not the default, which is full speed. Signed-off-by: Oliver Neukum Cc: stable Link: https://lore.kernel.org/r/20240620093800.28901-1-oneukum@suse.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_printer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/gadget/function/f_printer.c b/drivers/usb/gadget/function/f_printer.c index ba7d180cc9e6da..4c0b7c2970f147 100644 --- a/drivers/usb/gadget/function/f_printer.c +++ b/drivers/usb/gadget/function/f_printer.c @@ -213,6 +213,7 @@ static inline struct usb_endpoint_descriptor *ep_desc(struct usb_gadget *gadget, struct usb_endpoint_descriptor *ss) { switch (gadget->speed) { + case USB_SPEED_SUPER_PLUS: case USB_SPEED_SUPER: return ss; case USB_SPEED_HIGH: From e587a7633dfee8987a999cf253f7c52a8e09276c Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Thu, 20 Jun 2024 13:40:26 +0200 Subject: [PATCH 1222/1578] usb: gadget: printer: fix races against disable printer_read() and printer_write() guard against the race against disable() by checking the dev->interface flag, which in turn is guarded by a spinlock. These functions, however, drop the lock on multiple occasions. This means that the test has to be redone after reacquiring the lock and before doing IO. Add the tests. This also addresses CVE-2024-25741 Fixes: 7f2ca14d2f9b9 ("usb: gadget: function: printer: Interface is disabled and returns error") Cc: stable Signed-off-by: Oliver Neukum Link: https://lore.kernel.org/r/20240620114039.5767-1-oneukum@suse.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_printer.c | 39 ++++++++++++++++++------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/drivers/usb/gadget/function/f_printer.c b/drivers/usb/gadget/function/f_printer.c index 4c0b7c2970f147..44e20c6c36d32b 100644 --- a/drivers/usb/gadget/function/f_printer.c +++ b/drivers/usb/gadget/function/f_printer.c @@ -450,11 +450,8 @@ printer_read(struct file *fd, char __user *buf, size_t len, loff_t *ptr) mutex_lock(&dev->lock_printer_io); spin_lock_irqsave(&dev->lock, flags); - if (dev->interface < 0) { - spin_unlock_irqrestore(&dev->lock, flags); - mutex_unlock(&dev->lock_printer_io); - return -ENODEV; - } + if (dev->interface < 0) + goto out_disabled; /* We will use this flag later to check if a printer reset happened * after we turn interrupts back on. @@ -462,6 +459,9 @@ printer_read(struct file *fd, char __user *buf, size_t len, loff_t *ptr) dev->reset_printer = 0; setup_rx_reqs(dev); + /* this dropped the lock - need to retest */ + if (dev->interface < 0) + goto out_disabled; bytes_copied = 0; current_rx_req = dev->current_rx_req; @@ -495,6 +495,8 @@ printer_read(struct file *fd, char __user *buf, size_t len, loff_t *ptr) wait_event_interruptible(dev->rx_wait, (likely(!list_empty(&dev->rx_buffers)))); spin_lock_irqsave(&dev->lock, flags); + if (dev->interface < 0) + goto out_disabled; } /* We have data to return then copy it to the caller's buffer.*/ @@ -538,6 +540,9 @@ printer_read(struct file *fd, char __user *buf, size_t len, loff_t *ptr) return -EAGAIN; } + if (dev->interface < 0) + goto out_disabled; + /* If we not returning all the data left in this RX request * buffer then adjust the amount of data left in the buffer. * Othewise if we are done with this RX request buffer then @@ -567,6 +572,11 @@ printer_read(struct file *fd, char __user *buf, size_t len, loff_t *ptr) return bytes_copied; else return -EAGAIN; + +out_disabled: + spin_unlock_irqrestore(&dev->lock, flags); + mutex_unlock(&dev->lock_printer_io); + return -ENODEV; } static ssize_t @@ -587,11 +597,8 @@ printer_write(struct file *fd, const char __user *buf, size_t len, loff_t *ptr) mutex_lock(&dev->lock_printer_io); spin_lock_irqsave(&dev->lock, flags); - if (dev->interface < 0) { - spin_unlock_irqrestore(&dev->lock, flags); - mutex_unlock(&dev->lock_printer_io); - return -ENODEV; - } + if (dev->interface < 0) + goto out_disabled; /* Check if a printer reset happens while we have interrupts on */ dev->reset_printer = 0; @@ -614,6 +621,8 @@ printer_write(struct file *fd, const char __user *buf, size_t len, loff_t *ptr) wait_event_interruptible(dev->tx_wait, (likely(!list_empty(&dev->tx_reqs)))); spin_lock_irqsave(&dev->lock, flags); + if (dev->interface < 0) + goto out_disabled; } while (likely(!list_empty(&dev->tx_reqs)) && len) { @@ -663,6 +672,9 @@ printer_write(struct file *fd, const char __user *buf, size_t len, loff_t *ptr) return -EAGAIN; } + if (dev->interface < 0) + goto out_disabled; + list_add(&req->list, &dev->tx_reqs_active); /* here, we unlock, and only unlock, to avoid deadlock. */ @@ -675,6 +687,8 @@ printer_write(struct file *fd, const char __user *buf, size_t len, loff_t *ptr) mutex_unlock(&dev->lock_printer_io); return -EAGAIN; } + if (dev->interface < 0) + goto out_disabled; } spin_unlock_irqrestore(&dev->lock, flags); @@ -686,6 +700,11 @@ printer_write(struct file *fd, const char __user *buf, size_t len, loff_t *ptr) return bytes_copied; else return -EAGAIN; + +out_disabled: + spin_unlock_irqrestore(&dev->lock, flags); + mutex_unlock(&dev->lock_printer_io); + return -ENODEV; } static int From 2eabb655a968b862bc0c31629a09f0fbf3c80d51 Mon Sep 17 00:00:00 2001 From: Nikita Zhandarovich Date: Sun, 9 Jun 2024 06:15:46 -0700 Subject: [PATCH 1223/1578] usb: atm: cxacru: fix endpoint checking in cxacru_bind() Syzbot is still reporting quite an old issue [1] that occurs due to incomplete checking of present usb endpoints. As such, wrong endpoints types may be used at urb sumbitting stage which in turn triggers a warning in usb_submit_urb(). Fix the issue by verifying that required endpoint types are present for both in and out endpoints, taking into account cmd endpoint type. Unfortunately, this patch has not been tested on real hardware. [1] Syzbot report: usb 1-1: BOGUS urb xfer, pipe 1 != type 3 WARNING: CPU: 0 PID: 8667 at drivers/usb/core/urb.c:502 usb_submit_urb+0xed2/0x18a0 drivers/usb/core/urb.c:502 Modules linked in: CPU: 0 PID: 8667 Comm: kworker/0:4 Not tainted 5.14.0-rc4-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: usb_hub_wq hub_event RIP: 0010:usb_submit_urb+0xed2/0x18a0 drivers/usb/core/urb.c:502 ... Call Trace: cxacru_cm+0x3c0/0x8e0 drivers/usb/atm/cxacru.c:649 cxacru_card_status+0x22/0xd0 drivers/usb/atm/cxacru.c:760 cxacru_bind+0x7ac/0x11a0 drivers/usb/atm/cxacru.c:1209 usbatm_usb_probe+0x321/0x1ae0 drivers/usb/atm/usbatm.c:1055 cxacru_usb_probe+0xdf/0x1e0 drivers/usb/atm/cxacru.c:1363 usb_probe_interface+0x315/0x7f0 drivers/usb/core/driver.c:396 call_driver_probe drivers/base/dd.c:517 [inline] really_probe+0x23c/0xcd0 drivers/base/dd.c:595 __driver_probe_device+0x338/0x4d0 drivers/base/dd.c:747 driver_probe_device+0x4c/0x1a0 drivers/base/dd.c:777 __device_attach_driver+0x20b/0x2f0 drivers/base/dd.c:894 bus_for_each_drv+0x15f/0x1e0 drivers/base/bus.c:427 __device_attach+0x228/0x4a0 drivers/base/dd.c:965 bus_probe_device+0x1e4/0x290 drivers/base/bus.c:487 device_add+0xc2f/0x2180 drivers/base/core.c:3354 usb_set_configuration+0x113a/0x1910 drivers/usb/core/message.c:2170 usb_generic_driver_probe+0xba/0x100 drivers/usb/core/generic.c:238 usb_probe_device+0xd9/0x2c0 drivers/usb/core/driver.c:293 Reported-and-tested-by: syzbot+00c18ee8497dd3be6ade@syzkaller.appspotmail.com Fixes: 902ffc3c707c ("USB: cxacru: Use a bulk/int URB to access the command endpoint") Cc: stable Signed-off-by: Nikita Zhandarovich Link: https://lore.kernel.org/r/20240609131546.3932-1-n.zhandarovich@fintech.ru Signed-off-by: Greg Kroah-Hartman --- drivers/usb/atm/cxacru.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/usb/atm/cxacru.c b/drivers/usb/atm/cxacru.c index 4ce7cba2b48aa3..8f3b9a0a38e1dd 100644 --- a/drivers/usb/atm/cxacru.c +++ b/drivers/usb/atm/cxacru.c @@ -1131,6 +1131,7 @@ static int cxacru_bind(struct usbatm_data *usbatm_instance, struct cxacru_data *instance; struct usb_device *usb_dev = interface_to_usbdev(intf); struct usb_host_endpoint *cmd_ep = usb_dev->ep_in[CXACRU_EP_CMD]; + struct usb_endpoint_descriptor *in, *out; int ret; /* instance init */ @@ -1177,6 +1178,19 @@ static int cxacru_bind(struct usbatm_data *usbatm_instance, goto fail; } + if (usb_endpoint_xfer_int(&cmd_ep->desc)) + ret = usb_find_common_endpoints(intf->cur_altsetting, + NULL, NULL, &in, &out); + else + ret = usb_find_common_endpoints(intf->cur_altsetting, + &in, &out, NULL, NULL); + + if (ret) { + usb_err(usbatm_instance, "cxacru_bind: interface has incorrect endpoints\n"); + ret = -ENODEV; + goto fail; + } + if ((cmd_ep->desc.bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) == USB_ENDPOINT_XFER_INT) { usb_fill_int_urb(instance->rcv_urb, From 8e1ec117efdfd4b2f59f57bd0ad16b4edf5b963f Mon Sep 17 00:00:00 2001 From: Fabrice Gasnier Date: Wed, 12 Jun 2024 14:46:56 +0200 Subject: [PATCH 1224/1578] usb: ucsi: stm32: fix command completion handling Sometimes errors are seen, when doing DR swap, like: [ 24.672481] ucsi-stm32g0-i2c 0-0035: UCSI_GET_PDOS failed (-5) [ 24.720188] ucsi-stm32g0-i2c 0-0035: ucsi_handle_connector_change: GET_CONNECTOR_STATUS failed (-5) There may be some race, which lead to read CCI, before the command complete flag is set, hence returning -EIO. Similar fix has been done also in ucsi_acpi [1]. In case of a spurious or otherwise delayed notification it is possible that CCI still reports the previous completion. The UCSI spec is aware of this and provides two completion bits in CCI, one for normal commands and one for acks. As acks and commands alternate the notification handler can determine if the completion bit is from the current command. To fix this add the ACK_PENDING bit for ucsi_stm32g0 and only complete commands if the completion bit matches. [1] https://lore.kernel.org/lkml/20240121204123.275441-3-lk@c--e.de/ Fixes: 72849d4fcee7 ("usb: typec: ucsi: stm32g0: add support for stm32g0 controller") Signed-off-by: Fabrice Gasnier Link: https://lore.kernel.org/stable/20240612124656.2305603-1-fabrice.gasnier%40foss.st.com Cc: stable Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20240612124656.2305603-1-fabrice.gasnier@foss.st.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/ucsi/ucsi_stm32g0.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/usb/typec/ucsi/ucsi_stm32g0.c b/drivers/usb/typec/ucsi/ucsi_stm32g0.c index ac48b776311442..ac69288e8bb08f 100644 --- a/drivers/usb/typec/ucsi/ucsi_stm32g0.c +++ b/drivers/usb/typec/ucsi/ucsi_stm32g0.c @@ -65,6 +65,7 @@ struct ucsi_stm32g0 { struct device *dev; unsigned long flags; #define COMMAND_PENDING 1 +#define ACK_PENDING 2 const char *fw_name; struct ucsi *ucsi; bool suspended; @@ -396,9 +397,13 @@ static int ucsi_stm32g0_sync_write(struct ucsi *ucsi, unsigned int offset, const size_t len) { struct ucsi_stm32g0 *g0 = ucsi_get_drvdata(ucsi); + bool ack = UCSI_COMMAND(*(u64 *)val) == UCSI_ACK_CC_CI; int ret; - set_bit(COMMAND_PENDING, &g0->flags); + if (ack) + set_bit(ACK_PENDING, &g0->flags); + else + set_bit(COMMAND_PENDING, &g0->flags); ret = ucsi_stm32g0_async_write(ucsi, offset, val, len); if (ret) @@ -406,9 +411,14 @@ static int ucsi_stm32g0_sync_write(struct ucsi *ucsi, unsigned int offset, const if (!wait_for_completion_timeout(&g0->complete, msecs_to_jiffies(5000))) ret = -ETIMEDOUT; + else + return 0; out_clear_bit: - clear_bit(COMMAND_PENDING, &g0->flags); + if (ack) + clear_bit(ACK_PENDING, &g0->flags); + else + clear_bit(COMMAND_PENDING, &g0->flags); return ret; } @@ -429,8 +439,9 @@ static irqreturn_t ucsi_stm32g0_irq_handler(int irq, void *data) if (UCSI_CCI_CONNECTOR(cci)) ucsi_connector_change(g0->ucsi, UCSI_CCI_CONNECTOR(cci)); - if (test_bit(COMMAND_PENDING, &g0->flags) && - cci & (UCSI_CCI_ACK_COMPLETE | UCSI_CCI_COMMAND_COMPLETE)) + if (cci & UCSI_CCI_ACK_COMPLETE && test_and_clear_bit(ACK_PENDING, &g0->flags)) + complete(&g0->complete); + if (cci & UCSI_CCI_COMMAND_COMPLETE && test_and_clear_bit(COMMAND_PENDING, &g0->flags)) complete(&g0->complete); return IRQ_HANDLED; From 9e3caa9dd51b23e232f095a98336a84f42e4a7f2 Mon Sep 17 00:00:00 2001 From: Diogo Ivo Date: Wed, 12 Jun 2024 14:13:10 +0100 Subject: [PATCH 1225/1578] usb: typec: ucsi_acpi: Add LG Gram quirk Some LG Gram laptops report a bogus connector change event after a GET_PDOS command for the partner's source PDOs, which disappears from the CCI after acknowledging the command. However, the subsequent GET_CONNECTOR_STATUS in ucsi_handle_connector_change() still reports this bogus change in bits 5 and 6, leading to the UCSI core re-checking the partner's source PDOs and thus to an infinite loop. Fix this by adding a quirk that signals when a potentially buggy GET_PDOS command is used, checks the status change report and clears it if it is a bogus event before sending it to the UCSI core. Signed-off-by: Diogo Ivo Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20240612-gram_quirk-v1-1-52b0ff0e1546@tecnico.ulisboa.pt Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/ucsi/ucsi_acpi.c | 61 ++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/drivers/usb/typec/ucsi/ucsi_acpi.c b/drivers/usb/typec/ucsi/ucsi_acpi.c index 8d112c3edae51f..adf32ca0f761b2 100644 --- a/drivers/usb/typec/ucsi/ucsi_acpi.c +++ b/drivers/usb/typec/ucsi/ucsi_acpi.c @@ -25,6 +25,7 @@ struct ucsi_acpi { unsigned long flags; #define UCSI_ACPI_COMMAND_PENDING 1 #define UCSI_ACPI_ACK_PENDING 2 +#define UCSI_ACPI_CHECK_BOGUS_EVENT 3 guid_t guid; u64 cmd; }; @@ -128,6 +129,58 @@ static const struct ucsi_operations ucsi_zenbook_ops = { .async_write = ucsi_acpi_async_write }; +static int ucsi_gram_read(struct ucsi *ucsi, unsigned int offset, + void *val, size_t val_len) +{ + u16 bogus_change = UCSI_CONSTAT_POWER_LEVEL_CHANGE | + UCSI_CONSTAT_PDOS_CHANGE; + struct ucsi_acpi *ua = ucsi_get_drvdata(ucsi); + struct ucsi_connector_status *status; + int ret; + + ret = ucsi_acpi_read(ucsi, offset, val, val_len); + if (ret < 0) + return ret; + + if (UCSI_COMMAND(ua->cmd) == UCSI_GET_CONNECTOR_STATUS && + test_bit(UCSI_ACPI_CHECK_BOGUS_EVENT, &ua->flags) && + offset == UCSI_MESSAGE_IN) { + status = (struct ucsi_connector_status *)val; + + /* Clear the bogus change */ + if (status->change == bogus_change) + status->change = 0; + + clear_bit(UCSI_ACPI_CHECK_BOGUS_EVENT, &ua->flags); + } + + return ret; +} + +static int ucsi_gram_sync_write(struct ucsi *ucsi, unsigned int offset, + const void *val, size_t val_len) +{ + struct ucsi_acpi *ua = ucsi_get_drvdata(ucsi); + int ret; + + ret = ucsi_acpi_sync_write(ucsi, offset, val, val_len); + if (ret < 0) + return ret; + + if (UCSI_COMMAND(ua->cmd) == UCSI_GET_PDOS && + ua->cmd & UCSI_GET_PDOS_PARTNER_PDO(1) && + ua->cmd & UCSI_GET_PDOS_SRC_PDOS) + set_bit(UCSI_ACPI_CHECK_BOGUS_EVENT, &ua->flags); + + return ret; +} + +static const struct ucsi_operations ucsi_gram_ops = { + .read = ucsi_gram_read, + .sync_write = ucsi_gram_sync_write, + .async_write = ucsi_acpi_async_write +}; + static const struct dmi_system_id ucsi_acpi_quirks[] = { { .matches = { @@ -136,6 +189,14 @@ static const struct dmi_system_id ucsi_acpi_quirks[] = { }, .driver_data = (void *)&ucsi_zenbook_ops, }, + { + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "LG Electronics"), + DMI_MATCH(DMI_PRODUCT_FAMILY, "LG gram PC"), + DMI_MATCH(DMI_PRODUCT_NAME, "90Q"), + }, + .driver_data = (void *)&ucsi_gram_ops, + }, { } }; From de644a4a86be04ed8a43ef8267d0f7d021941c5e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 17 Jun 2024 12:31:30 +0300 Subject: [PATCH 1226/1578] usb: musb: da8xx: fix a resource leak in probe() Call usb_phy_generic_unregister() if of_platform_populate() fails. Fixes: d6299b6efbf6 ("usb: musb: Add support of CPPI 4.1 DMA controller to DA8xx") Cc: stable Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/69af1b1d-d3f4-492b-bcea-359ca5949f30@moroto.mountain Signed-off-by: Greg Kroah-Hartman --- drivers/usb/musb/da8xx.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/usb/musb/da8xx.c b/drivers/usb/musb/da8xx.c index 8abf3a567e30a8..108d9a593a80d3 100644 --- a/drivers/usb/musb/da8xx.c +++ b/drivers/usb/musb/da8xx.c @@ -556,7 +556,7 @@ static int da8xx_probe(struct platform_device *pdev) ret = of_platform_populate(pdev->dev.of_node, NULL, da8xx_auxdata_lookup, &pdev->dev); if (ret) - return ret; + goto err_unregister_phy; pinfo = da8xx_dev_info; pinfo.parent = &pdev->dev; @@ -571,9 +571,13 @@ static int da8xx_probe(struct platform_device *pdev) ret = PTR_ERR_OR_ZERO(glue->musb); if (ret) { dev_err(&pdev->dev, "failed to register musb device: %d\n", ret); - usb_phy_generic_unregister(glue->usb_phy); + goto err_unregister_phy; } + return 0; + +err_unregister_phy: + usb_phy_generic_unregister(glue->usb_phy); return ret; } From c68942624e254a4e8a65afcd3c17ed95acda5489 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Thu, 13 Jun 2024 14:14:48 +0200 Subject: [PATCH 1227/1578] usb: typec: ucsi: glink: fix child node release in probe function The device_for_each_child_node() macro requires explicit calls to fwnode_handle_put() in all early exits of the loop if the child node is not required outside. Otherwise, the child node's refcount is not decremented and the resource is not released. The current implementation of pmic_glink_ucsi_probe() makes use of the device_for_each_child_node(), but does not release the child node on early returns. Add the missing calls to fwnode_handle_put(). Cc: stable@vger.kernel.org Fixes: c6165ed2f425 ("usb: ucsi: glink: use the connector orientation GPIO to provide switch events") Signed-off-by: Javier Carrasco Reviewed-by: Dmitry Baryshkov Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20240613-ucsi-glink-release-node-v1-1-f7629a56f70a@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/ucsi/ucsi_glink.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/usb/typec/ucsi/ucsi_glink.c b/drivers/usb/typec/ucsi/ucsi_glink.c index 985a880e86da43..2fa973afe4e68b 100644 --- a/drivers/usb/typec/ucsi/ucsi_glink.c +++ b/drivers/usb/typec/ucsi/ucsi_glink.c @@ -372,6 +372,7 @@ static int pmic_glink_ucsi_probe(struct auxiliary_device *adev, ret = fwnode_property_read_u32(fwnode, "reg", &port); if (ret < 0) { dev_err(dev, "missing reg property of %pOFn\n", fwnode); + fwnode_handle_put(fwnode); return ret; } @@ -386,9 +387,11 @@ static int pmic_glink_ucsi_probe(struct auxiliary_device *adev, if (!desc) continue; - if (IS_ERR(desc)) + if (IS_ERR(desc)) { + fwnode_handle_put(fwnode); return dev_err_probe(dev, PTR_ERR(desc), "unable to acquire orientation gpio\n"); + } ucsi->port_orientation[port] = desc; } From 7838de15bb700c2898a7d741db9b1f3cbc86c136 Mon Sep 17 00:00:00 2001 From: Meng Li Date: Tue, 18 Jun 2024 11:19:18 +0800 Subject: [PATCH 1228/1578] usb: dwc3: core: remove lock of otg mode during gadget suspend/resume to avoid deadlock When config CONFIG_USB_DWC3_DUAL_ROLE is selected, and trigger system to enter suspend status with below command: echo mem > /sys/power/state There will be a deadlock issue occurring. Detailed invoking path as below: dwc3_suspend_common() spin_lock_irqsave(&dwc->lock, flags); <-- 1st dwc3_gadget_suspend(dwc); dwc3_gadget_soft_disconnect(dwc); spin_lock_irqsave(&dwc->lock, flags); <-- 2nd This issue is exposed by commit c7ebd8149ee5 ("usb: dwc3: gadget: Fix NULL pointer dereference in dwc3_gadget_suspend") that removes the code of checking whether dwc->gadget_driver is NULL or not. It causes the following code is executed and deadlock occurs when trying to get the spinlock. In fact, the root cause is the commit 5265397f9442("usb: dwc3: Remove DWC3 locking during gadget suspend/resume") that forgot to remove the lock of otg mode. So, remove the redundant lock of otg mode during gadget suspend/resume. Fixes: 5265397f9442 ("usb: dwc3: Remove DWC3 locking during gadget suspend/resume") Cc: Xu Yang Cc: stable@vger.kernel.org Signed-off-by: Meng Li Acked-by: Thinh Nguyen Link: https://lore.kernel.org/r/20240618031918.2585799-1-Meng.Li@windriver.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/core.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c index 7ee61a89520b69..9d47c3aa577710 100644 --- a/drivers/usb/dwc3/core.c +++ b/drivers/usb/dwc3/core.c @@ -2250,7 +2250,6 @@ static int dwc3_core_init_for_resume(struct dwc3 *dwc) static int dwc3_suspend_common(struct dwc3 *dwc, pm_message_t msg) { - unsigned long flags; u32 reg; int i; @@ -2293,9 +2292,7 @@ static int dwc3_suspend_common(struct dwc3 *dwc, pm_message_t msg) break; if (dwc->current_otg_role == DWC3_OTG_ROLE_DEVICE) { - spin_lock_irqsave(&dwc->lock, flags); dwc3_gadget_suspend(dwc); - spin_unlock_irqrestore(&dwc->lock, flags); synchronize_irq(dwc->irq_gadget); } @@ -2312,7 +2309,6 @@ static int dwc3_suspend_common(struct dwc3 *dwc, pm_message_t msg) static int dwc3_resume_common(struct dwc3 *dwc, pm_message_t msg) { - unsigned long flags; int ret; u32 reg; int i; @@ -2366,9 +2362,7 @@ static int dwc3_resume_common(struct dwc3 *dwc, pm_message_t msg) if (dwc->current_otg_role == DWC3_OTG_ROLE_HOST) { dwc3_otg_host_init(dwc); } else if (dwc->current_otg_role == DWC3_OTG_ROLE_DEVICE) { - spin_lock_irqsave(&dwc->lock, flags); dwc3_gadget_resume(dwc); - spin_unlock_irqrestore(&dwc->lock, flags); } break; From dba7567c2fbbf10a4de2471cdb0e16e5572dc007 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Thu, 13 Jun 2024 12:20:47 +0800 Subject: [PATCH 1229/1578] usb: gadget: aspeed_udc: fix device address configuration In the aspeed UDC setup, we configure the UDC hardware with the assigned USB device address. However, we have an off-by-one in the bitmask, so we're only setting the lower 6 bits of the address (USB addresses being 7 bits, and the hardware bitmask being bits 0:6). This means that device enumeration fails if the assigned address is greater than 64: [ 344.607255] usb 1-1: new high-speed USB device number 63 using ehci-platform [ 344.808459] usb 1-1: New USB device found, idVendor=cc00, idProduct=cc00, bcdDevice= 6.10 [ 344.817684] usb 1-1: New USB device strings: Mfr=1, Product=2, SerialNumber=3 [ 344.825671] usb 1-1: Product: Test device [ 344.831075] usb 1-1: Manufacturer: Test vendor [ 344.836335] usb 1-1: SerialNumber: 00 [ 349.917181] usb 1-1: USB disconnect, device number 63 [ 352.036775] usb 1-1: new high-speed USB device number 64 using ehci-platform [ 352.249432] usb 1-1: device descriptor read/all, error -71 [ 352.696740] usb 1-1: new high-speed USB device number 65 using ehci-platform [ 352.909431] usb 1-1: device descriptor read/all, error -71 Use the correct mask of 0x7f (rather than 0x3f), and generate this through the GENMASK macro, so we have numbers that correspond exactly to the hardware register definition. Fixes: 055276c13205 ("usb: gadget: add Aspeed ast2600 udc driver") Cc: stable@vger.kernel.org Reviewed-by: Neal Liu Reviewed-by: Andrew Jeffery Signed-off-by: Jeremy Kerr Link: https://lore.kernel.org/r/20240613-aspeed-udc-v2-1-29501ce9cb7a@codeconstruct.com.au Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/aspeed_udc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/udc/aspeed_udc.c b/drivers/usb/gadget/udc/aspeed_udc.c index 3916c8e2ba01f1..821a6ab5da56f8 100644 --- a/drivers/usb/gadget/udc/aspeed_udc.c +++ b/drivers/usb/gadget/udc/aspeed_udc.c @@ -66,8 +66,8 @@ #define USB_UPSTREAM_EN BIT(0) /* Main config reg */ -#define UDC_CFG_SET_ADDR(x) ((x) & 0x3f) -#define UDC_CFG_ADDR_MASK (0x3f) +#define UDC_CFG_SET_ADDR(x) ((x) & UDC_CFG_ADDR_MASK) +#define UDC_CFG_ADDR_MASK GENMASK(6, 0) /* Interrupt ctrl & status reg */ #define UDC_IRQ_EP_POOL_NAK BIT(17) From f3ced000a2df53f4b12849e121769045a81a3b22 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 10 Jun 2024 18:48:45 -0700 Subject: [PATCH 1230/1578] KVM: x86: Always sync PIR to IRR prior to scanning I/O APIC routes Sync pending posted interrupts to the IRR prior to re-scanning I/O APIC routes, irrespective of whether the I/O APIC is emulated by userspace or by KVM. If a level-triggered interrupt routed through the I/O APIC is pending or in-service for a vCPU, KVM needs to intercept EOIs on said vCPU even if the vCPU isn't the destination for the new routing, e.g. if servicing an interrupt using the old routing races with I/O APIC reconfiguration. Commit fceb3a36c29a ("KVM: x86: ioapic: Fix level-triggered EOI and userspace I/OAPIC reconfigure race") fixed the common cases, but kvm_apic_pending_eoi() only checks if an interrupt is in the local APIC's IRR or ISR, i.e. misses the uncommon case where an interrupt is pending in the PIR. Failure to intercept EOI can manifest as guest hangs with Windows 11 if the guest uses the RTC as its timekeeping source, e.g. if the VMM doesn't expose a more modern form of time to the guest. Cc: stable@vger.kernel.org Cc: Adamos Ttofari Cc: Raghavendra Rao Ananta Reviewed-by: Jim Mattson Signed-off-by: Sean Christopherson Message-ID: <20240611014845.82795-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8c9e4281d978d7..0763a0f72a067f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10718,13 +10718,12 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256); + static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); + if (irqchip_split(vcpu->kvm)) kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors); - else { - static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); - if (ioapic_in_kernel(vcpu->kvm)) - kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); - } + else if (ioapic_in_kernel(vcpu->kvm)) + kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); if (is_guest_mode(vcpu)) vcpu->arch.load_eoi_exitmap_pending = true; From b018589013d6db43fdc894c635d6590e0a7e3285 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 10 Jun 2024 09:34:27 -0700 Subject: [PATCH 1231/1578] MAINTAINERS: Drop Wanpeng Li as a Reviewer for KVM Paravirt support Drop Wanpeng as a KVM PARAVIRT reviewer as his @tencent.com email is bouncing, and according to lore[*], the last activity from his @gmail.com address was almost two years ago. [*] https://lore.kernel.org/all/CANRm+Cwj29M9HU3=JRUOaKDR+iDKgr0eNMWQi0iLkR5THON-bg@mail.gmail.com Cc: Wanpeng Li Cc: Like Xu Signed-off-by: Sean Christopherson Message-ID: <20240610163427.3359426-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 8754ac2c259dc9..5d62fe9495e6d3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12383,7 +12383,6 @@ F: drivers/video/backlight/ktz8866.c KVM PARAVIRT (KVM/paravirt) M: Paolo Bonzini -R: Wanpeng Li R: Vitaly Kuznetsov L: kvm@vger.kernel.org S: Supported From f474092c6fe1e2154a35308a1a1aef3212c3ecf2 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 10 Jun 2024 13:31:21 +0300 Subject: [PATCH 1232/1578] kvm: do not account temporary allocations to kmem Some allocations done by KVM are temporary, they are created as result of program actions, but can't exists for arbitrary long times. They should have been GFP_TEMPORARY (rip!). OTOH, kvm-nx-lpage-recovery and kvm-pit kernel threads exist for as long as VM exists but their task_struct memory is not accounted. This is story for another day. Signed-off-by: Alexey Dobriyan Message-ID: Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 14841acb8b959b..8e422c2c9450fd 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4427,7 +4427,7 @@ static long kvm_vcpu_ioctl(struct file *filp, struct kvm_regs *kvm_regs; r = -ENOMEM; - kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT); + kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); if (!kvm_regs) goto out; r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); @@ -4454,8 +4454,7 @@ static long kvm_vcpu_ioctl(struct file *filp, break; } case KVM_GET_SREGS: { - kvm_sregs = kzalloc(sizeof(struct kvm_sregs), - GFP_KERNEL_ACCOUNT); + kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); r = -ENOMEM; if (!kvm_sregs) goto out; @@ -4547,7 +4546,7 @@ static long kvm_vcpu_ioctl(struct file *filp, break; } case KVM_GET_FPU: { - fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT); + fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); r = -ENOMEM; if (!fpu) goto out; @@ -6210,7 +6209,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) active = kvm_active_vms; mutex_unlock(&kvm_lock); - env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT); + env = kzalloc(sizeof(*env), GFP_KERNEL); if (!env) return; @@ -6226,7 +6225,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) add_uevent_var(env, "PID=%d", kvm->userspace_pid); if (!IS_ERR(kvm->debugfs_dentry)) { - char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT); + char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL); if (p) { tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX); From ce5291e56081730ec7d87bc9aa41f3de73ff3256 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 20 Jun 2024 20:30:00 +0100 Subject: [PATCH 1233/1578] cifs: Defer read completion Defer read completion from the I/O thread to the cifsiod thread so as not to slow down the I/O thread. This restores the behaviour of v6.9. Fixes: 3ee1a1fc3981 ("cifs: Cut over to using netfslib") Signed-off-by: David Howells cc: Paulo Alcantara cc: Jeff Layton cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/smb2pdu.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 38a06e8a0f90f9..e213cecd509460 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -4484,6 +4484,16 @@ smb2_new_read_req(void **buf, unsigned int *total_len, return rc; } +static void smb2_readv_worker(struct work_struct *work) +{ + struct cifs_io_subrequest *rdata = + container_of(work, struct cifs_io_subrequest, subreq.work); + + netfs_subreq_terminated(&rdata->subreq, + (rdata->result == 0 || rdata->result == -EAGAIN) ? + rdata->got_bytes : rdata->result, true); +} + static void smb2_readv_callback(struct mid_q_entry *mid) { @@ -4578,9 +4588,8 @@ smb2_readv_callback(struct mid_q_entry *mid) rdata->result = 0; } rdata->credits.value = 0; - netfs_subreq_terminated(&rdata->subreq, - (rdata->result == 0 || rdata->result == -EAGAIN) ? - rdata->got_bytes : rdata->result, true); + INIT_WORK(&rdata->subreq.work, smb2_readv_worker); + queue_work(cifsiod_wq, &rdata->subreq.work); release_mid(mid); add_credits(server, &credits, 0); } From 969b3010cbfcf58de65399dff8252c41b5e79292 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 20 Jun 2024 18:31:28 +0100 Subject: [PATCH 1234/1578] cifs: Only pick a channel once per read request In cifs, only pick a channel when setting up a read request rather than doing so individually for every subrequest and instead use that channel for all. This mirrors what the code in v6.9 does. Signed-off-by: David Howells cc: Steve French cc: Paulo Alcantara cc: Jeff Layton cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 1 + fs/smb/client/file.c | 14 +++----------- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 73482734a8d8e9..0978997ddfa6bc 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -1494,6 +1494,7 @@ struct cifs_aio_ctx { struct cifs_io_request { struct netfs_io_request rreq; struct cifsFileInfo *cfile; + struct TCP_Server_Info *server; }; /* asynchronous read support */ diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 1e269e0bc75b35..4dbd80168a2bcd 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -134,17 +134,15 @@ static void cifs_issue_write(struct netfs_io_subrequest *subreq) static bool cifs_clamp_length(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; - struct TCP_Server_Info *server; struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq); struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq); + struct TCP_Server_Info *server = req->server; struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb); size_t rsize = 0; int rc; rdata->xid = get_xid(); rdata->have_xid = true; - - server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses); rdata->server = server; if (cifs_sb->ctx->rsize == 0) @@ -203,14 +201,7 @@ static void cifs_req_issue_read(struct netfs_io_subrequest *subreq) __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); rdata->pid = pid; - rc = adjust_credits(rdata->server, &rdata->credits, rdata->subreq.len); - if (!rc) { - if (rdata->req->cfile->invalidHandle) - rc = -EAGAIN; - else - rc = rdata->server->ops->async_readv(rdata); - } - + rc = rdata->server->ops->async_readv(rdata); out: if (rc) netfs_subreq_terminated(subreq, rc, false); @@ -250,6 +241,7 @@ static int cifs_init_request(struct netfs_io_request *rreq, struct file *file) open_file = file->private_data; rreq->netfs_priv = file->private_data; req->cfile = cifsFileInfo_get(open_file); + req->server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses); } else if (rreq->origin != NETFS_WRITEBACK) { WARN_ON_ONCE(1); return -EIO; From 3f59138580bf8006fa99641b5803d0f683709f10 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 20 Jun 2024 18:31:29 +0100 Subject: [PATCH 1235/1578] cifs: Move the 'pid' from the subreq to the req Move the reference pid from the cifs_io_subrequest struct to the cifs_io_request struct as it's the same for all subreqs of a particular request. Signed-off-by: David Howells cc: Paulo Alcantara cc: Jeff Layton cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 2 +- fs/smb/client/cifssmb.c | 8 ++++---- fs/smb/client/file.c | 11 +++-------- fs/smb/client/smb2pdu.c | 4 ++-- 4 files changed, 10 insertions(+), 15 deletions(-) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 0978997ddfa6bc..557b68e99d0a09 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -1495,6 +1495,7 @@ struct cifs_io_request { struct netfs_io_request rreq; struct cifsFileInfo *cfile; struct TCP_Server_Info *server; + pid_t pid; }; /* asynchronous read support */ @@ -1505,7 +1506,6 @@ struct cifs_io_subrequest { struct cifs_io_request *req; }; ssize_t got_bytes; - pid_t pid; unsigned int xid; int result; bool have_xid; diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 25e9ab947c1715..595c4b673707e9 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -1345,8 +1345,8 @@ cifs_async_readv(struct cifs_io_subrequest *rdata) if (rc) return rc; - smb->hdr.Pid = cpu_to_le16((__u16)rdata->pid); - smb->hdr.PidHigh = cpu_to_le16((__u16)(rdata->pid >> 16)); + smb->hdr.Pid = cpu_to_le16((__u16)rdata->req->pid); + smb->hdr.PidHigh = cpu_to_le16((__u16)(rdata->req->pid >> 16)); smb->AndXCommand = 0xFF; /* none */ smb->Fid = rdata->req->cfile->fid.netfid; @@ -1689,8 +1689,8 @@ cifs_async_writev(struct cifs_io_subrequest *wdata) if (rc) goto async_writev_out; - smb->hdr.Pid = cpu_to_le16((__u16)wdata->pid); - smb->hdr.PidHigh = cpu_to_le16((__u16)(wdata->pid >> 16)); + smb->hdr.Pid = cpu_to_le16((__u16)wdata->req->pid); + smb->hdr.PidHigh = cpu_to_le16((__u16)(wdata->req->pid >> 16)); smb->AndXCommand = 0xFF; /* none */ smb->Fid = wdata->req->cfile->fid.netfid; diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 4dbd80168a2bcd..f1f2573bb18df2 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -177,15 +177,8 @@ static void cifs_req_issue_read(struct netfs_io_subrequest *subreq) struct netfs_io_request *rreq = subreq->rreq; struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq); struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq); - struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb); - pid_t pid; int rc = 0; - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) - pid = req->cfile->pid; - else - pid = current->tgid; // Ummm... This may be a workqueue - cifs_dbg(FYI, "%s: op=%08x[%x] mapping=%p len=%zu/%zu\n", __func__, rreq->debug_id, subreq->debug_index, rreq->mapping, subreq->transferred, subreq->len); @@ -199,7 +192,6 @@ static void cifs_req_issue_read(struct netfs_io_subrequest *subreq) } __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); - rdata->pid = pid; rc = rdata->server->ops->async_readv(rdata); out: @@ -236,12 +228,15 @@ static int cifs_init_request(struct netfs_io_request *rreq, struct file *file) rreq->rsize = cifs_sb->ctx->rsize; rreq->wsize = cifs_sb->ctx->wsize; + req->pid = current->tgid; // Ummm... This may be a workqueue if (file) { open_file = file->private_data; rreq->netfs_priv = file->private_data; req->cfile = cifsFileInfo_get(open_file); req->server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) + req->pid = req->cfile->pid; } else if (rreq->origin != NETFS_WRITEBACK) { WARN_ON_ONCE(1); return -EIO; diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index e213cecd509460..2ae2dbb6202b3d 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -4621,7 +4621,7 @@ smb2_async_readv(struct cifs_io_subrequest *rdata) io_parms.length = rdata->subreq.len; io_parms.persistent_fid = rdata->req->cfile->fid.persistent_fid; io_parms.volatile_fid = rdata->req->cfile->fid.volatile_fid; - io_parms.pid = rdata->pid; + io_parms.pid = rdata->req->pid; rc = smb2_new_read_req( (void **) &buf, &total_len, &io_parms, rdata, 0, 0); @@ -4873,7 +4873,7 @@ smb2_async_writev(struct cifs_io_subrequest *wdata) .length = wdata->subreq.len, .persistent_fid = wdata->req->cfile->fid.persistent_fid, .volatile_fid = wdata->req->cfile->fid.volatile_fid, - .pid = wdata->pid, + .pid = wdata->req->pid, }; io_parms = &_io_parms; From e7c3696d4692e8046d25f6e63f983e934e12f2c5 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Wed, 15 May 2024 10:55:28 +0100 Subject: [PATCH 1236/1578] firmware: psci: Fix return value from psci_system_suspend() Currently we return the value from invoke_psci_fn() directly as return value from psci_system_suspend(). It is wrong to send the PSCI interface return value directly. psci_to_linux_errno() provide the mapping from PSCI return value to the one that can be returned to the callers within the kernel. Use psci_to_linux_errno() to convert and return the correct value from psci_system_suspend(). Fixes: faf7ec4a92c0 ("drivers: firmware: psci: add system suspend support") Acked-by: Mark Rutland Signed-off-by: Sudeep Holla Link: https://lore.kernel.org/r/20240515095528.1949992-1-sudeep.holla@arm.com Signed-off-by: Arnd Bergmann --- drivers/firmware/psci/psci.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c index d9629ff8786199..2328ca58bba61f 100644 --- a/drivers/firmware/psci/psci.c +++ b/drivers/firmware/psci/psci.c @@ -497,10 +497,12 @@ int psci_cpu_suspend_enter(u32 state) static int psci_system_suspend(unsigned long unused) { + int err; phys_addr_t pa_cpu_resume = __pa_symbol(cpu_resume); - return invoke_psci_fn(PSCI_FN_NATIVE(1_0, SYSTEM_SUSPEND), + err = invoke_psci_fn(PSCI_FN_NATIVE(1_0, SYSTEM_SUSPEND), pa_cpu_resume, 0, 0); + return psci_to_linux_errno(err); } static int psci_system_suspend_enter(suspend_state_t state) From c31745d2c508796a0996c88bf2e55f552d513f65 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 11 Jun 2024 04:22:18 -0400 Subject: [PATCH 1237/1578] virt: guest_memfd: fix reference leak on hwpoisoned page If kvm_gmem_get_pfn() detects an hwpoisoned page, it returns -EHWPOISON but it does not put back the reference that kvm_gmem_get_folio() had grabbed. Add the forgotten folio_put(). Fixes: a7800aa80ea4 ("KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory") Cc: stable@vger.kernel.org Reviewed-by: Liam Merwick Reviewed-by: Isaku Yamahata Signed-off-by: Paolo Bonzini --- virt/kvm/guest_memfd.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 0f4e0cf4f158b1..747fe251e445bb 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -510,8 +510,10 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, } if (folio_test_hwpoison(folio)) { + folio_unlock(folio); + folio_put(folio); r = -EHWPOISON; - goto out_unlock; + goto out_fput; } page = folio_file_page(folio, index); @@ -522,7 +524,6 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, r = 0; -out_unlock: folio_unlock(folio); out_fput: fput(file); From 8d1dfd51c84e202df05a999ce82cb27554f7d152 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 20 Jun 2024 12:53:50 +0000 Subject: [PATCH 1238/1578] block: Pass blk_queue_get_max_sectors() a request pointer Currently blk_queue_get_max_sectors() is passed a enum req_op. In future the value returned from blk_queue_get_max_sectors() may depend on certain request flags, so pass a request pointer. Reviewed-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Luis Chamberlain Reviewed-by: Martin K. Petersen Signed-off-by: John Garry Reviewed-by: Hannes Reinecke Acked-by: Darrick J. Wong Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20240620125359.2684798-2-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- block/blk-merge.c | 3 ++- block/blk-mq.c | 2 +- block/blk.h | 6 ++++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index 8534c35e04976a..8957e08e020c2d 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -593,7 +593,8 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq, if (blk_rq_is_passthrough(rq)) return q->limits.max_hw_sectors; - max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); + max_sectors = blk_queue_get_max_sectors(rq); + if (!q->limits.chunk_sectors || req_op(rq) == REQ_OP_DISCARD || req_op(rq) == REQ_OP_SECURE_ERASE) diff --git a/block/blk-mq.c b/block/blk-mq.c index e2b9710ddc5ad1..47fe9d19b8f109 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3036,7 +3036,7 @@ void blk_mq_submit_bio(struct bio *bio) blk_status_t blk_insert_cloned_request(struct request *rq) { struct request_queue *q = rq->q; - unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); + unsigned int max_sectors = blk_queue_get_max_sectors(rq); unsigned int max_segments = blk_rq_get_max_segments(rq); blk_status_t ret; diff --git a/block/blk.h b/block/blk.h index fa32f7fad5d7e6..20c5718815e2e7 100644 --- a/block/blk.h +++ b/block/blk.h @@ -182,9 +182,11 @@ static inline unsigned int blk_rq_get_max_segments(struct request *rq) return queue_max_segments(rq->q); } -static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, - enum req_op op) +static inline unsigned int blk_queue_get_max_sectors(struct request *rq) { + struct request_queue *q = rq->q; + enum req_op op = req_op(rq); + if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)) return min(q->limits.max_discard_sectors, UINT_MAX >> SECTOR_SHIFT); From f70167a7a6e7e8a6911f3a216dc044cbfe7c1983 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 20 Jun 2024 12:53:51 +0000 Subject: [PATCH 1239/1578] block: Generalize chunk_sectors support as boundary support The purpose of the chunk_sectors limit is to ensure that a mergeble request fits within the boundary of the chunck_sector value. Such a feature will be useful for other request_queue boundary limits, so generalize the chunk_sectors merge code. This idea was proposed by Hannes Reinecke. Reviewed-by: Martin K. Petersen Signed-off-by: John Garry Reviewed-by: Hannes Reinecke Acked-by: Darrick J. Wong Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20240620125359.2684798-3-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- block/blk-merge.c | 20 ++++++++++++++------ drivers/md/dm.c | 2 +- include/linux/blkdev.h | 13 +++++++------ 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index 8957e08e020c2d..68969e27c83193 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -154,6 +154,11 @@ static struct bio *bio_split_write_zeroes(struct bio *bio, return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs); } +static inline unsigned int blk_boundary_sectors(const struct queue_limits *lim) +{ + return lim->chunk_sectors; +} + /* * Return the maximum number of sectors from the start of a bio that may be * submitted as a single request to a block device. If enough sectors remain, @@ -167,12 +172,13 @@ static inline unsigned get_max_io_size(struct bio *bio, { unsigned pbs = lim->physical_block_size >> SECTOR_SHIFT; unsigned lbs = lim->logical_block_size >> SECTOR_SHIFT; + unsigned boundary_sectors = blk_boundary_sectors(lim); unsigned max_sectors = lim->max_sectors, start, end; - if (lim->chunk_sectors) { + if (boundary_sectors) { max_sectors = min(max_sectors, - blk_chunk_sectors_left(bio->bi_iter.bi_sector, - lim->chunk_sectors)); + blk_boundary_sectors_left(bio->bi_iter.bi_sector, + boundary_sectors)); } start = bio->bi_iter.bi_sector & (pbs - 1); @@ -588,19 +594,21 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq, sector_t offset) { struct request_queue *q = rq->q; - unsigned int max_sectors; + struct queue_limits *lim = &q->limits; + unsigned int max_sectors, boundary_sectors; if (blk_rq_is_passthrough(rq)) return q->limits.max_hw_sectors; + boundary_sectors = blk_boundary_sectors(lim); max_sectors = blk_queue_get_max_sectors(rq); - if (!q->limits.chunk_sectors || + if (!boundary_sectors || req_op(rq) == REQ_OP_DISCARD || req_op(rq) == REQ_OP_SECURE_ERASE) return max_sectors; return min(max_sectors, - blk_chunk_sectors_left(offset, q->limits.chunk_sectors)); + blk_boundary_sectors_left(offset, boundary_sectors)); } static inline int ll_new_hw_segment(struct request *req, struct bio *bio, diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 8a976cee448bed..7d107ae06e1ae1 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1188,7 +1188,7 @@ static sector_t __max_io_len(struct dm_target *ti, sector_t sector, return len; return min_t(sector_t, len, min(max_sectors ? : queue_max_sectors(ti->table->md->queue), - blk_chunk_sectors_left(target_offset, max_granularity))); + blk_boundary_sectors_left(target_offset, max_granularity))); } static inline sector_t max_io_len(struct dm_target *ti, sector_t sector) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0e8253c1507a51..fb7d4c21bba87f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -907,14 +907,15 @@ static inline bool bio_straddles_zones(struct bio *bio) } /* - * Return how much of the chunk is left to be used for I/O at a given offset. + * Return how much within the boundary is left to be used for I/O at a given + * offset. */ -static inline unsigned int blk_chunk_sectors_left(sector_t offset, - unsigned int chunk_sectors) +static inline unsigned int blk_boundary_sectors_left(sector_t offset, + unsigned int boundary_sectors) { - if (unlikely(!is_power_of_2(chunk_sectors))) - return chunk_sectors - sector_div(offset, chunk_sectors); - return chunk_sectors - (offset & (chunk_sectors - 1)); + if (unlikely(!is_power_of_2(boundary_sectors))) + return boundary_sectors - sector_div(offset, boundary_sectors); + return boundary_sectors - (offset & (boundary_sectors - 1)); } /** From c34fc6f26ab86d03a2d47446f42b6cd492dfdc56 Mon Sep 17 00:00:00 2001 From: Prasad Singamsetty Date: Thu, 20 Jun 2024 12:53:52 +0000 Subject: [PATCH 1240/1578] fs: Initial atomic write support An atomic write is a write issued with torn-write protection, meaning that for a power failure or any other hardware failure, all or none of the data from the write will be stored, but never a mix of old and new data. Userspace may add flag RWF_ATOMIC to pwritev2() to indicate that the write is to be issued with torn-write prevention, according to special alignment and length rules. For any syscall interface utilizing struct iocb, add IOCB_ATOMIC for iocb->ki_flags field to indicate the same. A call to statx will give the relevant atomic write info for a file: - atomic_write_unit_min - atomic_write_unit_max - atomic_write_segments_max Both min and max values must be a power-of-2. Applications can avail of atomic write feature by ensuring that the total length of a write is a power-of-2 in size and also sized between atomic_write_unit_min and atomic_write_unit_max, inclusive. Applications must ensure that the write is at a naturally-aligned offset in the file wrt the total write length. The value in atomic_write_segments_max indicates the upper limit for IOV_ITER iovcnt. Add file mode flag FMODE_CAN_ATOMIC_WRITE, so files which do not have the flag set will have RWF_ATOMIC rejected and not just ignored. Add a type argument to kiocb_set_rw_flags() to allows reads which have RWF_ATOMIC set to be rejected. Helper function generic_atomic_write_valid() can be used by FSes to verify compliant writes. There we check for iov_iter type is for ubuf, which implies iovcnt==1 for pwritev2(), which is an initial restriction for atomic_write_segments_max. Initially the only user will be bdev file operations write handler. We will rely on the block BIO submission path to ensure write sizes are compliant for the bdev, so we don't need to check atomic writes sizes yet. Signed-off-by: Prasad Singamsetty jpg: merge into single patch and much rewrite Acked-by: Darrick J. Wong Reviewed-by: Martin K. Petersen Signed-off-by: John Garry Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20240620125359.2684798-4-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- fs/aio.c | 8 ++++---- fs/btrfs/ioctl.c | 2 +- fs/read_write.c | 18 +++++++++++++++++- include/linux/fs.h | 17 +++++++++++++++-- include/uapi/linux/fs.h | 5 ++++- io_uring/rw.c | 9 ++++----- 6 files changed, 45 insertions(+), 14 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 57c9f7c077e601..93ef59d358b389 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1516,7 +1516,7 @@ static void aio_complete_rw(struct kiocb *kiocb, long res) iocb_put(iocb); } -static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) +static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb, int rw_type) { int ret; @@ -1542,7 +1542,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) } else req->ki_ioprio = get_current_ioprio(); - ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags); + ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags, rw_type); if (unlikely(ret)) return ret; @@ -1594,7 +1594,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb, struct file *file; int ret; - ret = aio_prep_rw(req, iocb); + ret = aio_prep_rw(req, iocb, READ); if (ret) return ret; file = req->ki_filp; @@ -1621,7 +1621,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb, struct file *file; int ret; - ret = aio_prep_rw(req, iocb); + ret = aio_prep_rw(req, iocb, WRITE); if (ret) return ret; file = req->ki_filp; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index efd5d6e9589e09..6ad524b894fc85 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4627,7 +4627,7 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool goto out_iov; init_sync_kiocb(&kiocb, file); - ret = kiocb_set_rw_flags(&kiocb, 0); + ret = kiocb_set_rw_flags(&kiocb, 0, WRITE); if (ret) goto out_iov; kiocb.ki_pos = pos; diff --git a/fs/read_write.c b/fs/read_write.c index ef6339391351c2..90e283b31ca181 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -730,7 +730,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, ssize_t ret; init_sync_kiocb(&kiocb, filp); - ret = kiocb_set_rw_flags(&kiocb, flags); + ret = kiocb_set_rw_flags(&kiocb, flags, type); if (ret) return ret; kiocb.ki_pos = (ppos ? *ppos : 0); @@ -1736,3 +1736,19 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out) return 0; } + +bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos) +{ + size_t len = iov_iter_count(iter); + + if (!iter_is_ubuf(iter)) + return false; + + if (!is_power_of_2(len)) + return false; + + if (!IS_ALIGNED(pos, len)) + return false; + + return true; +} diff --git a/include/linux/fs.h b/include/linux/fs.h index 0283cf366c2a93..e049414bef7da6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -125,8 +125,10 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define FMODE_EXEC ((__force fmode_t)(1 << 5)) /* File writes are restricted (block device specific) */ #define FMODE_WRITE_RESTRICTED ((__force fmode_t)(1 << 6)) +/* File supports atomic writes */ +#define FMODE_CAN_ATOMIC_WRITE ((__force fmode_t)(1 << 7)) -/* FMODE_* bits 7 to 8 */ +/* FMODE_* bit 8 */ /* 32bit hashes as llseek() offset (for directories) */ #define FMODE_32BITHASH ((__force fmode_t)(1 << 9)) @@ -317,6 +319,7 @@ struct readahead_control; #define IOCB_SYNC (__force int) RWF_SYNC #define IOCB_NOWAIT (__force int) RWF_NOWAIT #define IOCB_APPEND (__force int) RWF_APPEND +#define IOCB_ATOMIC (__force int) RWF_ATOMIC /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) @@ -351,6 +354,7 @@ struct readahead_control; { IOCB_SYNC, "SYNC" }, \ { IOCB_NOWAIT, "NOWAIT" }, \ { IOCB_APPEND, "APPEND" }, \ + { IOCB_ATOMIC, "ATOMIC"}, \ { IOCB_EVENTFD, "EVENTFD"}, \ { IOCB_DIRECT, "DIRECT" }, \ { IOCB_WRITE, "WRITE" }, \ @@ -3403,7 +3407,8 @@ static inline int iocb_flags(struct file *file) return res; } -static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) +static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags, + int rw_type) { int kiocb_flags = 0; @@ -3422,6 +3427,12 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) return -EOPNOTSUPP; kiocb_flags |= IOCB_NOIO; } + if (flags & RWF_ATOMIC) { + if (rw_type != WRITE) + return -EOPNOTSUPP; + if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE)) + return -EOPNOTSUPP; + } kiocb_flags |= (__force int) (flags & RWF_SUPPORTED); if (flags & RWF_SYNC) kiocb_flags |= IOCB_DSYNC; @@ -3613,4 +3624,6 @@ extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, extern int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice); +bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos); + #endif /* _LINUX_FS_H */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 45e4e64fd6643c..191a7e88a8ab7d 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -329,9 +329,12 @@ typedef int __bitwise __kernel_rwf_t; /* per-IO negation of O_APPEND */ #define RWF_NOAPPEND ((__force __kernel_rwf_t)0x00000020) +/* Atomic Write */ +#define RWF_ATOMIC ((__force __kernel_rwf_t)0x00000040) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ - RWF_APPEND | RWF_NOAPPEND) + RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC) /* Pagemap ioctl */ #define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg) diff --git a/io_uring/rw.c b/io_uring/rw.c index 1a2128459cb4c8..c004d21e2f12e3 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -772,7 +772,7 @@ static bool need_complete_io(struct io_kiocb *req) S_ISBLK(file_inode(req->file)->i_mode); } -static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) +static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct kiocb *kiocb = &rw->kiocb; @@ -787,7 +787,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) req->flags |= io_file_get_flags(file); kiocb->ki_flags = file->f_iocb_flags; - ret = kiocb_set_rw_flags(kiocb, rw->flags); + ret = kiocb_set_rw_flags(kiocb, rw->flags, rw_type); if (unlikely(ret)) return ret; kiocb->ki_flags |= IOCB_ALLOC_CACHE; @@ -832,8 +832,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(ret < 0)) return ret; } - - ret = io_rw_init_file(req, FMODE_READ); + ret = io_rw_init_file(req, FMODE_READ, READ); if (unlikely(ret)) return ret; req->cqe.res = iov_iter_count(&io->iter); @@ -1013,7 +1012,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) ssize_t ret, ret2; loff_t *ppos; - ret = io_rw_init_file(req, FMODE_WRITE); + ret = io_rw_init_file(req, FMODE_WRITE, WRITE); if (unlikely(ret)) return ret; req->cqe.res = iov_iter_count(&io->iter); From 0f9ca80fa4f9670ba09721e4e36b8baf086a500c Mon Sep 17 00:00:00 2001 From: Prasad Singamsetty Date: Thu, 20 Jun 2024 12:53:53 +0000 Subject: [PATCH 1241/1578] fs: Add initial atomic write support info to statx Extend statx system call to return additional info for atomic write support support for a file. Helper function generic_fill_statx_atomic_writes() can be used by FSes to fill in the relevant statx fields. For now atomic_write_segments_max will always be 1, otherwise some rules would need to be imposed on iovec length and alignment, which we don't want now. Signed-off-by: Prasad Singamsetty jpg: relocate bdev support to another patch Reviewed-by: Darrick J. Wong Reviewed-by: Martin K. Petersen Signed-off-by: John Garry Acked-by: Darrick J. Wong Link: https://lore.kernel.org/r/20240620125359.2684798-5-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- fs/stat.c | 34 ++++++++++++++++++++++++++++++++++ include/linux/fs.h | 3 +++ include/linux/stat.h | 3 +++ include/uapi/linux/stat.h | 12 ++++++++++-- 4 files changed, 50 insertions(+), 2 deletions(-) diff --git a/fs/stat.c b/fs/stat.c index 70bd3e888cfa30..72d0e6357b91f1 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -89,6 +89,37 @@ void generic_fill_statx_attr(struct inode *inode, struct kstat *stat) } EXPORT_SYMBOL(generic_fill_statx_attr); +/** + * generic_fill_statx_atomic_writes - Fill in atomic writes statx attributes + * @stat: Where to fill in the attribute flags + * @unit_min: Minimum supported atomic write length in bytes + * @unit_max: Maximum supported atomic write length in bytes + * + * Fill in the STATX{_ATTR}_WRITE_ATOMIC flags in the kstat structure from + * atomic write unit_min and unit_max values. + */ +void generic_fill_statx_atomic_writes(struct kstat *stat, + unsigned int unit_min, + unsigned int unit_max) +{ + /* Confirm that the request type is known */ + stat->result_mask |= STATX_WRITE_ATOMIC; + + /* Confirm that the file attribute type is known */ + stat->attributes_mask |= STATX_ATTR_WRITE_ATOMIC; + + if (unit_min) { + stat->atomic_write_unit_min = unit_min; + stat->atomic_write_unit_max = unit_max; + /* Initially only allow 1x segment */ + stat->atomic_write_segments_max = 1; + + /* Confirm atomic writes are actually supported */ + stat->attributes |= STATX_ATTR_WRITE_ATOMIC; + } +} +EXPORT_SYMBOL_GPL(generic_fill_statx_atomic_writes); + /** * vfs_getattr_nosec - getattr without security checks * @path: file to get attributes from @@ -659,6 +690,9 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer) tmp.stx_dio_mem_align = stat->dio_mem_align; tmp.stx_dio_offset_align = stat->dio_offset_align; tmp.stx_subvol = stat->subvol; + tmp.stx_atomic_write_unit_min = stat->atomic_write_unit_min; + tmp.stx_atomic_write_unit_max = stat->atomic_write_unit_max; + tmp.stx_atomic_write_segments_max = stat->atomic_write_segments_max; return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0; } diff --git a/include/linux/fs.h b/include/linux/fs.h index e049414bef7da6..db26b4a70c628b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3235,6 +3235,9 @@ extern const struct inode_operations page_symlink_inode_operations; extern void kfree_link(void *); void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *); void generic_fill_statx_attr(struct inode *inode, struct kstat *stat); +void generic_fill_statx_atomic_writes(struct kstat *stat, + unsigned int unit_min, + unsigned int unit_max); extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int); extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int); void __inode_add_bytes(struct inode *inode, loff_t bytes); diff --git a/include/linux/stat.h b/include/linux/stat.h index bf92441dbad284..3d900c86981c5b 100644 --- a/include/linux/stat.h +++ b/include/linux/stat.h @@ -54,6 +54,9 @@ struct kstat { u32 dio_offset_align; u64 change_cookie; u64 subvol; + u32 atomic_write_unit_min; + u32 atomic_write_unit_max; + u32 atomic_write_segments_max; }; /* These definitions are internal to the kernel for now. Mainly used by nfsd. */ diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h index 67626d53531664..887a2528644168 100644 --- a/include/uapi/linux/stat.h +++ b/include/uapi/linux/stat.h @@ -126,9 +126,15 @@ struct statx { __u64 stx_mnt_id; __u32 stx_dio_mem_align; /* Memory buffer alignment for direct I/O */ __u32 stx_dio_offset_align; /* File offset alignment for direct I/O */ - __u64 stx_subvol; /* Subvolume identifier */ /* 0xa0 */ - __u64 __spare3[11]; /* Spare space for future expansion */ + __u64 stx_subvol; /* Subvolume identifier */ + __u32 stx_atomic_write_unit_min; /* Min atomic write unit in bytes */ + __u32 stx_atomic_write_unit_max; /* Max atomic write unit in bytes */ + /* 0xb0 */ + __u32 stx_atomic_write_segments_max; /* Max atomic write segment count */ + __u32 __spare1[1]; + /* 0xb8 */ + __u64 __spare3[9]; /* Spare space for future expansion */ /* 0x100 */ }; @@ -157,6 +163,7 @@ struct statx { #define STATX_DIOALIGN 0x00002000U /* Want/got direct I/O alignment info */ #define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */ #define STATX_SUBVOL 0x00008000U /* Want/got stx_subvol */ +#define STATX_WRITE_ATOMIC 0x00010000U /* Want/got atomic_write_* fields */ #define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */ @@ -192,6 +199,7 @@ struct statx { #define STATX_ATTR_MOUNT_ROOT 0x00002000 /* Root of a mount */ #define STATX_ATTR_VERITY 0x00100000 /* [I] Verity protected file */ #define STATX_ATTR_DAX 0x00200000 /* File is currently in DAX state */ +#define STATX_ATTR_WRITE_ATOMIC 0x00400000 /* File supports atomic write operations */ #endif /* _UAPI_LINUX_STAT_H */ From 9da3d1e912f3953196e66991d75208cde3e845e1 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 20 Jun 2024 12:53:54 +0000 Subject: [PATCH 1242/1578] block: Add core atomic write support Add atomic write support, as follows: - add helper functions to get request_queue atomic write limits - report request_queue atomic write support limits to sysfs and update Doc - support to safely merge atomic writes - deal with splitting atomic writes - misc helper functions - add a per-request atomic write flag New request_queue limits are added, as follows: - atomic_write_hw_max is set by the block driver and is the maximum length of an atomic write which the device may support. It is not necessarily a power-of-2. - atomic_write_max_sectors is derived from atomic_write_hw_max_sectors and max_hw_sectors. It is always a power-of-2. Atomic writes may be merged, and atomic_write_max_sectors would be the limit on a merged atomic write request size. This value is not capped at max_sectors, as the value in max_sectors can be controlled from userspace, and it would only cause trouble if userspace could limit atomic_write_unit_max_bytes and the other atomic write limits. - atomic_write_hw_unit_{min,max} are set by the block driver and are the min/max length of an atomic write unit which the device may support. They both must be a power-of-2. Typically atomic_write_hw_unit_max will hold the same value as atomic_write_hw_max. - atomic_write_unit_{min,max} are derived from atomic_write_hw_unit_{min,max}, max_hw_sectors, and block core limits. Both min and max values must be a power-of-2. - atomic_write_hw_boundary is set by the block driver. If non-zero, it indicates an LBA space boundary at which an atomic write straddles no longer is atomically executed by the disk. The value must be a power-of-2. Note that it would be acceptable to enforce a rule that atomic_write_hw_boundary_sectors is a multiple of atomic_write_hw_unit_max, but the resultant code would be more complicated. All atomic writes limits are by default set 0 to indicate no atomic write support. Even though it is assumed by Linux that a logical block can always be atomically written, we ignore this as it is not of particular interest. Stacked devices are just not supported either for now. An atomic write must always be submitted to the block driver as part of a single request. As such, only a single BIO must be submitted to the block layer for an atomic write. When a single atomic write BIO is submitted, it cannot be split. As such, atomic_write_unit_{max, min}_bytes are limited by the maximum guaranteed BIO size which will not be required to be split. This max size is calculated by request_queue max segments and the number of bvecs a BIO can fit, BIO_MAX_VECS. Currently we rely on userspace issuing a write with iovcnt=1 for pwritev2() - as such, we can rely on each segment containing PAGE_SIZE of data, apart from the first+last, which each can fit logical block size of data. The first+last will be LBS length/aligned as we rely on direct IO alignment rules also. New sysfs files are added to report the following atomic write limits: - atomic_write_unit_max_bytes - same as atomic_write_unit_max_sectors in bytes - atomic_write_unit_min_bytes - same as atomic_write_unit_min_sectors in bytes - atomic_write_boundary_bytes - same as atomic_write_hw_boundary_sectors in bytes - atomic_write_max_bytes - same as atomic_write_max_sectors in bytes Atomic writes may only be merged with other atomic writes and only under the following conditions: - total resultant request length <= atomic_write_max_bytes - the merged write does not straddle a boundary Helper function bdev_can_atomic_write() is added to indicate whether atomic writes may be issued to a bdev. If a bdev is a partition, the partition start must be aligned with both atomic_write_unit_min_sectors and atomic_write_hw_boundary_sectors. FSes will rely on the block layer to validate that an atomic write BIO submitted will be of valid size, so add blk_validate_atomic_write_op_size() for this purpose. Userspace expects an atomic write which is of invalid size to be rejected with -EINVAL, so add BLK_STS_INVAL for this. Also use BLK_STS_INVAL for when a BIO needs to be split, as this should mean an invalid size BIO. Flag REQ_ATOMIC is used for indicating an atomic write. Co-developed-by: Himanshu Madhani Signed-off-by: Himanshu Madhani Reviewed-by: Martin K. Petersen Signed-off-by: John Garry Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20240620125359.2684798-6-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- Documentation/ABI/stable/sysfs-block | 53 +++++++++++++++++ block/blk-core.c | 19 ++++++ block/blk-merge.c | 50 ++++++++++++++-- block/blk-settings.c | 88 ++++++++++++++++++++++++++++ block/blk-sysfs.c | 33 +++++++++++ block/blk.h | 3 + include/linux/blk_types.h | 8 ++- include/linux/blkdev.h | 55 +++++++++++++++++ 8 files changed, 304 insertions(+), 5 deletions(-) diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index 831f19a32e080c..cea8856f798dde 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -21,6 +21,59 @@ Description: device is offset from the internal allocation unit's natural alignment. +What: /sys/block//atomic_write_max_bytes +Date: February 2024 +Contact: Himanshu Madhani +Description: + [RO] This parameter specifies the maximum atomic write + size reported by the device. This parameter is relevant + for merging of writes, where a merged atomic write + operation must not exceed this number of bytes. + This parameter may be greater than the value in + atomic_write_unit_max_bytes as + atomic_write_unit_max_bytes will be rounded down to a + power-of-two and atomic_write_unit_max_bytes may also be + limited by some other queue limits, such as max_segments. + This parameter - along with atomic_write_unit_min_bytes + and atomic_write_unit_max_bytes - will not be larger than + max_hw_sectors_kb, but may be larger than max_sectors_kb. + + +What: /sys/block//atomic_write_unit_min_bytes +Date: February 2024 +Contact: Himanshu Madhani +Description: + [RO] This parameter specifies the smallest block which can + be written atomically with an atomic write operation. All + atomic write operations must begin at a + atomic_write_unit_min boundary and must be multiples of + atomic_write_unit_min. This value must be a power-of-two. + + +What: /sys/block//atomic_write_unit_max_bytes +Date: February 2024 +Contact: Himanshu Madhani +Description: + [RO] This parameter defines the largest block which can be + written atomically with an atomic write operation. This + value must be a multiple of atomic_write_unit_min and must + be a power-of-two. This value will not be larger than + atomic_write_max_bytes. + + +What: /sys/block//atomic_write_boundary_bytes +Date: February 2024 +Contact: Himanshu Madhani +Description: + [RO] A device may need to internally split an atomic write I/O + which straddles a given logical block address boundary. This + parameter specifies the size in bytes of the atomic boundary if + one is reported by the device. This value must be a + power-of-two and at least the size as in + atomic_write_unit_max_bytes. + Any attempt to merge atomic write I/Os must not result in a + merged I/O which crosses this boundary (if any). + What: /sys/block//diskseq Date: February 2021 diff --git a/block/blk-core.c b/block/blk-core.c index 8d9fbd353fc7fc..6fc1a5a1980db3 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -174,6 +174,8 @@ static const struct { /* Command duration limit device-side timeout */ [BLK_STS_DURATION_LIMIT] = { -ETIME, "duration limit exceeded" }, + [BLK_STS_INVAL] = { -EINVAL, "invalid" }, + /* everything else not covered above: */ [BLK_STS_IOERR] = { -EIO, "I/O" }, }; @@ -739,6 +741,18 @@ void submit_bio_noacct_nocheck(struct bio *bio) __submit_bio_noacct(bio); } +static blk_status_t blk_validate_atomic_write_op_size(struct request_queue *q, + struct bio *bio) +{ + if (bio->bi_iter.bi_size > queue_atomic_write_unit_max_bytes(q)) + return BLK_STS_INVAL; + + if (bio->bi_iter.bi_size % queue_atomic_write_unit_min_bytes(q)) + return BLK_STS_INVAL; + + return BLK_STS_OK; +} + /** * submit_bio_noacct - re-submit a bio to the block device layer for I/O * @bio: The bio describing the location in memory and on the device. @@ -797,6 +811,11 @@ void submit_bio_noacct(struct bio *bio) switch (bio_op(bio)) { case REQ_OP_READ: case REQ_OP_WRITE: + if (bio->bi_opf & REQ_ATOMIC) { + status = blk_validate_atomic_write_op_size(q, bio); + if (status != BLK_STS_OK) + goto end_io; + } break; case REQ_OP_FLUSH: /* diff --git a/block/blk-merge.c b/block/blk-merge.c index 68969e27c83193..cff20bcc0252a7 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -154,8 +154,16 @@ static struct bio *bio_split_write_zeroes(struct bio *bio, return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs); } -static inline unsigned int blk_boundary_sectors(const struct queue_limits *lim) +static inline unsigned int blk_boundary_sectors(const struct queue_limits *lim, + bool is_atomic) { + /* + * chunk_sectors must be a multiple of atomic_write_boundary_sectors if + * both non-zero. + */ + if (is_atomic && lim->atomic_write_boundary_sectors) + return lim->atomic_write_boundary_sectors; + return lim->chunk_sectors; } @@ -172,8 +180,18 @@ static inline unsigned get_max_io_size(struct bio *bio, { unsigned pbs = lim->physical_block_size >> SECTOR_SHIFT; unsigned lbs = lim->logical_block_size >> SECTOR_SHIFT; - unsigned boundary_sectors = blk_boundary_sectors(lim); - unsigned max_sectors = lim->max_sectors, start, end; + bool is_atomic = bio->bi_opf & REQ_ATOMIC; + unsigned boundary_sectors = blk_boundary_sectors(lim, is_atomic); + unsigned max_sectors, start, end; + + /* + * We ignore lim->max_sectors for atomic writes because it may less + * than the actual bio size, which we cannot tolerate. + */ + if (is_atomic) + max_sectors = lim->atomic_write_max_sectors; + else + max_sectors = lim->max_sectors; if (boundary_sectors) { max_sectors = min(max_sectors, @@ -311,6 +329,11 @@ struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, *segs = nsegs; return NULL; split: + if (bio->bi_opf & REQ_ATOMIC) { + bio->bi_status = BLK_STS_INVAL; + bio_endio(bio); + return ERR_PTR(-EINVAL); + } /* * We can't sanely support splitting for a REQ_NOWAIT bio. End it * with EAGAIN if splitting is required and return an error pointer. @@ -596,11 +619,12 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq, struct request_queue *q = rq->q; struct queue_limits *lim = &q->limits; unsigned int max_sectors, boundary_sectors; + bool is_atomic = rq->cmd_flags & REQ_ATOMIC; if (blk_rq_is_passthrough(rq)) return q->limits.max_hw_sectors; - boundary_sectors = blk_boundary_sectors(lim); + boundary_sectors = blk_boundary_sectors(lim, is_atomic); max_sectors = blk_queue_get_max_sectors(rq); if (!boundary_sectors || @@ -806,6 +830,18 @@ static enum elv_merge blk_try_req_merge(struct request *req, return ELEVATOR_NO_MERGE; } +static bool blk_atomic_write_mergeable_rq_bio(struct request *rq, + struct bio *bio) +{ + return (rq->cmd_flags & REQ_ATOMIC) == (bio->bi_opf & REQ_ATOMIC); +} + +static bool blk_atomic_write_mergeable_rqs(struct request *rq, + struct request *next) +{ + return (rq->cmd_flags & REQ_ATOMIC) == (next->cmd_flags & REQ_ATOMIC); +} + /* * For non-mq, this has to be called with the request spinlock acquired. * For mq with scheduling, the appropriate queue wide lock should be held. @@ -829,6 +865,9 @@ static struct request *attempt_merge(struct request_queue *q, if (req->ioprio != next->ioprio) return NULL; + if (!blk_atomic_write_mergeable_rqs(req, next)) + return NULL; + /* * If we are allowed to merge, then append bio list * from next to rq and release next. merge_requests_fn @@ -960,6 +999,9 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (rq->ioprio != bio_prio(bio)) return false; + if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false) + return false; + return true; } diff --git a/block/blk-settings.c b/block/blk-settings.c index b193068040569c..37fe4c8f6b6ba1 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -135,6 +135,92 @@ static int blk_validate_integrity_limits(struct queue_limits *lim) return 0; } +/* + * Returns max guaranteed bytes which we can fit in a bio. + * + * We request that an atomic_write is ITER_UBUF iov_iter (so a single vector), + * so we assume that we can fit in at least PAGE_SIZE in a segment, apart from + * the first and last segments. + */ +static +unsigned int blk_queue_max_guaranteed_bio(struct queue_limits *lim) +{ + unsigned int max_segments = min(BIO_MAX_VECS, lim->max_segments); + unsigned int length; + + length = min(max_segments, 2) * lim->logical_block_size; + if (max_segments > 2) + length += (max_segments - 2) * PAGE_SIZE; + + return length; +} + +static void blk_atomic_writes_update_limits(struct queue_limits *lim) +{ + unsigned int unit_limit = min(lim->max_hw_sectors << SECTOR_SHIFT, + blk_queue_max_guaranteed_bio(lim)); + + unit_limit = rounddown_pow_of_two(unit_limit); + + lim->atomic_write_max_sectors = + min(lim->atomic_write_hw_max >> SECTOR_SHIFT, + lim->max_hw_sectors); + lim->atomic_write_unit_min = + min(lim->atomic_write_hw_unit_min, unit_limit); + lim->atomic_write_unit_max = + min(lim->atomic_write_hw_unit_max, unit_limit); + lim->atomic_write_boundary_sectors = + lim->atomic_write_hw_boundary >> SECTOR_SHIFT; +} + +static void blk_validate_atomic_write_limits(struct queue_limits *lim) +{ + unsigned int chunk_sectors = lim->chunk_sectors; + unsigned int boundary_sectors; + + if (!lim->atomic_write_hw_max) + goto unsupported; + + boundary_sectors = lim->atomic_write_hw_boundary >> SECTOR_SHIFT; + + if (boundary_sectors) { + /* + * A feature of boundary support is that it disallows bios to + * be merged which would result in a merged request which + * crosses either a chunk sector or atomic write HW boundary, + * even though chunk sectors may be just set for performance. + * For simplicity, disallow atomic writes for a chunk sector + * which is non-zero and smaller than atomic write HW boundary. + * Furthermore, chunk sectors must be a multiple of atomic + * write HW boundary. Otherwise boundary support becomes + * complicated. + * Devices which do not conform to these rules can be dealt + * with if and when they show up. + */ + if (WARN_ON_ONCE(do_div(chunk_sectors, boundary_sectors))) + goto unsupported; + + /* + * The boundary size just needs to be a multiple of unit_max + * (and not necessarily a power-of-2), so this following check + * could be relaxed in future. + * Furthermore, if needed, unit_max could even be reduced so + * that it is compliant with a !power-of-2 boundary. + */ + if (!is_power_of_2(boundary_sectors)) + goto unsupported; + } + + blk_atomic_writes_update_limits(lim); + return; + +unsupported: + lim->atomic_write_max_sectors = 0; + lim->atomic_write_boundary_sectors = 0; + lim->atomic_write_unit_min = 0; + lim->atomic_write_unit_max = 0; +} + /* * Check that the limits in lim are valid, initialize defaults for unset * values, and cap values based on others where needed. @@ -272,6 +358,8 @@ static int blk_validate_limits(struct queue_limits *lim) if (!(lim->features & BLK_FEAT_WRITE_CACHE)) lim->features &= ~BLK_FEAT_FUA; + blk_validate_atomic_write_limits(lim); + err = blk_validate_integrity_limits(lim); if (err) return err; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 59e6d111ed059a..1a984179f3acc5 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -118,6 +118,30 @@ static ssize_t queue_max_discard_segments_show(struct request_queue *q, return queue_var_show(queue_max_discard_segments(q), page); } +static ssize_t queue_atomic_write_max_bytes_show(struct request_queue *q, + char *page) +{ + return queue_var_show(queue_atomic_write_max_bytes(q), page); +} + +static ssize_t queue_atomic_write_boundary_show(struct request_queue *q, + char *page) +{ + return queue_var_show(queue_atomic_write_boundary_bytes(q), page); +} + +static ssize_t queue_atomic_write_unit_min_show(struct request_queue *q, + char *page) +{ + return queue_var_show(queue_atomic_write_unit_min_bytes(q), page); +} + +static ssize_t queue_atomic_write_unit_max_show(struct request_queue *q, + char *page) +{ + return queue_var_show(queue_atomic_write_unit_max_bytes(q), page); +} + static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page) { return queue_var_show(q->limits.max_integrity_segments, page); @@ -505,6 +529,11 @@ QUEUE_RO_ENTRY(queue_discard_max_hw, "discard_max_hw_bytes"); QUEUE_RW_ENTRY(queue_discard_max, "discard_max_bytes"); QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data"); +QUEUE_RO_ENTRY(queue_atomic_write_max_bytes, "atomic_write_max_bytes"); +QUEUE_RO_ENTRY(queue_atomic_write_boundary, "atomic_write_boundary_bytes"); +QUEUE_RO_ENTRY(queue_atomic_write_unit_max, "atomic_write_unit_max_bytes"); +QUEUE_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes"); + QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes"); QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes"); QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes"); @@ -626,6 +655,10 @@ static struct attribute *queue_attrs[] = { &queue_discard_max_entry.attr, &queue_discard_max_hw_entry.attr, &queue_discard_zeroes_data_entry.attr, + &queue_atomic_write_max_bytes_entry.attr, + &queue_atomic_write_boundary_entry.attr, + &queue_atomic_write_unit_min_entry.attr, + &queue_atomic_write_unit_max_entry.attr, &queue_write_same_max_entry.attr, &queue_write_zeroes_max_entry.attr, &queue_zone_append_max_entry.attr, diff --git a/block/blk.h b/block/blk.h index 20c5718815e2e7..d0a986d8ee507e 100644 --- a/block/blk.h +++ b/block/blk.h @@ -194,6 +194,9 @@ static inline unsigned int blk_queue_get_max_sectors(struct request *rq) if (unlikely(op == REQ_OP_WRITE_ZEROES)) return q->limits.max_write_zeroes_sectors; + if (rq->cmd_flags & REQ_ATOMIC) + return q->limits.atomic_write_max_sectors; + return q->limits.max_sectors; } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 781c4500491bde..632edd71f8c63c 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -162,6 +162,11 @@ typedef u16 blk_short_t; */ #define BLK_STS_DURATION_LIMIT ((__force blk_status_t)17) +/* + * Invalid size or alignment. + */ +#define BLK_STS_INVAL ((__force blk_status_t)19) + /** * blk_path_error - returns true if error may be path related * @error: status the request was completed with @@ -370,7 +375,7 @@ enum req_flag_bits { __REQ_SWAP, /* swap I/O */ __REQ_DRV, /* for driver use */ __REQ_FS_PRIVATE, /* for file system (submitter) use */ - + __REQ_ATOMIC, /* for atomic write operations */ /* * Command specific flags, keep last: */ @@ -402,6 +407,7 @@ enum req_flag_bits { #define REQ_SWAP (__force blk_opf_t)(1ULL << __REQ_SWAP) #define REQ_DRV (__force blk_opf_t)(1ULL << __REQ_DRV) #define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE) +#define REQ_ATOMIC (__force blk_opf_t)(1ULL << __REQ_ATOMIC) #define REQ_NOUNMAP (__force blk_opf_t)(1ULL << __REQ_NOUNMAP) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index fb7d4c21bba87f..4816f3b1d52892 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -377,6 +377,16 @@ struct queue_limits { unsigned int discard_alignment; unsigned int zone_write_granularity; + /* atomic write limits */ + unsigned int atomic_write_hw_max; + unsigned int atomic_write_max_sectors; + unsigned int atomic_write_hw_boundary; + unsigned int atomic_write_boundary_sectors; + unsigned int atomic_write_hw_unit_min; + unsigned int atomic_write_unit_min; + unsigned int atomic_write_hw_unit_max; + unsigned int atomic_write_unit_max; + unsigned short max_segments; unsigned short max_integrity_segments; unsigned short max_discard_segments; @@ -1403,6 +1413,30 @@ static inline int queue_dma_alignment(const struct request_queue *q) return q ? q->limits.dma_alignment : 511; } +static inline unsigned int +queue_atomic_write_unit_max_bytes(const struct request_queue *q) +{ + return q->limits.atomic_write_unit_max; +} + +static inline unsigned int +queue_atomic_write_unit_min_bytes(const struct request_queue *q) +{ + return q->limits.atomic_write_unit_min; +} + +static inline unsigned int +queue_atomic_write_boundary_bytes(const struct request_queue *q) +{ + return q->limits.atomic_write_boundary_sectors << SECTOR_SHIFT; +} + +static inline unsigned int +queue_atomic_write_max_bytes(const struct request_queue *q) +{ + return q->limits.atomic_write_max_sectors << SECTOR_SHIFT; +} + static inline unsigned int bdev_dma_alignment(struct block_device *bdev) { return queue_dma_alignment(bdev_get_queue(bdev)); @@ -1644,6 +1678,27 @@ struct io_comp_batch { void (*complete)(struct io_comp_batch *); }; +static inline bool bdev_can_atomic_write(struct block_device *bdev) +{ + struct request_queue *bd_queue = bdev->bd_queue; + struct queue_limits *limits = &bd_queue->limits; + + if (!limits->atomic_write_unit_min) + return false; + + if (bdev_is_partition(bdev)) { + sector_t bd_start_sect = bdev->bd_start_sect; + unsigned int alignment = + max(limits->atomic_write_unit_min, + limits->atomic_write_hw_boundary); + + if (!IS_ALIGNED(bd_start_sect, alignment >> SECTOR_SHIFT)) + return false; + } + + return true; +} + #define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { } #endif /* _LINUX_BLKDEV_H */ From 9abcfbd235f59fb5b6379e5bc0231dad831ebace Mon Sep 17 00:00:00 2001 From: Prasad Singamsetty Date: Thu, 20 Jun 2024 12:53:55 +0000 Subject: [PATCH 1243/1578] block: Add atomic write support for statx Extend statx system call to return additional info for atomic write support support if the specified file is a block device. Reviewed-by: Martin K. Petersen Signed-off-by: Prasad Singamsetty Signed-off-by: John Garry Reviewed-by: Keith Busch Acked-by: Darrick J. Wong Reviewed-by: Darrick J. Wong Reviewed-by: Luis Chamberlain Link: https://lore.kernel.org/r/20240620125359.2684798-7-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- block/bdev.c | 36 ++++++++++++++++++++++++++---------- fs/stat.c | 16 +++++++++------- include/linux/blkdev.h | 6 ++++-- 3 files changed, 39 insertions(+), 19 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index ced4ac990ec88f..1b4af2cc3b1e6c 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -1260,23 +1260,39 @@ void sync_bdevs(bool wait) } /* - * Handle STATX_DIOALIGN for block devices. - * - * Note that the inode passed to this is the inode of a block device node file, - * not the block device's internal inode. Therefore it is *not* valid to use - * I_BDEV() here; the block device has to be looked up by i_rdev instead. + * Handle STATX_{DIOALIGN, WRITE_ATOMIC} for block devices. */ -void bdev_statx_dioalign(struct inode *inode, struct kstat *stat) +void bdev_statx(struct inode *backing_inode, struct kstat *stat, + u32 request_mask) { struct block_device *bdev; - bdev = blkdev_get_no_open(inode->i_rdev); + if (!(request_mask & (STATX_DIOALIGN | STATX_WRITE_ATOMIC))) + return; + + /* + * Note that backing_inode is the inode of a block device node file, + * not the block device's internal inode. Therefore it is *not* valid + * to use I_BDEV() here; the block device has to be looked up by i_rdev + * instead. + */ + bdev = blkdev_get_no_open(backing_inode->i_rdev); if (!bdev) return; - stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; - stat->dio_offset_align = bdev_logical_block_size(bdev); - stat->result_mask |= STATX_DIOALIGN; + if (request_mask & STATX_DIOALIGN) { + stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; + stat->dio_offset_align = bdev_logical_block_size(bdev); + stat->result_mask |= STATX_DIOALIGN; + } + + if (request_mask & STATX_WRITE_ATOMIC && bdev_can_atomic_write(bdev)) { + struct request_queue *bd_queue = bdev->bd_queue; + + generic_fill_statx_atomic_writes(stat, + queue_atomic_write_unit_min_bytes(bd_queue), + queue_atomic_write_unit_max_bytes(bd_queue)); + } blkdev_put_no_open(bdev); } diff --git a/fs/stat.c b/fs/stat.c index 72d0e6357b91f1..bd0698dfd7b36e 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -265,6 +265,7 @@ static int vfs_statx(int dfd, struct filename *filename, int flags, { struct path path; unsigned int lookup_flags = getname_statx_lookup_flags(flags); + struct inode *backing_inode; int error; if (flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | AT_EMPTY_PATH | @@ -290,13 +291,14 @@ static int vfs_statx(int dfd, struct filename *filename, int flags, stat->attributes |= STATX_ATTR_MOUNT_ROOT; stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT; - /* Handle STATX_DIOALIGN for block devices. */ - if (request_mask & STATX_DIOALIGN) { - struct inode *inode = d_backing_inode(path.dentry); - - if (S_ISBLK(inode->i_mode)) - bdev_statx_dioalign(inode, stat); - } + /* + * If this is a block device inode, override the filesystem + * attributes with the block device specific parameters that need to be + * obtained from the bdev backing inode. + */ + backing_inode = d_backing_inode(path.dentry); + if (S_ISBLK(backing_inode->i_mode)) + bdev_statx(backing_inode, stat, request_mask); path_put(&path); if (retry_estale(error, lookup_flags)) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4816f3b1d52892..2800231046a98b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1638,7 +1638,8 @@ int sync_blockdev(struct block_device *bdev); int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend); int sync_blockdev_nowait(struct block_device *bdev); void sync_bdevs(bool wait); -void bdev_statx_dioalign(struct inode *inode, struct kstat *stat); +void bdev_statx(struct inode *backing_inode, struct kstat *stat, + u32 request_mask); void printk_all_partitions(void); int __init early_lookup_bdev(const char *pathname, dev_t *dev); #else @@ -1656,7 +1657,8 @@ static inline int sync_blockdev_nowait(struct block_device *bdev) static inline void sync_bdevs(bool wait) { } -static inline void bdev_statx_dioalign(struct inode *inode, struct kstat *stat) +static inline void bdev_statx(struct inode *backing_inode, struct kstat *stat, + u32 request_mask) { } static inline void printk_all_partitions(void) From caf336f81b3a3ca744e335972e86ec7244512d4a Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 20 Jun 2024 12:53:56 +0000 Subject: [PATCH 1244/1578] block: Add fops atomic write support Support atomic writes by submitting a single BIO with the REQ_ATOMIC set. It must be ensured that the atomic write adheres to its rules, like naturally aligned offset, so call blkdev_dio_invalid() -> blkdev_atomic_write_valid() [with renaming blkdev_dio_unaligned() to blkdev_dio_invalid()] for this purpose. The BIO submission path currently checks for atomic writes which are too large, so no need to check here. In blkdev_direct_IO(), if the nr_pages exceeds BIO_MAX_VECS, then we cannot produce a single BIO, so error in this case. Finally set FMODE_CAN_ATOMIC_WRITE when the bdev can support atomic writes and the associated file flag is for O_DIRECT. Reviewed-by: Martin K. Petersen Signed-off-by: John Garry Reviewed-by: Keith Busch Acked-by: Darrick J. Wong Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20240620125359.2684798-8-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- block/fops.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/block/fops.c b/block/fops.c index 376265935714fd..be36c9fbd500bb 100644 --- a/block/fops.c +++ b/block/fops.c @@ -34,9 +34,12 @@ static blk_opf_t dio_bio_write_op(struct kiocb *iocb) return opf; } -static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos, - struct iov_iter *iter) +static bool blkdev_dio_invalid(struct block_device *bdev, loff_t pos, + struct iov_iter *iter, bool is_atomic) { + if (is_atomic && !generic_atomic_write_valid(iter, pos)) + return true; + return pos & (bdev_logical_block_size(bdev) - 1) || !bdev_iter_is_aligned(bdev, iter); } @@ -72,6 +75,8 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; bio.bi_ioprio = iocb->ki_ioprio; + if (iocb->ki_flags & IOCB_ATOMIC) + bio.bi_opf |= REQ_ATOMIC; ret = bio_iov_iter_get_pages(&bio, iter); if (unlikely(ret)) @@ -343,6 +348,9 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, task_io_account_write(bio->bi_iter.bi_size); } + if (iocb->ki_flags & IOCB_ATOMIC) + bio->bi_opf |= REQ_ATOMIC; + if (iocb->ki_flags & IOCB_NOWAIT) bio->bi_opf |= REQ_NOWAIT; @@ -359,12 +367,13 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); + bool is_atomic = iocb->ki_flags & IOCB_ATOMIC; unsigned int nr_pages; if (!iov_iter_count(iter)) return 0; - if (blkdev_dio_unaligned(bdev, iocb->ki_pos, iter)) + if (blkdev_dio_invalid(bdev, iocb->ki_pos, iter, is_atomic)) return -EINVAL; nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); @@ -373,6 +382,8 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return __blkdev_direct_IO_simple(iocb, iter, bdev, nr_pages); return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages); + } else if (is_atomic) { + return -EINVAL; } return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages)); } @@ -612,6 +623,9 @@ static int blkdev_open(struct inode *inode, struct file *filp) if (!bdev) return -ENXIO; + if (bdev_can_atomic_write(bdev) && filp->f_flags & O_DIRECT) + filp->f_mode |= FMODE_CAN_ATOMIC_WRITE; + ret = bdev_open(bdev, mode, filp->private_data, NULL, filp); if (ret) blkdev_put_no_open(bdev); From bf4ae8f2e6407a779c0368eb0f3e047a8333be17 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 20 Jun 2024 12:53:57 +0000 Subject: [PATCH 1245/1578] scsi: sd: Atomic write support Support is divided into two main areas: - reading VPD pages and setting sdev request_queue limits - support WRITE ATOMIC (16) command and tracing The relevant block limits VPD page need to be read to allow the block layer request_queue atomic write limits to be set. These VPD page limits are described in sbc4r22 section 6.6.4 - Block limits VPD page. There are five limits of interest: - MAXIMUM ATOMIC TRANSFER LENGTH - ATOMIC ALIGNMENT - ATOMIC TRANSFER LENGTH GRANULARITY - MAXIMUM ATOMIC TRANSFER LENGTH WITH BOUNDARY - MAXIMUM ATOMIC BOUNDARY SIZE MAXIMUM ATOMIC TRANSFER LENGTH is the maximum length for a WRITE ATOMIC (16) command. It will not be greater than the device MAXIMUM TRANSFER LENGTH. ATOMIC ALIGNMENT and ATOMIC TRANSFER LENGTH GRANULARITY are the minimum alignment and length values for an atomic write in terms of logical blocks. Unlike NVMe, SCSI does not specify an LBA space boundary, but does specify a per-IO boundary granularity. The maximum boundary size is specified in MAXIMUM ATOMIC BOUNDARY SIZE. When used, this boundary value is set in the WRITE ATOMIC (16) ATOMIC BOUNDARY field - layout for the WRITE_ATOMIC_16 command can be found in sbc4r22 section 5.48. This boundary value is the granularity size at which the device may atomically write the data. A value of zero in WRITE ATOMIC (16) ATOMIC BOUNDARY field means that all data must be atomically written together. MAXIMUM ATOMIC TRANSFER LENGTH WITH BOUNDARY is the maximum atomic write length if a non-zero boundary value is set. For atomic write support, the WRITE ATOMIC (16) boundary is not of much interest, as the block layer expects each request submitted to be executed atomically. However, the SCSI spec does leave itself open to a quirky scenario where MAXIMUM ATOMIC TRANSFER LENGTH is zero, yet MAXIMUM ATOMIC TRANSFER LENGTH WITH BOUNDARY and MAXIMUM ATOMIC BOUNDARY SIZE are both non-zero. This case will be supported. To set the block layer request_queue atomic write capabilities, sanitize the VPD page limits and set limits as follows: - atomic_write_unit_min is derived from granularity and alignment values. If no granularity value is not set, use physical block size - atomic_write_unit_max is derived from MAXIMUM ATOMIC TRANSFER LENGTH. In the scenario where MAXIMUM ATOMIC TRANSFER LENGTH is zero and boundary limits are non-zero, use MAXIMUM ATOMIC BOUNDARY SIZE for atomic_write_unit_max. New flag scsi_disk.use_atomic_write_boundary is set for this scenario. - atomic_write_boundary_bytes is set to zero always SCSI also supports a WRITE ATOMIC (32) command, which is for type 2 protection enabled. This is not going to be supported now, so check for T10_PI_TYPE2_PROTECTION when setting any request_queue limits. To handle an atomic write request, add support for WRITE ATOMIC (16) command in handler sd_setup_atomic_cmnd(). Flag use_atomic_write_boundary is checked here for encoding ATOMIC BOUNDARY field. Trace info is also added for WRITE_ATOMIC_16 command. Reviewed-by: Martin K. Petersen Signed-off-by: John Garry Acked-by: Darrick J. Wong Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20240620125359.2684798-9-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- drivers/scsi/scsi_trace.c | 22 +++++++++ drivers/scsi/sd.c | 93 ++++++++++++++++++++++++++++++++++++- drivers/scsi/sd.h | 8 ++++ include/scsi/scsi_proto.h | 1 + include/trace/events/scsi.h | 1 + 5 files changed, 124 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_trace.c b/drivers/scsi/scsi_trace.c index 41a9500759130f..3e47c4472a80e7 100644 --- a/drivers/scsi/scsi_trace.c +++ b/drivers/scsi/scsi_trace.c @@ -325,6 +325,26 @@ scsi_trace_zbc_out(struct trace_seq *p, unsigned char *cdb, int len) return ret; } +static const char * +scsi_trace_atomic_write16_out(struct trace_seq *p, unsigned char *cdb, int len) +{ + const char *ret = trace_seq_buffer_ptr(p); + unsigned int boundary_size; + unsigned int nr_blocks; + sector_t lba; + + lba = get_unaligned_be64(&cdb[2]); + boundary_size = get_unaligned_be16(&cdb[10]); + nr_blocks = get_unaligned_be16(&cdb[12]); + + trace_seq_printf(p, "lba=%llu txlen=%u boundary_size=%u", + lba, nr_blocks, boundary_size); + + trace_seq_putc(p, 0); + + return ret; +} + static const char * scsi_trace_varlen(struct trace_seq *p, unsigned char *cdb, int len) { @@ -385,6 +405,8 @@ scsi_trace_parse_cdb(struct trace_seq *p, unsigned char *cdb, int len) return scsi_trace_zbc_in(p, cdb, len); case ZBC_OUT: return scsi_trace_zbc_out(p, cdb, len); + case WRITE_ATOMIC_16: + return scsi_trace_atomic_write16_out(p, cdb, len); default: return scsi_trace_misc(p, cdb, len); } diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index a27f1c7f1b61d5..525f48c97f5e15 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -939,6 +939,64 @@ static blk_status_t sd_setup_unmap_cmnd(struct scsi_cmnd *cmd) return scsi_alloc_sgtables(cmd); } +static void sd_config_atomic(struct scsi_disk *sdkp, struct queue_limits *lim) +{ + unsigned int logical_block_size = sdkp->device->sector_size, + physical_block_size_sectors, max_atomic, unit_min, unit_max; + + if ((!sdkp->max_atomic && !sdkp->max_atomic_with_boundary) || + sdkp->protection_type == T10_PI_TYPE2_PROTECTION) + return; + + physical_block_size_sectors = sdkp->physical_block_size / + sdkp->device->sector_size; + + unit_min = rounddown_pow_of_two(sdkp->atomic_granularity ? + sdkp->atomic_granularity : + physical_block_size_sectors); + + /* + * Only use atomic boundary when we have the odd scenario of + * sdkp->max_atomic == 0, which the spec does permit. + */ + if (sdkp->max_atomic) { + max_atomic = sdkp->max_atomic; + unit_max = rounddown_pow_of_two(sdkp->max_atomic); + sdkp->use_atomic_write_boundary = 0; + } else { + max_atomic = sdkp->max_atomic_with_boundary; + unit_max = rounddown_pow_of_two(sdkp->max_atomic_boundary); + sdkp->use_atomic_write_boundary = 1; + } + + /* + * Ensure compliance with granularity and alignment. For now, keep it + * simple and just don't support atomic writes for values mismatched + * with max_{boundary}atomic, physical block size, and + * atomic_granularity itself. + * + * We're really being distrustful by checking unit_max also... + */ + if (sdkp->atomic_granularity > 1) { + if (unit_min > 1 && unit_min % sdkp->atomic_granularity) + return; + if (unit_max > 1 && unit_max % sdkp->atomic_granularity) + return; + } + + if (sdkp->atomic_alignment > 1) { + if (unit_min > 1 && unit_min % sdkp->atomic_alignment) + return; + if (unit_max > 1 && unit_max % sdkp->atomic_alignment) + return; + } + + lim->atomic_write_hw_max = max_atomic * logical_block_size; + lim->atomic_write_hw_boundary = 0; + lim->atomic_write_hw_unit_min = unit_min * logical_block_size; + lim->atomic_write_hw_unit_max = unit_max * logical_block_size; +} + static blk_status_t sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, bool unmap) { @@ -1237,6 +1295,26 @@ static int sd_cdl_dld(struct scsi_disk *sdkp, struct scsi_cmnd *scmd) return (hint - IOPRIO_HINT_DEV_DURATION_LIMIT_1) + 1; } +static blk_status_t sd_setup_atomic_cmnd(struct scsi_cmnd *cmd, + sector_t lba, unsigned int nr_blocks, + bool boundary, unsigned char flags) +{ + cmd->cmd_len = 16; + cmd->cmnd[0] = WRITE_ATOMIC_16; + cmd->cmnd[1] = flags; + put_unaligned_be64(lba, &cmd->cmnd[2]); + put_unaligned_be16(nr_blocks, &cmd->cmnd[12]); + if (boundary) + put_unaligned_be16(nr_blocks, &cmd->cmnd[10]); + else + put_unaligned_be16(0, &cmd->cmnd[10]); + put_unaligned_be16(nr_blocks, &cmd->cmnd[12]); + cmd->cmnd[14] = 0; + cmd->cmnd[15] = 0; + + return BLK_STS_OK; +} + static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); @@ -1302,6 +1380,10 @@ static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd) if (protect && sdkp->protection_type == T10_PI_TYPE2_PROTECTION) { ret = sd_setup_rw32_cmnd(cmd, write, lba, nr_blocks, protect | fua, dld); + } else if (rq->cmd_flags & REQ_ATOMIC && write) { + ret = sd_setup_atomic_cmnd(cmd, lba, nr_blocks, + sdkp->use_atomic_write_boundary, + protect | fua); } else if (sdp->use_16_for_rw || (nr_blocks > 0xffff)) { ret = sd_setup_rw16_cmnd(cmd, write, lba, nr_blocks, protect | fua, dld); @@ -3264,7 +3346,7 @@ static void sd_read_block_limits(struct scsi_disk *sdkp, sdkp->max_ws_blocks = (u32)get_unaligned_be64(&vpd->data[36]); if (!sdkp->lbpme) - goto out; + goto config_atomic; lba_count = get_unaligned_be32(&vpd->data[20]); desc_count = get_unaligned_be32(&vpd->data[24]); @@ -3279,6 +3361,15 @@ static void sd_read_block_limits(struct scsi_disk *sdkp, get_unaligned_be32(&vpd->data[32]) & ~(1 << 31); sd_config_discard(sdkp, lim, sd_discard_mode(sdkp)); + +config_atomic: + sdkp->max_atomic = get_unaligned_be32(&vpd->data[44]); + sdkp->atomic_alignment = get_unaligned_be32(&vpd->data[48]); + sdkp->atomic_granularity = get_unaligned_be32(&vpd->data[52]); + sdkp->max_atomic_with_boundary = get_unaligned_be32(&vpd->data[56]); + sdkp->max_atomic_boundary = get_unaligned_be32(&vpd->data[60]); + + sd_config_atomic(sdkp, lim); } out: diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index 7603b3c67b233f..36382eca941ce2 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -115,6 +115,13 @@ struct scsi_disk { u32 max_unmap_blocks; u32 unmap_granularity; u32 unmap_alignment; + + u32 max_atomic; + u32 atomic_alignment; + u32 atomic_granularity; + u32 max_atomic_with_boundary; + u32 max_atomic_boundary; + u32 index; unsigned int physical_block_size; unsigned int max_medium_access_timeouts; @@ -148,6 +155,7 @@ struct scsi_disk { unsigned security : 1; unsigned ignore_medium_access_errors : 1; unsigned rscs : 1; /* reduced stream control support */ + unsigned use_atomic_write_boundary : 1; }; #define to_scsi_disk(obj) container_of(obj, struct scsi_disk, disk_dev) diff --git a/include/scsi/scsi_proto.h b/include/scsi/scsi_proto.h index 843106e1109f43..70e1262b2e202e 100644 --- a/include/scsi/scsi_proto.h +++ b/include/scsi/scsi_proto.h @@ -120,6 +120,7 @@ #define WRITE_SAME_16 0x93 #define ZBC_OUT 0x94 #define ZBC_IN 0x95 +#define WRITE_ATOMIC_16 0x9c #define SERVICE_ACTION_BIDIRECTIONAL 0x9d #define SERVICE_ACTION_IN_16 0x9e #define SERVICE_ACTION_OUT_16 0x9f diff --git a/include/trace/events/scsi.h b/include/trace/events/scsi.h index 8e2d9b1b0e77ab..05f1945ed204ec 100644 --- a/include/trace/events/scsi.h +++ b/include/trace/events/scsi.h @@ -102,6 +102,7 @@ scsi_opcode_name(WRITE_32), \ scsi_opcode_name(WRITE_SAME_32), \ scsi_opcode_name(ATA_16), \ + scsi_opcode_name(WRITE_ATOMIC_16), \ scsi_opcode_name(ATA_12)) #define scsi_hostbyte_name(result) { result, #result } From 84f3a3c01d70efba736bc42155cf32722067b327 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 20 Jun 2024 12:53:58 +0000 Subject: [PATCH 1246/1578] scsi: scsi_debug: Atomic write support Add initial support for atomic writes. As is standard method, feed device properties via modules param, those being: - atomic_max_size_blks - atomic_alignment_blks - atomic_granularity_blks - atomic_max_size_with_boundary_blks - atomic_max_boundary_blks These just match sbc4r22 section 6.6.4 - Block limits VPD page. We just support ATOMIC WRITE (16). The major change in the driver is how we lock the device for RW accesses. Currently the driver uses a per-device lock for accessing device metadata and "media" data (calls to do_device_access()) atomically for the duration of the whole read/write command. This should not suit verifying atomic writes. Reason being that currently all reads/writes are atomic, so using atomic writes does not prove anything. Change device access model to basis that regular writes only atomic on a per-sector basis, while reads and atomic writes are fully atomic. As mentioned, since accessing metadata and device media is atomic, continue to have regular writes involving metadata - like discard or PI - as atomic. We can improve this later. Currently we only support model where overlapping going reads or writes wait for current access to complete before commencing an atomic write. This is described in 4.29.3.2 section of the SBC. However, we simplify, things and wait for all accesses to complete (when issuing an atomic write). Reviewed-by: Martin K. Petersen Signed-off-by: John Garry Acked-by: Darrick J. Wong Reviewed-by: Darrick J. Wong Reviewed-by: Luis Chamberlain Link: https://lore.kernel.org/r/20240620125359.2684798-10-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- drivers/scsi/scsi_debug.c | 588 +++++++++++++++++++++++++++++--------- 1 file changed, 454 insertions(+), 134 deletions(-) diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index acf0592d63dae4..6b8397b3a55f59 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -69,6 +69,8 @@ static const char *sdebug_version_date = "20210520"; /* Additional Sense Code (ASC) */ #define NO_ADDITIONAL_SENSE 0x0 +#define OVERLAP_ATOMIC_COMMAND_ASC 0x0 +#define OVERLAP_ATOMIC_COMMAND_ASCQ 0x23 #define LOGICAL_UNIT_NOT_READY 0x4 #define LOGICAL_UNIT_COMMUNICATION_FAILURE 0x8 #define UNRECOVERED_READ_ERR 0x11 @@ -103,6 +105,7 @@ static const char *sdebug_version_date = "20210520"; #define READ_BOUNDARY_ASCQ 0x7 #define ATTEMPT_ACCESS_GAP 0x9 #define INSUFF_ZONE_ASCQ 0xe +/* see drivers/scsi/sense_codes.h */ /* Additional Sense Code Qualifier (ASCQ) */ #define ACK_NAK_TO 0x3 @@ -152,6 +155,12 @@ static const char *sdebug_version_date = "20210520"; #define DEF_VIRTUAL_GB 0 #define DEF_VPD_USE_HOSTNO 1 #define DEF_WRITESAME_LENGTH 0xFFFF +#define DEF_ATOMIC_WR 0 +#define DEF_ATOMIC_WR_MAX_LENGTH 8192 +#define DEF_ATOMIC_WR_ALIGN 2 +#define DEF_ATOMIC_WR_GRAN 2 +#define DEF_ATOMIC_WR_MAX_LENGTH_BNDRY (DEF_ATOMIC_WR_MAX_LENGTH) +#define DEF_ATOMIC_WR_MAX_BNDRY 128 #define DEF_STRICT 0 #define DEF_STATISTICS false #define DEF_SUBMIT_QUEUES 1 @@ -374,7 +383,9 @@ struct sdebug_host_info { /* There is an xarray of pointers to this struct's objects, one per host */ struct sdeb_store_info { - rwlock_t macc_lck; /* for atomic media access on this store */ + rwlock_t macc_data_lck; /* for media data access on this store */ + rwlock_t macc_meta_lck; /* for atomic media meta access on this store */ + rwlock_t macc_sector_lck; /* per-sector media data access on this store */ u8 *storep; /* user data storage (ram) */ struct t10_pi_tuple *dif_storep; /* protection info */ void *map_storep; /* provisioning map */ @@ -398,12 +409,20 @@ struct sdebug_defer { enum sdeb_defer_type defer_t; }; +struct sdebug_device_access_info { + bool atomic_write; + u64 lba; + u32 num; + struct scsi_cmnd *self; +}; + struct sdebug_queued_cmd { /* corresponding bit set in in_use_bm[] in owning struct sdebug_queue * instance indicates this slot is in use. */ struct sdebug_defer sd_dp; struct scsi_cmnd *scmd; + struct sdebug_device_access_info *i; }; struct sdebug_scsi_cmd { @@ -463,7 +482,8 @@ enum sdeb_opcode_index { SDEB_I_PRE_FETCH = 29, /* 10, 16 */ SDEB_I_ZONE_OUT = 30, /* 0x94+SA; includes no data xfer */ SDEB_I_ZONE_IN = 31, /* 0x95+SA; all have data-in */ - SDEB_I_LAST_ELEM_P1 = 32, /* keep this last (previous + 1) */ + SDEB_I_ATOMIC_WRITE_16 = 32, + SDEB_I_LAST_ELEM_P1 = 33, /* keep this last (previous + 1) */ }; @@ -497,7 +517,8 @@ static const unsigned char opcode_ind_arr[256] = { 0, 0, 0, SDEB_I_VERIFY, SDEB_I_PRE_FETCH, SDEB_I_SYNC_CACHE, 0, SDEB_I_WRITE_SAME, SDEB_I_ZONE_OUT, SDEB_I_ZONE_IN, 0, 0, - 0, 0, 0, 0, 0, 0, SDEB_I_SERV_ACT_IN_16, SDEB_I_SERV_ACT_OUT_16, + 0, 0, 0, 0, + SDEB_I_ATOMIC_WRITE_16, 0, SDEB_I_SERV_ACT_IN_16, SDEB_I_SERV_ACT_OUT_16, /* 0xa0; 0xa0->0xbf: 12 byte cdbs */ SDEB_I_REPORT_LUNS, SDEB_I_ATA_PT, 0, SDEB_I_MAINT_IN, SDEB_I_MAINT_OUT, 0, 0, 0, @@ -547,6 +568,7 @@ static int resp_write_buffer(struct scsi_cmnd *, struct sdebug_dev_info *); static int resp_sync_cache(struct scsi_cmnd *, struct sdebug_dev_info *); static int resp_pre_fetch(struct scsi_cmnd *, struct sdebug_dev_info *); static int resp_report_zones(struct scsi_cmnd *, struct sdebug_dev_info *); +static int resp_atomic_write(struct scsi_cmnd *, struct sdebug_dev_info *); static int resp_open_zone(struct scsi_cmnd *, struct sdebug_dev_info *); static int resp_close_zone(struct scsi_cmnd *, struct sdebug_dev_info *); static int resp_finish_zone(struct scsi_cmnd *, struct sdebug_dev_info *); @@ -788,6 +810,11 @@ static const struct opcode_info_t opcode_info_arr[SDEB_I_LAST_ELEM_P1 + 1] = { resp_report_zones, zone_in_iarr, /* ZONE_IN(16), REPORT ZONES) */ {16, 0x0 /* SA */, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xbf, 0xc7} }, +/* 31 */ + {0, 0x0, 0x0, F_D_OUT | FF_MEDIA_IO, + resp_atomic_write, NULL, /* ATOMIC WRITE 16 */ + {16, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff} }, /* sentinel */ {0xff, 0, 0, 0, NULL, NULL, /* terminating element */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }, @@ -835,6 +862,13 @@ static unsigned int sdebug_unmap_granularity = DEF_UNMAP_GRANULARITY; static unsigned int sdebug_unmap_max_blocks = DEF_UNMAP_MAX_BLOCKS; static unsigned int sdebug_unmap_max_desc = DEF_UNMAP_MAX_DESC; static unsigned int sdebug_write_same_length = DEF_WRITESAME_LENGTH; +static unsigned int sdebug_atomic_wr = DEF_ATOMIC_WR; +static unsigned int sdebug_atomic_wr_max_length = DEF_ATOMIC_WR_MAX_LENGTH; +static unsigned int sdebug_atomic_wr_align = DEF_ATOMIC_WR_ALIGN; +static unsigned int sdebug_atomic_wr_gran = DEF_ATOMIC_WR_GRAN; +static unsigned int sdebug_atomic_wr_max_length_bndry = + DEF_ATOMIC_WR_MAX_LENGTH_BNDRY; +static unsigned int sdebug_atomic_wr_max_bndry = DEF_ATOMIC_WR_MAX_BNDRY; static int sdebug_uuid_ctl = DEF_UUID_CTL; static bool sdebug_random = DEF_RANDOM; static bool sdebug_per_host_store = DEF_PER_HOST_STORE; @@ -1188,6 +1222,11 @@ static inline bool scsi_debug_lbp(void) (sdebug_lbpu || sdebug_lbpws || sdebug_lbpws10); } +static inline bool scsi_debug_atomic_write(void) +{ + return sdebug_fake_rw == 0 && sdebug_atomic_wr; +} + static void *lba2fake_store(struct sdeb_store_info *sip, unsigned long long lba) { @@ -1815,6 +1854,14 @@ static int inquiry_vpd_b0(unsigned char *arr) /* Maximum WRITE SAME Length */ put_unaligned_be64(sdebug_write_same_length, &arr[32]); + if (sdebug_atomic_wr) { + put_unaligned_be32(sdebug_atomic_wr_max_length, &arr[40]); + put_unaligned_be32(sdebug_atomic_wr_align, &arr[44]); + put_unaligned_be32(sdebug_atomic_wr_gran, &arr[48]); + put_unaligned_be32(sdebug_atomic_wr_max_length_bndry, &arr[52]); + put_unaligned_be32(sdebug_atomic_wr_max_bndry, &arr[56]); + } + return 0x3c; /* Mandatory page length for Logical Block Provisioning */ } @@ -3377,16 +3424,238 @@ static inline struct sdeb_store_info *devip2sip(struct sdebug_dev_info *devip, return xa_load(per_store_ap, devip->sdbg_host->si_idx); } +static inline void +sdeb_read_lock(rwlock_t *lock) +{ + if (sdebug_no_rwlock) + __acquire(lock); + else + read_lock(lock); +} + +static inline void +sdeb_read_unlock(rwlock_t *lock) +{ + if (sdebug_no_rwlock) + __release(lock); + else + read_unlock(lock); +} + +static inline void +sdeb_write_lock(rwlock_t *lock) +{ + if (sdebug_no_rwlock) + __acquire(lock); + else + write_lock(lock); +} + +static inline void +sdeb_write_unlock(rwlock_t *lock) +{ + if (sdebug_no_rwlock) + __release(lock); + else + write_unlock(lock); +} + +static inline void +sdeb_data_read_lock(struct sdeb_store_info *sip) +{ + BUG_ON(!sip); + + sdeb_read_lock(&sip->macc_data_lck); +} + +static inline void +sdeb_data_read_unlock(struct sdeb_store_info *sip) +{ + BUG_ON(!sip); + + sdeb_read_unlock(&sip->macc_data_lck); +} + +static inline void +sdeb_data_write_lock(struct sdeb_store_info *sip) +{ + BUG_ON(!sip); + + sdeb_write_lock(&sip->macc_data_lck); +} + +static inline void +sdeb_data_write_unlock(struct sdeb_store_info *sip) +{ + BUG_ON(!sip); + + sdeb_write_unlock(&sip->macc_data_lck); +} + +static inline void +sdeb_data_sector_read_lock(struct sdeb_store_info *sip) +{ + BUG_ON(!sip); + + sdeb_read_lock(&sip->macc_sector_lck); +} + +static inline void +sdeb_data_sector_read_unlock(struct sdeb_store_info *sip) +{ + BUG_ON(!sip); + + sdeb_read_unlock(&sip->macc_sector_lck); +} + +static inline void +sdeb_data_sector_write_lock(struct sdeb_store_info *sip) +{ + BUG_ON(!sip); + + sdeb_write_lock(&sip->macc_sector_lck); +} + +static inline void +sdeb_data_sector_write_unlock(struct sdeb_store_info *sip) +{ + BUG_ON(!sip); + + sdeb_write_unlock(&sip->macc_sector_lck); +} + +/* + * Atomic locking: + * We simplify the atomic model to allow only 1x atomic write and many non- + * atomic reads or writes for all LBAs. + + * A RW lock has a similar bahaviour: + * Only 1x writer and many readers. + + * So use a RW lock for per-device read and write locking: + * An atomic access grabs the lock as a writer and non-atomic grabs the lock + * as a reader. + */ + +static inline void +sdeb_data_lock(struct sdeb_store_info *sip, bool atomic) +{ + if (atomic) + sdeb_data_write_lock(sip); + else + sdeb_data_read_lock(sip); +} + +static inline void +sdeb_data_unlock(struct sdeb_store_info *sip, bool atomic) +{ + if (atomic) + sdeb_data_write_unlock(sip); + else + sdeb_data_read_unlock(sip); +} + +/* Allow many reads but only 1x write per sector */ +static inline void +sdeb_data_sector_lock(struct sdeb_store_info *sip, bool do_write) +{ + if (do_write) + sdeb_data_sector_write_lock(sip); + else + sdeb_data_sector_read_lock(sip); +} + +static inline void +sdeb_data_sector_unlock(struct sdeb_store_info *sip, bool do_write) +{ + if (do_write) + sdeb_data_sector_write_unlock(sip); + else + sdeb_data_sector_read_unlock(sip); +} + +static inline void +sdeb_meta_read_lock(struct sdeb_store_info *sip) +{ + if (sdebug_no_rwlock) { + if (sip) + __acquire(&sip->macc_meta_lck); + else + __acquire(&sdeb_fake_rw_lck); + } else { + if (sip) + read_lock(&sip->macc_meta_lck); + else + read_lock(&sdeb_fake_rw_lck); + } +} + +static inline void +sdeb_meta_read_unlock(struct sdeb_store_info *sip) +{ + if (sdebug_no_rwlock) { + if (sip) + __release(&sip->macc_meta_lck); + else + __release(&sdeb_fake_rw_lck); + } else { + if (sip) + read_unlock(&sip->macc_meta_lck); + else + read_unlock(&sdeb_fake_rw_lck); + } +} + +static inline void +sdeb_meta_write_lock(struct sdeb_store_info *sip) +{ + if (sdebug_no_rwlock) { + if (sip) + __acquire(&sip->macc_meta_lck); + else + __acquire(&sdeb_fake_rw_lck); + } else { + if (sip) + write_lock(&sip->macc_meta_lck); + else + write_lock(&sdeb_fake_rw_lck); + } +} + +static inline void +sdeb_meta_write_unlock(struct sdeb_store_info *sip) +{ + if (sdebug_no_rwlock) { + if (sip) + __release(&sip->macc_meta_lck); + else + __release(&sdeb_fake_rw_lck); + } else { + if (sip) + write_unlock(&sip->macc_meta_lck); + else + write_unlock(&sdeb_fake_rw_lck); + } +} + /* Returns number of bytes copied or -1 if error. */ static int do_device_access(struct sdeb_store_info *sip, struct scsi_cmnd *scp, - u32 sg_skip, u64 lba, u32 num, bool do_write, - u8 group_number) + u32 sg_skip, u64 lba, u32 num, u8 group_number, + bool do_write, bool atomic) { int ret; - u64 block, rest = 0; + u64 block; enum dma_data_direction dir; struct scsi_data_buffer *sdb = &scp->sdb; u8 *fsp; + int i; + + /* + * Even though reads are inherently atomic (in this driver), we expect + * the atomic flag only for writes. + */ + if (!do_write && atomic) + return -1; if (do_write) { dir = DMA_TO_DEVICE; @@ -3406,21 +3675,26 @@ static int do_device_access(struct sdeb_store_info *sip, struct scsi_cmnd *scp, fsp = sip->storep; block = do_div(lba, sdebug_store_sectors); - if (block + num > sdebug_store_sectors) - rest = block + num - sdebug_store_sectors; - ret = sg_copy_buffer(sdb->table.sgl, sdb->table.nents, + /* Only allow 1x atomic write or multiple non-atomic writes at any given time */ + sdeb_data_lock(sip, atomic); + for (i = 0; i < num; i++) { + /* We shouldn't need to lock for atomic writes, but do it anyway */ + sdeb_data_sector_lock(sip, do_write); + ret = sg_copy_buffer(sdb->table.sgl, sdb->table.nents, fsp + (block * sdebug_sector_size), - (num - rest) * sdebug_sector_size, sg_skip, do_write); - if (ret != (num - rest) * sdebug_sector_size) - return ret; - - if (rest) { - ret += sg_copy_buffer(sdb->table.sgl, sdb->table.nents, - fsp, rest * sdebug_sector_size, - sg_skip + ((num - rest) * sdebug_sector_size), - do_write); + sdebug_sector_size, sg_skip, do_write); + sdeb_data_sector_unlock(sip, do_write); + if (ret != sdebug_sector_size) { + ret += (i * sdebug_sector_size); + break; + } + sg_skip += sdebug_sector_size; + if (++block >= sdebug_store_sectors) + block = 0; } + ret = num * sdebug_sector_size; + sdeb_data_unlock(sip, atomic); return ret; } @@ -3596,70 +3870,6 @@ static int prot_verify_read(struct scsi_cmnd *scp, sector_t start_sec, return ret; } -static inline void -sdeb_read_lock(struct sdeb_store_info *sip) -{ - if (sdebug_no_rwlock) { - if (sip) - __acquire(&sip->macc_lck); - else - __acquire(&sdeb_fake_rw_lck); - } else { - if (sip) - read_lock(&sip->macc_lck); - else - read_lock(&sdeb_fake_rw_lck); - } -} - -static inline void -sdeb_read_unlock(struct sdeb_store_info *sip) -{ - if (sdebug_no_rwlock) { - if (sip) - __release(&sip->macc_lck); - else - __release(&sdeb_fake_rw_lck); - } else { - if (sip) - read_unlock(&sip->macc_lck); - else - read_unlock(&sdeb_fake_rw_lck); - } -} - -static inline void -sdeb_write_lock(struct sdeb_store_info *sip) -{ - if (sdebug_no_rwlock) { - if (sip) - __acquire(&sip->macc_lck); - else - __acquire(&sdeb_fake_rw_lck); - } else { - if (sip) - write_lock(&sip->macc_lck); - else - write_lock(&sdeb_fake_rw_lck); - } -} - -static inline void -sdeb_write_unlock(struct sdeb_store_info *sip) -{ - if (sdebug_no_rwlock) { - if (sip) - __release(&sip->macc_lck); - else - __release(&sdeb_fake_rw_lck); - } else { - if (sip) - write_unlock(&sip->macc_lck); - else - write_unlock(&sdeb_fake_rw_lck); - } -} - static int resp_read_dt0(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) { bool check_prot; @@ -3669,6 +3879,7 @@ static int resp_read_dt0(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) u64 lba; struct sdeb_store_info *sip = devip2sip(devip, true); u8 *cmd = scp->cmnd; + bool meta_data_locked = false; switch (cmd[0]) { case READ_16: @@ -3727,6 +3938,10 @@ static int resp_read_dt0(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) atomic_set(&sdeb_inject_pending, 0); } + /* + * When checking device access params, for reads we only check data + * versus what is set at init time, so no need to lock. + */ ret = check_device_access_params(scp, lba, num, false); if (ret) return ret; @@ -3746,29 +3961,33 @@ static int resp_read_dt0(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) return check_condition_result; } - sdeb_read_lock(sip); + if (sdebug_dev_is_zoned(devip) || + (sdebug_dix && scsi_prot_sg_count(scp))) { + sdeb_meta_read_lock(sip); + meta_data_locked = true; + } /* DIX + T10 DIF */ if (unlikely(sdebug_dix && scsi_prot_sg_count(scp))) { switch (prot_verify_read(scp, lba, num, ei_lba)) { case 1: /* Guard tag error */ if (cmd[1] >> 5 != 3) { /* RDPROTECT != 3 */ - sdeb_read_unlock(sip); + sdeb_meta_read_unlock(sip); mk_sense_buffer(scp, ABORTED_COMMAND, 0x10, 1); return check_condition_result; } else if (scp->prot_flags & SCSI_PROT_GUARD_CHECK) { - sdeb_read_unlock(sip); + sdeb_meta_read_unlock(sip); mk_sense_buffer(scp, ILLEGAL_REQUEST, 0x10, 1); return illegal_condition_result; } break; case 3: /* Reference tag error */ if (cmd[1] >> 5 != 3) { /* RDPROTECT != 3 */ - sdeb_read_unlock(sip); + sdeb_meta_read_unlock(sip); mk_sense_buffer(scp, ABORTED_COMMAND, 0x10, 3); return check_condition_result; } else if (scp->prot_flags & SCSI_PROT_REF_CHECK) { - sdeb_read_unlock(sip); + sdeb_meta_read_unlock(sip); mk_sense_buffer(scp, ILLEGAL_REQUEST, 0x10, 3); return illegal_condition_result; } @@ -3776,8 +3995,9 @@ static int resp_read_dt0(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) } } - ret = do_device_access(sip, scp, 0, lba, num, false, 0); - sdeb_read_unlock(sip); + ret = do_device_access(sip, scp, 0, lba, num, 0, false, false); + if (meta_data_locked) + sdeb_meta_read_unlock(sip); if (unlikely(ret == -1)) return DID_ERROR << 16; @@ -3967,6 +4187,7 @@ static int resp_write_dt0(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) u64 lba; struct sdeb_store_info *sip = devip2sip(devip, true); u8 *cmd = scp->cmnd; + bool meta_data_locked = false; switch (cmd[0]) { case WRITE_16: @@ -4025,10 +4246,17 @@ static int resp_write_dt0(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) "to DIF device\n"); } - sdeb_write_lock(sip); + if (sdebug_dev_is_zoned(devip) || + (sdebug_dix && scsi_prot_sg_count(scp)) || + scsi_debug_lbp()) { + sdeb_meta_write_lock(sip); + meta_data_locked = true; + } + ret = check_device_access_params(scp, lba, num, true); if (ret) { - sdeb_write_unlock(sip); + if (meta_data_locked) + sdeb_meta_write_unlock(sip); return ret; } @@ -4037,22 +4265,22 @@ static int resp_write_dt0(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) switch (prot_verify_write(scp, lba, num, ei_lba)) { case 1: /* Guard tag error */ if (scp->prot_flags & SCSI_PROT_GUARD_CHECK) { - sdeb_write_unlock(sip); + sdeb_meta_write_unlock(sip); mk_sense_buffer(scp, ILLEGAL_REQUEST, 0x10, 1); return illegal_condition_result; } else if (scp->cmnd[1] >> 5 != 3) { /* WRPROTECT != 3 */ - sdeb_write_unlock(sip); + sdeb_meta_write_unlock(sip); mk_sense_buffer(scp, ABORTED_COMMAND, 0x10, 1); return check_condition_result; } break; case 3: /* Reference tag error */ if (scp->prot_flags & SCSI_PROT_REF_CHECK) { - sdeb_write_unlock(sip); + sdeb_meta_write_unlock(sip); mk_sense_buffer(scp, ILLEGAL_REQUEST, 0x10, 3); return illegal_condition_result; } else if (scp->cmnd[1] >> 5 != 3) { /* WRPROTECT != 3 */ - sdeb_write_unlock(sip); + sdeb_meta_write_unlock(sip); mk_sense_buffer(scp, ABORTED_COMMAND, 0x10, 3); return check_condition_result; } @@ -4060,13 +4288,16 @@ static int resp_write_dt0(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) } } - ret = do_device_access(sip, scp, 0, lba, num, true, group); + ret = do_device_access(sip, scp, 0, lba, num, group, true, false); if (unlikely(scsi_debug_lbp())) map_region(sip, lba, num); + /* If ZBC zone then bump its write pointer */ if (sdebug_dev_is_zoned(devip)) zbc_inc_wp(devip, lba, num); - sdeb_write_unlock(sip); + if (meta_data_locked) + sdeb_meta_write_unlock(sip); + if (unlikely(-1 == ret)) return DID_ERROR << 16; else if (unlikely(sdebug_verbose && @@ -4176,7 +4407,8 @@ static int resp_write_scat(struct scsi_cmnd *scp, goto err_out; } - sdeb_write_lock(sip); + /* Just keep it simple and always lock for now */ + sdeb_meta_write_lock(sip); sg_off = lbdof_blen; /* Spec says Buffer xfer Length field in number of LBs in dout */ cum_lb = 0; @@ -4219,7 +4451,11 @@ static int resp_write_scat(struct scsi_cmnd *scp, } } - ret = do_device_access(sip, scp, sg_off, lba, num, true, group); + /* + * Write ranges atomically to keep as close to pre-atomic + * writes behaviour as possible. + */ + ret = do_device_access(sip, scp, sg_off, lba, num, group, true, true); /* If ZBC zone then bump its write pointer */ if (sdebug_dev_is_zoned(devip)) zbc_inc_wp(devip, lba, num); @@ -4258,7 +4494,7 @@ static int resp_write_scat(struct scsi_cmnd *scp, } ret = 0; err_out_unlock: - sdeb_write_unlock(sip); + sdeb_meta_write_unlock(sip); err_out: kfree(lrdp); return ret; @@ -4277,14 +4513,16 @@ static int resp_write_same(struct scsi_cmnd *scp, u64 lba, u32 num, scp->device->hostdata, true); u8 *fs1p; u8 *fsp; + bool meta_data_locked = false; - sdeb_write_lock(sip); + if (sdebug_dev_is_zoned(devip) || scsi_debug_lbp()) { + sdeb_meta_write_lock(sip); + meta_data_locked = true; + } ret = check_device_access_params(scp, lba, num, true); - if (ret) { - sdeb_write_unlock(sip); - return ret; - } + if (ret) + goto out; if (unmap && scsi_debug_lbp()) { unmap_region(sip, lba, num); @@ -4295,6 +4533,7 @@ static int resp_write_same(struct scsi_cmnd *scp, u64 lba, u32 num, /* if ndob then zero 1 logical block, else fetch 1 logical block */ fsp = sip->storep; fs1p = fsp + (block * lb_size); + sdeb_data_write_lock(sip); if (ndob) { memset(fs1p, 0, lb_size); ret = 0; @@ -4302,8 +4541,8 @@ static int resp_write_same(struct scsi_cmnd *scp, u64 lba, u32 num, ret = fetch_to_dev_buffer(scp, fs1p, lb_size); if (-1 == ret) { - sdeb_write_unlock(sip); - return DID_ERROR << 16; + ret = DID_ERROR << 16; + goto out; } else if (sdebug_verbose && !ndob && (ret < lb_size)) sdev_printk(KERN_INFO, scp->device, "%s: %s: lb size=%u, IO sent=%d bytes\n", @@ -4320,10 +4559,12 @@ static int resp_write_same(struct scsi_cmnd *scp, u64 lba, u32 num, /* If ZBC zone then bump its write pointer */ if (sdebug_dev_is_zoned(devip)) zbc_inc_wp(devip, lba, num); + sdeb_data_write_unlock(sip); + ret = 0; out: - sdeb_write_unlock(sip); - - return 0; + if (meta_data_locked) + sdeb_meta_write_unlock(sip); + return ret; } static int resp_write_same_10(struct scsi_cmnd *scp, @@ -4466,25 +4707,30 @@ static int resp_comp_write(struct scsi_cmnd *scp, return check_condition_result; } - sdeb_write_lock(sip); - ret = do_dout_fetch(scp, dnum, arr); if (ret == -1) { retval = DID_ERROR << 16; - goto cleanup; + goto cleanup_free; } else if (sdebug_verbose && (ret < (dnum * lb_size))) sdev_printk(KERN_INFO, scp->device, "%s: compare_write: cdb " "indicated=%u, IO sent=%d bytes\n", my_name, dnum * lb_size, ret); + + sdeb_data_write_lock(sip); + sdeb_meta_write_lock(sip); if (!comp_write_worker(sip, lba, num, arr, false)) { mk_sense_buffer(scp, MISCOMPARE, MISCOMPARE_VERIFY_ASC, 0); retval = check_condition_result; - goto cleanup; + goto cleanup_unlock; } + + /* Cover sip->map_storep (which map_region()) sets with data lock */ if (scsi_debug_lbp()) map_region(sip, lba, num); -cleanup: - sdeb_write_unlock(sip); +cleanup_unlock: + sdeb_meta_write_unlock(sip); + sdeb_data_write_unlock(sip); +cleanup_free: kfree(arr); return retval; } @@ -4528,7 +4774,7 @@ static int resp_unmap(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) desc = (void *)&buf[8]; - sdeb_write_lock(sip); + sdeb_meta_write_lock(sip); for (i = 0 ; i < descriptors ; i++) { unsigned long long lba = get_unaligned_be64(&desc[i].lba); @@ -4544,7 +4790,7 @@ static int resp_unmap(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) ret = 0; out: - sdeb_write_unlock(sip); + sdeb_meta_write_unlock(sip); kfree(buf); return ret; @@ -4702,12 +4948,13 @@ static int resp_pre_fetch(struct scsi_cmnd *scp, rest = block + nblks - sdebug_store_sectors; /* Try to bring the PRE-FETCH range into CPU's cache */ - sdeb_read_lock(sip); + sdeb_data_read_lock(sip); prefetch_range(fsp + (sdebug_sector_size * block), (nblks - rest) * sdebug_sector_size); if (rest) prefetch_range(fsp, rest * sdebug_sector_size); - sdeb_read_unlock(sip); + + sdeb_data_read_unlock(sip); fini: if (cmd[1] & 0x2) res = SDEG_RES_IMMED_MASK; @@ -4866,7 +5113,7 @@ static int resp_verify(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) return check_condition_result; } /* Not changing store, so only need read access */ - sdeb_read_lock(sip); + sdeb_data_read_lock(sip); ret = do_dout_fetch(scp, a_num, arr); if (ret == -1) { @@ -4888,7 +5135,7 @@ static int resp_verify(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) goto cleanup; } cleanup: - sdeb_read_unlock(sip); + sdeb_data_read_unlock(sip); kfree(arr); return ret; } @@ -4934,7 +5181,7 @@ static int resp_report_zones(struct scsi_cmnd *scp, return check_condition_result; } - sdeb_read_lock(sip); + sdeb_meta_read_lock(sip); desc = arr + 64; for (lba = zs_lba; lba < sdebug_capacity; @@ -5032,11 +5279,70 @@ static int resp_report_zones(struct scsi_cmnd *scp, ret = fill_from_dev_buffer(scp, arr, min_t(u32, alloc_len, rep_len)); fini: - sdeb_read_unlock(sip); + sdeb_meta_read_unlock(sip); kfree(arr); return ret; } +static int resp_atomic_write(struct scsi_cmnd *scp, + struct sdebug_dev_info *devip) +{ + struct sdeb_store_info *sip; + u8 *cmd = scp->cmnd; + u16 boundary, len; + u64 lba, lba_tmp; + int ret; + + if (!scsi_debug_atomic_write()) { + mk_sense_invalid_opcode(scp); + return check_condition_result; + } + + sip = devip2sip(devip, true); + + lba = get_unaligned_be64(cmd + 2); + boundary = get_unaligned_be16(cmd + 10); + len = get_unaligned_be16(cmd + 12); + + lba_tmp = lba; + if (sdebug_atomic_wr_align && + do_div(lba_tmp, sdebug_atomic_wr_align)) { + /* Does not meet alignment requirement */ + mk_sense_buffer(scp, ILLEGAL_REQUEST, INVALID_FIELD_IN_CDB, 0); + return check_condition_result; + } + + if (sdebug_atomic_wr_gran && len % sdebug_atomic_wr_gran) { + /* Does not meet alignment requirement */ + mk_sense_buffer(scp, ILLEGAL_REQUEST, INVALID_FIELD_IN_CDB, 0); + return check_condition_result; + } + + if (boundary > 0) { + if (boundary > sdebug_atomic_wr_max_bndry) { + mk_sense_invalid_fld(scp, SDEB_IN_CDB, 12, -1); + return check_condition_result; + } + + if (len > sdebug_atomic_wr_max_length_bndry) { + mk_sense_invalid_fld(scp, SDEB_IN_CDB, 12, -1); + return check_condition_result; + } + } else { + if (len > sdebug_atomic_wr_max_length) { + mk_sense_invalid_fld(scp, SDEB_IN_CDB, 12, -1); + return check_condition_result; + } + } + + ret = do_device_access(sip, scp, 0, lba, len, 0, true, true); + if (unlikely(ret == -1)) + return DID_ERROR << 16; + if (unlikely(ret != len * sdebug_sector_size)) + return DID_ERROR << 16; + return 0; +} + /* Logic transplanted from tcmu-runner, file_zbc.c */ static void zbc_open_all(struct sdebug_dev_info *devip) { @@ -5063,8 +5369,7 @@ static int resp_open_zone(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) mk_sense_invalid_opcode(scp); return check_condition_result; } - - sdeb_write_lock(sip); + sdeb_meta_write_lock(sip); if (all) { /* Check if all closed zones can be open */ @@ -5113,7 +5418,7 @@ static int resp_open_zone(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) zbc_open_zone(devip, zsp, true); fini: - sdeb_write_unlock(sip); + sdeb_meta_write_unlock(sip); return res; } @@ -5140,7 +5445,7 @@ static int resp_close_zone(struct scsi_cmnd *scp, return check_condition_result; } - sdeb_write_lock(sip); + sdeb_meta_write_lock(sip); if (all) { zbc_close_all(devip); @@ -5169,7 +5474,7 @@ static int resp_close_zone(struct scsi_cmnd *scp, zbc_close_zone(devip, zsp); fini: - sdeb_write_unlock(sip); + sdeb_meta_write_unlock(sip); return res; } @@ -5212,7 +5517,7 @@ static int resp_finish_zone(struct scsi_cmnd *scp, return check_condition_result; } - sdeb_write_lock(sip); + sdeb_meta_write_lock(sip); if (all) { zbc_finish_all(devip); @@ -5241,7 +5546,7 @@ static int resp_finish_zone(struct scsi_cmnd *scp, zbc_finish_zone(devip, zsp, true); fini: - sdeb_write_unlock(sip); + sdeb_meta_write_unlock(sip); return res; } @@ -5292,7 +5597,7 @@ static int resp_rwp_zone(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) return check_condition_result; } - sdeb_write_lock(sip); + sdeb_meta_write_lock(sip); if (all) { zbc_rwp_all(devip); @@ -5320,7 +5625,7 @@ static int resp_rwp_zone(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) zbc_rwp_zone(devip, zsp); fini: - sdeb_write_unlock(sip); + sdeb_meta_write_unlock(sip); return res; } @@ -6284,6 +6589,7 @@ module_param_named(lbprz, sdebug_lbprz, int, S_IRUGO); module_param_named(lbpu, sdebug_lbpu, int, S_IRUGO); module_param_named(lbpws, sdebug_lbpws, int, S_IRUGO); module_param_named(lbpws10, sdebug_lbpws10, int, S_IRUGO); +module_param_named(atomic_wr, sdebug_atomic_wr, int, S_IRUGO); module_param_named(lowest_aligned, sdebug_lowest_aligned, int, S_IRUGO); module_param_named(lun_format, sdebug_lun_am_i, int, S_IRUGO | S_IWUSR); module_param_named(max_luns, sdebug_max_luns, int, S_IRUGO | S_IWUSR); @@ -6318,6 +6624,11 @@ module_param_named(unmap_alignment, sdebug_unmap_alignment, int, S_IRUGO); module_param_named(unmap_granularity, sdebug_unmap_granularity, int, S_IRUGO); module_param_named(unmap_max_blocks, sdebug_unmap_max_blocks, int, S_IRUGO); module_param_named(unmap_max_desc, sdebug_unmap_max_desc, int, S_IRUGO); +module_param_named(atomic_wr_max_length, sdebug_atomic_wr_max_length, int, S_IRUGO); +module_param_named(atomic_wr_align, sdebug_atomic_wr_align, int, S_IRUGO); +module_param_named(atomic_wr_gran, sdebug_atomic_wr_gran, int, S_IRUGO); +module_param_named(atomic_wr_max_length_bndry, sdebug_atomic_wr_max_length_bndry, int, S_IRUGO); +module_param_named(atomic_wr_max_bndry, sdebug_atomic_wr_max_bndry, int, S_IRUGO); module_param_named(uuid_ctl, sdebug_uuid_ctl, int, S_IRUGO); module_param_named(virtual_gb, sdebug_virtual_gb, int, S_IRUGO | S_IWUSR); module_param_named(vpd_use_hostno, sdebug_vpd_use_hostno, int, @@ -6361,6 +6672,7 @@ MODULE_PARM_DESC(lbprz, MODULE_PARM_DESC(lbpu, "enable LBP, support UNMAP command (def=0)"); MODULE_PARM_DESC(lbpws, "enable LBP, support WRITE SAME(16) with UNMAP bit (def=0)"); MODULE_PARM_DESC(lbpws10, "enable LBP, support WRITE SAME(10) with UNMAP bit (def=0)"); +MODULE_PARM_DESC(atomic_write, "enable ATOMIC WRITE support, support WRITE ATOMIC(16) (def=0)"); MODULE_PARM_DESC(lowest_aligned, "lowest aligned lba (def=0)"); MODULE_PARM_DESC(lun_format, "LUN format: 0->peripheral (def); 1 --> flat address method"); MODULE_PARM_DESC(max_luns, "number of LUNs per target to simulate(def=1)"); @@ -6392,6 +6704,11 @@ MODULE_PARM_DESC(unmap_alignment, "lowest aligned thin provisioning lba (def=0)" MODULE_PARM_DESC(unmap_granularity, "thin provisioning granularity in blocks (def=1)"); MODULE_PARM_DESC(unmap_max_blocks, "max # of blocks can be unmapped in one cmd (def=0xffffffff)"); MODULE_PARM_DESC(unmap_max_desc, "max # of ranges that can be unmapped in one cmd (def=256)"); +MODULE_PARM_DESC(atomic_wr_max_length, "max # of blocks can be atomically written in one cmd (def=8192)"); +MODULE_PARM_DESC(atomic_wr_align, "minimum alignment of atomic write in blocks (def=2)"); +MODULE_PARM_DESC(atomic_wr_gran, "minimum granularity of atomic write in blocks (def=2)"); +MODULE_PARM_DESC(atomic_wr_max_length_bndry, "max # of blocks can be atomically written in one cmd with boundary set (def=8192)"); +MODULE_PARM_DESC(atomic_wr_max_bndry, "max # boundaries per atomic write (def=128)"); MODULE_PARM_DESC(uuid_ctl, "1->use uuid for lu name, 0->don't, 2->all use same (def=0)"); MODULE_PARM_DESC(virtual_gb, "virtual gigabyte (GiB) size (def=0 -> use dev_size_mb)"); @@ -7563,6 +7880,7 @@ static int __init scsi_debug_init(void) return -EINVAL; } } + xa_init_flags(per_store_ap, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ); if (want_store) { idx = sdebug_add_store(); @@ -7770,7 +8088,9 @@ static int sdebug_add_store(void) map_region(sip, 0, 2); } - rwlock_init(&sip->macc_lck); + rwlock_init(&sip->macc_data_lck); + rwlock_init(&sip->macc_meta_lck); + rwlock_init(&sip->macc_sector_lck); return (int)n_idx; err: sdebug_erase_store((int)n_idx, sip); From 5f9bbea02f06110ec5cf95a3327019b3194b2d80 Mon Sep 17 00:00:00 2001 From: Alan Adamson Date: Thu, 20 Jun 2024 12:53:59 +0000 Subject: [PATCH 1247/1578] nvme: Atomic write support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support to set block layer request_queue atomic write limits. The limits will be derived from either the namespace or controller atomic parameters. NVMe atomic-related parameters are grouped into "normal" and "power-fail" (or PF) class of parameter. For atomic write support, only PF parameters are of interest. The "normal" parameters are concerned with racing reads and writes (which also applies to PF). See NVM Command Set Specification Revision 1.0d section 2.1.4 for reference. Whether to use per namespace or controller atomic parameters is decided by NSFEAT bit 1 - see Figure 97: Identify – Identify Namespace Data Structure, NVM Command Set. NVMe namespaces may define an atomic boundary, whereby no atomic guarantees are provided for a write which straddles this per-lba space boundary. The block layer merging policy is such that no merges may occur in which the resultant request would straddle such a boundary. Unlike SCSI, NVMe specifies no granularity or alignment rules, apart from atomic boundary rule. In addition, again unlike SCSI, there is no dedicated atomic write command - a write which adheres to the atomic size limit and boundary is implicitly atomic. If NSFEAT bit 1 is set, the following parameters are of interest: - NAWUPF (Namespace Atomic Write Unit Power Fail) - NABSPF (Namespace Atomic Boundary Size Power Fail) - NABO (Namespace Atomic Boundary Offset) and we set request_queue limits as follows: - atomic_write_unit_max = rounddown_pow_of_two(NAWUPF) - atomic_write_max_bytes = NAWUPF - atomic_write_boundary = NABSPF If in the unlikely scenario that NABO is non-zero, then atomic writes will not be supported at all as dealing with this adds extra complexity. This policy may change in future. In all cases, atomic_write_unit_min is set to the logical block size. If NSFEAT bit 1 is unset, the following parameter is of interest: - AWUPF (Atomic Write Unit Power Fail) and we set request_queue limits as follows: - atomic_write_unit_max = rounddown_pow_of_two(AWUPF) - atomic_write_max_bytes = AWUPF - atomic_write_boundary = 0 A new function, nvme_valid_atomic_write(), is also called from submission path to verify that a request has been submitted to the driver will actually be executed atomically. As mentioned, there is no dedicated NVMe atomic write command (which may error for a command which exceeds the controller atomic write limits). Note on NABSPF: There seems to be some vagueness in the spec as to whether NABSPF applies for NSFEAT bit 1 being unset. Figure 97 does not explicitly mention NABSPF and how it is affected by bit 1. However Figure 4 does tell to check Figure 97 for info about per-namespace parameters, which NABSPF is, so it is implied. However currently nvme_update_disk_info() does check namespace parameter NABO regardless of this bit. Signed-off-by: Alan Adamson Reviewed-by: Keith Busch Reviewed-by: Martin K. Petersen jpg: total rewrite Signed-off-by: John Garry Link: https://lore.kernel.org/r/20240620125359.2684798-11-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 52 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index bf410d10b12006..89ebfa89613ee6 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -927,6 +927,36 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns, return BLK_STS_OK; } +/* + * NVMe does not support a dedicated command to issue an atomic write. A write + * which does adhere to the device atomic limits will silently be executed + * non-atomically. The request issuer should ensure that the write is within + * the queue atomic writes limits, but just validate this in case it is not. + */ +static bool nvme_valid_atomic_write(struct request *req) +{ + struct request_queue *q = req->q; + u32 boundary_bytes = queue_atomic_write_boundary_bytes(q); + + if (blk_rq_bytes(req) > queue_atomic_write_unit_max_bytes(q)) + return false; + + if (boundary_bytes) { + u64 mask = boundary_bytes - 1, imask = ~mask; + u64 start = blk_rq_pos(req) << SECTOR_SHIFT; + u64 end = start + blk_rq_bytes(req) - 1; + + /* If greater then must be crossing a boundary */ + if (blk_rq_bytes(req) > boundary_bytes) + return false; + + if ((start & imask) != (end & imask)) + return false; + } + + return true; +} + static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, struct request *req, struct nvme_command *cmnd, enum nvme_opcode op) @@ -942,6 +972,9 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, if (req->cmd_flags & REQ_RAHEAD) dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; + if (req->cmd_flags & REQ_ATOMIC && !nvme_valid_atomic_write(req)) + return BLK_STS_INVAL; + cmnd->rw.opcode = op; cmnd->rw.flags = 0; cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id); @@ -1920,6 +1953,23 @@ static void nvme_configure_metadata(struct nvme_ctrl *ctrl, } } + +static void nvme_update_atomic_write_disk_info(struct nvme_ns *ns, + struct nvme_id_ns *id, struct queue_limits *lim, + u32 bs, u32 atomic_bs) +{ + unsigned int boundary = 0; + + if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) { + if (le16_to_cpu(id->nabspf)) + boundary = (le16_to_cpu(id->nabspf) + 1) * bs; + } + lim->atomic_write_hw_max = atomic_bs; + lim->atomic_write_hw_boundary = boundary; + lim->atomic_write_hw_unit_min = bs; + lim->atomic_write_hw_unit_max = rounddown_pow_of_two(atomic_bs); +} + static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl) { return ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> SECTOR_SHIFT) + 1; @@ -1966,6 +2016,8 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id, atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs; else atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs; + + nvme_update_atomic_write_disk_info(ns, id, lim, bs, atomic_bs); } if (id->nsfeat & NVME_NS_FEAT_IO_OPT) { From 676f819c3e982db3695a371f336a05086585ea4f Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Thu, 13 Jun 2024 20:28:03 +0800 Subject: [PATCH 1248/1578] KVM: Discard zero mask with function kvm_dirty_ring_reset Function kvm_reset_dirty_gfn may be called with parameters cur_slot / cur_offset / mask are all zero, it does not represent real dirty page. It is not necessary to clear dirty page in this condition. Also return value of macro __fls() is undefined if mask is zero which is called in funciton kvm_reset_dirty_gfn(). Here just return. Signed-off-by: Bibo Mao Message-ID: <20240613122803.1031511-1-maobibo@loongson.cn> [Move the conditional inside kvm_reset_dirty_gfn; suggested by Sean Christopherson. - Paolo] Signed-off-by: Paolo Bonzini --- virt/kvm/dirty_ring.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c index 86d267db87bb13..7bc74969a819af 100644 --- a/virt/kvm/dirty_ring.c +++ b/virt/kvm/dirty_ring.c @@ -55,6 +55,9 @@ static void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask) struct kvm_memory_slot *memslot; int as_id, id; + if (!mask) + return; + as_id = slot >> 16; id = (u16)slot; From d4e001ffeccfc128c715057e866f301ac9b95728 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 20 Jun 2024 13:34:49 +0200 Subject: [PATCH 1249/1578] dt-bindings: i2c: atmel,at91sam: correct path to i2c-controller schema The referenced i2c-controller.yaml schema is provided by dtschema package (outside of Linux kernel), so use full path to reference it. Cc: stable@vger.kernel.org Fixes: 7ea75dd386be ("dt-bindings: i2c: convert i2c-at91 to json-schema") Signed-off-by: Krzysztof Kozlowski Reviewed-by: Conor Dooley Signed-off-by: Andi Shyti --- Documentation/devicetree/bindings/i2c/atmel,at91sam-i2c.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/i2c/atmel,at91sam-i2c.yaml b/Documentation/devicetree/bindings/i2c/atmel,at91sam-i2c.yaml index b1c13bab24722f..b2d19cfb87addc 100644 --- a/Documentation/devicetree/bindings/i2c/atmel,at91sam-i2c.yaml +++ b/Documentation/devicetree/bindings/i2c/atmel,at91sam-i2c.yaml @@ -77,7 +77,7 @@ required: - clocks allOf: - - $ref: i2c-controller.yaml + - $ref: /schemas/i2c/i2c-controller.yaml# - if: properties: compatible: From 5c8cfd592bb7632200b4edac8f2c7ec892ed9d81 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 20 Jun 2024 13:34:50 +0200 Subject: [PATCH 1250/1578] dt-bindings: i2c: google,cros-ec-i2c-tunnel: correct path to i2c-controller schema The referenced i2c-controller.yaml schema is provided by dtschema package (outside of Linux kernel), so use full path to reference it. Cc: stable@vger.kernel.org Fixes: 1acd4577a66f ("dt-bindings: i2c: convert i2c-cros-ec-tunnel to json-schema") Signed-off-by: Krzysztof Kozlowski Reviewed-by: Conor Dooley Signed-off-by: Andi Shyti --- .../devicetree/bindings/i2c/google,cros-ec-i2c-tunnel.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/i2c/google,cros-ec-i2c-tunnel.yaml b/Documentation/devicetree/bindings/i2c/google,cros-ec-i2c-tunnel.yaml index ab151c9db21913..580003cdfff59e 100644 --- a/Documentation/devicetree/bindings/i2c/google,cros-ec-i2c-tunnel.yaml +++ b/Documentation/devicetree/bindings/i2c/google,cros-ec-i2c-tunnel.yaml @@ -21,7 +21,7 @@ description: | google,cros-ec-spi or google,cros-ec-i2c. allOf: - - $ref: i2c-controller.yaml# + - $ref: /schemas/i2c/i2c-controller.yaml# properties: compatible: From 5a72477273066b5b357801ab2d315ef14949d402 Mon Sep 17 00:00:00 2001 From: Grygorii Tertychnyi Date: Mon, 20 May 2024 17:39:32 +0200 Subject: [PATCH 1251/1578] i2c: ocores: set IACK bit after core is enabled Setting IACK bit when core is disabled does not clear the "Interrupt Flag" bit in the status register, and the interrupt remains pending. Sometimes it causes failure for the very first message transfer, that is usually a device probe. Hence, set IACK bit after core is enabled to clear pending interrupt. Fixes: 18f98b1e3147 ("[PATCH] i2c: New bus driver for the OpenCores I2C controller") Signed-off-by: Grygorii Tertychnyi Acked-by: Peter Korsgaard Cc: stable@vger.kernel.org Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-ocores.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-ocores.c b/drivers/i2c/busses/i2c-ocores.c index 56a4dabf5a3887..4ad670a80a63af 100644 --- a/drivers/i2c/busses/i2c-ocores.c +++ b/drivers/i2c/busses/i2c-ocores.c @@ -431,8 +431,8 @@ static int ocores_init(struct device *dev, struct ocores_i2c *i2c) oc_setreg(i2c, OCI2C_PREHIGH, prescale >> 8); /* Init the device */ - oc_setreg(i2c, OCI2C_CMD, OCI2C_CMD_IACK); oc_setreg(i2c, OCI2C_CONTROL, ctrl | OCI2C_CTRL_EN); + oc_setreg(i2c, OCI2C_CMD, OCI2C_CMD_IACK); return 0; } From 120dd4118e58dbda2ddb1dcf55f3c56cdfe8cee0 Mon Sep 17 00:00:00 2001 From: Xi Ruoyao Date: Fri, 21 Jun 2024 10:18:40 +0800 Subject: [PATCH 1252/1578] LoongArch: Only allow OBJTOOL & ORC unwinder if toolchain supports -mthin-add-sub GAS <= 2.41 does not support generating R_LARCH_{32,64}_PCREL for "label - ." and it generates R_LARCH_{ADD,SUB}{32,64} pairs instead. Objtool cannot handle R_LARCH_{ADD,SUB}{32,64} pair in __jump_table (static key implementation) and etc. so it will produce some warnings. This is causing the kernel CI systems to complain everywhere. For GAS we can check if -mthin-add-sub option is available to know if R_LARCH_{32,64}_PCREL are supported. For Clang, we require Clang >= 18 and Clang >= 17 already supports R_LARCH_{32,64}_PCREL. But unfortunately Clang has some other issues, so we disable objtool for Clang at present. Note that __jump_table here is not generated by the compiler, so -fno-jump-table is completely irrelevant for this issue. Fixes: cb8a2ef0848c ("LoongArch: Add ORC stack unwinder support") Closes: https://lore.kernel.org/loongarch/Zl5m1ZlVmGKitAof@yujie-X299/ Closes: https://lore.kernel.org/loongarch/ZlY1gDDPi_mNrwJ1@slm.duckdns.org/ Closes: https://lore.kernel.org/loongarch/1717478006.038663-1-hengqi@linux.alibaba.com/ Link: https://sourceware.org/git/?p=binutils-gdb.git;a=commitdiff;h=816029e06768 Link: https://github.com/llvm/llvm-project/commit/42cb3c6346fc Signed-off-by: Xi Ruoyao Signed-off-by: Huacai Chen --- arch/loongarch/Kconfig | 5 ++++- arch/loongarch/Kconfig.debug | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index e38139c576ee90..ddc042895d0114 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -143,7 +143,7 @@ config LOONGARCH select HAVE_LIVEPATCH select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI - select HAVE_OBJTOOL if AS_HAS_EXPLICIT_RELOCS + select HAVE_OBJTOOL if AS_HAS_EXPLICIT_RELOCS && AS_HAS_THIN_ADD_SUB && !CC_IS_CLANG select HAVE_PCI select HAVE_PERF_EVENTS select HAVE_PERF_REGS @@ -261,6 +261,9 @@ config AS_HAS_EXPLICIT_RELOCS config AS_HAS_FCSR_CLASS def_bool $(as-instr,movfcsr2gr \$t0$(comma)\$fcsr0) +config AS_HAS_THIN_ADD_SUB + def_bool $(cc-option,-Wa$(comma)-mthin-add-sub) + config AS_HAS_LSX_EXTENSION def_bool $(as-instr,vld \$vr0$(comma)\$a0$(comma)0) diff --git a/arch/loongarch/Kconfig.debug b/arch/loongarch/Kconfig.debug index 98d60630c3d4b4..8b2ce5b5d43e8f 100644 --- a/arch/loongarch/Kconfig.debug +++ b/arch/loongarch/Kconfig.debug @@ -28,6 +28,7 @@ config UNWINDER_PROLOGUE config UNWINDER_ORC bool "ORC unwinder" + depends on HAVE_OBJTOOL select OBJTOOL help This option enables the ORC (Oops Rewind Capability) unwinder for From f63a47b34b140ed1ca39d7e4bd4f1cdc617fc316 Mon Sep 17 00:00:00 2001 From: Hui Li Date: Fri, 21 Jun 2024 10:18:40 +0800 Subject: [PATCH 1253/1578] LoongArch: Fix watchpoint setting error In the current code, when debugging the following code using gdb, "invalid argument ..." message will be displayed. lihui@bogon:~$ cat test.c #include int a = 0; int main() { a = 1; return 0; } lihui@bogon:~$ gcc -g test.c -o test lihui@bogon:~$ gdb test ... (gdb) watch a Hardware watchpoint 1: a (gdb) r ... Invalid argument setting hardware debug registers There are mainly two types of issues. 1. Some incorrect judgment condition existed in user_watch_state argument parsing, causing -EINVAL to be returned. When setting up a watchpoint, gdb uses the ptrace interface, ptrace(PTRACE_SETREGSET, tid, NT_LOONGARCH_HW_WATCH, (void *) &iov)). Register values in user_watch_state as follows: addr[0] = 0x0, mask[0] = 0x0, ctrl[0] = 0x0 addr[1] = 0x0, mask[1] = 0x0, ctrl[1] = 0x0 addr[2] = 0x0, mask[2] = 0x0, ctrl[2] = 0x0 addr[3] = 0x0, mask[3] = 0x0, ctrl[3] = 0x0 addr[4] = 0x0, mask[4] = 0x0, ctrl[4] = 0x0 addr[5] = 0x0, mask[5] = 0x0, ctrl[5] = 0x0 addr[6] = 0x0, mask[6] = 0x0, ctrl[6] = 0x0 addr[7] = 0x12000803c, mask[7] = 0x0, ctrl[7] = 0x610 In arch_bp_generic_fields(), return -EINVAL when ctrl.len is LOONGARCH_BREAKPOINT_LEN_8(0b00). So delete the incorrect judgment here. In ptrace_hbp_fill_attr_ctrl(), when note_type is NT_LOONGARCH_HW_WATCH and ctrl[0] == 0x0, if ((type & HW_BREAKPOINT_RW) != type) will return -EINVAL. Here ctrl.type should be set based on note_type, and unnecessary judgments can be removed. 2. The watchpoint argument was not set correctly due to unnecessary offset and alignment_mask. Modify ptrace_hbp_fill_attr_ctrl() and hw_breakpoint_arch_parse(), which ensure the watchpont argument is set correctly. All changes according to the LoongArch Reference Manual: https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html#control-and-status-registers-related-to-watchpoints Cc: stable@vger.kernel.org Signed-off-by: Hui Li Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/hw_breakpoint.h | 2 +- arch/loongarch/kernel/hw_breakpoint.c | 19 ++++--------- arch/loongarch/kernel/ptrace.c | 32 ++++++++++------------ 3 files changed, 21 insertions(+), 32 deletions(-) diff --git a/arch/loongarch/include/asm/hw_breakpoint.h b/arch/loongarch/include/asm/hw_breakpoint.h index 21447fb1efc778..a8ce580f4fc6ff 100644 --- a/arch/loongarch/include/asm/hw_breakpoint.h +++ b/arch/loongarch/include/asm/hw_breakpoint.h @@ -101,7 +101,7 @@ struct perf_event; struct perf_event_attr; extern int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl, - int *gen_len, int *gen_type, int *offset); + int *gen_len, int *gen_type); extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw); extern int hw_breakpoint_arch_parse(struct perf_event *bp, const struct perf_event_attr *attr, diff --git a/arch/loongarch/kernel/hw_breakpoint.c b/arch/loongarch/kernel/hw_breakpoint.c index fc55c4de2a11ff..950b2b8a82ee07 100644 --- a/arch/loongarch/kernel/hw_breakpoint.c +++ b/arch/loongarch/kernel/hw_breakpoint.c @@ -283,7 +283,7 @@ int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw) * to generic breakpoint descriptions. */ int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl, - int *gen_len, int *gen_type, int *offset) + int *gen_len, int *gen_type) { /* Type */ switch (ctrl.type) { @@ -303,11 +303,6 @@ int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl, return -EINVAL; } - if (!ctrl.len) - return -EINVAL; - - *offset = __ffs(ctrl.len); - /* Len */ switch (ctrl.len) { case LOONGARCH_BREAKPOINT_LEN_1: @@ -386,21 +381,17 @@ int hw_breakpoint_arch_parse(struct perf_event *bp, struct arch_hw_breakpoint *hw) { int ret; - u64 alignment_mask, offset; + u64 alignment_mask; /* Build the arch_hw_breakpoint. */ ret = arch_build_bp_info(bp, attr, hw); if (ret) return ret; - if (hw->ctrl.type != LOONGARCH_BREAKPOINT_EXECUTE) - alignment_mask = 0x7; - else + if (hw->ctrl.type == LOONGARCH_BREAKPOINT_EXECUTE) { alignment_mask = 0x3; - offset = hw->address & alignment_mask; - - hw->address &= ~alignment_mask; - hw->ctrl.len <<= offset; + hw->address &= ~alignment_mask; + } return 0; } diff --git a/arch/loongarch/kernel/ptrace.c b/arch/loongarch/kernel/ptrace.c index c114c5ef13325a..16b756c6049bc8 100644 --- a/arch/loongarch/kernel/ptrace.c +++ b/arch/loongarch/kernel/ptrace.c @@ -494,28 +494,14 @@ static int ptrace_hbp_fill_attr_ctrl(unsigned int note_type, struct arch_hw_breakpoint_ctrl ctrl, struct perf_event_attr *attr) { - int err, len, type, offset; + int err, len, type; - err = arch_bp_generic_fields(ctrl, &len, &type, &offset); + err = arch_bp_generic_fields(ctrl, &len, &type); if (err) return err; - switch (note_type) { - case NT_LOONGARCH_HW_BREAK: - if ((type & HW_BREAKPOINT_X) != type) - return -EINVAL; - break; - case NT_LOONGARCH_HW_WATCH: - if ((type & HW_BREAKPOINT_RW) != type) - return -EINVAL; - break; - default: - return -EINVAL; - } - attr->bp_len = len; attr->bp_type = type; - attr->bp_addr += offset; return 0; } @@ -609,7 +595,19 @@ static int ptrace_hbp_set_ctrl(unsigned int note_type, return PTR_ERR(bp); attr = bp->attr; - decode_ctrl_reg(uctrl, &ctrl); + + switch (note_type) { + case NT_LOONGARCH_HW_BREAK: + ctrl.type = LOONGARCH_BREAKPOINT_EXECUTE; + ctrl.len = LOONGARCH_BREAKPOINT_LEN_4; + break; + case NT_LOONGARCH_HW_WATCH: + decode_ctrl_reg(uctrl, &ctrl); + break; + default: + return -EINVAL; + } + err = ptrace_hbp_fill_attr_ctrl(note_type, ctrl, &attr); if (err) return err; From c8e57ab0995c5b443d3c81c8a36b588776dcd0c3 Mon Sep 17 00:00:00 2001 From: Hui Li Date: Fri, 21 Jun 2024 10:18:40 +0800 Subject: [PATCH 1254/1578] LoongArch: Trigger user-space watchpoints correctly In the current code, gdb can set the watchpoint successfully through ptrace interface, but watchpoint will not be triggered. When debugging the following code using gdb. lihui@bogon:~$ cat test.c #include int a = 0; int main() { a = 1; printf("a = %d\n", a); return 0; } lihui@bogon:~$ gcc -g test.c -o test lihui@bogon:~$ gdb test ... (gdb) watch a ... (gdb) r ... a = 1 [Inferior 1 (process 4650) exited normally] No watchpoints were triggered, the root causes are: 1. Kernel uses perf_event and hw_breakpoint framework to control watchpoint, but the perf_event corresponding to watchpoint is not enabled. So it needs to be enabled according to MWPnCFG3 or FWPnCFG3 PLV bit field in ptrace_hbp_set_ctrl(), and privilege is set according to the monitored addr in hw_breakpoint_control(). Furthermore, add a judgment in ptrace_hbp_set_addr() to ensure kernel-space addr cannot be monitored in user mode. 2. The global enable control for all watchpoints is the WE bit of CSR.CRMD, and hardware sets the value to 0 when an exception is triggered. When the ERTN instruction is executed to return, the hardware restores the value of the PWE field of CSR.PRMD here. So, before a thread containing watchpoints be scheduled, the PWE field of CSR.PRMD needs to be set to 1. Add this modification in hw_breakpoint_control(). All changes according to the LoongArch Reference Manual: https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html#control-and-status-registers-related-to-watchpoints https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html#basic-control-and-status-registers With this patch: lihui@bogon:~$ gdb test ... (gdb) watch a Hardware watchpoint 1: a (gdb) r ... Hardware watchpoint 1: a Old value = 0 New value = 1 main () at test.c:6 6 printf("a = %d\n", a); (gdb) c Continuing. a = 1 [Inferior 1 (process 775) exited normally] Cc: stable@vger.kernel.org Signed-off-by: Hui Li Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/hw_breakpoint.h | 2 ++ arch/loongarch/kernel/hw_breakpoint.c | 20 +++++++++++++++++--- arch/loongarch/kernel/ptrace.c | 15 ++++++++++++--- 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/arch/loongarch/include/asm/hw_breakpoint.h b/arch/loongarch/include/asm/hw_breakpoint.h index a8ce580f4fc6ff..d78330916bd18a 100644 --- a/arch/loongarch/include/asm/hw_breakpoint.h +++ b/arch/loongarch/include/asm/hw_breakpoint.h @@ -75,6 +75,8 @@ do { \ #define CSR_MWPC_NUM 0x3f #define CTRL_PLV_ENABLE 0x1e +#define CTRL_PLV0_ENABLE 0x02 +#define CTRL_PLV3_ENABLE 0x10 #define MWPnCFG3_LoadEn 8 #define MWPnCFG3_StoreEn 9 diff --git a/arch/loongarch/kernel/hw_breakpoint.c b/arch/loongarch/kernel/hw_breakpoint.c index 950b2b8a82ee07..e882df1f72db81 100644 --- a/arch/loongarch/kernel/hw_breakpoint.c +++ b/arch/loongarch/kernel/hw_breakpoint.c @@ -174,11 +174,21 @@ void flush_ptrace_hw_breakpoint(struct task_struct *tsk) static int hw_breakpoint_control(struct perf_event *bp, enum hw_breakpoint_ops ops) { - u32 ctrl; + u32 ctrl, privilege; int i, max_slots, enable; + struct pt_regs *regs; struct perf_event **slots; struct arch_hw_breakpoint *info = counter_arch_bp(bp); + if (arch_check_bp_in_kernelspace(info)) + privilege = CTRL_PLV0_ENABLE; + else + privilege = CTRL_PLV3_ENABLE; + + /* Whether bp belongs to a task. */ + if (bp->hw.target) + regs = task_pt_regs(bp->hw.target); + if (info->ctrl.type == LOONGARCH_BREAKPOINT_EXECUTE) { /* Breakpoint */ slots = this_cpu_ptr(bp_on_reg); @@ -204,13 +214,15 @@ static int hw_breakpoint_control(struct perf_event *bp, write_wb_reg(CSR_CFG_ASID, i, 0, 0); write_wb_reg(CSR_CFG_ASID, i, 1, 0); if (info->ctrl.type == LOONGARCH_BREAKPOINT_EXECUTE) { - write_wb_reg(CSR_CFG_CTRL, i, 0, CTRL_PLV_ENABLE); + write_wb_reg(CSR_CFG_CTRL, i, 0, privilege); } else { ctrl = encode_ctrl_reg(info->ctrl); - write_wb_reg(CSR_CFG_CTRL, i, 1, ctrl | CTRL_PLV_ENABLE); + write_wb_reg(CSR_CFG_CTRL, i, 1, ctrl | privilege); } enable = csr_read64(LOONGARCH_CSR_CRMD); csr_write64(CSR_CRMD_WE | enable, LOONGARCH_CSR_CRMD); + if (bp->hw.target) + regs->csr_prmd |= CSR_PRMD_PWE; break; case HW_BREAKPOINT_UNINSTALL: /* Reset the FWPnCFG/MWPnCFG 1~4 register. */ @@ -222,6 +234,8 @@ static int hw_breakpoint_control(struct perf_event *bp, write_wb_reg(CSR_CFG_CTRL, i, 1, 0); write_wb_reg(CSR_CFG_ASID, i, 0, 0); write_wb_reg(CSR_CFG_ASID, i, 1, 0); + if (bp->hw.target) + regs->csr_prmd &= ~CSR_PRMD_PWE; break; } diff --git a/arch/loongarch/kernel/ptrace.c b/arch/loongarch/kernel/ptrace.c index 16b756c6049bc8..200109de1971ad 100644 --- a/arch/loongarch/kernel/ptrace.c +++ b/arch/loongarch/kernel/ptrace.c @@ -608,9 +608,14 @@ static int ptrace_hbp_set_ctrl(unsigned int note_type, return -EINVAL; } - err = ptrace_hbp_fill_attr_ctrl(note_type, ctrl, &attr); - if (err) - return err; + if (uctrl & CTRL_PLV_ENABLE) { + err = ptrace_hbp_fill_attr_ctrl(note_type, ctrl, &attr); + if (err) + return err; + attr.disabled = 0; + } else { + attr.disabled = 1; + } return modify_user_hw_breakpoint(bp, &attr); } @@ -641,6 +646,10 @@ static int ptrace_hbp_set_addr(unsigned int note_type, struct perf_event *bp; struct perf_event_attr attr; + /* Kernel-space address cannot be monitored by user-space */ + if ((unsigned long)addr >= XKPRANGE) + return -EINVAL; + bp = ptrace_hbp_get_initialised_bp(note_type, tsk, idx); if (IS_ERR(bp)) return PTR_ERR(bp); From 3eb2a8b23598e90fda43abb0f23cb267bd5018ba Mon Sep 17 00:00:00 2001 From: Hui Li Date: Fri, 21 Jun 2024 10:18:40 +0800 Subject: [PATCH 1255/1578] LoongArch: Fix multiple hardware watchpoint issues In the current code, if multiple hardware breakpoints/watchpoints in a user-space thread, some of them will not be triggered. When debugging the following code using gdb. lihui@bogon:~$ cat test.c #include int a = 0; int main() { printf("start test\n"); a = 1; printf("a = %d\n", a); printf("end test\n"); return 0; } lihui@bogon:~$ gcc -g test.c -o test lihui@bogon:~$ gdb test ... (gdb) start ... Temporary breakpoint 1, main () at test.c:5 5 printf("start test\n"); (gdb) watch a Hardware watchpoint 2: a (gdb) hbreak 8 Hardware assisted breakpoint 3 at 0x1200006ec: file test.c, line 8. (gdb) c Continuing. start test a = 1 Breakpoint 3, main () at test.c:8 8 printf("end test\n"); ... The first hardware watchpoint is not triggered, the root causes are: 1. In hw_breakpoint_control(), The FWPnCFG1.2.4/MWPnCFG1.2.4 register settings are not distinguished. They should be set based on hardware watchpoint functions (fetch or load/store operations). 2. In breakpoint_handler() and watchpoint_handler(), it doesn't identify which watchpoint is triggered. So, all watchpoint-related perf_event callbacks are called and siginfo is sent to the user space. This will cause user-space unable to determine which watchpoint is triggered. The kernel need to identity which watchpoint is triggered via MWPS/ FWPS registers, and then call the corresponding perf event callbacks to report siginfo to the user-space. Modify the relevant code to solve above issues. All changes according to the LoongArch Reference Manual: https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html#control-and-status-registers-related-to-watchpoints With this patch: lihui@bogon:~$ gdb test ... (gdb) start ... Temporary breakpoint 1, main () at test.c:5 5 printf("start test\n"); (gdb) watch a Hardware watchpoint 2: a (gdb) hbreak 8 Hardware assisted breakpoint 3 at 0x1200006ec: file test.c, line 8. (gdb) c Continuing. start test Hardware watchpoint 2: a Old value = 0 New value = 1 main () at test.c:7 7 printf("a = %d\n", a); (gdb) c Continuing. a = 1 Breakpoint 3, main () at test.c:8 8 printf("end test\n"); (gdb) c Continuing. end test [Inferior 1 (process 778) exited normally] Cc: stable@vger.kernel.org Signed-off-by: Hui Li Signed-off-by: Huacai Chen --- arch/loongarch/kernel/hw_breakpoint.c | 57 ++++++++++++++++----------- 1 file changed, 33 insertions(+), 24 deletions(-) diff --git a/arch/loongarch/kernel/hw_breakpoint.c b/arch/loongarch/kernel/hw_breakpoint.c index e882df1f72db81..621ad7634df718 100644 --- a/arch/loongarch/kernel/hw_breakpoint.c +++ b/arch/loongarch/kernel/hw_breakpoint.c @@ -207,15 +207,15 @@ static int hw_breakpoint_control(struct perf_event *bp, switch (ops) { case HW_BREAKPOINT_INSTALL: /* Set the FWPnCFG/MWPnCFG 1~4 register. */ - write_wb_reg(CSR_CFG_ADDR, i, 0, info->address); - write_wb_reg(CSR_CFG_ADDR, i, 1, info->address); - write_wb_reg(CSR_CFG_MASK, i, 0, info->mask); - write_wb_reg(CSR_CFG_MASK, i, 1, info->mask); - write_wb_reg(CSR_CFG_ASID, i, 0, 0); - write_wb_reg(CSR_CFG_ASID, i, 1, 0); if (info->ctrl.type == LOONGARCH_BREAKPOINT_EXECUTE) { + write_wb_reg(CSR_CFG_ADDR, i, 0, info->address); + write_wb_reg(CSR_CFG_MASK, i, 0, info->mask); + write_wb_reg(CSR_CFG_ASID, i, 0, 0); write_wb_reg(CSR_CFG_CTRL, i, 0, privilege); } else { + write_wb_reg(CSR_CFG_ADDR, i, 1, info->address); + write_wb_reg(CSR_CFG_MASK, i, 1, info->mask); + write_wb_reg(CSR_CFG_ASID, i, 1, 0); ctrl = encode_ctrl_reg(info->ctrl); write_wb_reg(CSR_CFG_CTRL, i, 1, ctrl | privilege); } @@ -226,14 +226,17 @@ static int hw_breakpoint_control(struct perf_event *bp, break; case HW_BREAKPOINT_UNINSTALL: /* Reset the FWPnCFG/MWPnCFG 1~4 register. */ - write_wb_reg(CSR_CFG_ADDR, i, 0, 0); - write_wb_reg(CSR_CFG_ADDR, i, 1, 0); - write_wb_reg(CSR_CFG_MASK, i, 0, 0); - write_wb_reg(CSR_CFG_MASK, i, 1, 0); - write_wb_reg(CSR_CFG_CTRL, i, 0, 0); - write_wb_reg(CSR_CFG_CTRL, i, 1, 0); - write_wb_reg(CSR_CFG_ASID, i, 0, 0); - write_wb_reg(CSR_CFG_ASID, i, 1, 0); + if (info->ctrl.type == LOONGARCH_BREAKPOINT_EXECUTE) { + write_wb_reg(CSR_CFG_ADDR, i, 0, 0); + write_wb_reg(CSR_CFG_MASK, i, 0, 0); + write_wb_reg(CSR_CFG_CTRL, i, 0, 0); + write_wb_reg(CSR_CFG_ASID, i, 0, 0); + } else { + write_wb_reg(CSR_CFG_ADDR, i, 1, 0); + write_wb_reg(CSR_CFG_MASK, i, 1, 0); + write_wb_reg(CSR_CFG_CTRL, i, 1, 0); + write_wb_reg(CSR_CFG_ASID, i, 1, 0); + } if (bp->hw.target) regs->csr_prmd &= ~CSR_PRMD_PWE; break; @@ -476,12 +479,15 @@ void breakpoint_handler(struct pt_regs *regs) slots = this_cpu_ptr(bp_on_reg); for (i = 0; i < boot_cpu_data.watch_ireg_count; ++i) { - bp = slots[i]; - if (bp == NULL) - continue; - perf_bp_event(bp, regs); + if ((csr_read32(LOONGARCH_CSR_FWPS) & (0x1 << i))) { + bp = slots[i]; + if (bp == NULL) + continue; + perf_bp_event(bp, regs); + csr_write32(0x1 << i, LOONGARCH_CSR_FWPS); + update_bp_registers(regs, 0, 0); + } } - update_bp_registers(regs, 0, 0); } NOKPROBE_SYMBOL(breakpoint_handler); @@ -493,12 +499,15 @@ void watchpoint_handler(struct pt_regs *regs) slots = this_cpu_ptr(wp_on_reg); for (i = 0; i < boot_cpu_data.watch_dreg_count; ++i) { - wp = slots[i]; - if (wp == NULL) - continue; - perf_bp_event(wp, regs); + if ((csr_read32(LOONGARCH_CSR_MWPS) & (0x1 << i))) { + wp = slots[i]; + if (wp == NULL) + continue; + perf_bp_event(wp, regs); + csr_write32(0x1 << i, LOONGARCH_CSR_MWPS); + update_bp_registers(regs, 0, 1); + } } - update_bp_registers(regs, 0, 1); } NOKPROBE_SYMBOL(watchpoint_handler); From d0a1c07739e1b7f74683fe061545669156d102f2 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 21 Jun 2024 10:18:40 +0800 Subject: [PATCH 1256/1578] LoongArch: KVM: Remove an unneeded semicolon Remove an unneeded semicolon to avoid build warnings: ./arch/loongarch/kvm/exit.c:764:2-3: Unneeded semicolon Cc: stable@vger.kernel.org Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=9343 Signed-off-by: Yang Li Signed-off-by: Huacai Chen --- arch/loongarch/kvm/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c index c86e099af5cad0..a68573e091c013 100644 --- a/arch/loongarch/kvm/exit.c +++ b/arch/loongarch/kvm/exit.c @@ -761,7 +761,7 @@ static void kvm_handle_service(struct kvm_vcpu *vcpu) default: ret = KVM_HCALL_INVALID_CODE; break; - }; + } kvm_write_reg(vcpu, LOONGARCH_GPR_A0, ret); } From ad53f5f54f351e967128edbc431f0f26427172cf Mon Sep 17 00:00:00 2001 From: Tristram Ha Date: Tue, 18 Jun 2024 17:16:42 -0700 Subject: [PATCH 1257/1578] net: dsa: microchip: fix initial port flush problem The very first flush in any port will flush all learned addresses in all ports. This can be observed by unplugging the cable from one port while additional ports are connected and dumping the fdb entries. This problem is caused by the initially wrong value programmed to the REG_SW_LUE_CTRL_1 register. Setting SW_FLUSH_STP_TABLE and SW_FLUSH_MSTP_TABLE bits does not have an immediate effect. It is when ksz9477_flush_dyn_mac_table() is called then the SW_FLUSH_STP_TABLE bit takes effect and flushes all learned entries. After that call both bits are reset and so the next port flush will not cause such problem again. Fixes: b987e98e50ab ("dsa: add DSA switch driver for Microchip KSZ9477") Signed-off-by: Tristram Ha Link: https://patch.msgid.link/1718756202-2731-1-git-send-email-Tristram.Ha@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/microchip/ksz9477.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/dsa/microchip/ksz9477.c b/drivers/net/dsa/microchip/ksz9477.c index f8ad7833f5d9df..2231128eef8b8f 100644 --- a/drivers/net/dsa/microchip/ksz9477.c +++ b/drivers/net/dsa/microchip/ksz9477.c @@ -355,10 +355,8 @@ int ksz9477_reset_switch(struct ksz_device *dev) SPI_AUTO_EDGE_DETECTION, 0); /* default configuration */ - ksz_read8(dev, REG_SW_LUE_CTRL_1, &data8); - data8 = SW_AGING_ENABLE | SW_LINK_AUTO_AGING | - SW_SRC_ADDR_FILTER | SW_FLUSH_STP_TABLE | SW_FLUSH_MSTP_TABLE; - ksz_write8(dev, REG_SW_LUE_CTRL_1, data8); + ksz_write8(dev, REG_SW_LUE_CTRL_1, + SW_AGING_ENABLE | SW_LINK_AUTO_AGING | SW_SRC_ADDR_FILTER); /* disable interrupts */ ksz_write32(dev, REG_SW_INT_MASK__4, SWITCH_INT_MASK); From ad22051afdad962b6012f3823d0ed1a735935386 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pablo=20Ca=C3=B1o?= Date: Thu, 20 Jun 2024 17:25:33 +0200 Subject: [PATCH 1258/1578] ALSA: hda/realtek: Add quirk for Lenovo Yoga Pro 7 14AHP9 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lenovo Yoga Pro 7 14AHP9 (PCI SSID 17aa:3891) seems requiring a similar workaround like Yoga 9 model and Yoga 7 Pro 14APH8 for the bass speaker. Cc: Link: https://lore.kernel.org/all/20231207182035.30248-1-tiwai@suse.de/ Signed-off-by: Pablo Caño Link: https://patch.msgid.link/20240620152533.76712-1-pablocpascual@gmail.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index e2dbcf8f5bcfb3..f4454abadc8d9e 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10555,6 +10555,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x3882, "Lenovo Yoga Pro 7 14APH8", ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK_PIN), SND_PCI_QUIRK(0x17aa, 0x3884, "Y780 YG DUAL", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x3886, "Y780 VECO DUAL", ALC287_FIXUP_TAS2781_I2C), + SND_PCI_QUIRK(0x17aa, 0x3891, "Lenovo Yoga Pro 7 14AHP9", ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK_PIN), SND_PCI_QUIRK(0x17aa, 0x38a7, "Y780P AMD YG dual", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x38a8, "Y780P AMD VECO dual", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x38a9, "Thinkbook 16P", ALC287_FIXUP_MG_RTKC_CSAMP_CS35L41_I2C_THINKPAD), From 4a3e37b3caea817199757a0b13aa53dd7c9376c8 Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Sun, 16 Jun 2024 14:25:02 +0100 Subject: [PATCH 1259/1578] MIPS: mipsmtregs: Fix target register for MFTC0 Target register of mftc0 should be __res instead of $1, this is a leftover from old .insn code. Fixes: dd6d29a61489 ("MIPS: Implement microMIPS MT ASE helpers") Cc: stable@vger.kernel.org Signed-off-by: Jiaxun Yang Signed-off-by: Thomas Bogendoerfer --- arch/mips/include/asm/mipsmtregs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/include/asm/mipsmtregs.h b/arch/mips/include/asm/mipsmtregs.h index 30e86861c206ce..b1ee3c48e84baf 100644 --- a/arch/mips/include/asm/mipsmtregs.h +++ b/arch/mips/include/asm/mipsmtregs.h @@ -322,7 +322,7 @@ static inline void ehb(void) " .set push \n" \ " .set "MIPS_ISA_LEVEL" \n" \ _ASM_SET_MFTC0 \ - " mftc0 $1, " #rt ", " #sel " \n" \ + " mftc0 %0, " #rt ", " #sel " \n" \ _ASM_UNSET_MFTC0 \ " .set pop \n" \ : "=r" (__res)); \ From 0d5679a0aae2d8cda72169452c32e5cb88a7ab33 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 20 Jun 2024 18:23:04 +0200 Subject: [PATCH 1260/1578] mips: fix compat_sys_lseek syscall This is almost compatible, but passing a negative offset should result in a EINVAL error, but on mips o32 compat mode would seek to a large 32-bit byte offset. Use compat_sys_lseek() to correctly sign-extend the argument. Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Bogendoerfer --- arch/mips/kernel/syscalls/syscall_o32.tbl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 008ebe60263e37..81428a2eb66048 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -27,7 +27,7 @@ 17 o32 break sys_ni_syscall # 18 was sys_stat 18 o32 unused18 sys_ni_syscall -19 o32 lseek sys_lseek +19 o32 lseek sys_lseek compat_sys_lseek 20 o32 getpid sys_getpid 21 o32 mount sys_mount 22 o32 umount sys_oldumount From d3e2904f71ea0fe7eaff1d68a2b0363c888ea0fb Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 17 Nov 2023 13:49:59 +0100 Subject: [PATCH 1261/1578] net: can: j1939: enhanced error handling for tightly received RTS messages in xtp_rx_rts_session_new This patch enhances error handling in scenarios with RTS (Request to Send) messages arriving closely. It replaces the less informative WARN_ON_ONCE backtraces with a new error handling method. This provides clearer error messages and allows for the early termination of problematic sessions. Previously, sessions were only released at the end of j1939_xtp_rx_rts(). Potentially this could be reproduced with something like: testj1939 -r vcan0:0x80 & while true; do # send first RTS cansend vcan0 18EC8090#1014000303002301; # send second RTS cansend vcan0 18EC8090#1014000303002301; # send abort cansend vcan0 18EC8090#ff00000000002301; done Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Reported-by: syzbot+daa36413a5cedf799ae4@syzkaller.appspotmail.com Cc: stable@vger.kernel.org Signed-off-by: Oleksij Rempel Link: https://lore.kernel.org/all/20231117124959.961171-1-o.rempel@pengutronix.de Signed-off-by: Marc Kleine-Budde --- net/can/j1939/transport.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index fe3df23a259578..c6569f98d25144 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -1593,8 +1593,8 @@ j1939_session *j1939_xtp_rx_rts_session_new(struct j1939_priv *priv, struct j1939_sk_buff_cb skcb = *j1939_skb_to_cb(skb); struct j1939_session *session; const u8 *dat; + int len, ret; pgn_t pgn; - int len; netdev_dbg(priv->ndev, "%s\n", __func__); @@ -1653,7 +1653,22 @@ j1939_session *j1939_xtp_rx_rts_session_new(struct j1939_priv *priv, session->tskey = priv->rx_tskey++; j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_RTS); - WARN_ON_ONCE(j1939_session_activate(session)); + ret = j1939_session_activate(session); + if (ret) { + /* Entering this scope indicates an issue with the J1939 bus. + * Possible scenarios include: + * - A time lapse occurred, and a new session was initiated + * due to another packet being sent correctly. This could + * have been caused by too long interrupt, debugger, or being + * out-scheduled by another task. + * - The bus is receiving numerous erroneous packets, either + * from a malfunctioning device or during a test scenario. + */ + netdev_alert(priv->ndev, "%s: 0x%p: concurrent session with same addr (%02x %02x) is already active.\n", + __func__, session, skcb.addr.sa, skcb.addr.da); + j1939_session_put(session); + return NULL; + } return session; } From b7cdf1dd5d2a2d8200efd98d1893684db48fe134 Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Fri, 17 May 2024 12:59:53 +0900 Subject: [PATCH 1262/1578] net: can: j1939: Initialize unused data in j1939_send_one() syzbot reported kernel-infoleak in raw_recvmsg() [1]. j1939_send_one() creates full frame including unused data, but it doesn't initialize it. This causes the kernel-infoleak issue. Fix this by initializing unused data. [1] BUG: KMSAN: kernel-infoleak in instrument_copy_to_user include/linux/instrumented.h:114 [inline] BUG: KMSAN: kernel-infoleak in copy_to_user_iter lib/iov_iter.c:24 [inline] BUG: KMSAN: kernel-infoleak in iterate_ubuf include/linux/iov_iter.h:29 [inline] BUG: KMSAN: kernel-infoleak in iterate_and_advance2 include/linux/iov_iter.h:245 [inline] BUG: KMSAN: kernel-infoleak in iterate_and_advance include/linux/iov_iter.h:271 [inline] BUG: KMSAN: kernel-infoleak in _copy_to_iter+0x366/0x2520 lib/iov_iter.c:185 instrument_copy_to_user include/linux/instrumented.h:114 [inline] copy_to_user_iter lib/iov_iter.c:24 [inline] iterate_ubuf include/linux/iov_iter.h:29 [inline] iterate_and_advance2 include/linux/iov_iter.h:245 [inline] iterate_and_advance include/linux/iov_iter.h:271 [inline] _copy_to_iter+0x366/0x2520 lib/iov_iter.c:185 copy_to_iter include/linux/uio.h:196 [inline] memcpy_to_msg include/linux/skbuff.h:4113 [inline] raw_recvmsg+0x2b8/0x9e0 net/can/raw.c:1008 sock_recvmsg_nosec net/socket.c:1046 [inline] sock_recvmsg+0x2c4/0x340 net/socket.c:1068 ____sys_recvmsg+0x18a/0x620 net/socket.c:2803 ___sys_recvmsg+0x223/0x840 net/socket.c:2845 do_recvmmsg+0x4fc/0xfd0 net/socket.c:2939 __sys_recvmmsg net/socket.c:3018 [inline] __do_sys_recvmmsg net/socket.c:3041 [inline] __se_sys_recvmmsg net/socket.c:3034 [inline] __x64_sys_recvmmsg+0x397/0x490 net/socket.c:3034 x64_sys_call+0xf6c/0x3b50 arch/x86/include/generated/asm/syscalls_64.h:300 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcf/0x1e0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Uninit was created at: slab_post_alloc_hook mm/slub.c:3804 [inline] slab_alloc_node mm/slub.c:3845 [inline] kmem_cache_alloc_node+0x613/0xc50 mm/slub.c:3888 kmalloc_reserve+0x13d/0x4a0 net/core/skbuff.c:577 __alloc_skb+0x35b/0x7a0 net/core/skbuff.c:668 alloc_skb include/linux/skbuff.h:1313 [inline] alloc_skb_with_frags+0xc8/0xbf0 net/core/skbuff.c:6504 sock_alloc_send_pskb+0xa81/0xbf0 net/core/sock.c:2795 sock_alloc_send_skb include/net/sock.h:1842 [inline] j1939_sk_alloc_skb net/can/j1939/socket.c:878 [inline] j1939_sk_send_loop net/can/j1939/socket.c:1142 [inline] j1939_sk_sendmsg+0xc0a/0x2730 net/can/j1939/socket.c:1277 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x30f/0x380 net/socket.c:745 ____sys_sendmsg+0x877/0xb60 net/socket.c:2584 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2638 __sys_sendmsg net/socket.c:2667 [inline] __do_sys_sendmsg net/socket.c:2676 [inline] __se_sys_sendmsg net/socket.c:2674 [inline] __x64_sys_sendmsg+0x307/0x4a0 net/socket.c:2674 x64_sys_call+0xc4b/0x3b50 arch/x86/include/generated/asm/syscalls_64.h:47 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcf/0x1e0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Bytes 12-15 of 16 are uninitialized Memory access of size 16 starts at ffff888120969690 Data copied to user address 00000000200017c0 CPU: 1 PID: 5050 Comm: syz-executor198 Not tainted 6.9.0-rc5-syzkaller-00031-g71b1543c83d6 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/27/2024 Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Reported-and-tested-by: syzbot+5681e40d297b30f5b513@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=5681e40d297b30f5b513 Acked-by: Oleksij Rempel Signed-off-by: Shigeru Yoshida Link: https://lore.kernel.org/all/20240517035953.2617090-1-syoshida@redhat.com Cc: stable@vger.kernel.org Signed-off-by: Marc Kleine-Budde --- net/can/j1939/main.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c index a6fb89fa627851..7e8a20f2fc42b5 100644 --- a/net/can/j1939/main.c +++ b/net/can/j1939/main.c @@ -30,10 +30,6 @@ MODULE_ALIAS("can-proto-" __stringify(CAN_J1939)); /* CAN_HDR: #bytes before can_frame data part */ #define J1939_CAN_HDR (offsetof(struct can_frame, data)) -/* CAN_FTR: #bytes beyond data part */ -#define J1939_CAN_FTR (sizeof(struct can_frame) - J1939_CAN_HDR - \ - sizeof(((struct can_frame *)0)->data)) - /* lowest layer */ static void j1939_can_recv(struct sk_buff *iskb, void *data) { @@ -342,7 +338,7 @@ int j1939_send_one(struct j1939_priv *priv, struct sk_buff *skb) memset(cf, 0, J1939_CAN_HDR); /* make it a full can frame again */ - skb_put(skb, J1939_CAN_FTR + (8 - dlc)); + skb_put_zero(skb, 8 - dlc); canid = CAN_EFF_FLAG | (skcb->priority << 26) | From 9ad1da14ab3bf23087ae45fe399d84a109ddb81a Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Tue, 28 May 2024 09:06:48 +0200 Subject: [PATCH 1263/1578] net: can: j1939: recover socket queue on CAN bus error during BAM transmission MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses an issue where a CAN bus error during a BAM transmission could stall the socket queue, preventing further transmissions even after the bus error is resolved. The fix activates the next queued session after the error recovery, allowing communication to continue. Fixes: 9d71dd0c70099 ("can: add support of SAE J1939 protocol") Cc: stable@vger.kernel.org Reported-by: Alexander Hölzl Tested-by: Alexander Hölzl Signed-off-by: Oleksij Rempel Link: https://lore.kernel.org/all/20240528070648.1947203-1-o.rempel@pengutronix.de Cc: stable@vger.kernel.org Signed-off-by: Marc Kleine-Budde --- net/can/j1939/transport.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index c6569f98d25144..4be73de5033cb7 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -1696,6 +1696,8 @@ static int j1939_xtp_rx_rts_session_active(struct j1939_session *session, j1939_session_timers_cancel(session); j1939_session_cancel(session, J1939_XTP_ABORT_BUSY); + if (session->transmission) + j1939_session_deactivate_activate_next(session); return -EBUSY; } From 0d34d8163fd87978a6abd792e2d8ad849f4c3d57 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Tue, 21 May 2024 12:10:20 +0800 Subject: [PATCH 1264/1578] can: kvaser_usb: fix return value for hif_usb_send_regout As the potential failure of usb_submit_urb(), it should be better to return the err variable to catch the error. Signed-off-by: Chen Ni Link: https://lore.kernel.org/all/20240521041020.1519416-1-nichen@iscas.ac.cn Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c index 8faf8a462c0550..7292c81fc0cdc0 100644 --- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c +++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c @@ -294,7 +294,7 @@ int kvaser_usb_send_cmd_async(struct kvaser_usb_net_priv *priv, void *cmd, } usb_free_urb(urb); - return 0; + return err; } int kvaser_usb_can_rx_over_error(struct net_device *netdev) From d8fb63e46c884c898a38f061c2330f7729e75510 Mon Sep 17 00:00:00 2001 From: Vitor Soares Date: Fri, 17 May 2024 14:43:55 +0100 Subject: [PATCH 1265/1578] can: mcp251xfd: fix infinite loop when xmit fails When the mcp251xfd_start_xmit() function fails, the driver stops processing messages, and the interrupt routine does not return, running indefinitely even after killing the running application. Error messages: [ 441.298819] mcp251xfd spi2.0 can0: ERROR in mcp251xfd_start_xmit: -16 [ 441.306498] mcp251xfd spi2.0 can0: Transmit Event FIFO buffer not empty. (seq=0x000017c7, tef_tail=0x000017cf, tef_head=0x000017d0, tx_head=0x000017d3). ... and repeat forever. The issue can be triggered when multiple devices share the same SPI interface. And there is concurrent access to the bus. The problem occurs because tx_ring->head increments even if mcp251xfd_start_xmit() fails. Consequently, the driver skips one TX package while still expecting a response in mcp251xfd_handle_tefif_one(). Resolve the issue by starting a workqueue to write the tx obj synchronously if err = -EBUSY. In case of another error, decrement tx_ring->head, remove skb from the echo stack, and drop the message. Fixes: 55e5b97f003e ("can: mcp25xxfd: add driver for Microchip MCP25xxFD SPI CAN") Cc: stable@vger.kernel.org Signed-off-by: Vitor Soares Link: https://lore.kernel.org/all/20240517134355.770777-1-ivitro@gmail.com [mkl: use more imperative wording in patch description] Signed-off-by: Marc Kleine-Budde --- .../net/can/spi/mcp251xfd/mcp251xfd-core.c | 14 ++++- drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c | 55 ++++++++++++++++--- drivers/net/can/spi/mcp251xfd/mcp251xfd.h | 5 ++ 3 files changed, 65 insertions(+), 9 deletions(-) diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c index 1d9057dc44f25e..bf1589aef1fc6b 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c @@ -1618,11 +1618,20 @@ static int mcp251xfd_open(struct net_device *ndev) clear_bit(MCP251XFD_FLAGS_DOWN, priv->flags); can_rx_offload_enable(&priv->offload); + priv->wq = alloc_ordered_workqueue("%s-mcp251xfd_wq", + WQ_FREEZABLE | WQ_MEM_RECLAIM, + dev_name(&spi->dev)); + if (!priv->wq) { + err = -ENOMEM; + goto out_can_rx_offload_disable; + } + INIT_WORK(&priv->tx_work, mcp251xfd_tx_obj_write_sync); + err = request_threaded_irq(spi->irq, NULL, mcp251xfd_irq, IRQF_SHARED | IRQF_ONESHOT, dev_name(&spi->dev), priv); if (err) - goto out_can_rx_offload_disable; + goto out_destroy_workqueue; err = mcp251xfd_chip_interrupts_enable(priv); if (err) @@ -1634,6 +1643,8 @@ static int mcp251xfd_open(struct net_device *ndev) out_free_irq: free_irq(spi->irq, priv); + out_destroy_workqueue: + destroy_workqueue(priv->wq); out_can_rx_offload_disable: can_rx_offload_disable(&priv->offload); set_bit(MCP251XFD_FLAGS_DOWN, priv->flags); @@ -1661,6 +1672,7 @@ static int mcp251xfd_stop(struct net_device *ndev) hrtimer_cancel(&priv->tx_irq_timer); mcp251xfd_chip_interrupts_disable(priv); free_irq(ndev->irq, priv); + destroy_workqueue(priv->wq); can_rx_offload_disable(&priv->offload); mcp251xfd_timestamp_stop(priv); mcp251xfd_chip_stop(priv, CAN_STATE_STOPPED); diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c index 160528d3cc26b1..b1de8052a45ccb 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c @@ -131,6 +131,39 @@ mcp251xfd_tx_obj_from_skb(const struct mcp251xfd_priv *priv, tx_obj->xfer[0].len = len; } +static void mcp251xfd_tx_failure_drop(const struct mcp251xfd_priv *priv, + struct mcp251xfd_tx_ring *tx_ring, + int err) +{ + struct net_device *ndev = priv->ndev; + struct net_device_stats *stats = &ndev->stats; + unsigned int frame_len = 0; + u8 tx_head; + + tx_ring->head--; + stats->tx_dropped++; + tx_head = mcp251xfd_get_tx_head(tx_ring); + can_free_echo_skb(ndev, tx_head, &frame_len); + netdev_completed_queue(ndev, 1, frame_len); + netif_wake_queue(ndev); + + if (net_ratelimit()) + netdev_err(priv->ndev, "ERROR in %s: %d\n", __func__, err); +} + +void mcp251xfd_tx_obj_write_sync(struct work_struct *work) +{ + struct mcp251xfd_priv *priv = container_of(work, struct mcp251xfd_priv, + tx_work); + struct mcp251xfd_tx_obj *tx_obj = priv->tx_work_obj; + struct mcp251xfd_tx_ring *tx_ring = priv->tx; + int err; + + err = spi_sync(priv->spi, &tx_obj->msg); + if (err) + mcp251xfd_tx_failure_drop(priv, tx_ring, err); +} + static int mcp251xfd_tx_obj_write(const struct mcp251xfd_priv *priv, struct mcp251xfd_tx_obj *tx_obj) { @@ -162,6 +195,11 @@ static bool mcp251xfd_tx_busy(const struct mcp251xfd_priv *priv, return false; } +static bool mcp251xfd_work_busy(struct work_struct *work) +{ + return work_busy(work); +} + netdev_tx_t mcp251xfd_start_xmit(struct sk_buff *skb, struct net_device *ndev) { @@ -175,7 +213,8 @@ netdev_tx_t mcp251xfd_start_xmit(struct sk_buff *skb, if (can_dev_dropped_skb(ndev, skb)) return NETDEV_TX_OK; - if (mcp251xfd_tx_busy(priv, tx_ring)) + if (mcp251xfd_tx_busy(priv, tx_ring) || + mcp251xfd_work_busy(&priv->tx_work)) return NETDEV_TX_BUSY; tx_obj = mcp251xfd_get_tx_obj_next(tx_ring); @@ -193,13 +232,13 @@ netdev_tx_t mcp251xfd_start_xmit(struct sk_buff *skb, netdev_sent_queue(priv->ndev, frame_len); err = mcp251xfd_tx_obj_write(priv, tx_obj); - if (err) - goto out_err; - - return NETDEV_TX_OK; - - out_err: - netdev_err(priv->ndev, "ERROR in %s: %d\n", __func__, err); + if (err == -EBUSY) { + netif_stop_queue(ndev); + priv->tx_work_obj = tx_obj; + queue_work(priv->wq, &priv->tx_work); + } else if (err) { + mcp251xfd_tx_failure_drop(priv, tx_ring, err); + } return NETDEV_TX_OK; } diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd.h b/drivers/net/can/spi/mcp251xfd/mcp251xfd.h index 24510b3b80203e..b35bfebd23f292 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd.h +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd.h @@ -633,6 +633,10 @@ struct mcp251xfd_priv { struct mcp251xfd_rx_ring *rx[MCP251XFD_FIFO_RX_NUM]; struct mcp251xfd_tx_ring tx[MCP251XFD_FIFO_TX_NUM]; + struct workqueue_struct *wq; + struct work_struct tx_work; + struct mcp251xfd_tx_obj *tx_work_obj; + DECLARE_BITMAP(flags, __MCP251XFD_FLAGS_SIZE__); u8 rx_ring_num; @@ -952,6 +956,7 @@ void mcp251xfd_skb_set_timestamp(const struct mcp251xfd_priv *priv, void mcp251xfd_timestamp_init(struct mcp251xfd_priv *priv); void mcp251xfd_timestamp_stop(struct mcp251xfd_priv *priv); +void mcp251xfd_tx_obj_write_sync(struct work_struct *work); netdev_tx_t mcp251xfd_start_xmit(struct sk_buff *skb, struct net_device *ndev); From a23ac973f67f37e77b3c634e8b1ad5b0164fcc1f Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 19 Jun 2024 18:08:56 -0400 Subject: [PATCH 1266/1578] openvswitch: get related ct labels from its master if it is not confirmed Ilya found a failure in running check-kernel tests with at_groups=144 (144: conntrack - FTP SNAT orig tuple) in OVS repo. After his further investigation, the root cause is that the labels sent to userspace for related ct are incorrect. The labels for unconfirmed related ct should use its master's labels. However, the changes made in commit 8c8b73320805 ("openvswitch: set IPS_CONFIRMED in tmpl status only when commit is set in conntrack") led to getting labels from this related ct. So fix it in ovs_ct_get_labels() by changing to copy labels from its master ct if it is a unconfirmed related ct. Note that there is no fix needed for ct->mark, as it was already copied from its master ct for related ct in init_conntrack(). Fixes: 8c8b73320805 ("openvswitch: set IPS_CONFIRMED in tmpl status only when commit is set in conntrack") Reported-by: Ilya Maximets Signed-off-by: Xin Long Reviewed-by: Ilya Maximets Tested-by: Ilya Maximets Reviewed-by: Aaron Conole Signed-off-by: David S. Miller --- net/openvswitch/conntrack.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 2928c142a2ddb3..3b980bf2770bb2 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -168,8 +168,13 @@ static u32 ovs_ct_get_mark(const struct nf_conn *ct) static void ovs_ct_get_labels(const struct nf_conn *ct, struct ovs_key_ct_labels *labels) { - struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL; + struct nf_conn_labels *cl = NULL; + if (ct) { + if (ct->master && !nf_ct_is_confirmed(ct)) + ct = ct->master; + cl = nf_ct_labels_find(ct); + } if (cl) memcpy(labels, cl->bits, OVS_CT_LABELS_LEN); else From 17563b4a19d1844bdbccc7a82d2f31c28ca9cfae Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 21 Jun 2024 09:39:09 +0200 Subject: [PATCH 1267/1578] ALSA: hda: Use imply for suggesting CONFIG_SERIAL_MULTI_INSTANTIATE The recent fix introduced a reverse selection of CONFIG_SERIAL_MULTI_INSTANTIATE, but its condition isn't always met. Use a weak reverse selection to suggest the config for avoiding such inconsistencies, instead. Fixes: 9b1effff19cd ("ALSA: hda: cs35l56: Select SERIAL_MULTI_INSTANTIATE") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202406210732.ozgk8IMK-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202406211244.oLhoF3My-lkp@intel.com/ Reviewed-by: Richard Fitzgerald Link: https://patch.msgid.link/20240621073915.19576-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/Kconfig b/sound/pci/hda/Kconfig index e59df40a0007a8..a3cf0725fc43b0 100644 --- a/sound/pci/hda/Kconfig +++ b/sound/pci/hda/Kconfig @@ -162,7 +162,7 @@ config SND_HDA_SCODEC_CS35L56_I2C depends on ACPI || COMPILE_TEST depends on SND_SOC select FW_CS_DSP - select SERIAL_MULTI_INSTANTIATE + imply SERIAL_MULTI_INSTANTIATE select SND_HDA_GENERIC select SND_SOC_CS35L56_SHARED select SND_HDA_SCODEC_CS35L56 @@ -179,7 +179,7 @@ config SND_HDA_SCODEC_CS35L56_SPI depends on ACPI || COMPILE_TEST depends on SND_SOC select FW_CS_DSP - select SERIAL_MULTI_INSTANTIATE + imply SERIAL_MULTI_INSTANTIATE select SND_HDA_GENERIC select SND_SOC_CS35L56_SHARED select SND_HDA_SCODEC_CS35L56 From 00418d5530ca1f42d8721fe0a3e73d1ae477c223 Mon Sep 17 00:00:00 2001 From: Aryan Srivastava Date: Thu, 20 Jun 2024 16:12:02 +1200 Subject: [PATCH 1268/1578] net: mvpp2: fill-in dev_port attribute Fill this in so user-space can identify multiple ports on the same CP unit. Signed-off-by: Aryan Srivastava Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 671368d2c77e6a..9adf4301c9b1d7 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -6907,6 +6907,7 @@ static int mvpp2_port_probe(struct platform_device *pdev, /* 9704 == 9728 - 20 and rounding to 8 */ dev->max_mtu = MVPP2_BM_JUMBO_PKT_SIZE; device_set_node(&dev->dev, port_fwnode); + dev->dev_port = port->id; port->pcs_gmac.ops = &mvpp2_phylink_gmac_pcs_ops; port->pcs_gmac.neg_mode = true; From a95b031c6796bf9972da2d4b4b524a57734f3a0a Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 20 Jun 2024 16:56:26 +0800 Subject: [PATCH 1269/1578] bonding: fix incorrect software timestamping report The __ethtool_get_ts_info function returns directly if the device has a get_ts_info() method. For bonding with an active slave, this works correctly as we simply return the real device's timestamping information. However, when there is no active slave, we only check the slave's TX software timestamp information. We still need to set the phc index and RX timestamp information manually. Otherwise, the result will be look like: Time stamping parameters for bond0: Capabilities: software-transmit PTP Hardware Clock: 0 Hardware Transmit Timestamp Modes: none Hardware Receive Filter Modes: none This issue does not affect VLAN or MACVLAN devices, as they only have one downlink and can directly use the downlink's timestamping information. Fixes: b8768dc40777 ("net: ethtool: Refactor identical get_ts_info implementations.") Reported-by: Liang Li Closes: https://issues.redhat.com/browse/RHEL-42409 Signed-off-by: Hangbin Liu Acked-by: Kory Maincent Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 3c3fcce4acd48c..d19aabf5d4fba7 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -5773,6 +5773,9 @@ static int bond_ethtool_get_ts_info(struct net_device *bond_dev, if (real_dev) { ret = ethtool_get_ts_info_by_layer(real_dev, info); } else { + info->phc_index = -1; + info->so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE | + SOF_TIMESTAMPING_SOFTWARE; /* Check if all slaves support software tx timestamping */ rcu_read_lock(); bond_for_each_slave_rcu(bond, slave, iter) { From 7eadf50095bc35c0e17e66cab617a934888edc20 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Thu, 20 Jun 2024 11:57:50 +0200 Subject: [PATCH 1270/1578] net: pse-pd: Kconfig: Fix missing firmware loader config select Selecting FW_UPLOAD is not sufficient as it allows the firmware loader API to be built as a module alongside the pd692x0 driver built as builtin. Add select FW_LOADER to fix this issue. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202406200632.hSChnX0g-lkp@intel.com/ Fixes: 9a9938451890 ("net: pse-pd: Add PD692x0 PSE controller driver") Signed-off-by: Kory Maincent Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/pse-pd/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/pse-pd/Kconfig b/drivers/net/pse-pd/Kconfig index 577ea904b3d919..7fab916a7f46ab 100644 --- a/drivers/net/pse-pd/Kconfig +++ b/drivers/net/pse-pd/Kconfig @@ -23,6 +23,7 @@ config PSE_REGULATOR config PSE_PD692X0 tristate "PD692X0 PSE controller" depends on I2C + select FW_LOADER select FW_UPLOAD help This module provides support for PD692x0 regulator based Ethernet From e3f02f32a05009a688a87f5799e049ed6b55bab5 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Thu, 20 Jun 2024 10:58:08 +0000 Subject: [PATCH 1271/1578] ionic: fix kernel panic due to multi-buffer handling Currently, the ionic_run_xdp() doesn't handle multi-buffer packets properly for XDP_TX and XDP_REDIRECT. When a jumbo frame is received, the ionic_run_xdp() first makes xdp frame with all necessary pages in the rx descriptor. And if the action is either XDP_TX or XDP_REDIRECT, it should unmap dma-mapping and reset page pointer to NULL for all pages, not only the first page. But it doesn't for SG pages. So, SG pages unexpectedly will be reused. It eventually causes kernel panic. Oops: general protection fault, probably for non-canonical address 0x504f4e4dbebc64ff: 0000 [#1] PREEMPT SMP NOPTI CPU: 3 PID: 0 Comm: swapper/3 Not tainted 6.10.0-rc3+ #25 RIP: 0010:xdp_return_frame+0x42/0x90 Code: 01 75 12 5b 4c 89 e6 5d 31 c9 41 5c 31 d2 41 5d e9 73 fd ff ff 44 8b 6b 20 0f b7 43 0a 49 81 ed 68 01 00 00 49 29 c5 49 01 fd <41> 80 7d0 RSP: 0018:ffff99d00122ce08 EFLAGS: 00010202 RAX: 0000000000005453 RBX: ffff8d325f904000 RCX: 0000000000000001 RDX: 00000000670e1000 RSI: 000000011f90d000 RDI: 504f4e4d4c4b4a49 RBP: ffff99d003907740 R08: 0000000000000000 R09: 0000000000000000 R10: 000000011f90d000 R11: 0000000000000000 R12: ffff8d325f904010 R13: 504f4e4dbebc64fd R14: ffff8d3242b070c8 R15: ffff99d0039077c0 FS: 0000000000000000(0000) GS:ffff8d399f780000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f41f6c85e38 CR3: 000000037ac30000 CR4: 00000000007506f0 PKRU: 55555554 Call Trace: ? die_addr+0x33/0x90 ? exc_general_protection+0x251/0x2f0 ? asm_exc_general_protection+0x22/0x30 ? xdp_return_frame+0x42/0x90 ionic_tx_clean+0x211/0x280 [ionic 15881354510e6a9c655c59c54812b319ed2cd015] ionic_tx_cq_service+0xd3/0x210 [ionic 15881354510e6a9c655c59c54812b319ed2cd015] ionic_txrx_napi+0x41/0x1b0 [ionic 15881354510e6a9c655c59c54812b319ed2cd015] __napi_poll.constprop.0+0x29/0x1b0 net_rx_action+0x2c4/0x350 handle_softirqs+0xf4/0x320 irq_exit_rcu+0x78/0xa0 common_interrupt+0x77/0x90 Fixes: 5377805dc1c0 ("ionic: implement xdp frags support") Signed-off-by: Taehee Yoo Reviewed-by: Shannon Nelson Signed-off-by: David S. Miller --- .../net/ethernet/pensando/ionic/ionic_txrx.c | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c index 2427610f4306d9..aed7d9cbce038f 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c @@ -480,6 +480,20 @@ int ionic_xdp_xmit(struct net_device *netdev, int n, return nxmit; } +static void ionic_xdp_rx_put_bufs(struct ionic_queue *q, + struct ionic_buf_info *buf_info, + int nbufs) +{ + int i; + + for (i = 0; i < nbufs; i++) { + dma_unmap_page(q->dev, buf_info->dma_addr, + IONIC_PAGE_SIZE, DMA_FROM_DEVICE); + buf_info->page = NULL; + buf_info++; + } +} + static bool ionic_run_xdp(struct ionic_rx_stats *stats, struct net_device *netdev, struct bpf_prog *xdp_prog, @@ -493,6 +507,7 @@ static bool ionic_run_xdp(struct ionic_rx_stats *stats, struct netdev_queue *nq; struct xdp_frame *xdpf; int remain_len; + int nbufs = 1; int frag_len; int err = 0; @@ -542,6 +557,7 @@ static bool ionic_run_xdp(struct ionic_rx_stats *stats, if (page_is_pfmemalloc(bi->page)) xdp_buff_set_frag_pfmemalloc(&xdp_buf); } while (remain_len > 0); + nbufs += sinfo->nr_frags; } xdp_action = bpf_prog_run_xdp(xdp_prog, &xdp_buf); @@ -574,9 +590,6 @@ static bool ionic_run_xdp(struct ionic_rx_stats *stats, goto out_xdp_abort; } - dma_unmap_page(rxq->dev, buf_info->dma_addr, - IONIC_PAGE_SIZE, DMA_FROM_DEVICE); - err = ionic_xdp_post_frame(txq, xdpf, XDP_TX, buf_info->page, buf_info->page_offset, @@ -586,23 +599,19 @@ static bool ionic_run_xdp(struct ionic_rx_stats *stats, netdev_dbg(netdev, "tx ionic_xdp_post_frame err %d\n", err); goto out_xdp_abort; } - buf_info->page = NULL; + ionic_xdp_rx_put_bufs(rxq, buf_info, nbufs); stats->xdp_tx++; /* the Tx completion will free the buffers */ break; case XDP_REDIRECT: - /* unmap the pages before handing them to a different device */ - dma_unmap_page(rxq->dev, buf_info->dma_addr, - IONIC_PAGE_SIZE, DMA_FROM_DEVICE); - err = xdp_do_redirect(netdev, &xdp_buf, xdp_prog); if (err) { netdev_dbg(netdev, "xdp_do_redirect err %d\n", err); goto out_xdp_abort; } - buf_info->page = NULL; + ionic_xdp_rx_put_bufs(rxq, buf_info, nbufs); rxq->xdp_flush = true; stats->xdp_redirect++; break; From cf6d9d2d243f242f51ee0666ca88e61d9408752f Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Tue, 4 Jun 2024 18:35:10 -0500 Subject: [PATCH 1272/1578] KVM: SEV-ES: Fix svm_get_msr()/svm_set_msr() for KVM_SEV_ES_INIT guests With commit 27bd5fdc24c0 ("KVM: SEV-ES: Prevent MSR access post VMSA encryption"), older VMMs like QEMU 9.0 and older will fail when booting SEV-ES guests with something like the following error: qemu-system-x86_64: error: failed to get MSR 0x174 qemu-system-x86_64: ../qemu.git/target/i386/kvm/kvm.c:3950: kvm_get_msrs: Assertion `ret == cpu->kvm_msr_buf->nmsrs' failed. This is because older VMMs that might still call svm_get_msr()/svm_set_msr() for SEV-ES guests after guest boot even if those interfaces were essentially just noops because of the vCPU state being encrypted and stored separately in the VMSA. Now those VMMs will get an -EINVAL and generally crash. Newer VMMs that are aware of KVM_SEV_INIT2 however are already aware of the stricter limitations of what vCPU state can be sync'd during guest run-time, so newer QEMU for instance will work both for legacy KVM_SEV_ES_INIT interface as well as KVM_SEV_INIT2. So when using KVM_SEV_INIT2 it's okay to assume userspace can deal with -EINVAL, whereas for legacy KVM_SEV_ES_INIT the kernel might be dealing with either an older VMM and so it needs to assume that returning -EINVAL might break the VMM. Address this by only returning -EINVAL if the guest was started with KVM_SEV_INIT2. Otherwise, just silently return. Cc: Ravi Bangoria Cc: Nikunj A Dadhania Reported-by: Srikanth Aithal Closes: https://lore.kernel.org/lkml/37usuu4yu4ok7be2hqexhmcyopluuiqj3k266z4gajc2rcj4yo@eujb23qc3zcm/ Fixes: 27bd5fdc24c0 ("KVM: SEV-ES: Prevent MSR access post VMSA encryption") Signed-off-by: Michael Roth Message-ID: <20240604233510.764949-1-michael.roth@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 296c524988f953..c95d3900fe5640 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2843,7 +2843,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (sev_es_prevent_msr_access(vcpu, msr_info)) { msr_info->data = 0; - return -EINVAL; + return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; } switch (msr_info->index) { @@ -2998,7 +2998,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) u64 data = msr->data; if (sev_es_prevent_msr_access(vcpu, msr)) - return -EINVAL; + return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; switch (ecx) { case MSR_AMD64_TSC_RATIO: From 62e58ddb146502faff1dd23164a20688624eaaed Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 20 Jun 2024 13:31:19 +0000 Subject: [PATCH 1273/1578] net: add softirq safety to netdev_rename_lock syzbot reported a lockdep violation involving bridge driver [1] Make sure netdev_rename_lock is softirq safe to fix this issue. [1] WARNING: SOFTIRQ-safe -> SOFTIRQ-unsafe lock order detected 6.10.0-rc2-syzkaller-00249-gbe27b8965297 #0 Not tainted ----------------------------------------------------- syz-executor.2/9449 [HC0[0]:SC0[2]:HE0:SE0] is trying to acquire: ffffffff8f5de668 (netdev_rename_lock.seqcount){+.+.}-{0:0}, at: rtnl_fill_ifinfo+0x38e/0x2270 net/core/rtnetlink.c:1839 and this task is already holding: ffff888060c64cb8 (&br->lock){+.-.}-{2:2}, at: spin_lock_bh include/linux/spinlock.h:356 [inline] ffff888060c64cb8 (&br->lock){+.-.}-{2:2}, at: br_port_slave_changelink+0x3d/0x150 net/bridge/br_netlink.c:1212 which would create a new lock dependency: (&br->lock){+.-.}-{2:2} -> (netdev_rename_lock.seqcount){+.+.}-{0:0} but this new dependency connects a SOFTIRQ-irq-safe lock: (&br->lock){+.-.}-{2:2} ... which became SOFTIRQ-irq-safe at: lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754 __raw_spin_lock include/linux/spinlock_api_smp.h:133 [inline] _raw_spin_lock+0x2e/0x40 kernel/locking/spinlock.c:154 spin_lock include/linux/spinlock.h:351 [inline] br_forward_delay_timer_expired+0x50/0x440 net/bridge/br_stp_timer.c:86 call_timer_fn+0x18e/0x650 kernel/time/timer.c:1792 expire_timers kernel/time/timer.c:1843 [inline] __run_timers kernel/time/timer.c:2417 [inline] __run_timer_base+0x66a/0x8e0 kernel/time/timer.c:2428 run_timer_base kernel/time/timer.c:2437 [inline] run_timer_softirq+0xb7/0x170 kernel/time/timer.c:2447 handle_softirqs+0x2c4/0x970 kernel/softirq.c:554 __do_softirq kernel/softirq.c:588 [inline] invoke_softirq kernel/softirq.c:428 [inline] __irq_exit_rcu+0xf4/0x1c0 kernel/softirq.c:637 irq_exit_rcu+0x9/0x30 kernel/softirq.c:649 instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1043 [inline] sysvec_apic_timer_interrupt+0xa6/0xc0 arch/x86/kernel/apic/apic.c:1043 asm_sysvec_apic_timer_interrupt+0x1a/0x20 arch/x86/include/asm/idtentry.h:702 lock_acquire+0x264/0x550 kernel/locking/lockdep.c:5758 fs_reclaim_acquire+0xaf/0x140 mm/page_alloc.c:3800 might_alloc include/linux/sched/mm.h:334 [inline] slab_pre_alloc_hook mm/slub.c:3890 [inline] slab_alloc_node mm/slub.c:3980 [inline] kmalloc_trace_noprof+0x3d/0x2c0 mm/slub.c:4147 kmalloc_noprof include/linux/slab.h:660 [inline] kzalloc_noprof include/linux/slab.h:778 [inline] class_dir_create_and_add drivers/base/core.c:3255 [inline] get_device_parent+0x2a7/0x410 drivers/base/core.c:3315 device_add+0x325/0xbf0 drivers/base/core.c:3645 netdev_register_kobject+0x17e/0x320 net/core/net-sysfs.c:2136 register_netdevice+0x11d5/0x19e0 net/core/dev.c:10375 nsim_init_netdevsim drivers/net/netdevsim/netdev.c:690 [inline] nsim_create+0x647/0x890 drivers/net/netdevsim/netdev.c:750 __nsim_dev_port_add+0x6c0/0xae0 drivers/net/netdevsim/dev.c:1390 nsim_dev_port_add_all drivers/net/netdevsim/dev.c:1446 [inline] nsim_dev_reload_create drivers/net/netdevsim/dev.c:1498 [inline] nsim_dev_reload_up+0x69b/0x8e0 drivers/net/netdevsim/dev.c:985 devlink_reload+0x478/0x870 net/devlink/dev.c:474 devlink_nl_reload_doit+0xbd6/0xe50 net/devlink/dev.c:586 genl_family_rcv_msg_doit net/netlink/genetlink.c:1115 [inline] genl_family_rcv_msg net/netlink/genetlink.c:1195 [inline] genl_rcv_msg+0xb14/0xec0 net/netlink/genetlink.c:1210 netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2564 genl_rcv+0x28/0x40 net/netlink/genetlink.c:1219 netlink_unicast_kernel net/netlink/af_netlink.c:1335 [inline] netlink_unicast+0x7ea/0x980 net/netlink/af_netlink.c:1361 netlink_sendmsg+0x8db/0xcb0 net/netlink/af_netlink.c:1905 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x221/0x270 net/socket.c:745 ____sys_sendmsg+0x525/0x7d0 net/socket.c:2585 ___sys_sendmsg net/socket.c:2639 [inline] __sys_sendmsg+0x2b0/0x3a0 net/socket.c:2668 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f to a SOFTIRQ-irq-unsafe lock: (netdev_rename_lock.seqcount){+.+.}-{0:0} ... which became SOFTIRQ-irq-unsafe at: ... lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754 do_write_seqcount_begin_nested include/linux/seqlock.h:469 [inline] do_write_seqcount_begin include/linux/seqlock.h:495 [inline] write_seqlock include/linux/seqlock.h:823 [inline] dev_change_name+0x184/0x920 net/core/dev.c:1229 do_setlink+0xa4b/0x41f0 net/core/rtnetlink.c:2880 __rtnl_newlink net/core/rtnetlink.c:3696 [inline] rtnl_newlink+0x180b/0x20a0 net/core/rtnetlink.c:3743 rtnetlink_rcv_msg+0x89b/0x1180 net/core/rtnetlink.c:6635 netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2564 netlink_unicast_kernel net/netlink/af_netlink.c:1335 [inline] netlink_unicast+0x7ea/0x980 net/netlink/af_netlink.c:1361 netlink_sendmsg+0x8db/0xcb0 net/netlink/af_netlink.c:1905 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x221/0x270 net/socket.c:745 __sys_sendto+0x3a4/0x4f0 net/socket.c:2192 __do_sys_sendto net/socket.c:2204 [inline] __se_sys_sendto net/socket.c:2200 [inline] __x64_sys_sendto+0xde/0x100 net/socket.c:2200 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f other info that might help us debug this: Possible interrupt unsafe locking scenario: CPU0 CPU1 ---- ---- lock(netdev_rename_lock.seqcount); local_irq_disable(); lock(&br->lock); lock(netdev_rename_lock.seqcount); lock(&br->lock); *** DEADLOCK *** 3 locks held by syz-executor.2/9449: #0: ffffffff8f5e7448 (rtnl_mutex){+.+.}-{3:3}, at: rtnl_lock net/core/rtnetlink.c:79 [inline] #0: ffffffff8f5e7448 (rtnl_mutex){+.+.}-{3:3}, at: rtnetlink_rcv_msg+0x842/0x1180 net/core/rtnetlink.c:6632 #1: ffff888060c64cb8 (&br->lock){+.-.}-{2:2}, at: spin_lock_bh include/linux/spinlock.h:356 [inline] #1: ffff888060c64cb8 (&br->lock){+.-.}-{2:2}, at: br_port_slave_changelink+0x3d/0x150 net/bridge/br_netlink.c:1212 #2: ffffffff8e333fa0 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:329 [inline] #2: ffffffff8e333fa0 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:781 [inline] #2: ffffffff8e333fa0 (rcu_read_lock){....}-{1:2}, at: team_change_rx_flags+0x29/0x330 drivers/net/team/team_core.c:1767 the dependencies between SOFTIRQ-irq-safe lock and the holding lock: -> (&br->lock){+.-.}-{2:2} { HARDIRQ-ON-W at: lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754 __raw_spin_lock_bh include/linux/spinlock_api_smp.h:126 [inline] _raw_spin_lock_bh+0x35/0x50 kernel/locking/spinlock.c:178 spin_lock_bh include/linux/spinlock.h:356 [inline] br_add_if+0xb34/0xef0 net/bridge/br_if.c:682 do_set_master net/core/rtnetlink.c:2701 [inline] do_setlink+0xe70/0x41f0 net/core/rtnetlink.c:2907 __rtnl_newlink net/core/rtnetlink.c:3696 [inline] rtnl_newlink+0x180b/0x20a0 net/core/rtnetlink.c:3743 rtnetlink_rcv_msg+0x89b/0x1180 net/core/rtnetlink.c:6635 netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2564 netlink_unicast_kernel net/netlink/af_netlink.c:1335 [inline] netlink_unicast+0x7ea/0x980 net/netlink/af_netlink.c:1361 netlink_sendmsg+0x8db/0xcb0 net/netlink/af_netlink.c:1905 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x221/0x270 net/socket.c:745 __sys_sendto+0x3a4/0x4f0 net/socket.c:2192 __do_sys_sendto net/socket.c:2204 [inline] __se_sys_sendto net/socket.c:2200 [inline] __x64_sys_sendto+0xde/0x100 net/socket.c:2200 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f IN-SOFTIRQ-W at: lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754 __raw_spin_lock include/linux/spinlock_api_smp.h:133 [inline] _raw_spin_lock+0x2e/0x40 kernel/locking/spinlock.c:154 spin_lock include/linux/spinlock.h:351 [inline] br_forward_delay_timer_expired+0x50/0x440 net/bridge/br_stp_timer.c:86 call_timer_fn+0x18e/0x650 kernel/time/timer.c:1792 expire_timers kernel/time/timer.c:1843 [inline] __run_timers kernel/time/timer.c:2417 [inline] __run_timer_base+0x66a/0x8e0 kernel/time/timer.c:2428 run_timer_base kernel/time/timer.c:2437 [inline] run_timer_softirq+0xb7/0x170 kernel/time/timer.c:2447 handle_softirqs+0x2c4/0x970 kernel/softirq.c:554 __do_softirq kernel/softirq.c:588 [inline] invoke_softirq kernel/softirq.c:428 [inline] __irq_exit_rcu+0xf4/0x1c0 kernel/softirq.c:637 irq_exit_rcu+0x9/0x30 kernel/softirq.c:649 instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1043 [inline] sysvec_apic_timer_interrupt+0xa6/0xc0 arch/x86/kernel/apic/apic.c:1043 asm_sysvec_apic_timer_interrupt+0x1a/0x20 arch/x86/include/asm/idtentry.h:702 lock_acquire+0x264/0x550 kernel/locking/lockdep.c:5758 fs_reclaim_acquire+0xaf/0x140 mm/page_alloc.c:3800 might_alloc include/linux/sched/mm.h:334 [inline] slab_pre_alloc_hook mm/slub.c:3890 [inline] slab_alloc_node mm/slub.c:3980 [inline] kmalloc_trace_noprof+0x3d/0x2c0 mm/slub.c:4147 kmalloc_noprof include/linux/slab.h:660 [inline] kzalloc_noprof include/linux/slab.h:778 [inline] class_dir_create_and_add drivers/base/core.c:3255 [inline] get_device_parent+0x2a7/0x410 drivers/base/core.c:3315 device_add+0x325/0xbf0 drivers/base/core.c:3645 netdev_register_kobject+0x17e/0x320 net/core/net-sysfs.c:2136 register_netdevice+0x11d5/0x19e0 net/core/dev.c:10375 nsim_init_netdevsim drivers/net/netdevsim/netdev.c:690 [inline] nsim_create+0x647/0x890 drivers/net/netdevsim/netdev.c:750 __nsim_dev_port_add+0x6c0/0xae0 drivers/net/netdevsim/dev.c:1390 nsim_dev_port_add_all drivers/net/netdevsim/dev.c:1446 [inline] nsim_dev_reload_create drivers/net/netdevsim/dev.c:1498 [inline] nsim_dev_reload_up+0x69b/0x8e0 drivers/net/netdevsim/dev.c:985 devlink_reload+0x478/0x870 net/devlink/dev.c:474 devlink_nl_reload_doit+0xbd6/0xe50 net/devlink/dev.c:586 genl_family_rcv_msg_doit net/netlink/genetlink.c:1115 [inline] genl_family_rcv_msg net/netlink/genetlink.c:1195 [inline] genl_rcv_msg+0xb14/0xec0 net/netlink/genetlink.c:1210 netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2564 genl_rcv+0x28/0x40 net/netlink/genetlink.c:1219 netlink_unicast_kernel net/netlink/af_netlink.c:1335 [inline] netlink_unicast+0x7ea/0x980 net/netlink/af_netlink.c:1361 netlink_sendmsg+0x8db/0xcb0 net/netlink/af_netlink.c:1905 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x221/0x270 net/socket.c:745 ____sys_sendmsg+0x525/0x7d0 net/socket.c:2585 ___sys_sendmsg net/socket.c:2639 [inline] __sys_sendmsg+0x2b0/0x3a0 net/socket.c:2668 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f INITIAL USE at: lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754 __raw_spin_lock_bh include/linux/spinlock_api_smp.h:126 [inline] _raw_spin_lock_bh+0x35/0x50 kernel/locking/spinlock.c:178 spin_lock_bh include/linux/spinlock.h:356 [inline] br_add_if+0xb34/0xef0 net/bridge/br_if.c:682 do_set_master net/core/rtnetlink.c:2701 [inline] do_setlink+0xe70/0x41f0 net/core/rtnetlink.c:2907 __rtnl_newlink net/core/rtnetlink.c:3696 [inline] rtnl_newlink+0x180b/0x20a0 net/core/rtnetlink.c:3743 rtnetlink_rcv_msg+0x89b/0x1180 net/core/rtnetlink.c:6635 netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2564 netlink_unicast_kernel net/netlink/af_netlink.c:1335 [inline] netlink_unicast+0x7ea/0x980 net/netlink/af_netlink.c:1361 netlink_sendmsg+0x8db/0xcb0 net/netlink/af_netlink.c:1905 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x221/0x270 net/socket.c:745 __sys_sendto+0x3a4/0x4f0 net/socket.c:2192 __do_sys_sendto net/socket.c:2204 [inline] __se_sys_sendto net/socket.c:2200 [inline] __x64_sys_sendto+0xde/0x100 net/socket.c:2200 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f } ... key at: [] br_dev_setup.__key+0x0/0x20 the dependencies between the lock to be acquired and SOFTIRQ-irq-unsafe lock: -> (netdev_rename_lock.seqcount){+.+.}-{0:0} { HARDIRQ-ON-W at: lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754 do_write_seqcount_begin_nested include/linux/seqlock.h:469 [inline] do_write_seqcount_begin include/linux/seqlock.h:495 [inline] write_seqlock include/linux/seqlock.h:823 [inline] dev_change_name+0x184/0x920 net/core/dev.c:1229 do_setlink+0xa4b/0x41f0 net/core/rtnetlink.c:2880 __rtnl_newlink net/core/rtnetlink.c:3696 [inline] rtnl_newlink+0x180b/0x20a0 net/core/rtnetlink.c:3743 rtnetlink_rcv_msg+0x89b/0x1180 net/core/rtnetlink.c:6635 netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2564 netlink_unicast_kernel net/netlink/af_netlink.c:1335 [inline] netlink_unicast+0x7ea/0x980 net/netlink/af_netlink.c:1361 netlink_sendmsg+0x8db/0xcb0 net/netlink/af_netlink.c:1905 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x221/0x270 net/socket.c:745 __sys_sendto+0x3a4/0x4f0 net/socket.c:2192 __do_sys_sendto net/socket.c:2204 [inline] __se_sys_sendto net/socket.c:2200 [inline] __x64_sys_sendto+0xde/0x100 net/socket.c:2200 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f SOFTIRQ-ON-W at: lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754 do_write_seqcount_begin_nested include/linux/seqlock.h:469 [inline] do_write_seqcount_begin include/linux/seqlock.h:495 [inline] write_seqlock include/linux/seqlock.h:823 [inline] dev_change_name+0x184/0x920 net/core/dev.c:1229 do_setlink+0xa4b/0x41f0 net/core/rtnetlink.c:2880 __rtnl_newlink net/core/rtnetlink.c:3696 [inline] rtnl_newlink+0x180b/0x20a0 net/core/rtnetlink.c:3743 rtnetlink_rcv_msg+0x89b/0x1180 net/core/rtnetlink.c:6635 netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2564 netlink_unicast_kernel net/netlink/af_netlink.c:1335 [inline] netlink_unicast+0x7ea/0x980 net/netlink/af_netlink.c:1361 netlink_sendmsg+0x8db/0xcb0 net/netlink/af_netlink.c:1905 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x221/0x270 net/socket.c:745 __sys_sendto+0x3a4/0x4f0 net/socket.c:2192 __do_sys_sendto net/socket.c:2204 [inline] __se_sys_sendto net/socket.c:2200 [inline] __x64_sys_sendto+0xde/0x100 net/socket.c:2200 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f INITIAL USE at: lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754 do_write_seqcount_begin_nested include/linux/seqlock.h:469 [inline] do_write_seqcount_begin include/linux/seqlock.h:495 [inline] write_seqlock include/linux/seqlock.h:823 [inline] dev_change_name+0x184/0x920 net/core/dev.c:1229 do_setlink+0xa4b/0x41f0 net/core/rtnetlink.c:2880 __rtnl_newlink net/core/rtnetlink.c:3696 [inline] rtnl_newlink+0x180b/0x20a0 net/core/rtnetlink.c:3743 rtnetlink_rcv_msg+0x89b/0x1180 net/core/rtnetlink.c:6635 netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2564 netlink_unicast_kernel net/netlink/af_netlink.c:1335 [inline] netlink_unicast+0x7ea/0x980 net/netlink/af_netlink.c:1361 netlink_sendmsg+0x8db/0xcb0 net/netlink/af_netlink.c:1905 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x221/0x270 net/socket.c:745 __sys_sendto+0x3a4/0x4f0 net/socket.c:2192 __do_sys_sendto net/socket.c:2204 [inline] __se_sys_sendto net/socket.c:2200 [inline] __x64_sys_sendto+0xde/0x100 net/socket.c:2200 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f INITIAL READ USE at: lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754 seqcount_lockdep_reader_access include/linux/seqlock.h:72 [inline] read_seqbegin include/linux/seqlock.h:772 [inline] netdev_copy_name+0x168/0x2c0 net/core/dev.c:949 rtnl_fill_ifinfo+0x38e/0x2270 net/core/rtnetlink.c:1839 rtmsg_ifinfo_build_skb+0x18a/0x260 net/core/rtnetlink.c:4073 rtmsg_ifinfo_event net/core/rtnetlink.c:4107 [inline] rtmsg_ifinfo+0x91/0x1b0 net/core/rtnetlink.c:4116 register_netdevice+0x1665/0x19e0 net/core/dev.c:10422 register_netdev+0x3b/0x50 net/core/dev.c:10512 loopback_net_init+0x73/0x150 drivers/net/loopback.c:217 ops_init+0x359/0x610 net/core/net_namespace.c:139 __register_pernet_operations net/core/net_namespace.c:1247 [inline] register_pernet_operations+0x2cb/0x660 net/core/net_namespace.c:1320 register_pernet_device+0x33/0x80 net/core/net_namespace.c:1407 net_dev_init+0xfcd/0x10d0 net/core/dev.c:11956 do_one_initcall+0x248/0x880 init/main.c:1267 do_initcall_level+0x157/0x210 init/main.c:1329 do_initcalls+0x3f/0x80 init/main.c:1345 kernel_init_freeable+0x435/0x5d0 init/main.c:1578 kernel_init+0x1d/0x2b0 init/main.c:1467 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 } ... key at: [] netdev_rename_lock+0x8/0xa0 ... acquired at: lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754 seqcount_lockdep_reader_access include/linux/seqlock.h:72 [inline] read_seqbegin include/linux/seqlock.h:772 [inline] netdev_copy_name+0x168/0x2c0 net/core/dev.c:949 rtnl_fill_ifinfo+0x38e/0x2270 net/core/rtnetlink.c:1839 rtmsg_ifinfo_build_skb+0x18a/0x260 net/core/rtnetlink.c:4073 rtmsg_ifinfo_event net/core/rtnetlink.c:4107 [inline] rtmsg_ifinfo+0x91/0x1b0 net/core/rtnetlink.c:4116 __dev_notify_flags+0xf7/0x400 net/core/dev.c:8816 __dev_set_promiscuity+0x152/0x5a0 net/core/dev.c:8588 dev_set_promiscuity+0x51/0xe0 net/core/dev.c:8608 team_change_rx_flags+0x203/0x330 drivers/net/team/team_core.c:1771 dev_change_rx_flags net/core/dev.c:8541 [inline] __dev_set_promiscuity+0x406/0x5a0 net/core/dev.c:8585 dev_set_promiscuity+0x51/0xe0 net/core/dev.c:8608 br_port_clear_promisc net/bridge/br_if.c:135 [inline] br_manage_promisc+0x505/0x590 net/bridge/br_if.c:172 nbp_update_port_count net/bridge/br_if.c:242 [inline] br_port_flags_change+0x161/0x1f0 net/bridge/br_if.c:761 br_setport+0xcb5/0x16d0 net/bridge/br_netlink.c:1000 br_port_slave_changelink+0x135/0x150 net/bridge/br_netlink.c:1213 __rtnl_newlink net/core/rtnetlink.c:3689 [inline] rtnl_newlink+0x169f/0x20a0 net/core/rtnetlink.c:3743 rtnetlink_rcv_msg+0x89b/0x1180 net/core/rtnetlink.c:6635 netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2564 netlink_unicast_kernel net/netlink/af_netlink.c:1335 [inline] netlink_unicast+0x7ea/0x980 net/netlink/af_netlink.c:1361 netlink_sendmsg+0x8db/0xcb0 net/netlink/af_netlink.c:1905 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x221/0x270 net/socket.c:745 ____sys_sendmsg+0x525/0x7d0 net/socket.c:2585 ___sys_sendmsg net/socket.c:2639 [inline] __sys_sendmsg+0x2b0/0x3a0 net/socket.c:2668 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f stack backtrace: CPU: 0 PID: 9449 Comm: syz-executor.2 Not tainted 6.10.0-rc2-syzkaller-00249-gbe27b8965297 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 06/07/2024 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:114 print_bad_irq_dependency kernel/locking/lockdep.c:2626 [inline] check_irq_usage kernel/locking/lockdep.c:2865 [inline] check_prev_add kernel/locking/lockdep.c:3138 [inline] check_prevs_add kernel/locking/lockdep.c:3253 [inline] validate_chain+0x4de0/0x5900 kernel/locking/lockdep.c:3869 __lock_acquire+0x1346/0x1fd0 kernel/locking/lockdep.c:5137 lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754 seqcount_lockdep_reader_access include/linux/seqlock.h:72 [inline] read_seqbegin include/linux/seqlock.h:772 [inline] netdev_copy_name+0x168/0x2c0 net/core/dev.c:949 rtnl_fill_ifinfo+0x38e/0x2270 net/core/rtnetlink.c:1839 rtmsg_ifinfo_build_skb+0x18a/0x260 net/core/rtnetlink.c:4073 rtmsg_ifinfo_event net/core/rtnetlink.c:4107 [inline] rtmsg_ifinfo+0x91/0x1b0 net/core/rtnetlink.c:4116 __dev_notify_flags+0xf7/0x400 net/core/dev.c:8816 __dev_set_promiscuity+0x152/0x5a0 net/core/dev.c:8588 dev_set_promiscuity+0x51/0xe0 net/core/dev.c:8608 team_change_rx_flags+0x203/0x330 drivers/net/team/team_core.c:1771 dev_change_rx_flags net/core/dev.c:8541 [inline] __dev_set_promiscuity+0x406/0x5a0 net/core/dev.c:8585 dev_set_promiscuity+0x51/0xe0 net/core/dev.c:8608 br_port_clear_promisc net/bridge/br_if.c:135 [inline] br_manage_promisc+0x505/0x590 net/bridge/br_if.c:172 nbp_update_port_count net/bridge/br_if.c:242 [inline] br_port_flags_change+0x161/0x1f0 net/bridge/br_if.c:761 br_setport+0xcb5/0x16d0 net/bridge/br_netlink.c:1000 br_port_slave_changelink+0x135/0x150 net/bridge/br_netlink.c:1213 __rtnl_newlink net/core/rtnetlink.c:3689 [inline] rtnl_newlink+0x169f/0x20a0 net/core/rtnetlink.c:3743 rtnetlink_rcv_msg+0x89b/0x1180 net/core/rtnetlink.c:6635 netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2564 netlink_unicast_kernel net/netlink/af_netlink.c:1335 [inline] netlink_unicast+0x7ea/0x980 net/netlink/af_netlink.c:1361 netlink_sendmsg+0x8db/0xcb0 net/netlink/af_netlink.c:1905 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x221/0x270 net/socket.c:745 ____sys_sendmsg+0x525/0x7d0 net/socket.c:2585 ___sys_sendmsg net/socket.c:2639 [inline] __sys_sendmsg+0x2b0/0x3a0 net/socket.c:2668 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f3b3047cf29 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 e1 20 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f3b311740c8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f3b305b4050 RCX: 00007f3b3047cf29 RDX: 0000000000000000 RSI: 0000000020000000 RDI: 0000000000000008 RBP: 00007f3b304ec074 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 000000000000006e R14: 00007f3b305b4050 R15: 00007ffca2f3dc68 Fixes: 0840556e5a3a ("net: Protect dev->name by seqlock.") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Kuniyuki Iwashima Reviewed-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/core/dev.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 4d4de9008f6f3b..2b4819b610b8a1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1226,9 +1226,9 @@ int dev_change_name(struct net_device *dev, const char *newname) memcpy(oldname, dev->name, IFNAMSIZ); - write_seqlock(&netdev_rename_lock); + write_seqlock_bh(&netdev_rename_lock); err = dev_get_valid_name(net, dev, newname); - write_sequnlock(&netdev_rename_lock); + write_sequnlock_bh(&netdev_rename_lock); if (err < 0) { up_write(&devnet_rename_sem); @@ -1269,9 +1269,9 @@ int dev_change_name(struct net_device *dev, const char *newname) if (err >= 0) { err = ret; down_write(&devnet_rename_sem); - write_seqlock(&netdev_rename_lock); + write_seqlock_bh(&netdev_rename_lock); memcpy(dev->name, oldname, IFNAMSIZ); - write_sequnlock(&netdev_rename_lock); + write_sequnlock_bh(&netdev_rename_lock); memcpy(oldname, newname, IFNAMSIZ); WRITE_ONCE(dev->name_assign_type, old_assign_type); old_assign_type = NET_NAME_RENAMED; @@ -11419,9 +11419,9 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, if (new_name[0]) { /* Rename the netdev to prepared name */ - write_seqlock(&netdev_rename_lock); + write_seqlock_bh(&netdev_rename_lock); strscpy(dev->name, new_name, IFNAMSIZ); - write_sequnlock(&netdev_rename_lock); + write_sequnlock_bh(&netdev_rename_lock); } /* Fixup kobjects */ From 1cbf347288702af0fe8667c0ce760afbe982a2f1 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Fri, 14 Jun 2024 11:14:18 +0300 Subject: [PATCH 1274/1578] i2c: Add nop fwnode operations Add nop variants of i2c_find_device_by_fwnode(), i2c_find_adapter_by_fwnode() and i2c_get_adapter_by_fwnode() for use without CONFIG_I2C. Signed-off-by: Sakari Ailus Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 9709537370ee9b..424acb98c7c265 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -960,8 +960,6 @@ int i2c_handle_smbus_host_notify(struct i2c_adapter *adap, unsigned short addr); #define builtin_i2c_driver(__i2c_driver) \ builtin_driver(__i2c_driver, i2c_add_driver) -#endif /* I2C */ - /* must call put_device() when done with returned i2c_client device */ struct i2c_client *i2c_find_device_by_fwnode(struct fwnode_handle *fwnode); @@ -971,6 +969,28 @@ struct i2c_adapter *i2c_find_adapter_by_fwnode(struct fwnode_handle *fwnode); /* must call i2c_put_adapter() when done with returned i2c_adapter device */ struct i2c_adapter *i2c_get_adapter_by_fwnode(struct fwnode_handle *fwnode); +#else /* I2C */ + +static inline struct i2c_client * +i2c_find_device_by_fwnode(struct fwnode_handle *fwnode) +{ + return NULL; +} + +static inline struct i2c_adapter * +i2c_find_adapter_by_fwnode(struct fwnode_handle *fwnode) +{ + return NULL; +} + +static inline struct i2c_adapter * +i2c_get_adapter_by_fwnode(struct fwnode_handle *fwnode) +{ + return NULL; +} + +#endif /* !I2C */ + #if IS_ENABLED(CONFIG_OF) /* must call put_device() when done with returned i2c_client device */ static inline struct i2c_client *of_find_i2c_device_by_node(struct device_node *node) From 2490785ee778f7498afa0350aea5651a3472d0a7 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Thu, 20 Jun 2024 14:52:57 -0700 Subject: [PATCH 1275/1578] net: remove drivers@pensando.io from MAINTAINERS Our corporate overlords have been changing the domains around again and this mailing list has gone away. Signed-off-by: Shannon Nelson Reviewed-by: Martin Habets Signed-off-by: David S. Miller --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index a39c237edb95db..972e1c8fec8631 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17532,7 +17532,6 @@ F: include/linux/peci.h PENSANDO ETHERNET DRIVERS M: Shannon Nelson M: Brett Creeley -M: drivers@pensando.io L: netdev@vger.kernel.org S: Supported F: Documentation/networking/device_drivers/ethernet/pensando/ionic.rst From a5d8922ab2aec39336ebc78d7cefe3b84647b058 Mon Sep 17 00:00:00 2001 From: Xin Zeng Date: Mon, 10 Jun 2024 22:37:56 +0800 Subject: [PATCH 1276/1578] crypto: qat - fix linking errors when PCI_IOV is disabled When CONFIG_PCI_IOV=n, the build of the QAT vfio pci variant driver fails reporting the following linking errors: ERROR: modpost: "qat_vfmig_open" [drivers/vfio/pci/qat/qat_vfio_pci.ko] undefined! ERROR: modpost: "qat_vfmig_resume" [drivers/vfio/pci/qat/qat_vfio_pci.ko] undefined! ERROR: modpost: "qat_vfmig_save_state" [drivers/vfio/pci/qat/qat_vfio_pci.ko] undefined! ERROR: modpost: "qat_vfmig_suspend" [drivers/vfio/pci/qat/qat_vfio_pci.ko] undefined! ERROR: modpost: "qat_vfmig_load_state" [drivers/vfio/pci/qat/qat_vfio_pci.ko] undefined! ERROR: modpost: "qat_vfmig_reset" [drivers/vfio/pci/qat/qat_vfio_pci.ko] undefined! ERROR: modpost: "qat_vfmig_save_setup" [drivers/vfio/pci/qat/qat_vfio_pci.ko] undefined! ERROR: modpost: "qat_vfmig_destroy" [drivers/vfio/pci/qat/qat_vfio_pci.ko] undefined! ERROR: modpost: "qat_vfmig_close" [drivers/vfio/pci/qat/qat_vfio_pci.ko] undefined! ERROR: modpost: "qat_vfmig_cleanup" [drivers/vfio/pci/qat/qat_vfio_pci.ko] undefined! WARNING: modpost: suppressed 1 unresolved symbol warnings because there were too many) Make live migration helpers provided by QAT PF driver always available even if CONFIG_PCI_IOV is not selected. This does not cause any side effect. Reported-by: Arnd Bergmann Closes: https://lore.kernel.org/lkml/20240607153406.60355e6c.alex.williamson@redhat.com/T/ Fixes: bb208810b1ab ("vfio/qat: Add vfio_pci driver for Intel QAT SR-IOV VF devices") Signed-off-by: Xin Zeng Reviewed-by: Giovanni Cabiddu Reviewed-by: Kevin Tian Signed-off-by: Herbert Xu --- drivers/crypto/intel/qat/qat_common/Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/intel/qat/qat_common/Makefile b/drivers/crypto/intel/qat/qat_common/Makefile index 6f9266edc9f17e..eac73cbfdd38e2 100644 --- a/drivers/crypto/intel/qat/qat_common/Makefile +++ b/drivers/crypto/intel/qat/qat_common/Makefile @@ -39,7 +39,8 @@ intel_qat-objs := adf_cfg.o \ adf_sysfs_rl.o \ qat_uclo.o \ qat_hal.o \ - qat_bl.o + qat_bl.o \ + qat_mig_dev.o intel_qat-$(CONFIG_DEBUG_FS) += adf_transport_debug.o \ adf_fw_counters.o \ @@ -56,6 +57,6 @@ intel_qat-$(CONFIG_DEBUG_FS) += adf_transport_debug.o \ intel_qat-$(CONFIG_PCI_IOV) += adf_sriov.o adf_vf_isr.o adf_pfvf_utils.o \ adf_pfvf_pf_msg.o adf_pfvf_pf_proto.o \ adf_pfvf_vf_msg.o adf_pfvf_vf_proto.o \ - adf_gen2_pfvf.o adf_gen4_pfvf.o qat_mig_dev.o + adf_gen2_pfvf.o adf_gen4_pfvf.o intel_qat-$(CONFIG_CRYPTO_DEV_QAT_ERROR_INJECTION) += adf_heartbeat_inject.o From c1eb2512596fb3542357bb6c34c286f5e0374538 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 May 2024 15:52:52 +0300 Subject: [PATCH 1277/1578] RDMA/mlx5: Remove extra unlock on error path The below commit lifted the locking out of this function but left this error path unlock behind resulting in unbalanced locking. Remove the missed unlock too. Cc: stable@vger.kernel.org Fixes: 627122280c87 ("RDMA/mlx5: Add work to remove temporary entries from the cache") Signed-off-by: Jason Gunthorpe Reviewed-by: Michael Guralnik Link: https://lore.kernel.org/r/78090c210c750f47219b95248f9f782f34548bb1.1716900410.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mr.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index ecc111ed5d86ea..38d2c743db877b 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -641,10 +641,8 @@ static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache, new = &((*new)->rb_left); if (cmp < 0) new = &((*new)->rb_right); - if (cmp == 0) { - mutex_unlock(&cache->rb_lock); + if (cmp == 0) return -EEXIST; - } } /* Add new node and rebalance tree. */ From f637040c3339a2ed8c12d65ad03f9552386e2fe7 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 May 2024 15:52:53 +0300 Subject: [PATCH 1278/1578] RDMA/mlx5: Follow rb_key.ats when creating new mkeys When a cache ent already exists but doesn't have any mkeys in it the cache will automatically create a new one based on the specification in the ent->rb_key. ent->ats was missed when creating the new key and so ma_translation_mode was not being set even though the ent requires it. Cc: stable@vger.kernel.org Fixes: 73d09b2fe833 ("RDMA/mlx5: Introduce mlx5r_cache_rb_key") Signed-off-by: Jason Gunthorpe Reviewed-by: Michael Guralnik Link: https://lore.kernel.org/r/7c5613458ecb89fbe5606b7aa4c8d990bdea5b9a.1716900410.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 38d2c743db877b..35dcb9d9e12af2 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -246,6 +246,7 @@ static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc) MLX5_SET(mkc, mkc, access_mode_1_0, ent->rb_key.access_mode & 0x3); MLX5_SET(mkc, mkc, access_mode_4_2, (ent->rb_key.access_mode >> 2) & 0x7); + MLX5_SET(mkc, mkc, ma_translation_mode, !!ent->rb_key.ats); MLX5_SET(mkc, mkc, translations_octword_size, get_mkc_octo_size(ent->rb_key.access_mode, From 2e4c02fdecf2f6f55cefe48cb82d93fa4f8e2204 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 May 2024 15:52:54 +0300 Subject: [PATCH 1279/1578] RDMA/mlx5: Ensure created mkeys always have a populated rb_key cachable and mmkey.rb_key together are used by mlx5_revoke_mr() to put the MR/mkey back into the cache. In all cases they should be set correctly. alloc_cacheable_mr() was setting cachable but not filling rb_key, resulting in cache_ent_find_and_store() bucketing them all into a 0 length entry. implicit_get_child_mr()/mlx5_ib_alloc_implicit_mr() failed to set cachable or rb_key at all, so the cache was not working at all for implicit ODP. Cc: stable@vger.kernel.org Fixes: 8c1185fef68c ("RDMA/mlx5: Change check for cacheable mkeys") Fixes: dd1b913fb0d0 ("RDMA/mlx5: Cache all user cacheable mkeys on dereg MR flow") Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/7778c02dfa0999a30d6746c79a23dd7140a9c729.1716900410.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 35dcb9d9e12af2..d3c1f63791a2b6 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -718,6 +718,8 @@ static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, } mr->mmkey.cache_ent = ent; mr->mmkey.type = MLX5_MKEY_MR; + mr->mmkey.rb_key = ent->rb_key; + mr->mmkey.cacheable = true; init_waitqueue_head(&mr->mmkey.wait); return mr; } @@ -1168,7 +1170,6 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, mr->ibmr.pd = pd; mr->umem = umem; mr->page_shift = order_base_2(page_size); - mr->mmkey.cacheable = true; set_mr_fields(dev, mr, umem->length, access_flags, iova); return mr; From 81497c148b7a2e4a4fbda93aee585439f7323e2e Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 28 May 2024 15:52:55 +0300 Subject: [PATCH 1280/1578] RDMA/mlx5: Fix unwind flow as part of mlx5_ib_stage_init_init Fix unwind flow as part of mlx5_ib_stage_init_init to use the correct goto upon an error. Fixes: 758ce14aee82 ("RDMA/mlx5: Implement MACsec gid addition and deletion") Signed-off-by: Yishai Hadas Reviewed-by: Patrisious Haddad Link: https://lore.kernel.org/r/aa40615116eda14ec9eca21d52017d632ea89188.1716900410.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 2366c46eebc871..43660c831b22cf 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3759,10 +3759,10 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) spin_lock_init(&dev->dm.lock); dev->dm.dev = mdev; return 0; -err: - mlx5r_macsec_dealloc_gids(dev); err_mp: mlx5_ib_cleanup_multiport_master(dev); +err: + mlx5r_macsec_dealloc_gids(dev); return err; } From 36ab7ada64caf08f10ee5a114d39964d1f91e81d Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Tue, 28 May 2024 15:52:56 +0300 Subject: [PATCH 1281/1578] RDMA/mlx5: Add check for srq max_sge attribute max_sge attribute is passed by the user, and is inserted and used unchecked, so verify that the value doesn't exceed maximum allowed value before using it. Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters") Signed-off-by: Patrisious Haddad Link: https://lore.kernel.org/r/277ccc29e8d57bfd53ddeb2ac633f2760cf8cdd0.1716900410.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/srq.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index a056ea835da549..84be0c3d569959 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -199,17 +199,20 @@ int mlx5_ib_create_srq(struct ib_srq *ib_srq, int err; struct mlx5_srq_attr in = {}; __u32 max_srq_wqes = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz); + __u32 max_sge_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq) / + sizeof(struct mlx5_wqe_data_seg); if (init_attr->srq_type != IB_SRQT_BASIC && init_attr->srq_type != IB_SRQT_XRC && init_attr->srq_type != IB_SRQT_TM) return -EOPNOTSUPP; - /* Sanity check SRQ size before proceeding */ - if (init_attr->attr.max_wr >= max_srq_wqes) { - mlx5_ib_dbg(dev, "max_wr %d, cap %d\n", - init_attr->attr.max_wr, - max_srq_wqes); + /* Sanity check SRQ and sge size before proceeding */ + if (init_attr->attr.max_wr >= max_srq_wqes || + init_attr->attr.max_sge > max_sge_sz) { + mlx5_ib_dbg(dev, "max_wr %d,wr_cap %d,max_sge %d, sge_cap:%d\n", + init_attr->attr.max_wr, max_srq_wqes, + init_attr->attr.max_sge, max_sge_sz); return -EINVAL; } From 82a5cc783d49b86afd2f60e297ecd85223c39f88 Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Wed, 5 Jun 2024 01:16:08 -0700 Subject: [PATCH 1282/1578] RDMA/mana_ib: Ignore optional access flags for MRs Ignore optional ib_access_flags when an MR is created. Fixes: 0266a177631d ("RDMA/mana_ib: Add a driver for Microsoft Azure Network Adapter") Signed-off-by: Konstantin Taranov Link: https://lore.kernel.org/r/1717575368-14879-1-git-send-email-kotaranov@linux.microsoft.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/mr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c index 4f13423ecdbdf1..887b09dd86e784 100644 --- a/drivers/infiniband/hw/mana/mr.c +++ b/drivers/infiniband/hw/mana/mr.c @@ -112,6 +112,7 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, "start 0x%llx, iova 0x%llx length 0x%llx access_flags 0x%x", start, iova, length, access_flags); + access_flags &= ~IB_ACCESS_OPTIONAL; if (access_flags & ~VALID_MR_FLAGS) return ERR_PTR(-EINVAL); From 11b006d6896c0471ad29c6f1fb1af606e7ba278f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 20 Jun 2024 19:10:51 -0700 Subject: [PATCH 1283/1578] selftest: af_unix: Add Kconfig file. diag_uid selftest failed on NIPA where the received nlmsg_type is NLMSG_ERROR [0] because CONFIG_UNIX_DIAG is not set [1] by default and sock_diag_lock_handler() failed to load the module. # # Starting 2 tests from 2 test cases. # # RUN diag_uid.uid.1 ... # # diag_uid.c:159:1:Expected nlh->nlmsg_type (2) == SOCK_DIAG_BY_FAMILY (20) # # 1: Test terminated by assertion # # FAIL diag_uid.uid.1 # not ok 1 diag_uid.uid.1 Let's add all AF_UNIX Kconfig to the config file under af_unix dir so that NIPA consumes it. Fixes: ac011361bd4f ("af_unix: Add test for sock_diag and UDIAG_SHOW_UID.") Link: https://netdev-3.bots.linux.dev/vmksft-net/results/644841/104-diag-uid/stdout [0] Link: https://netdev-3.bots.linux.dev/vmksft-net/results/644841/config [1] Reported-by: Jakub Kicinski Closes: https://lore.kernel.org/netdev/20240617073033.0cbb829d@kernel.org/ Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- tools/testing/selftests/net/af_unix/config | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 tools/testing/selftests/net/af_unix/config diff --git a/tools/testing/selftests/net/af_unix/config b/tools/testing/selftests/net/af_unix/config new file mode 100644 index 00000000000000..37368567768cc0 --- /dev/null +++ b/tools/testing/selftests/net/af_unix/config @@ -0,0 +1,3 @@ +CONFIG_UNIX=y +CONFIG_AF_UNIX_OOB=y +CONFIG_UNIX_DIAG=m From 0602697d6f4d72b0bc5edbc76afabf6aaa029a69 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 21 Jun 2024 09:19:13 +0200 Subject: [PATCH 1284/1578] mlxsw: pci: Fix driver initialization with Spectrum-4 Cited commit added support for a new reset flow ("all reset") which is deeper than the existing reset flow ("software reset") and allows the device's PCI firmware to be upgraded. In the new flow the driver first tells the firmware that "all reset" is required by issuing a new reset command (i.e., MRSR.command=6) and then triggers the reset by having the PCI core issue a secondary bus reset (SBR). However, due to a race condition in the device's firmware the device is not always able to recover from this reset, resulting in initialization failures [1]. New firmware versions include a fix for the bug and advertise it using a new capability bit in the Management Capabilities Mask (MCAM) register. Avoid initialization failures by reading the new capability bit and triggering the new reset flow only if the bit is set. If the bit is not set, trigger a normal PCI hot reset by skipping the call to the Management Reset and Shutdown Register (MRSR). Normal PCI hot reset is weaker than "all reset", but it results in a fully operational driver and allows users to flash a new firmware, if they want to. [1] mlxsw_spectrum4 0000:01:00.0: not ready 1023ms after bus reset; waiting mlxsw_spectrum4 0000:01:00.0: not ready 2047ms after bus reset; waiting mlxsw_spectrum4 0000:01:00.0: not ready 4095ms after bus reset; waiting mlxsw_spectrum4 0000:01:00.0: not ready 8191ms after bus reset; waiting mlxsw_spectrum4 0000:01:00.0: not ready 16383ms after bus reset; waiting mlxsw_spectrum4 0000:01:00.0: not ready 32767ms after bus reset; waiting mlxsw_spectrum4 0000:01:00.0: not ready 65535ms after bus reset; giving up mlxsw_spectrum4 0000:01:00.0: PCI function reset failed with -25 mlxsw_spectrum4 0000:01:00.0: cannot register bus device mlxsw_spectrum4: probe of 0000:01:00.0 failed with error -25 Fixes: f257c73e5356 ("mlxsw: pci: Add support for new reset flow") Reported-by: Maksym Yaremchuk Signed-off-by: Ido Schimmel Tested-by: Maksym Yaremchuk Reviewed-by: Simon Horman Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/pci.c | 18 +++++++++++++++--- drivers/net/ethernet/mellanox/mlxsw/reg.h | 2 ++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c index bf66d996e32e72..c0ced4d315f3d7 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/pci.c +++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c @@ -1594,18 +1594,25 @@ static int mlxsw_pci_sys_ready_wait(struct mlxsw_pci *mlxsw_pci, return -EBUSY; } -static int mlxsw_pci_reset_at_pci_disable(struct mlxsw_pci *mlxsw_pci) +static int mlxsw_pci_reset_at_pci_disable(struct mlxsw_pci *mlxsw_pci, + bool pci_reset_sbr_supported) { struct pci_dev *pdev = mlxsw_pci->pdev; char mrsr_pl[MLXSW_REG_MRSR_LEN]; int err; + if (!pci_reset_sbr_supported) { + pci_dbg(pdev, "Performing PCI hot reset instead of \"all reset\"\n"); + goto sbr; + } + mlxsw_reg_mrsr_pack(mrsr_pl, MLXSW_REG_MRSR_COMMAND_RESET_AT_PCI_DISABLE); err = mlxsw_reg_write(mlxsw_pci->core, MLXSW_REG(mrsr), mrsr_pl); if (err) return err; +sbr: device_lock_assert(&pdev->dev); pci_cfg_access_lock(pdev); @@ -1633,6 +1640,7 @@ static int mlxsw_pci_reset(struct mlxsw_pci *mlxsw_pci, const struct pci_device_id *id) { struct pci_dev *pdev = mlxsw_pci->pdev; + bool pci_reset_sbr_supported = false; char mcam_pl[MLXSW_REG_MCAM_LEN]; bool pci_reset_supported = false; u32 sys_status; @@ -1652,13 +1660,17 @@ mlxsw_pci_reset(struct mlxsw_pci *mlxsw_pci, const struct pci_device_id *id) mlxsw_reg_mcam_pack(mcam_pl, MLXSW_REG_MCAM_FEATURE_GROUP_ENHANCED_FEATURES); err = mlxsw_reg_query(mlxsw_pci->core, MLXSW_REG(mcam), mcam_pl); - if (!err) + if (!err) { mlxsw_reg_mcam_unpack(mcam_pl, MLXSW_REG_MCAM_PCI_RESET, &pci_reset_supported); + mlxsw_reg_mcam_unpack(mcam_pl, MLXSW_REG_MCAM_PCI_RESET_SBR, + &pci_reset_sbr_supported); + } if (pci_reset_supported) { pci_dbg(pdev, "Starting PCI reset flow\n"); - err = mlxsw_pci_reset_at_pci_disable(mlxsw_pci); + err = mlxsw_pci_reset_at_pci_disable(mlxsw_pci, + pci_reset_sbr_supported); } else { pci_dbg(pdev, "Starting software reset flow\n"); err = mlxsw_pci_reset_sw(mlxsw_pci); diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h index 8adf86a6f5ccce..3bb89045eaf5e0 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/reg.h +++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h @@ -10671,6 +10671,8 @@ enum mlxsw_reg_mcam_mng_feature_cap_mask_bits { MLXSW_REG_MCAM_MCIA_128B = 34, /* If set, MRSR.command=6 is supported. */ MLXSW_REG_MCAM_PCI_RESET = 48, + /* If set, MRSR.command=6 is supported with Secondary Bus Reset. */ + MLXSW_REG_MCAM_PCI_RESET_SBR = 67, }; #define MLXSW_REG_BYTES_PER_DWORD 0x4 From c28947de2bed40217cf256c5d0d16880054fcf13 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 21 Jun 2024 09:19:14 +0200 Subject: [PATCH 1285/1578] mlxsw: spectrum_buffers: Fix memory corruptions on Spectrum-4 systems The following two shared buffer operations make use of the Shared Buffer Status Register (SBSR): # devlink sb occupancy snapshot pci/0000:01:00.0 # devlink sb occupancy clearmax pci/0000:01:00.0 The register has two masks of 256 bits to denote on which ingress / egress ports the register should operate on. Spectrum-4 has more than 256 ports, so the register was extended by cited commit with a new 'port_page' field. However, when filling the register's payload, the driver specifies the ports as absolute numbers and not relative to the first port of the port page, resulting in memory corruptions [1]. Fix by specifying the ports relative to the first port of the port page. [1] BUG: KASAN: slab-use-after-free in mlxsw_sp_sb_occ_snapshot+0xb6d/0xbc0 Read of size 1 at addr ffff8881068cb00f by task devlink/1566 [...] Call Trace: dump_stack_lvl+0xc6/0x120 print_report+0xce/0x670 kasan_report+0xd7/0x110 mlxsw_sp_sb_occ_snapshot+0xb6d/0xbc0 mlxsw_devlink_sb_occ_snapshot+0x75/0xb0 devlink_nl_sb_occ_snapshot_doit+0x1f9/0x2a0 genl_family_rcv_msg_doit+0x20c/0x300 genl_rcv_msg+0x567/0x800 netlink_rcv_skb+0x170/0x450 genl_rcv+0x2d/0x40 netlink_unicast+0x547/0x830 netlink_sendmsg+0x8d4/0xdb0 __sys_sendto+0x49b/0x510 __x64_sys_sendto+0xe5/0x1c0 do_syscall_64+0xc1/0x1d0 entry_SYSCALL_64_after_hwframe+0x77/0x7f [...] Allocated by task 1: kasan_save_stack+0x33/0x60 kasan_save_track+0x14/0x30 __kasan_kmalloc+0x8f/0xa0 copy_verifier_state+0xbc2/0xfb0 do_check_common+0x2c51/0xc7e0 bpf_check+0x5107/0x9960 bpf_prog_load+0xf0e/0x2690 __sys_bpf+0x1a61/0x49d0 __x64_sys_bpf+0x7d/0xc0 do_syscall_64+0xc1/0x1d0 entry_SYSCALL_64_after_hwframe+0x77/0x7f Freed by task 1: kasan_save_stack+0x33/0x60 kasan_save_track+0x14/0x30 kasan_save_free_info+0x3b/0x60 poison_slab_object+0x109/0x170 __kasan_slab_free+0x14/0x30 kfree+0xca/0x2b0 free_verifier_state+0xce/0x270 do_check_common+0x4828/0xc7e0 bpf_check+0x5107/0x9960 bpf_prog_load+0xf0e/0x2690 __sys_bpf+0x1a61/0x49d0 __x64_sys_bpf+0x7d/0xc0 do_syscall_64+0xc1/0x1d0 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: f8538aec88b4 ("mlxsw: Add support for more than 256 ports in SBSR register") Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Reviewed-by: Simon Horman Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- .../mellanox/mlxsw/spectrum_buffers.c | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c index c9f1c79f3f9d07..ba090262e27ef8 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c @@ -1607,8 +1607,8 @@ static void mlxsw_sp_sb_sr_occ_query_cb(struct mlxsw_core *mlxsw_core, int mlxsw_sp_sb_occ_snapshot(struct mlxsw_core *mlxsw_core, unsigned int sb_index) { + u16 local_port, local_port_1, first_local_port, last_local_port; struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core); - u16 local_port, local_port_1, last_local_port; struct mlxsw_sp_sb_sr_occ_query_cb_ctx cb_ctx; u8 masked_count, current_page = 0; unsigned long cb_priv = 0; @@ -1628,6 +1628,7 @@ int mlxsw_sp_sb_occ_snapshot(struct mlxsw_core *mlxsw_core, masked_count = 0; mlxsw_reg_sbsr_pack(sbsr_pl, false); mlxsw_reg_sbsr_port_page_set(sbsr_pl, current_page); + first_local_port = current_page * MLXSW_REG_SBSR_NUM_PORTS_IN_PAGE; last_local_port = current_page * MLXSW_REG_SBSR_NUM_PORTS_IN_PAGE + MLXSW_REG_SBSR_NUM_PORTS_IN_PAGE - 1; @@ -1645,9 +1646,12 @@ int mlxsw_sp_sb_occ_snapshot(struct mlxsw_core *mlxsw_core, if (local_port != MLXSW_PORT_CPU_PORT) { /* Ingress quotas are not supported for the CPU port */ mlxsw_reg_sbsr_ingress_port_mask_set(sbsr_pl, - local_port, 1); + local_port - first_local_port, + 1); } - mlxsw_reg_sbsr_egress_port_mask_set(sbsr_pl, local_port, 1); + mlxsw_reg_sbsr_egress_port_mask_set(sbsr_pl, + local_port - first_local_port, + 1); for (i = 0; i < mlxsw_sp->sb_vals->pool_count; i++) { err = mlxsw_sp_sb_pm_occ_query(mlxsw_sp, local_port, i, &bulk_list); @@ -1684,7 +1688,7 @@ int mlxsw_sp_sb_occ_max_clear(struct mlxsw_core *mlxsw_core, unsigned int sb_index) { struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core); - u16 local_port, last_local_port; + u16 local_port, first_local_port, last_local_port; LIST_HEAD(bulk_list); unsigned int masked_count; u8 current_page = 0; @@ -1702,6 +1706,7 @@ int mlxsw_sp_sb_occ_max_clear(struct mlxsw_core *mlxsw_core, masked_count = 0; mlxsw_reg_sbsr_pack(sbsr_pl, true); mlxsw_reg_sbsr_port_page_set(sbsr_pl, current_page); + first_local_port = current_page * MLXSW_REG_SBSR_NUM_PORTS_IN_PAGE; last_local_port = current_page * MLXSW_REG_SBSR_NUM_PORTS_IN_PAGE + MLXSW_REG_SBSR_NUM_PORTS_IN_PAGE - 1; @@ -1719,9 +1724,12 @@ int mlxsw_sp_sb_occ_max_clear(struct mlxsw_core *mlxsw_core, if (local_port != MLXSW_PORT_CPU_PORT) { /* Ingress quotas are not supported for the CPU port */ mlxsw_reg_sbsr_ingress_port_mask_set(sbsr_pl, - local_port, 1); + local_port - first_local_port, + 1); } - mlxsw_reg_sbsr_egress_port_mask_set(sbsr_pl, local_port, 1); + mlxsw_reg_sbsr_egress_port_mask_set(sbsr_pl, + local_port - first_local_port, + 1); for (i = 0; i < mlxsw_sp->sb_vals->pool_count; i++) { err = mlxsw_sp_sb_pm_occ_clear(mlxsw_sp, local_port, i, &bulk_list); From 339b84ab6b1d66900c27bd999271cb2ae40ce812 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Jun 2024 09:45:09 -0400 Subject: [PATCH 1286/1578] closures: Change BUG_ON() to WARN_ON() If a BUG_ON() can be hit in the wild, it shouldn't be a BUG_ON() For reference, this has popped up once in the CI, and we'll need more info to debug it: 03240 ------------[ cut here ]------------ 03240 kernel BUG at lib/closure.c:21! 03240 kernel BUG at lib/closure.c:21! 03240 Internal error: Oops - BUG: 00000000f2000800 [#1] SMP 03240 Modules linked in: 03240 CPU: 15 PID: 40534 Comm: kworker/u80:1 Not tainted 6.10.0-rc4-ktest-ga56da69799bd #25570 03240 Hardware name: linux,dummy-virt (DT) 03240 Workqueue: btree_update btree_interior_update_work 03240 pstate: 00001005 (nzcv daif -PAN -UAO -TCO -DIT +SSBS BTYPE=--) 03240 pc : closure_put+0x224/0x2a0 03240 lr : closure_put+0x24/0x2a0 03240 sp : ffff0000d12071c0 03240 x29: ffff0000d12071c0 x28: dfff800000000000 x27: ffff0000d1207360 03240 x26: 0000000000000040 x25: 0000000000000040 x24: 0000000000000040 03240 x23: ffff0000c1f20180 x22: 0000000000000000 x21: ffff0000c1f20168 03240 x20: 0000000040000000 x19: ffff0000c1f20140 x18: 0000000000000001 03240 x17: 0000000000003aa0 x16: 0000000000003ad0 x15: 1fffe0001c326974 03240 x14: 0000000000000a1e x13: 0000000000000000 x12: 1fffe000183e402d 03240 x11: ffff6000183e402d x10: dfff800000000000 x9 : ffff6000183e402e 03240 x8 : 0000000000000001 x7 : 00009fffe7c1bfd3 x6 : ffff0000c1f2016b 03240 x5 : ffff0000c1f20168 x4 : ffff6000183e402e x3 : ffff800081391954 03240 x2 : 0000000000000001 x1 : 0000000000000000 x0 : 00000000a8000000 03240 Call trace: 03240 closure_put+0x224/0x2a0 03240 bch2_check_for_deadlock+0x910/0x1028 03240 bch2_six_check_for_deadlock+0x1c/0x30 03240 six_lock_slowpath.isra.0+0x29c/0xed0 03240 six_lock_ip_waiter+0xa8/0xf8 03240 __bch2_btree_node_lock_write+0x14c/0x298 03240 bch2_trans_lock_write+0x6d4/0xb10 03240 __bch2_trans_commit+0x135c/0x5520 03240 btree_interior_update_work+0x1248/0x1c10 03240 process_scheduled_works+0x53c/0xd90 03240 worker_thread+0x370/0x8c8 03240 kthread+0x258/0x2e8 03240 ret_from_fork+0x10/0x20 03240 Code: aa1303e0 d63f0020 a94363f7 17ffff8c (d4210000) 03240 ---[ end trace 0000000000000000 ]--- 03240 Kernel panic - not syncing: Oops - BUG: Fatal exception 03240 SMP: stopping secondary CPUs 03241 SMP: failed to stop secondary CPUs 13,15 03241 Kernel Offset: disabled 03241 CPU features: 0x00,00000003,80000008,4240500b 03241 Memory Limit: none 03241 ---[ end Kernel panic - not syncing: Oops - BUG: Fatal exception ]--- 03246 ========= FAILED TIMEOUT copygc_torture_no_checksum in 7200s Signed-off-by: Kent Overstreet --- lib/closure.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/lib/closure.c b/lib/closure.c index 07409e9e35a53e..2e1ee9fdec081b 100644 --- a/lib/closure.c +++ b/lib/closure.c @@ -17,12 +17,18 @@ static inline void closure_put_after_sub(struct closure *cl, int flags) { int r = flags & CLOSURE_REMAINING_MASK; - BUG_ON(flags & CLOSURE_GUARD_MASK); - BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); + if (WARN(flags & CLOSURE_GUARD_MASK, + "closure has guard bits set: %x (%u)", + flags & CLOSURE_GUARD_MASK, (unsigned) __fls(r))) + r &= ~CLOSURE_GUARD_MASK; if (!r) { smp_acquire__after_ctrl_dep(); + WARN(flags & ~CLOSURE_DESTRUCTOR, + "closure ref hit 0 with incorrect flags set: %x (%u)", + flags & ~CLOSURE_DESTRUCTOR, (unsigned) __fls(flags)); + cl->closure_get_happened = false; if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { From f648b6c12b70af9d24a293617102729cee6b7862 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Jun 2024 10:04:35 -0400 Subject: [PATCH 1287/1578] bcachefs: Fix missing alloc_data_type_set() Incorrect bucket state transition in the discard path; when incrementing a bucket's generation number that had already been discarded, we were forgetting to check if it should be need_gc_gens, not free. This was caught by the .invalid checks in the transaction commit path, causing us to go emergency read only. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 7b5909764d1481..e5e7d33f4a5ef9 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -776,6 +776,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) { new_a->gen++; SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); + alloc_data_type_set(new_a, new_a->data_type); } if (old_a->data_type != new_a->data_type || @@ -1796,8 +1797,9 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, } SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); - alloc_data_type_set(&a->v, a->v.data_type); write: + alloc_data_type_set(&a->v, a->v.data_type); + ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: bch2_trans_commit(trans, NULL, NULL, BCH_WATERMARK_btree| From 504794067fc266be5ac170777a94a927a72ac846 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 26 May 2024 22:52:22 -0400 Subject: [PATCH 1288/1578] bcachefs: Replace bare EEXIST with private error codes Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 2 +- fs/bcachefs/errcode.h | 3 +++ fs/bcachefs/fs-ioctl.c | 2 +- fs/bcachefs/str_hash.h | 2 +- fs/bcachefs/super.c | 11 ++++++----- 5 files changed, 12 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index e5e7d33f4a5ef9..8dec2c6cbb7eb0 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1643,7 +1643,7 @@ static int discard_in_flight_add(struct bch_fs *c, struct bpos bucket) mutex_lock(&c->discard_buckets_in_flight_lock); darray_for_each(c->discard_buckets_in_flight, i) if (bkey_eq(*i, bucket)) { - ret = -EEXIST; + ret = -BCH_ERR_EEXIST_discard_in_flight_add; goto out; } diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index dbe35b80bc0b89..58612abf7927af 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -116,6 +116,9 @@ x(ENOENT, ENOENT_dev_idx_not_found) \ x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \ x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \ + x(EEXIST, EEXIST_str_hash_set) \ + x(EEXIST, EEXIST_discard_in_flight_add) \ + x(EEXIST, EEXIST_subvolume_create) \ x(0, open_buckets_empty) \ x(0, freelist_empty) \ x(BCH_ERR_freelist_empty, no_buckets_found) \ diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 3551a737181b2e..79a0c8732bced9 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -373,7 +373,7 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, } if (dst_dentry->d_inode) { - error = -EEXIST; + error = -BCH_ERR_EEXIST_subvolume_create; goto err3; } diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index cbad9b27874fed..c8c266cb579726 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -300,7 +300,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans, if (!found && (flags & STR_HASH_must_replace)) { ret = -BCH_ERR_ENOENT_str_hash_set_must_replace; } else if (found && (flags & STR_HASH_must_create)) { - ret = -EEXIST; + ret = -BCH_ERR_EEXIST_str_hash_set; } else { if (!found && slot.path) swap(iter, slot); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 635da5b3439cf2..9083df82073a56 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -931,12 +931,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) if (ret) goto err; - for (i = 0; i < c->sb.nr_devices; i++) - if (bch2_member_exists(c->disk_sb.sb, i) && - bch2_dev_alloc(c, i)) { - ret = -EEXIST; + for (i = 0; i < c->sb.nr_devices; i++) { + if (!bch2_member_exists(c->disk_sb.sb, i)) + continue; + ret = bch2_dev_alloc(c, i); + if (ret) goto err; - } + } bch2_journal_entry_res_resize(&c->journal, &c->btree_root_journal_res, From dd9086487c1bb38641bcfbe765422c7f0a1a8d95 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Jun 2024 13:20:49 -0400 Subject: [PATCH 1289/1578] bcachefs: Fix I_NEW warning in race path in bch2_inode_insert() discard_new_inode() is the correct interface for tearing down an indoe that was fully created but not made visible to other threads, but it expects I_NEW to be set, which we don't use. Reported-by: https://github.com/koverstreet/bcachefs/issues/690 Fixes: bcachefs: Fix race path in bch2_inode_insert() Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 8314d3e1582d31..615ef8305c6eb4 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -188,6 +188,12 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino BUG_ON(!old); if (unlikely(old != inode)) { + /* + * bcachefs doesn't use I_NEW; we have no use for it since we + * only insert fully created inodes in the inode hash table. But + * discard_new_inode() expects it to be set... + */ + inode->v.i_flags |= I_NEW; discard_new_inode(&inode->v); inode = old; } else { @@ -195,8 +201,10 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); mutex_unlock(&c->vfs_inodes_lock); /* - * we really don't want insert_inode_locked2() to be setting - * I_NEW... + * Again, I_NEW makes no sense for bcachefs. This is only needed + * for clearing I_NEW, but since the inode was already fully + * created and initialized we didn't actually want + * inode_insert5() to set it for us. */ unlock_new_inode(&inode->v); } From e6b3a655ac7ba5282b1504851488236865804cb8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Jun 2024 13:10:34 -0400 Subject: [PATCH 1290/1578] bcachefs: Use bch2_print_string_as_lines for long err printk strings get truncated to 1024 bytes; if we have a long error message (journal debug info) we need to use a helper. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_io.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index cdcb1ad49af42e..492426c8d869ac 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1967,7 +1967,6 @@ CLOSURE_CALLBACK(bch2_journal_write) struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_replicas_padded replicas; - struct printbuf journal_debug_buf = PRINTBUF; unsigned nr_rw_members = 0; int ret; @@ -2011,11 +2010,15 @@ CLOSURE_CALLBACK(bch2_journal_write) } if (ret) { - __bch2_journal_debug_to_text(&journal_debug_buf, j); + struct printbuf buf = PRINTBUF; + buf.atomic++; + + prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write: %s"), + bch2_err_str(ret)); + __bch2_journal_debug_to_text(&buf, j); spin_unlock(&j->lock); - bch_err(c, "Unable to allocate journal write:\n%s", - journal_debug_buf.buf); - printbuf_exit(&journal_debug_buf); + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); goto err; } From 2fe79ce7d1e8ec5059e7dfc15f3c769ae9679569 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Jun 2024 19:42:39 -0400 Subject: [PATCH 1291/1578] bcachefs: Fix a UAF after write_super() write_super() may reallocate the superblock buffer - but bch_sb_field_ext was referencing it; don't use it after the write_super call. Reported-by: syzbot+8992fc10a192067b8d8a@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index e632da69196ccf..1f9d044ed92077 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -664,10 +664,10 @@ int bch2_fs_recovery(struct bch_fs *c) if (check_version_upgrade(c)) write_sb = true; + c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + if (write_sb) bch2_write_super(c); - - c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); mutex_unlock(&c->sb_lock); if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) From bd4da0462ea7bf26b2a5df5528ec20c550f7ec41 Mon Sep 17 00:00:00 2001 From: Youling Tang Date: Tue, 4 Jun 2024 16:46:10 +0800 Subject: [PATCH 1292/1578] bcachefs: Move the ei_flags setting to after initialization `inode->ei_flags` setting and cleaning should be done after initialization, otherwise the operation is invalid. Fixes: 9ca4853b98af ("bcachefs: Fix quota support for snapshots") Signed-off-by: Youling Tang Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 615ef8305c6eb4..f9c9a95d7d4cad 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1497,11 +1497,6 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, bch2_iget5_set(&inode->v, &inum); bch2_inode_update_after_write(trans, inode, bi, ~0); - if (BCH_SUBVOLUME_SNAP(subvol)) - set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); - else - clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); - inode->v.i_blocks = bi->bi_sectors; inode->v.i_ino = bi->bi_inum; inode->v.i_rdev = bi->bi_dev; @@ -1513,6 +1508,9 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, inode->ei_qid = bch_qid(bi); inode->ei_subvol = inum.subvol; + if (BCH_SUBVOLUME_SNAP(subvol)) + set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); + inode->v.i_mapping->a_ops = &bch_address_space_operations; switch (inode->v.i_mode & S_IFMT) { From bfc6444b57dc7186b6acc964705d7516cbaf3904 Mon Sep 17 00:00:00 2001 From: Ian Ray Date: Thu, 20 Jun 2024 07:29:15 +0300 Subject: [PATCH 1293/1578] gpio: pca953x: fix pca953x_irq_bus_sync_unlock race Ensure that `i2c_lock' is held when setting interrupt latch and mask in pca953x_irq_bus_sync_unlock() in order to avoid races. The other (non-probe) call site pca953x_gpio_set_multiple() ensures the lock is held before calling pca953x_write_regs(). The problem occurred when a request raced against irq_bus_sync_unlock() approximately once per thousand reboots on an i.MX8MP based system. * Normal case 0-0022: write register AI|3a {03,02,00,00,01} Input latch P0 0-0022: write register AI|49 {fc,fd,ff,ff,fe} Interrupt mask P0 0-0022: write register AI|08 {ff,00,00,00,00} Output P3 0-0022: write register AI|12 {fc,00,00,00,00} Config P3 * Race case 0-0022: write register AI|08 {ff,00,00,00,00} Output P3 0-0022: write register AI|08 {03,02,00,00,01} *** Wrong register *** 0-0022: write register AI|12 {fc,00,00,00,00} Config P3 0-0022: write register AI|49 {fc,fd,ff,ff,fe} Interrupt mask P0 Signed-off-by: Ian Ray Link: https://lore.kernel.org/r/20240620042915.2173-1-ian.ray@gehealthcare.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-pca953x.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpio/gpio-pca953x.c b/drivers/gpio/gpio-pca953x.c index 77a2812f297461..732a6964748c1b 100644 --- a/drivers/gpio/gpio-pca953x.c +++ b/drivers/gpio/gpio-pca953x.c @@ -758,6 +758,8 @@ static void pca953x_irq_bus_sync_unlock(struct irq_data *d) int level; if (chip->driver_data & PCA_PCAL) { + guard(mutex)(&chip->i2c_lock); + /* Enable latch on interrupt-enabled inputs */ pca953x_write_regs(chip, PCAL953X_IN_LATCH, chip->irq_mask); From 4ac9056e4bd787f1ba2001167c5acc2b5a75ddf9 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 21 Jun 2024 12:15:04 +0900 Subject: [PATCH 1294/1578] null_blk: Do not set disk->nr_zones In null_register_zoned_dev(), there is no need to set disk->nr_zones as the now uncoditional call to blk_revalidate_disk_zones() will do that. So remove the assignment using bdev_nr_zones(). Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240621031506.759397-2-dlemoal@kernel.org Signed-off-by: Jens Axboe --- drivers/block/null_blk/zoned.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index b42c00f1313254..9f7151ad93cfc4 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -171,8 +171,6 @@ int null_register_zoned_dev(struct nullb *nullb) struct request_queue *q = nullb->q; struct gendisk *disk = nullb->disk; - disk->nr_zones = bdev_nr_zones(disk->part0); - pr_info("%s: using %s zone append\n", disk->disk_name, queue_emulates_zone_append(q) ? "emulated" : "native"); From b6cfe2287df6e26d685af8a8a96ed1bf87bdde28 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 21 Jun 2024 12:15:05 +0900 Subject: [PATCH 1295/1578] block: Define bdev_nr_zones() as an inline function There is no need for bdev_nr_zones() to be an exported function calculating the number of zones of a block device. Instead, given that all callers use this helper with a fully initialized block device that has a gendisk, we can redefine this function as an inline helper in blkdev.h. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240621031506.759397-3-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 18 ------------------ include/linux/blkdev.h | 6 +++++- 2 files changed, 5 insertions(+), 19 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 8f89705f5e1c50..601c21a224c93e 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -115,24 +115,6 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) } EXPORT_SYMBOL_GPL(blk_zone_cond_str); -/** - * bdev_nr_zones - Get number of zones - * @bdev: Target device - * - * Return the total number of zones of a zoned block device. For a block - * device without zone capabilities, the number of zones is always 0. - */ -unsigned int bdev_nr_zones(struct block_device *bdev) -{ - sector_t zone_sectors = bdev_zone_sectors(bdev); - - if (!bdev_is_zoned(bdev)) - return 0; - return (bdev_nr_sectors(bdev) + zone_sectors - 1) >> - ilog2(zone_sectors); -} -EXPORT_SYMBOL_GPL(bdev_nr_zones); - /** * blkdev_report_zones - Get zones information * @bdev: Target block device diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2800231046a98b..1078a7d5129541 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -673,7 +673,6 @@ static inline bool blk_queue_is_zoned(struct request_queue *q) } #ifdef CONFIG_BLK_DEV_ZONED -unsigned int bdev_nr_zones(struct block_device *bdev); static inline unsigned int disk_nr_zones(struct gendisk *disk) { @@ -687,6 +686,11 @@ static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector) return sector >> ilog2(disk->queue->limits.chunk_sectors); } +static inline unsigned int bdev_nr_zones(struct block_device *bdev) +{ + return disk_nr_zones(bdev->bd_disk); +} + static inline unsigned int bdev_max_open_zones(struct block_device *bdev) { return bdev->bd_disk->queue->limits.max_open_zones; From caaf7101c01a91a882d3da2f566579dda692367d Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 21 Jun 2024 12:15:06 +0900 Subject: [PATCH 1296/1578] block: Cleanup block device zone helpers There is no need to conditionally define on CONFIG_BLK_DEV_ZONED the inline helper functions bdev_nr_zones(), bdev_max_open_zones(), bdev_max_active_zones() and disk_zone_no() as these function will return the correct valu in all cases (zoned device or not, including when CONFIG_BLK_DEV_ZONED is not set). Furthermore, disk_nr_zones() definition can be simplified as disk->nr_zones is always 0 for regular block devices. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240621031506.759397-4-dlemoal@kernel.org Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 44 ++++++++++++------------------------------ 1 file changed, 12 insertions(+), 32 deletions(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1078a7d5129541..e89003360c17c7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -673,11 +673,21 @@ static inline bool blk_queue_is_zoned(struct request_queue *q) } #ifdef CONFIG_BLK_DEV_ZONED - static inline unsigned int disk_nr_zones(struct gendisk *disk) { - return blk_queue_is_zoned(disk->queue) ? disk->nr_zones : 0; + return disk->nr_zones; +} +bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs); +#else /* CONFIG_BLK_DEV_ZONED */ +static inline unsigned int disk_nr_zones(struct gendisk *disk) +{ + return 0; +} +static inline bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) +{ + return false; } +#endif /* CONFIG_BLK_DEV_ZONED */ static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector) { @@ -701,36 +711,6 @@ static inline unsigned int bdev_max_active_zones(struct block_device *bdev) return bdev->bd_disk->queue->limits.max_active_zones; } -bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs); -#else /* CONFIG_BLK_DEV_ZONED */ -static inline unsigned int bdev_nr_zones(struct block_device *bdev) -{ - return 0; -} - -static inline unsigned int disk_nr_zones(struct gendisk *disk) -{ - return 0; -} -static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector) -{ - return 0; -} -static inline unsigned int bdev_max_open_zones(struct block_device *bdev) -{ - return 0; -} - -static inline unsigned int bdev_max_active_zones(struct block_device *bdev) -{ - return 0; -} -static inline bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) -{ - return false; -} -#endif /* CONFIG_BLK_DEV_ZONED */ - static inline unsigned int blk_queue_depth(struct request_queue *q) { if (q->queue_depth) From c45fcf46ca2368dafe7e5c513a711a6f0f974308 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 21 Jun 2024 16:37:12 +0200 Subject: [PATCH 1297/1578] pwm: stm32: Refuse too small period requests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If period_ns is small, prd might well become 0. Catch that case because otherwise with regmap_write(priv->regmap, TIM_ARR, prd - 1); a few lines down quite a big period is configured. Fixes: 7edf7369205b ("pwm: Add driver for STM32 plaftorm") Cc: stable@vger.kernel.org Reviewed-by: Trevor Gamblin Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/b86f62f099983646f97eeb6bfc0117bb2d0c340d.1718979150.git.u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-stm32.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pwm/pwm-stm32.c b/drivers/pwm/pwm-stm32.c index a2f231d13a9f7c..3e7b2a8e34e7dd 100644 --- a/drivers/pwm/pwm-stm32.c +++ b/drivers/pwm/pwm-stm32.c @@ -337,6 +337,8 @@ static int stm32_pwm_config(struct stm32_pwm *priv, unsigned int ch, prd = mul_u64_u64_div_u64(period_ns, clk_get_rate(priv->clk), (u64)NSEC_PER_SEC * (prescaler + 1)); + if (!prd) + return -EINVAL; /* * All channels share the same prescaler and counter so when two From f80a55fa90fa76d01e3fffaa5d0413e522ab9a00 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 17 Jun 2024 09:27:27 +0200 Subject: [PATCH 1298/1578] nvme: fixup comment for nvme RDMA Provider Type PRTYPE is the provider type, not the QP service type. Fixes: eb793e2c9286 ("nvme.h: add NVMe over Fabrics definitions") Signed-off-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- include/linux/nvme.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 42557320229535..69ac2abf8acfee 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -87,8 +87,8 @@ enum { NVMF_RDMA_QPTYPE_DATAGRAM = 2, /* Reliable Datagram */ }; -/* RDMA QP Service Type codes for Discovery Log Page entry TSAS - * RDMA_QPTYPE field +/* RDMA Provider Type codes for Discovery Log Page entry TSAS + * RDMA_PRTYPE field */ enum { NVMF_RDMA_PRTYPE_NOT_SPECIFIED = 1, /* No Provider Specified */ From 0f1f5803920d2a6b88bee950914fd37421e17170 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 17 Jun 2024 09:27:28 +0200 Subject: [PATCH 1299/1578] nvmet: make 'tsas' attribute idempotent for RDMA The RDMA transport defines values for TSAS, but it cannot be changed as we only support the 'connected' mode. So to avoid errors during reconfiguration we should allow to write the current value. Fixes: 3f123494db72 ("nvmet: make TCP sectype settable via configfs") Signed-off-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/target/configfs.c | 39 ++++++++++++++++++++++++++-------- include/linux/nvme.h | 2 ++ 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index e60224356048a4..685e89b35d330d 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -413,25 +413,46 @@ static ssize_t nvmet_addr_tsas_show(struct config_item *item, return sprintf(page, "\n"); } +static u8 nvmet_addr_tsas_rdma_store(const char *page) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(nvmet_addr_tsas_rdma); i++) { + if (sysfs_streq(page, nvmet_addr_tsas_rdma[i].name)) + return nvmet_addr_tsas_rdma[i].type; + } + return NVMF_RDMA_QPTYPE_INVALID; +} + +static u8 nvmet_addr_tsas_tcp_store(const char *page) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(nvmet_addr_tsas_tcp); i++) { + if (sysfs_streq(page, nvmet_addr_tsas_tcp[i].name)) + return nvmet_addr_tsas_tcp[i].type; + } + return NVMF_TCP_SECTYPE_INVALID; +} + static ssize_t nvmet_addr_tsas_store(struct config_item *item, const char *page, size_t count) { struct nvmet_port *port = to_nvmet_port(item); u8 treq = nvmet_port_disc_addr_treq_mask(port); - u8 sectype; - int i; + u8 sectype, qptype; if (nvmet_is_port_enabled(port, __func__)) return -EACCES; - if (port->disc_addr.trtype != NVMF_TRTYPE_TCP) - return -EINVAL; - - for (i = 0; i < ARRAY_SIZE(nvmet_addr_tsas_tcp); i++) { - if (sysfs_streq(page, nvmet_addr_tsas_tcp[i].name)) { - sectype = nvmet_addr_tsas_tcp[i].type; + if (port->disc_addr.trtype == NVMF_TRTYPE_RDMA) { + qptype = nvmet_addr_tsas_rdma_store(page); + if (qptype == port->disc_addr.tsas.rdma.qptype) + return count; + } else if (port->disc_addr.trtype == NVMF_TRTYPE_TCP) { + sectype = nvmet_addr_tsas_tcp_store(page); + if (sectype != NVMF_TCP_SECTYPE_INVALID) goto found; - } } pr_err("Invalid value '%s' for tsas\n", page); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 69ac2abf8acfee..c693ac344ec05c 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -85,6 +85,7 @@ enum { enum { NVMF_RDMA_QPTYPE_CONNECTED = 1, /* Reliable Connected */ NVMF_RDMA_QPTYPE_DATAGRAM = 2, /* Reliable Datagram */ + NVMF_RDMA_QPTYPE_INVALID = 0xff, }; /* RDMA Provider Type codes for Discovery Log Page entry TSAS @@ -110,6 +111,7 @@ enum { NVMF_TCP_SECTYPE_NONE = 0, /* No Security */ NVMF_TCP_SECTYPE_TLS12 = 1, /* TLSv1.2, NVMe-oF 1.1 and NVMe-TCP 3.6.1.1 */ NVMF_TCP_SECTYPE_TLS13 = 2, /* TLSv1.3, NVMe-oF 1.1 and NVMe-TCP 3.6.1.1 */ + NVMF_TCP_SECTYPE_INVALID = 0xff, }; #define NVME_AQ_DEPTH 32 From 5337ac4c9b807bc46baa0713121a0afa8beacd70 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 18 Jun 2024 18:18:58 -0700 Subject: [PATCH 1300/1578] bpf: Fix the corner case with may_goto and jump to the 1st insn. When the following program is processed by the verifier: L1: may_goto L2 goto L1 L2: w0 = 0 exit the may_goto insn is first converted to: L1: r11 = *(u64 *)(r10 -8) if r11 == 0x0 goto L2 r11 -= 1 *(u64 *)(r10 -8) = r11 goto L1 L2: w0 = 0 exit then later as the last step the verifier inserts: *(u64 *)(r10 -8) = BPF_MAX_LOOPS as the first insn of the program to initialize loop count. When the first insn happens to be a branch target of some jmp the bpf_patch_insn_data() logic will produce: L1: *(u64 *)(r10 -8) = BPF_MAX_LOOPS r11 = *(u64 *)(r10 -8) if r11 == 0x0 goto L2 r11 -= 1 *(u64 *)(r10 -8) = r11 goto L1 L2: w0 = 0 exit because instruction patching adjusts all jmps and calls, but for this particular corner case it's incorrect and the L1 label should be one instruction down, like: *(u64 *)(r10 -8) = BPF_MAX_LOOPS L1: r11 = *(u64 *)(r10 -8) if r11 == 0x0 goto L2 r11 -= 1 *(u64 *)(r10 -8) = r11 goto L1 L2: w0 = 0 exit and that's what this patch is fixing. After bpf_patch_insn_data() call adjust_jmp_off() to adjust all jmps that point to newly insert BPF_ST insn to point to insn after. Note that bpf_patch_insn_data() cannot easily be changed to accommodate this logic, since jumps that point before or after a sequence of patched instructions have to be adjusted with the full length of the patch. Conceptually it's somewhat similar to "insert" of instructions between other instructions with weird semantics. Like "insert" before 1st insn would require adjustment of CALL insns to point to newly inserted 1st insn, but not an adjustment JMP insns that point to 1st, yet still adjusting JMP insns that cross over 1st insn (point to insn before or insn after), hence use simple adjust_jmp_off() logic to fix this corner case. Ideally bpf_patch_insn_data() would have an auxiliary info to say where 'the start of newly inserted patch is', but it would be too complex for backport. Fixes: 011832b97b31 ("bpf: Introduce may_goto instruction") Reported-by: Zac Ecob Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Eduard Zingerman Closes: https://lore.kernel.org/bpf/CAADnVQJ_WWx8w4b=6Gc2EpzAjgv+6A0ridnMz2TvS2egj4r3Gw@mail.gmail.com/ Link: https://lore.kernel.org/bpf/20240619011859.79334-1-alexei.starovoitov@gmail.com --- kernel/bpf/verifier.c | 50 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e0a398a97d32e4..5586a571bf551e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12721,6 +12721,16 @@ static bool signed_add32_overflows(s32 a, s32 b) return res < a; } +static bool signed_add16_overflows(s16 a, s16 b) +{ + /* Do the add in u16, where overflow is well-defined */ + s16 res = (s16)((u16)a + (u16)b); + + if (b < 0) + return res > a; + return res < a; +} + static bool signed_sub_overflows(s64 a, s64 b) { /* Do the sub in u64, where overflow is well-defined */ @@ -18732,6 +18742,39 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of return new_prog; } +/* + * For all jmp insns in a given 'prog' that point to 'tgt_idx' insn adjust the + * jump offset by 'delta'. + */ +static int adjust_jmp_off(struct bpf_prog *prog, u32 tgt_idx, u32 delta) +{ + struct bpf_insn *insn = prog->insnsi; + u32 insn_cnt = prog->len, i; + + for (i = 0; i < insn_cnt; i++, insn++) { + u8 code = insn->code; + + if ((BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) || + BPF_OP(code) == BPF_CALL || BPF_OP(code) == BPF_EXIT) + continue; + + if (insn->code == (BPF_JMP32 | BPF_JA)) { + if (i + 1 + insn->imm != tgt_idx) + continue; + if (signed_add32_overflows(insn->imm, delta)) + return -ERANGE; + insn->imm += delta; + } else { + if (i + 1 + insn->off != tgt_idx) + continue; + if (signed_add16_overflows(insn->imm, delta)) + return -ERANGE; + insn->off += delta; + } + } + return 0; +} + static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env, u32 off, u32 cnt) { @@ -20548,6 +20591,13 @@ static int do_misc_fixups(struct bpf_verifier_env *env) if (!new_prog) return -ENOMEM; env->prog = prog = new_prog; + /* + * If may_goto is a first insn of a prog there could be a jmp + * insn that points to it, hence adjust all such jmps to point + * to insn after BPF_ST that inits may_goto count. + * Adjustment will succeed because bpf_patch_insn_data() didn't fail. + */ + WARN_ON(adjust_jmp_off(env->prog, subprog_start, 1)); } /* Since poke tab is now finalized, publish aux to tracker. */ From 2673315947c9f3890ad34a8196f62142e4ddef5a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 18 Jun 2024 18:18:59 -0700 Subject: [PATCH 1301/1578] selftests/bpf: Tests with may_goto and jumps to the 1st insn Add few tests with may_goto and jumps to the 1st insn. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20240619011859.79334-2-alexei.starovoitov@gmail.com --- .../bpf/progs/verifier_iterating_callbacks.c | 94 +++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c b/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c index bd676d7e615fd5..8885e5239d6b73 100644 --- a/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c +++ b/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c @@ -307,6 +307,100 @@ int iter_limit_bug(struct __sk_buff *skb) return 0; } +SEC("socket") +__success __retval(0) +__naked void ja_and_may_goto(void) +{ + asm volatile (" \ +l0_%=: .byte 0xe5; /* may_goto */ \ + .byte 0; /* regs */ \ + .short 1; /* off 1 */ \ + .long 0; /* imm */ \ + goto l0_%=; \ + r0 = 0; \ + exit; \ +" ::: __clobber_common); +} + +SEC("socket") +__success __retval(0) +__naked void ja_and_may_goto2(void) +{ + asm volatile (" \ +l0_%=: r0 = 0; \ + .byte 0xe5; /* may_goto */ \ + .byte 0; /* regs */ \ + .short 1; /* off 1 */ \ + .long 0; /* imm */ \ + goto l0_%=; \ + r0 = 0; \ + exit; \ +" ::: __clobber_common); +} + +SEC("socket") +__success __retval(0) +__naked void jlt_and_may_goto(void) +{ + asm volatile (" \ +l0_%=: call %[bpf_jiffies64]; \ + .byte 0xe5; /* may_goto */ \ + .byte 0; /* regs */ \ + .short 1; /* off 1 */ \ + .long 0; /* imm */ \ + if r0 < 10 goto l0_%=; \ + r0 = 0; \ + exit; \ +" :: __imm(bpf_jiffies64) + : __clobber_all); +} + +#if (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \ + (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64) || \ + defined(__TARGET_ARCH_arm) || defined(__TARGET_ARCH_s390) || \ + defined(__TARGET_ARCH_loongarch)) && \ + __clang_major__ >= 18 +SEC("socket") +__success __retval(0) +__naked void gotol_and_may_goto(void) +{ + asm volatile (" \ +l0_%=: r0 = 0; \ + .byte 0xe5; /* may_goto */ \ + .byte 0; /* regs */ \ + .short 1; /* off 1 */ \ + .long 0; /* imm */ \ + gotol l0_%=; \ + r0 = 0; \ + exit; \ +" ::: __clobber_common); +} +#endif + +SEC("socket") +__success __retval(0) +__naked void ja_and_may_goto_subprog(void) +{ + asm volatile (" \ + call subprog_with_may_goto; \ + exit; \ +" ::: __clobber_all); +} + +static __naked __noinline __used +void subprog_with_may_goto(void) +{ + asm volatile (" \ +l0_%=: .byte 0xe5; /* may_goto */ \ + .byte 0; /* regs */ \ + .short 1; /* off 1 */ \ + .long 0; /* imm */ \ + goto l0_%=; \ + r0 = 0; \ + exit; \ +" ::: __clobber_all); +} + #define ARR_SZ 1000000 int zero; char arr[ARR_SZ]; From 8324bb755a80d463ff53379e5d64991656512069 Mon Sep 17 00:00:00 2001 From: John Garry Date: Fri, 21 Jun 2024 18:30:16 +0000 Subject: [PATCH 1302/1578] block: Fix blk_validate_atomic_write_limits() build for arm32 For arm32, we get the following build warning: In file included from /tmp/next/build/include/linux/printk.h:10, from /tmp/next/build/include/linux/kernel.h:31, from /tmp/next/build/block/blk-settings.c:5: /tmp/next/build/block/blk-settings.c: In function 'blk_validate_atomic_write_limits': /tmp/next/build/include/asm-generic/div64.h:222:35: warning: comparison of distinct pointer types lacks a cast 222 | (void)(((typeof((n)) *)0) == ((uint64_t *)0)); \ | ^~ The divident for do_div() should be 64b, which it is not. Since we want to check 2x unsigned ints, just use % operator. This allows us to drop the chunk_sectors variable. Fixes: 9da3d1e912f3 ("block: Add core atomic write support") Reported-by: Mark Brown Closes: https://lore.kernel.org/linux-next/b765d200-4e0f-48b1-a962-7dfa1c4aef9c@kernel.dk/T/#mbf067b1edd89c7f9d7dac6e258c516199953a108 Signed-off-by: John Garry Link: https://lore.kernel.org/r/20240621183016.3092518-1-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- block/blk-settings.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index 37fe4c8f6b6ba1..ec7dbe93d5c324 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -175,7 +175,6 @@ static void blk_atomic_writes_update_limits(struct queue_limits *lim) static void blk_validate_atomic_write_limits(struct queue_limits *lim) { - unsigned int chunk_sectors = lim->chunk_sectors; unsigned int boundary_sectors; if (!lim->atomic_write_hw_max) @@ -197,7 +196,7 @@ static void blk_validate_atomic_write_limits(struct queue_limits *lim) * Devices which do not conform to these rules can be dealt * with if and when they show up. */ - if (WARN_ON_ONCE(do_div(chunk_sectors, boundary_sectors))) + if (WARN_ON_ONCE(lim->chunk_sectors % boundary_sectors)) goto unsupported; /* From cfa1a2329a691ffd991fcf7248a57d752e712881 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 21 Jun 2024 16:08:27 +0200 Subject: [PATCH 1303/1578] bpf: Fix overrunning reservations in ringbuf The BPF ring buffer internally is implemented as a power-of-2 sized circular buffer, with two logical and ever-increasing counters: consumer_pos is the consumer counter to show which logical position the consumer consumed the data, and producer_pos which is the producer counter denoting the amount of data reserved by all producers. Each time a record is reserved, the producer that "owns" the record will successfully advance producer counter. In user space each time a record is read, the consumer of the data advanced the consumer counter once it finished processing. Both counters are stored in separate pages so that from user space, the producer counter is read-only and the consumer counter is read-write. One aspect that simplifies and thus speeds up the implementation of both producers and consumers is how the data area is mapped twice contiguously back-to-back in the virtual memory, allowing to not take any special measures for samples that have to wrap around at the end of the circular buffer data area, because the next page after the last data page would be first data page again, and thus the sample will still appear completely contiguous in virtual memory. Each record has a struct bpf_ringbuf_hdr { u32 len; u32 pg_off; } header for book-keeping the length and offset, and is inaccessible to the BPF program. Helpers like bpf_ringbuf_reserve() return `(void *)hdr + BPF_RINGBUF_HDR_SZ` for the BPF program to use. Bing-Jhong and Muhammad reported that it is however possible to make a second allocated memory chunk overlapping with the first chunk and as a result, the BPF program is now able to edit first chunk's header. For example, consider the creation of a BPF_MAP_TYPE_RINGBUF map with size of 0x4000. Next, the consumer_pos is modified to 0x3000 /before/ a call to bpf_ringbuf_reserve() is made. This will allocate a chunk A, which is in [0x0,0x3008], and the BPF program is able to edit [0x8,0x3008]. Now, lets allocate a chunk B with size 0x3000. This will succeed because consumer_pos was edited ahead of time to pass the `new_prod_pos - cons_pos > rb->mask` check. Chunk B will be in range [0x3008,0x6010], and the BPF program is able to edit [0x3010,0x6010]. Due to the ring buffer memory layout mentioned earlier, the ranges [0x0,0x4000] and [0x4000,0x8000] point to the same data pages. This means that chunk B at [0x4000,0x4008] is chunk A's header. bpf_ringbuf_submit() / bpf_ringbuf_discard() use the header's pg_off to then locate the bpf_ringbuf itself via bpf_ringbuf_restore_from_rec(). Once chunk B modified chunk A's header, then bpf_ringbuf_commit() refers to the wrong page and could cause a crash. Fix it by calculating the oldest pending_pos and check whether the range from the oldest outstanding record to the newest would span beyond the ring buffer size. If that is the case, then reject the request. We've tested with the ring buffer benchmark in BPF selftests (./benchs/run_bench_ringbufs.sh) before/after the fix and while it seems a bit slower on some benchmarks, it is still not significantly enough to matter. Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it") Reported-by: Bing-Jhong Billy Jheng Reported-by: Muhammad Ramdhan Co-developed-by: Bing-Jhong Billy Jheng Co-developed-by: Andrii Nakryiko Signed-off-by: Bing-Jhong Billy Jheng Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240621140828.18238-1-daniel@iogearbox.net --- kernel/bpf/ringbuf.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 0ee653a936ea00..e20b90c3613169 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -51,7 +51,8 @@ struct bpf_ringbuf { * This prevents a user-space application from modifying the * position and ruining in-kernel tracking. The permissions of the * pages depend on who is producing samples: user-space or the - * kernel. + * kernel. Note that the pending counter is placed in the same + * page as the producer, so that it shares the same cache line. * * Kernel-producer * --------------- @@ -70,6 +71,7 @@ struct bpf_ringbuf { */ unsigned long consumer_pos __aligned(PAGE_SIZE); unsigned long producer_pos __aligned(PAGE_SIZE); + unsigned long pending_pos; char data[] __aligned(PAGE_SIZE); }; @@ -179,6 +181,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) rb->mask = data_sz - 1; rb->consumer_pos = 0; rb->producer_pos = 0; + rb->pending_pos = 0; return rb; } @@ -404,9 +407,9 @@ bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr) static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) { - unsigned long cons_pos, prod_pos, new_prod_pos, flags; - u32 len, pg_off; + unsigned long cons_pos, prod_pos, new_prod_pos, pend_pos, flags; struct bpf_ringbuf_hdr *hdr; + u32 len, pg_off, tmp_size, hdr_len; if (unlikely(size > RINGBUF_MAX_RECORD_SZ)) return NULL; @@ -424,13 +427,29 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) spin_lock_irqsave(&rb->spinlock, flags); } + pend_pos = rb->pending_pos; prod_pos = rb->producer_pos; new_prod_pos = prod_pos + len; - /* check for out of ringbuf space by ensuring producer position - * doesn't advance more than (ringbuf_size - 1) ahead + while (pend_pos < prod_pos) { + hdr = (void *)rb->data + (pend_pos & rb->mask); + hdr_len = READ_ONCE(hdr->len); + if (hdr_len & BPF_RINGBUF_BUSY_BIT) + break; + tmp_size = hdr_len & ~BPF_RINGBUF_DISCARD_BIT; + tmp_size = round_up(tmp_size + BPF_RINGBUF_HDR_SZ, 8); + pend_pos += tmp_size; + } + rb->pending_pos = pend_pos; + + /* check for out of ringbuf space: + * - by ensuring producer position doesn't advance more than + * (ringbuf_size - 1) ahead + * - by ensuring oldest not yet committed record until newest + * record does not span more than (ringbuf_size - 1) */ - if (new_prod_pos - cons_pos > rb->mask) { + if (new_prod_pos - cons_pos > rb->mask || + new_prod_pos - pend_pos > rb->mask) { spin_unlock_irqrestore(&rb->spinlock, flags); return NULL; } From 6ddc9deacc1312762c2edd9de00ce76b00f69f7c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 19 Jun 2024 09:51:08 -0400 Subject: [PATCH 1304/1578] SUNRPC: Fix backchannel reply, again I still see "RPC: Could not send backchannel reply error: -110" quite often, along with slow-running tests. Debugging shows that the backchannel is still stumbling when it has to queue a callback reply on a busy transport. Note that every one of these timeouts causes a connection loss by virtue of the xprt_conditional_disconnect() call in that arm of call_cb_transmit_status(). I found that setting to_maxval is necessary to get the RPC timeout logic to behave whenever to_exponential is not set. Fixes: 57331a59ac0d ("NFSv4.1: Use the nfs_client's rpc timeouts for backchannel") Signed-off-by: Chuck Lever Reviewed-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- net/sunrpc/svc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 2b4b1276d4e867..d9cda1e53a017a 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1557,9 +1557,11 @@ void svc_process(struct svc_rqst *rqstp) */ void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp) { + struct rpc_timeout timeout = { + .to_increment = 0, + }; struct rpc_task *task; int proc_error; - struct rpc_timeout timeout; /* Build the svc_rqst used by the common processing routine */ rqstp->rq_xid = req->rq_xid; @@ -1612,6 +1614,7 @@ void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp) timeout.to_initval = req->rq_xprt->timeout->to_initval; timeout.to_retries = req->rq_xprt->timeout->to_retries; } + timeout.to_maxval = timeout.to_initval; memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf)); task = rpc_run_bc_task(req, &timeout); From 31392048f55f98cb01ca709d32d06d926ab9760a Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 19 Jun 2024 15:34:57 +0200 Subject: [PATCH 1305/1578] vxlan: Pull inner IP header in vxlan_xmit_one(). Ensure the inner IP header is part of the skb's linear data before setting old_iph. Otherwise, on a non-linear skb, old_iph could point outside of the packet data. Unlike classical VXLAN, which always encapsulates Ethernet packets, VXLAN-GPE can transport IP packets directly. In that case, we need to look at skb->protocol to figure out if an Ethernet header is present. Fixes: d342894c5d2f ("vxlan: virtual extensible lan") Signed-off-by: Guillaume Nault Link: https://patch.msgid.link/2aa75f6fa62ac9dbe4f16ad5ba75dd04a51d4b99.1718804000.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 567cb3faab709c..ba59e92ab941de 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -2339,7 +2339,7 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, struct ip_tunnel_key *pkey; struct ip_tunnel_key key; struct vxlan_dev *vxlan = netdev_priv(dev); - const struct iphdr *old_iph = ip_hdr(skb); + const struct iphdr *old_iph; struct vxlan_metadata _md; struct vxlan_metadata *md = &_md; unsigned int pkt_len = skb->len; @@ -2353,8 +2353,15 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, bool use_cache; bool udp_sum = false; bool xnet = !net_eq(vxlan->net, dev_net(vxlan->dev)); + bool no_eth_encap; __be32 vni = 0; + no_eth_encap = flags & VXLAN_F_GPE && skb->protocol != htons(ETH_P_TEB); + if (!skb_vlan_inet_prepare(skb, no_eth_encap)) + goto drop; + + old_iph = ip_hdr(skb); + info = skb_tunnel_info(skb); use_cache = ip_tunnel_dst_cache_usable(skb, info); From 0cf81c73e4c6a4861128a8f27861176ec312af4e Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 21 Jun 2024 17:22:40 -0500 Subject: [PATCH 1306/1578] counter: ti-eqep: enable clock at probe The TI eQEP clock is both a functional and interface clock. Since it is required for the device to function, we should be enabling it at probe. Up to now, we've just been lucky that the clock was enabled by something else on the system already. Fixes: f213729f6796 ("counter: new TI eQEP driver") Reviewed-by: Judith Mendez Signed-off-by: David Lechner Link: https://lore.kernel.org/r/20240621-ti-eqep-enable-clock-v2-1-edd3421b54d4@baylibre.com Signed-off-by: William Breathitt Gray --- drivers/counter/ti-eqep.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/counter/ti-eqep.c b/drivers/counter/ti-eqep.c index 072b11fd6b32e8..825ae22c3ebc70 100644 --- a/drivers/counter/ti-eqep.c +++ b/drivers/counter/ti-eqep.c @@ -6,6 +6,7 @@ */ #include +#include #include #include #include @@ -376,6 +377,7 @@ static int ti_eqep_probe(struct platform_device *pdev) struct counter_device *counter; struct ti_eqep_cnt *priv; void __iomem *base; + struct clk *clk; int err; counter = devm_counter_alloc(dev, sizeof(*priv)); @@ -415,6 +417,10 @@ static int ti_eqep_probe(struct platform_device *pdev) pm_runtime_enable(dev); pm_runtime_get_sync(dev); + clk = devm_clk_get_enabled(dev, NULL); + if (IS_ERR(clk)) + return dev_err_probe(dev, PTR_ERR(clk), "failed to enable clock\n"); + err = counter_add(counter); if (err < 0) { pm_runtime_put_sync(dev); From d18b822c8f622ed37af7130088a0b7f1eb0b16e6 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 21 Jun 2024 09:30:08 +0200 Subject: [PATCH 1307/1578] docs: i2c: summary: start sentences consistently. Change the first paragraphs to contain only one space after the end of the previous sentence like in the rest of the document. Signed-off-by: Wolfram Sang Reviewed-by: Easwar Hariharan Signed-off-by: Wolfram Sang --- Documentation/i2c/summary.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/i2c/summary.rst b/Documentation/i2c/summary.rst index 786c618ba3befb..28ff80a2302bea 100644 --- a/Documentation/i2c/summary.rst +++ b/Documentation/i2c/summary.rst @@ -4,10 +4,10 @@ Introduction to I2C and SMBus I²C (pronounce: I squared C and written I2C in the kernel documentation) is a protocol developed by Philips. It is a slow two-wire protocol (variable -speed, up to 400 kHz), with a high speed extension (3.4 MHz). It provides +speed, up to 400 kHz), with a high speed extension (3.4 MHz). It provides an inexpensive bus for connecting many types of devices with infrequent or -low bandwidth communications needs. I2C is widely used with embedded -systems. Some systems use variants that don't meet branding requirements, +low bandwidth communications needs. I2C is widely used with embedded +systems. Some systems use variants that don't meet branding requirements, and so are not advertised as being I2C but come under different names, e.g. TWI (Two Wire Interface), IIC. @@ -18,14 +18,14 @@ access the PDF. An older version of the specification (revision 6) is archived `here `_. SMBus (System Management Bus) is based on the I2C protocol, and is mostly -a subset of I2C protocols and signaling. Many I2C devices will work on an +a subset of I2C protocols and signaling. Many I2C devices will work on an SMBus, but some SMBus protocols add semantics beyond what is required to -achieve I2C branding. Modern PC mainboards rely on SMBus. The most common +achieve I2C branding. Modern PC mainboards rely on SMBus. The most common devices connected through SMBus are RAM modules configured using I2C EEPROMs, and hardware monitoring chips. Because the SMBus is mostly a subset of the generalized I2C bus, we can -use its protocols on many I2C systems. However, there are systems that don't +use its protocols on many I2C systems. However, there are systems that don't meet both SMBus and I2C electrical constraints; and others which can't implement all the common SMBus protocol semantics or messages. From 75d148c90a34b94a3e3e7e7b2f30a689d8fbb7c8 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 21 Jun 2024 09:30:09 +0200 Subject: [PATCH 1308/1578] docs: i2c: summary: update I2C specification link MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Luckily, the specs are directly downloadable again, so update the link. Also update its title to the original name "I²C". Signed-off-by: Wolfram Sang Reviewed-by: Easwar Hariharan Signed-off-by: Wolfram Sang --- Documentation/i2c/summary.rst | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/Documentation/i2c/summary.rst b/Documentation/i2c/summary.rst index 28ff80a2302bea..e3ab1d414014d2 100644 --- a/Documentation/i2c/summary.rst +++ b/Documentation/i2c/summary.rst @@ -11,11 +11,9 @@ systems. Some systems use variants that don't meet branding requirements, and so are not advertised as being I2C but come under different names, e.g. TWI (Two Wire Interface), IIC. -The latest official I2C specification is the `"I2C-bus specification and user -manual" (UM10204) `_ -published by NXP Semiconductors. However, you need to log-in to the site to -access the PDF. An older version of the specification (revision 6) is archived -`here `_. +The latest official I2C specification is the `"I²C-bus specification and user +manual" (UM10204) `_ +published by NXP Semiconductors, version 7 as of this writing. SMBus (System Management Bus) is based on the I2C protocol, and is mostly a subset of I2C protocols and signaling. Many I2C devices will work on an From a5b88cb9fdff337a2867f0dff7c5cd23d4bd6663 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 21 Jun 2024 09:30:10 +0200 Subject: [PATCH 1309/1578] docs: i2c: summary: update speed mode description Fastest I2C mode is 5 MHz. Update the docs and reword the paragraph slightly. Signed-off-by: Wolfram Sang Reviewed-by: Easwar Hariharan Signed-off-by: Wolfram Sang --- Documentation/i2c/summary.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/i2c/summary.rst b/Documentation/i2c/summary.rst index e3ab1d414014d2..a1e5c0715f8b48 100644 --- a/Documentation/i2c/summary.rst +++ b/Documentation/i2c/summary.rst @@ -3,8 +3,8 @@ Introduction to I2C and SMBus ============================= I²C (pronounce: I squared C and written I2C in the kernel documentation) is -a protocol developed by Philips. It is a slow two-wire protocol (variable -speed, up to 400 kHz), with a high speed extension (3.4 MHz). It provides +a protocol developed by Philips. It is a two-wire protocol with variable +speed (typically up to 400 kHz, high speed modes up to 5 MHz). It provides an inexpensive bus for connecting many types of devices with infrequent or low bandwidth communications needs. I2C is widely used with embedded systems. Some systems use variants that don't meet branding requirements, From d77367fff7c0d67e20393a8236b519d5c48ee875 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 21 Jun 2024 09:30:11 +0200 Subject: [PATCH 1310/1578] docs: i2c: summary: document use of inclusive language We now have the updated I2C specs and our own Code of Conduct, so we have all we need to switch over to the inclusive terminology. Define them here. Signed-off-by: Wolfram Sang Reviewed-by: Easwar Hariharan Signed-off-by: Wolfram Sang --- Documentation/i2c/i2c_bus.svg | 15 ++++++++------- Documentation/i2c/summary.rst | 23 +++++++++++++++++------ 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/Documentation/i2c/i2c_bus.svg b/Documentation/i2c/i2c_bus.svg index 3170de976373cb..45801de4af7d9e 100644 --- a/Documentation/i2c/i2c_bus.svg +++ b/Documentation/i2c/i2c_bus.svg @@ -1,5 +1,6 @@ + I2CMaster + id="tspan1285">Controller Slave + id="tspan1287">Target Slave + id="tspan1287-6">Target Slave + id="tspan1287-9">Target Date: Fri, 21 Jun 2024 09:30:12 +0200 Subject: [PATCH 1311/1578] docs: i2c: summary: document 'local' and 'remote' targets Because Linux can be a target as well, add terminology to differentiate between Linux being the target and Linux accessing targets. Signed-off-by: Wolfram Sang Reviewed-by: Easwar Hariharan Signed-off-by: Wolfram Sang --- Documentation/i2c/summary.rst | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/Documentation/i2c/summary.rst b/Documentation/i2c/summary.rst index a6da1032fa0651..ff8bda32b9c3d2 100644 --- a/Documentation/i2c/summary.rst +++ b/Documentation/i2c/summary.rst @@ -49,10 +49,15 @@ whole class of I2C adapters. Each specific adapter driver either depends on an algorithm driver in the ``drivers/i2c/algos/`` subdirectory, or includes its own implementation. -A **target** chip is a node that responds to communications when addressed -by the controller. In Linux it is called a **client**. Client drivers are kept -in a directory specific to the feature they provide, for example -``drivers/media/gpio/`` for GPIO expanders and ``drivers/media/i2c/`` for +A **target** chip is a node that responds to communications when addressed by a +controller. In the Linux kernel implementation it is called a **client**. While +targets are usually separate external chips, Linux can also act as a target +(needs hardware support) and respond to another controller on the bus. This is +then called a **local target**. In contrast, an external chip is called a +**remote target**. + +Target drivers are kept in a directory specific to the feature they provide, +for example ``drivers/gpio/`` for GPIO expanders and ``drivers/media/i2c/`` for video-related chips. For the example configuration in figure, you will need a driver for your From 20738cb9fa7ad74c4f374c5b49c8189277df3a9d Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 21 Jun 2024 09:30:13 +0200 Subject: [PATCH 1312/1578] docs: i2c: summary: be clearer with 'controller/target' and 'adapter/client' pairs This not only includes rewording, but also where to put which emphasis on terms in this document. Signed-off-by: Wolfram Sang Reviewed-by: Easwar Hariharan Signed-off-by: Wolfram Sang --- Documentation/i2c/summary.rst | 41 ++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/Documentation/i2c/summary.rst b/Documentation/i2c/summary.rst index ff8bda32b9c3d2..579a1c7df200ee 100644 --- a/Documentation/i2c/summary.rst +++ b/Documentation/i2c/summary.rst @@ -31,9 +31,7 @@ implement all the common SMBus protocol semantics or messages. Terminology =========== -The I2C bus connects one or more *controller* chips and one or more *target* -chips. - +The I2C bus connects one or more controller chips and one or more target chips. .. kernel-figure:: i2c_bus.svg :alt: Simple I2C bus with one controller and 3 targets @@ -41,28 +39,37 @@ chips. Simple I2C bus A **controller** chip is a node that starts communications with targets. In the -Linux kernel implementation it is called an **adapter** or bus. Adapter -drivers are in the ``drivers/i2c/busses/`` subdirectory. +Linux kernel implementation it is also called an "adapter" or "bus". Controller +drivers are usually in the ``drivers/i2c/busses/`` subdirectory. -An **algorithm** contains general code that can be used to implement a -whole class of I2C adapters. Each specific adapter driver either depends on -an algorithm driver in the ``drivers/i2c/algos/`` subdirectory, or includes -its own implementation. +An **algorithm** contains general code that can be used to implement a whole +class of I2C controllers. Each specific controller driver either depends on an +algorithm driver in the ``drivers/i2c/algos/`` subdirectory, or includes its +own implementation. A **target** chip is a node that responds to communications when addressed by a -controller. In the Linux kernel implementation it is called a **client**. While -targets are usually separate external chips, Linux can also act as a target -(needs hardware support) and respond to another controller on the bus. This is -then called a **local target**. In contrast, an external chip is called a -**remote target**. +controller. In the Linux kernel implementation it is also called a "client". +While targets are usually separate external chips, Linux can also act as a +target (needs hardware support) and respond to another controller on the bus. +This is then called a **local target**. In contrast, an external chip is called +a **remote target**. Target drivers are kept in a directory specific to the feature they provide, for example ``drivers/gpio/`` for GPIO expanders and ``drivers/media/i2c/`` for video-related chips. -For the example configuration in figure, you will need a driver for your -I2C adapter, and drivers for your I2C devices (usually one driver for each -device). +For the example configuration in the figure above, you will need one driver for +the I2C controller, and drivers for your I2C targets. Usually one driver for +each target. + +Synonyms +-------- + +As mentioned above, the Linux I2C implementation historically uses the terms +"adapter" for controller and "client" for target. A number of data structures +have these synonyms in their name. So, when discussing implementation details, +you should be aware of these terms as well. The official wording is preferred, +though. Outdated terminology -------------------- From 49bbeb5719c2f56907d3a9623b47c6c15c2c431d Mon Sep 17 00:00:00 2001 From: Nick Child Date: Thu, 20 Jun 2024 10:23:12 -0500 Subject: [PATCH 1313/1578] ibmvnic: Free any outstanding tx skbs during scrq reset There are 2 types of outstanding tx skb's: Type 1: Packets that are sitting in the drivers ind_buff that are waiting to be batch sent to the NIC. During a device reset, these are freed with a call to ibmvnic_tx_scrq_clean_buffer() Type 2: Packets that have been sent to the NIC and are awaiting a TX completion IRQ. These are free'd during a reset with a call to clean_tx_pools() During any reset which requires us to free the tx irq, ensure that the Type 2 skb references are freed. Since the irq is released, it is impossible for the NIC to inform of any completions. Furthermore, later in the reset process is a call to init_tx_pools() which marks every entry in the tx pool as free (ie not outstanding). So if the driver is to make a call to init_tx_pools(), it must first be sure that the tx pool is empty of skb references. This issue was discovered by observing the following in the logs during EEH testing: TX free map points to untracked skb (tso_pool 0 idx=4) TX free map points to untracked skb (tso_pool 0 idx=5) TX free map points to untracked skb (tso_pool 1 idx=36) Fixes: 65d6470d139a ("ibmvnic: clean pending indirect buffs during reset") Signed-off-by: Nick Child Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 5e9a93bdb51886..5490f0f9c112b6 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -4061,6 +4061,12 @@ static void release_sub_crqs(struct ibmvnic_adapter *adapter, bool do_h_free) adapter->num_active_tx_scrqs = 0; } + /* Clean any remaining outstanding SKBs + * we freed the irq so we won't be hearing + * from them + */ + clean_tx_pools(adapter); + if (adapter->rx_scrq) { for (i = 0; i < adapter->num_active_rx_scrqs; i++) { if (!adapter->rx_scrq[i]) From dab8f9f0fe3aada61c0eb013dcf7d3ff75a2c336 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 21 Jun 2024 16:37:13 +0200 Subject: [PATCH 1314/1578] pwm: stm32: Fix calculation of prescaler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A small prescaler is beneficial, as this improves the resolution of the duty_cycle configuration. However if the prescaler is too small, the maximal possible period becomes considerably smaller than the requested value. One situation where this goes wrong is the following: With a parent clock rate of 208877930 Hz and max_arr = 0xffff = 65535, a request for period = 941243 ns currently results in PSC = 1. The value for ARR is then calculated to ARR = 941243 * 208877930 / (1000000000 * 2) - 1 = 98301 This value is bigger than 65535 however and so doesn't fit into the respective register field. In this particular case the PWM was configured for a period of 313733.4806027616 ns (with ARR = 98301 & 0xffff). Even if ARR was configured to its maximal value, only period = 627495.6861167669 ns would be achievable. Fix the calculation accordingly and adapt the comment to match the new algorithm. With the calculation fixed the above case results in PSC = 2 and so an actual period of 941229.1667195285 ns. Fixes: 8002fbeef1e4 ("pwm: stm32: Calculate prescaler with a division instead of a loop") Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/b4d96b79917617434a540df45f20cb5de4142f88.1718979150.git.u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-stm32.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/pwm/pwm-stm32.c b/drivers/pwm/pwm-stm32.c index 3e7b2a8e34e7dd..97d3de24f312fd 100644 --- a/drivers/pwm/pwm-stm32.c +++ b/drivers/pwm/pwm-stm32.c @@ -321,17 +321,23 @@ static int stm32_pwm_config(struct stm32_pwm *priv, unsigned int ch, * First we need to find the minimal value for prescaler such that * * period_ns * clkrate - * ------------------------------ + * ------------------------------ < max_arr + 1 * NSEC_PER_SEC * (prescaler + 1) * - * isn't bigger than max_arr. + * This equation is equivalent to + * + * period_ns * clkrate + * ---------------------------- < prescaler + 1 + * NSEC_PER_SEC * (max_arr + 1) + * + * Using integer division and knowing that the right hand side is + * integer, this is further equivalent to + * + * (period_ns * clkrate) // (NSEC_PER_SEC * (max_arr + 1)) ≤ prescaler */ prescaler = mul_u64_u64_div_u64(period_ns, clk_get_rate(priv->clk), - (u64)NSEC_PER_SEC * priv->max_arr); - if (prescaler > 0) - prescaler -= 1; - + (u64)NSEC_PER_SEC * ((u64)priv->max_arr + 1)); if (prescaler > MAX_TIM_PSC) return -EINVAL; From f01af3022d4a46362c5dda3d35dea939f3246d10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 21 Jun 2024 16:37:14 +0200 Subject: [PATCH 1315/1578] pwm: stm32: Fix error message to not describe the previous error path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "Failed to lock the clock" is an appropriate error message for clk_rate_exclusive_get() failing, but not for the clock running too fast for the driver's calculations. Adapt the error message accordingly. Fixes: d44d635635a7 ("pwm: stm32: Fix for settings using period > UINT32_MAX") Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/285182163211203fc823a65b180761f46e828dcb.1718979150.git.u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-stm32.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/pwm/pwm-stm32.c b/drivers/pwm/pwm-stm32.c index 97d3de24f312fd..8bae3fd2b33065 100644 --- a/drivers/pwm/pwm-stm32.c +++ b/drivers/pwm/pwm-stm32.c @@ -681,7 +681,8 @@ static int stm32_pwm_probe(struct platform_device *pdev) * .apply() won't overflow. */ if (clk_get_rate(priv->clk) > 1000000000) - return dev_err_probe(dev, -EINVAL, "Failed to lock clock\n"); + return dev_err_probe(dev, -EINVAL, "Clock freq too high (%lu)\n", + clk_get_rate(priv->clk)); chip->ops = &stm32pwm_ops; From 9bd01500e4d8c3c3387076581c19b3987776d7af Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 22 Jun 2024 17:22:24 -0400 Subject: [PATCH 1316/1578] bcachefs: Fix freeing of error pointers This fixes incorrect/missign checking of strndup_user() returns. Signed-off-by: Kent Overstreet --- fs/bcachefs/chardev.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 9e54323f0f5fce..6d82e1165adc2d 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -216,7 +216,8 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a ret = PTR_ERR_OR_ZERO(optstr) ?: bch2_parse_mount_opts(NULL, &thr->opts, optstr); - kfree(optstr); + if (!IS_ERR(optstr)) + kfree(optstr); if (ret) goto err; @@ -319,7 +320,8 @@ static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg) return ret; ret = bch2_dev_add(c, path); - kfree(path); + if (!IS_ERR(path)) + kfree(path); return ret; } @@ -850,7 +852,8 @@ static long bch2_ioctl_fsck_online(struct bch_fs *c, ret = PTR_ERR_OR_ZERO(optstr) ?: bch2_parse_mount_opts(c, &thr->opts, optstr); - kfree(optstr); + if (!IS_ERR(optstr)) + kfree(optstr); if (ret) goto err; From 85f86c5ede7697162c54744258908e657e456f57 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Sat, 1 Jun 2024 23:18:16 +0100 Subject: [PATCH 1317/1578] cdrom: Add missing MODULE_DESCRIPTION() make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/cdrom/cdrom.o Add the missing MODULE_DESCRIPTION() macro invocation. Signed-off-by: Jeff Johnson Link: https://lore.kernel.org/lkml/20240530-cdrom-v1-1-51579c5c240a@quicinc.com Reviewed-by: Phillip Potter Link: https://lore.kernel.org/lkml/ZluYQbvrJkRlhnJC@KernelVM Signed-off-by: Phillip Potter Link: https://lore.kernel.org/r/20240601221816.136977-2-phil@philpotter.co.uk Signed-off-by: Jens Axboe --- drivers/cdrom/cdrom.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 20c90ebb3a3f61..49e4829b726479 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -3708,4 +3708,5 @@ static void __exit cdrom_exit(void) module_init(cdrom_init); module_exit(cdrom_exit); +MODULE_DESCRIPTION("Uniform CD-ROM driver"); MODULE_LICENSE("GPL"); From f44cc269a1c148ad83332d85fe54607e8874ca79 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 22 Jun 2024 20:52:39 -0400 Subject: [PATCH 1318/1578] bcachefs: fix seqmutex_relock() We were grabbing the sequence number before unlock incremented it - fix this by moving the increment to seqmutex_lock() (so the seqmutex_relock() failure path skips the mutex_trylock()), and returning the sequence number from unlock(), to make the API simpler and safer. Signed-off-by: Kent Overstreet --- fs/bcachefs/debug.c | 8 ++------ fs/bcachefs/seqmutex.h | 11 ++++------- 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 51cbf39283612c..8ec2d44e4956ed 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -575,7 +575,6 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, struct bch_fs *c = i->c; struct btree_trans *trans; ssize_t ret = 0; - u32 seq; i->ubuf = buf; i->size = size; @@ -589,8 +588,7 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, continue; closure_get(&trans->ref); - seq = seqmutex_seq(&c->btree_trans_lock); - seqmutex_unlock(&c->btree_trans_lock); + u32 seq = seqmutex_unlock(&c->btree_trans_lock); ret = flush_buf(i); if (ret) { @@ -811,7 +809,6 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, struct bch_fs *c = i->c; struct btree_trans *trans; ssize_t ret = 0; - u32 seq; i->ubuf = buf; i->size = size; @@ -828,8 +825,7 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, continue; closure_get(&trans->ref); - seq = seqmutex_seq(&c->btree_trans_lock); - seqmutex_unlock(&c->btree_trans_lock); + u32 seq = seqmutex_unlock(&c->btree_trans_lock); ret = flush_buf(i); if (ret) { diff --git a/fs/bcachefs/seqmutex.h b/fs/bcachefs/seqmutex.h index c1860d8163fb14..c4b3d8d3f4149c 100644 --- a/fs/bcachefs/seqmutex.h +++ b/fs/bcachefs/seqmutex.h @@ -19,17 +19,14 @@ static inline bool seqmutex_trylock(struct seqmutex *lock) static inline void seqmutex_lock(struct seqmutex *lock) { mutex_lock(&lock->lock); -} - -static inline void seqmutex_unlock(struct seqmutex *lock) -{ lock->seq++; - mutex_unlock(&lock->lock); } -static inline u32 seqmutex_seq(struct seqmutex *lock) +static inline u32 seqmutex_unlock(struct seqmutex *lock) { - return lock->seq; + u32 seq = lock->seq; + mutex_unlock(&lock->lock); + return seq; } static inline bool seqmutex_relock(struct seqmutex *lock, u32 seq) From 18e92841e87bc548fcb91530115a66e72eecb10c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 22 Jun 2024 20:59:09 -0400 Subject: [PATCH 1319/1578] bcachefs: Make btree_deadlock_to_text() clearer btree_deadlock_to_text() searches the list of btree transactions to find a deadlock - when it finds one it's done; it's not like other *_read() functions that's printing each object. Factor out btree_deadlock_to_text() to make this clearer. Signed-off-by: Kent Overstreet --- fs/bcachefs/debug.c | 52 +++++++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 8ec2d44e4956ed..ecfdb21ebade9f 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -802,48 +802,54 @@ static const struct file_operations btree_transaction_stats_op = { .read = btree_transaction_stats_read, }; -static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) +/* walk btree transactions until we find a deadlock and print it */ +static void btree_deadlock_to_text(struct printbuf *out, struct bch_fs *c) { - struct dump_iter *i = file->private_data; - struct bch_fs *c = i->c; struct btree_trans *trans; - ssize_t ret = 0; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - if (i->iter) - goto out; + pid_t iter = 0; restart: seqmutex_lock(&c->btree_trans_lock); list_for_each_entry(trans, &c->btree_trans_list, list) { struct task_struct *task = READ_ONCE(trans->locking_wait.task); - if (!task || task->pid <= i->iter) + if (!task || task->pid <= iter) continue; + iter = task->pid; + closure_get(&trans->ref); - u32 seq = seqmutex_unlock(&c->btree_trans_lock); - ret = flush_buf(i); - if (ret) { - closure_put(&trans->ref); - goto out; - } + u32 seq = seqmutex_unlock(&c->btree_trans_lock); - bch2_check_for_deadlock(trans, &i->buf); - - i->iter = task->pid; + bool found = bch2_check_for_deadlock(trans, out) != 0; closure_put(&trans->ref); + if (found) + return; + if (!seqmutex_relock(&c->btree_trans_lock, seq)) goto restart; } seqmutex_unlock(&c->btree_trans_lock); -out: +} + +static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct bch_fs *c = i->c; + ssize_t ret = 0; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + if (!i->iter) { + btree_deadlock_to_text(&i->buf, c); + i->iter++; + } + if (i->buf.allocation_failure) ret = -ENOMEM; From 06efa5f30c28eaf237247ca8c4cb46eb62cb6bd9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 22 Jun 2024 21:38:58 -0400 Subject: [PATCH 1320/1578] closures: closure_get_not_zero(), closure_return_sync() Provide new primitives for solving a lifetime issue with bcachefs btree_trans objects. closure_sync_return(): like closure_sync(), wait synchronously for any outstanding gets. like closure_return, the closure is considered "finished" and the ref left at 0. closure_get_not_zero(): get a ref on a closure if it's alive, i.e. the ref is not zero. Signed-off-by: Kent Overstreet --- include/linux/closure.h | 23 ++++++++++++++++++ lib/closure.c | 52 ++++++++++++++++++++++++++++++++++++----- 2 files changed, 69 insertions(+), 6 deletions(-) diff --git a/include/linux/closure.h b/include/linux/closure.h index 99155df162d03c..59b8c06b11ff33 100644 --- a/include/linux/closure.h +++ b/include/linux/closure.h @@ -284,6 +284,21 @@ static inline void closure_get(struct closure *cl) #endif } +/** + * closure_get_not_zero + */ +static inline bool closure_get_not_zero(struct closure *cl) +{ + unsigned old = atomic_read(&cl->remaining); + do { + if (!(old & CLOSURE_REMAINING_MASK)) + return false; + + } while (!atomic_try_cmpxchg_acquire(&cl->remaining, &old, old + 1)); + + return true; +} + /** * closure_init - Initialize a closure, setting the refcount to 1 * @cl: closure to initialize @@ -310,6 +325,12 @@ static inline void closure_init_stack(struct closure *cl) atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); } +static inline void closure_init_stack_release(struct closure *cl) +{ + memset(cl, 0, sizeof(struct closure)); + atomic_set_release(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); +} + /** * closure_wake_up - wake up all closures on a wait list, * with memory barrier @@ -355,6 +376,8 @@ do { \ */ #define closure_return(_cl) continue_at((_cl), NULL, NULL) +void closure_return_sync(struct closure *cl); + /** * continue_at_nobarrier - jump to another function without barrier * diff --git a/lib/closure.c b/lib/closure.c index 2e1ee9fdec081b..c971216d9d7742 100644 --- a/lib/closure.c +++ b/lib/closure.c @@ -13,7 +13,7 @@ #include #include -static inline void closure_put_after_sub(struct closure *cl, int flags) +static inline void closure_put_after_sub_checks(int flags) { int r = flags & CLOSURE_REMAINING_MASK; @@ -22,12 +22,17 @@ static inline void closure_put_after_sub(struct closure *cl, int flags) flags & CLOSURE_GUARD_MASK, (unsigned) __fls(r))) r &= ~CLOSURE_GUARD_MASK; - if (!r) { - smp_acquire__after_ctrl_dep(); + WARN(!r && (flags & ~CLOSURE_DESTRUCTOR), + "closure ref hit 0 with incorrect flags set: %x (%u)", + flags & ~CLOSURE_DESTRUCTOR, (unsigned) __fls(flags)); +} + +static inline void closure_put_after_sub(struct closure *cl, int flags) +{ + closure_put_after_sub_checks(flags); - WARN(flags & ~CLOSURE_DESTRUCTOR, - "closure ref hit 0 with incorrect flags set: %x (%u)", - flags & ~CLOSURE_DESTRUCTOR, (unsigned) __fls(flags)); + if (!(flags & CLOSURE_REMAINING_MASK)) { + smp_acquire__after_ctrl_dep(); cl->closure_get_happened = false; @@ -145,6 +150,41 @@ void __sched __closure_sync(struct closure *cl) } EXPORT_SYMBOL(__closure_sync); +/* + * closure_return_sync - finish running a closure, synchronously (i.e. waiting + * for outstanding get()s to finish) and returning once closure refcount is 0. + * + * Unlike closure_sync() this doesn't reinit the ref to 1; subsequent + * closure_get_not_zero() calls waill fail. + */ +void __sched closure_return_sync(struct closure *cl) +{ + struct closure_syncer s = { .task = current }; + + cl->s = &s; + set_closure_fn(cl, closure_sync_fn, NULL); + + unsigned flags = atomic_sub_return_release(1 + CLOSURE_RUNNING - CLOSURE_DESTRUCTOR, + &cl->remaining); + + closure_put_after_sub_checks(flags); + + if (unlikely(flags & CLOSURE_REMAINING_MASK)) { + while (1) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (s.done) + break; + schedule(); + } + + __set_current_state(TASK_RUNNING); + } + + if (cl->parent) + closure_put(cl->parent); +} +EXPORT_SYMBOL(closure_return_sync); + int __sched __closure_sync_timeout(struct closure *cl, unsigned long timeout) { struct closure_syncer s = { .task = current }; From de611ab6fc5ed0d68dd46319b9913353e3b459e9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 22 Jun 2024 22:02:09 -0400 Subject: [PATCH 1321/1578] bcachefs: Fix race between trans_put() and btree_transactions_read() debug.c was using closure_get() on a different thread's closure where the we don't know if the object being refcounted is alive. We keep btree_trans objects on a list so they can be printed by debug code, and because it is cost prohibitive to touch the btree_trans list every time we allocate and free btree_trans objects, cached objects are also on this list. However, we do not want the debug code to see cached but not in use btree_trans objects - critically because the btree_paths array will have been freed (if it was reallocated). closure_get() is also incorrect to use when that get may race with it hitting zero, i.e. we must already have a ref on the object or know the ref can't currently hit 0 for other reasons (as used in the cycle detector). to fix this, use the previously introduced closure_get_not_zero(), closure_return_sync(), and closure_init_stack_release(); the debug code now can only take a ref on a trans object if it's alive and in use. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 10 ++++------ fs/bcachefs/debug.c | 19 +++++++++---------- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 3a1419d1788856..15c1c7cfefe604 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -3130,7 +3130,6 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS); memset(trans, 0, sizeof(*trans)); - closure_init_stack(&trans->ref); seqmutex_lock(&c->btree_trans_lock); if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { @@ -3161,7 +3160,6 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) list_add_done: seqmutex_unlock(&c->btree_trans_lock); got_trans: - trans->ref.closure_get_happened = false; trans->c = c; trans->last_begin_time = local_clock(); trans->fn_idx = fn_idx; @@ -3200,6 +3198,8 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); trans->srcu_lock_time = jiffies; trans->srcu_held = true; + + closure_init_stack_release(&trans->ref); return trans; } @@ -3257,10 +3257,10 @@ void bch2_trans_put(struct btree_trans *trans) bch2_journal_keys_put(c); /* - * trans->ref protects trans->locking_wait.task, btree_paths arary; used + * trans->ref protects trans->locking_wait.task, btree_paths array; used * by cycle detector */ - closure_sync(&trans->ref); + closure_return_sync(&trans->ref); trans->locking_wait.task = NULL; unsigned long *paths_allocated = trans->paths_allocated; @@ -3385,8 +3385,6 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) per_cpu_ptr(c->btree_trans_bufs, cpu)->trans; if (trans) { - closure_sync(&trans->ref); - seqmutex_lock(&c->btree_trans_lock); list_del(&trans->list); seqmutex_unlock(&c->btree_trans_lock); diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index ecfdb21ebade9f..61c50522abb955 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -587,14 +587,10 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, if (!task || task->pid <= i->iter) continue; - closure_get(&trans->ref); - u32 seq = seqmutex_unlock(&c->btree_trans_lock); + if (!closure_get_not_zero(&trans->ref)) + continue; - ret = flush_buf(i); - if (ret) { - closure_put(&trans->ref); - goto unlocked; - } + u32 seq = seqmutex_unlock(&c->btree_trans_lock); bch2_btree_trans_to_text(&i->buf, trans); @@ -604,10 +600,12 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, printbuf_indent_sub(&i->buf, 2); prt_newline(&i->buf); - i->iter = task->pid; - closure_put(&trans->ref); + ret = flush_buf(i); + if (ret) + goto unlocked; + if (!seqmutex_relock(&c->btree_trans_lock, seq)) goto restart; } @@ -817,7 +815,8 @@ static void btree_deadlock_to_text(struct printbuf *out, struct bch_fs *c) iter = task->pid; - closure_get(&trans->ref); + if (!closure_get_not_zero(&trans->ref)) + continue; u32 seq = seqmutex_unlock(&c->btree_trans_lock); From 1aaf5cb41b8e92dcd3ac7e047124cb0e3e27f1c1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 22 Jun 2024 22:11:01 -0400 Subject: [PATCH 1322/1578] bcachefs: Fix btree_trans list ordering The debug code relies on btree_trans_list being ordered so that it can resume on subsequent calls or lock restarts. However, it was using trans->locknig_wait.task.pid, which is incorrect since btree_trans objects are cached and reused - typically by different tasks. Fix this by switching to pointer order, and also sort them lazily when required - speeding up the btree_trans_get() fastpath. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 9 ++------- fs/bcachefs/debug.c | 36 ++++++++++++++++++++++++++++++++---- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 15c1c7cfefe604..0ed9e6574fcd0d 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -3149,15 +3149,10 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) BUG_ON(pos_task && pid == pos_task->pid && pos->locked); - - if (pos_task && pid < pos_task->pid) { - list_add_tail(&trans->list, &pos->list); - goto list_add_done; - } } } - list_add_tail(&trans->list, &c->btree_trans_list); -list_add_done: + + list_add(&trans->list, &c->btree_trans_list); seqmutex_unlock(&c->btree_trans_lock); got_trans: trans->c = c; diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 61c50522abb955..f0d4727c4dc290 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -568,6 +568,32 @@ static const struct file_operations cached_btree_nodes_ops = { .read = bch2_cached_btree_nodes_read, }; +typedef int (*list_cmp_fn)(const struct list_head *l, const struct list_head *r); + +static void list_sort(struct list_head *head, list_cmp_fn cmp) +{ + struct list_head *pos; + + list_for_each(pos, head) + while (!list_is_last(pos, head) && + cmp(pos, pos->next) > 0) { + struct list_head *pos2, *next = pos->next; + + list_del(next); + list_for_each(pos2, head) + if (cmp(next, pos2) < 0) + goto pos_found; + BUG(); +pos_found: + list_add_tail(next, pos2); + } +} + +static int list_ptr_order_cmp(const struct list_head *l, const struct list_head *r) +{ + return cmp_int(l, r); +} + static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { @@ -581,12 +607,14 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, i->ret = 0; restart: seqmutex_lock(&c->btree_trans_lock); - list_for_each_entry(trans, &c->btree_trans_list, list) { - struct task_struct *task = READ_ONCE(trans->locking_wait.task); + list_sort(&c->btree_trans_list, list_ptr_order_cmp); - if (!task || task->pid <= i->iter) + list_for_each_entry(trans, &c->btree_trans_list, list) { + if ((ulong) trans < i->iter) continue; + i->iter = (ulong) trans; + if (!closure_get_not_zero(&trans->ref)) continue; @@ -596,7 +624,7 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, prt_printf(&i->buf, "backtrace:\n"); printbuf_indent_add(&i->buf, 2); - bch2_prt_task_backtrace(&i->buf, task, 0, GFP_KERNEL); + bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task, 0, GFP_KERNEL); printbuf_indent_sub(&i->buf, 2); prt_newline(&i->buf); From 42354e3c3150cf886457f3354af0acefc56b53a2 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Fri, 21 Jun 2024 15:00:59 +0200 Subject: [PATCH 1323/1578] netlink: specs: Fix pse-set command attributes Not all PSE attributes are used for the pse-set netlink command. Select only the ones used by ethtool. Fixes: f8586411e40e ("netlink: specs: Expand the pse netlink command with PoE interface") Signed-off-by: Kory Maincent Reviewed-by: Donald Hunter Signed-off-by: David S. Miller --- Documentation/netlink/specs/ethtool.yaml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 00dc61358be8c5..4510e8d1adcb8f 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -1603,7 +1603,7 @@ operations: attributes: - header reply: - attributes: &pse + attributes: - header - podl-pse-admin-state - podl-pse-admin-control @@ -1620,7 +1620,10 @@ operations: do: request: - attributes: *pse + attributes: + - header + - podl-pse-admin-control + - c33-pse-admin-control - name: rss-get doc: Get RSS params. From 54a4e5c16382e871c01dd82b47e930fdce30406b Mon Sep 17 00:00:00 2001 From: Enguerrand de Ribaucourt Date: Fri, 21 Jun 2024 16:43:20 +0200 Subject: [PATCH 1324/1578] net: phy: micrel: add Microchip KSZ 9477 to the device table PHY_ID_KSZ9477 was supported but not added to the device table passed to MODULE_DEVICE_TABLE. Fixes: fc3973a1fa09 ("phy: micrel: add Microchip KSZ 9477 Switch PHY support") Signed-off-by: Enguerrand de Ribaucourt Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/micrel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 5aada7cf3da726..ebafedde0ab745 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -5607,6 +5607,7 @@ static struct mdio_device_id __maybe_unused micrel_tbl[] = { { PHY_ID_KSZ8081, MICREL_PHY_ID_MASK }, { PHY_ID_KSZ8873MLL, MICREL_PHY_ID_MASK }, { PHY_ID_KSZ886X, MICREL_PHY_ID_MASK }, + { PHY_ID_KSZ9477, MICREL_PHY_ID_MASK }, { PHY_ID_LAN8814, MICREL_PHY_ID_MASK }, { PHY_ID_LAN8804, MICREL_PHY_ID_MASK }, { PHY_ID_LAN8841, MICREL_PHY_ID_MASK }, From d963c95bc9840d070a788c35e41b715a648717f7 Mon Sep 17 00:00:00 2001 From: Enguerrand de Ribaucourt Date: Fri, 21 Jun 2024 16:43:21 +0200 Subject: [PATCH 1325/1578] net: dsa: microchip: use collision based back pressure mode Errata DS80000758 states that carrier sense back pressure mode can cause link down issues in 100BASE-TX half duplex mode. The datasheet also recommends to always use the collision based back pressure mode. Fixes: b987e98e50ab ("dsa: add DSA switch driver for Microchip KSZ9477") Signed-off-by: Enguerrand de Ribaucourt Reviewed-by: Woojung Huh Acked-by: Arun Ramadoss Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/microchip/ksz9477.c | 4 ++++ drivers/net/dsa/microchip/ksz9477_reg.h | 1 + 2 files changed, 5 insertions(+) diff --git a/drivers/net/dsa/microchip/ksz9477.c b/drivers/net/dsa/microchip/ksz9477.c index 2231128eef8b8f..cbaca4140cda3b 100644 --- a/drivers/net/dsa/microchip/ksz9477.c +++ b/drivers/net/dsa/microchip/ksz9477.c @@ -1297,6 +1297,10 @@ int ksz9477_setup(struct dsa_switch *ds) /* Enable REG_SW_MTU__2 reg by setting SW_JUMBO_PACKET */ ksz_cfg(dev, REG_SW_MAC_CTRL_1, SW_JUMBO_PACKET, true); + /* Use collision based back pressure mode. */ + ksz_cfg(dev, REG_SW_MAC_CTRL_1, SW_BACK_PRESSURE, + SW_BACK_PRESSURE_COLLISION); + /* Now we can configure default MTU value */ ret = regmap_update_bits(ksz_regmap_16(dev), REG_SW_MTU__2, REG_SW_MTU_MASK, VLAN_ETH_FRAME_LEN + ETH_FCS_LEN); diff --git a/drivers/net/dsa/microchip/ksz9477_reg.h b/drivers/net/dsa/microchip/ksz9477_reg.h index f3a205ee483f21..fb124be8edd30d 100644 --- a/drivers/net/dsa/microchip/ksz9477_reg.h +++ b/drivers/net/dsa/microchip/ksz9477_reg.h @@ -247,6 +247,7 @@ #define REG_SW_MAC_CTRL_1 0x0331 #define SW_BACK_PRESSURE BIT(5) +#define SW_BACK_PRESSURE_COLLISION 0 #define FAIR_FLOW_CTRL BIT(4) #define NO_EXC_COLLISION_DROP BIT(3) #define SW_JUMBO_PACKET BIT(2) From bf1bff11e497a01b0cc6cb2afcff734340ae95f8 Mon Sep 17 00:00:00 2001 From: Enguerrand de Ribaucourt Date: Fri, 21 Jun 2024 16:43:22 +0200 Subject: [PATCH 1326/1578] net: dsa: microchip: monitor potential faults in half-duplex mode The errata DS80000754 recommends monitoring potential faults in half-duplex mode for the KSZ9477 family. half-duplex is not very common so I just added a critical message when the fault conditions are detected. The switch can be expected to be unable to communicate anymore in these states and a software reset of the switch would be required which I did not implement. Fixes: b987e98e50ab ("dsa: add DSA switch driver for Microchip KSZ9477") Signed-off-by: Enguerrand de Ribaucourt Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/microchip/ksz9477.c | 51 +++++++++++++++++++++++++ drivers/net/dsa/microchip/ksz9477.h | 2 + drivers/net/dsa/microchip/ksz9477_reg.h | 10 ++++- drivers/net/dsa/microchip/ksz_common.c | 11 ++++++ drivers/net/dsa/microchip/ksz_common.h | 1 + 5 files changed, 73 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/microchip/ksz9477.c b/drivers/net/dsa/microchip/ksz9477.c index cbaca4140cda3b..425e20daf1e92d 100644 --- a/drivers/net/dsa/microchip/ksz9477.c +++ b/drivers/net/dsa/microchip/ksz9477.c @@ -427,6 +427,57 @@ void ksz9477_freeze_mib(struct ksz_device *dev, int port, bool freeze) mutex_unlock(&p->mib.cnt_mutex); } +int ksz9477_errata_monitor(struct ksz_device *dev, int port, + u64 tx_late_col) +{ + u32 pmavbc; + u8 status; + u16 pqm; + int ret; + + ret = ksz_pread8(dev, port, REG_PORT_STATUS_0, &status); + if (ret) + return ret; + if (!(FIELD_GET(PORT_INTF_SPEED_MASK, status) == PORT_INTF_SPEED_NONE) && + !(status & PORT_INTF_FULL_DUPLEX)) { + /* Errata DS80000754 recommends monitoring potential faults in + * half-duplex mode. The switch might not be able to communicate anymore + * in these states. + * If you see this message, please read the errata-sheet for more information: + * https://ww1.microchip.com/downloads/aemDocuments/documents/UNG/ProductDocuments/Errata/KSZ9477S-Errata-DS80000754.pdf + * To workaround this issue, half-duplex mode should be avoided. + * A software reset could be implemented to recover from this state. + */ + dev_warn_once(dev->dev, + "Half-duplex detected on port %d, transmission halt may occur\n", + port); + if (tx_late_col != 0) { + /* Transmission halt with late collisions */ + dev_crit_once(dev->dev, + "TX late collisions detected, transmission may be halted on port %d\n", + port); + } + ret = ksz_read8(dev, REG_SW_LUE_CTRL_0, &status); + if (ret) + return ret; + if (status & SW_VLAN_ENABLE) { + ret = ksz_pread16(dev, port, REG_PORT_QM_TX_CNT_0__4, &pqm); + if (ret) + return ret; + ret = ksz_read32(dev, REG_PMAVBC, &pmavbc); + if (ret) + return ret; + if ((FIELD_GET(PMAVBC_MASK, pmavbc) <= PMAVBC_MIN) || + (FIELD_GET(PORT_QM_TX_CNT_M, pqm) >= PORT_QM_TX_CNT_MAX)) { + /* Transmission halt with Half-Duplex and VLAN */ + dev_crit_once(dev->dev, + "resources out of limits, transmission may be halted\n"); + } + } + } + return ret; +} + void ksz9477_port_init_cnt(struct ksz_device *dev, int port) { struct ksz_port_mib *mib = &dev->ports[port].mib; diff --git a/drivers/net/dsa/microchip/ksz9477.h b/drivers/net/dsa/microchip/ksz9477.h index ce1e656b800b15..239a281da10b3f 100644 --- a/drivers/net/dsa/microchip/ksz9477.h +++ b/drivers/net/dsa/microchip/ksz9477.h @@ -36,6 +36,8 @@ int ksz9477_port_mirror_add(struct ksz_device *dev, int port, bool ingress, struct netlink_ext_ack *extack); void ksz9477_port_mirror_del(struct ksz_device *dev, int port, struct dsa_mall_mirror_tc_entry *mirror); +int ksz9477_errata_monitor(struct ksz_device *dev, int port, + u64 tx_late_col); void ksz9477_get_caps(struct ksz_device *dev, int port, struct phylink_config *config); int ksz9477_fdb_dump(struct ksz_device *dev, int port, diff --git a/drivers/net/dsa/microchip/ksz9477_reg.h b/drivers/net/dsa/microchip/ksz9477_reg.h index fb124be8edd30d..d5354c600ea1e9 100644 --- a/drivers/net/dsa/microchip/ksz9477_reg.h +++ b/drivers/net/dsa/microchip/ksz9477_reg.h @@ -843,8 +843,8 @@ #define REG_PORT_STATUS_0 0x0030 -#define PORT_INTF_SPEED_M 0x3 -#define PORT_INTF_SPEED_S 3 +#define PORT_INTF_SPEED_MASK GENMASK(4, 3) +#define PORT_INTF_SPEED_NONE GENMASK(1, 0) #define PORT_INTF_FULL_DUPLEX BIT(2) #define PORT_TX_FLOW_CTRL BIT(1) #define PORT_RX_FLOW_CTRL BIT(0) @@ -1168,6 +1168,11 @@ #define PORT_RMII_CLK_SEL BIT(7) #define PORT_MII_SEL_EDGE BIT(5) +#define REG_PMAVBC 0x03AC + +#define PMAVBC_MASK GENMASK(26, 16) +#define PMAVBC_MIN 0x580 + /* 4 - MAC */ #define REG_PORT_MAC_CTRL_0 0x0400 @@ -1495,6 +1500,7 @@ #define PORT_QM_TX_CNT_USED_S 0 #define PORT_QM_TX_CNT_M (BIT(11) - 1) +#define PORT_QM_TX_CNT_MAX 0x200 #define REG_PORT_QM_TX_CNT_1__4 0x0A14 diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index 2818e24e2a51dd..0433109b42e570 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -1382,6 +1382,7 @@ const struct ksz_chip_data ksz_switch_chips[] = { .tc_cbs_supported = true, .ops = &ksz9477_dev_ops, .phylink_mac_ops = &ksz9477_phylink_mac_ops, + .phy_errata_9477 = true, .mib_names = ksz9477_mib_names, .mib_cnt = ARRAY_SIZE(ksz9477_mib_names), .reg_mib_cnt = MIB_COUNTER_NUM, @@ -1416,6 +1417,7 @@ const struct ksz_chip_data ksz_switch_chips[] = { .num_ipms = 8, .ops = &ksz9477_dev_ops, .phylink_mac_ops = &ksz9477_phylink_mac_ops, + .phy_errata_9477 = true, .mib_names = ksz9477_mib_names, .mib_cnt = ARRAY_SIZE(ksz9477_mib_names), .reg_mib_cnt = MIB_COUNTER_NUM, @@ -1450,6 +1452,7 @@ const struct ksz_chip_data ksz_switch_chips[] = { .num_ipms = 8, .ops = &ksz9477_dev_ops, .phylink_mac_ops = &ksz9477_phylink_mac_ops, + .phy_errata_9477 = true, .mib_names = ksz9477_mib_names, .mib_cnt = ARRAY_SIZE(ksz9477_mib_names), .reg_mib_cnt = MIB_COUNTER_NUM, @@ -1540,6 +1543,7 @@ const struct ksz_chip_data ksz_switch_chips[] = { .tc_cbs_supported = true, .ops = &ksz9477_dev_ops, .phylink_mac_ops = &ksz9477_phylink_mac_ops, + .phy_errata_9477 = true, .mib_names = ksz9477_mib_names, .mib_cnt = ARRAY_SIZE(ksz9477_mib_names), .reg_mib_cnt = MIB_COUNTER_NUM, @@ -1820,6 +1824,7 @@ void ksz_r_mib_stats64(struct ksz_device *dev, int port) struct rtnl_link_stats64 *stats; struct ksz_stats_raw *raw; struct ksz_port_mib *mib; + int ret; mib = &dev->ports[port].mib; stats = &mib->stats64; @@ -1861,6 +1866,12 @@ void ksz_r_mib_stats64(struct ksz_device *dev, int port) pstats->rx_pause_frames = raw->rx_pause; spin_unlock(&mib->stats64_lock); + + if (dev->info->phy_errata_9477) { + ret = ksz9477_errata_monitor(dev, port, raw->tx_late_col); + if (ret) + dev_err(dev->dev, "Failed to monitor transmission halt\n"); + } } void ksz88xx_r_mib_stats64(struct ksz_device *dev, int port) diff --git a/drivers/net/dsa/microchip/ksz_common.h b/drivers/net/dsa/microchip/ksz_common.h index c784fd23a99372..ee7db46e469d59 100644 --- a/drivers/net/dsa/microchip/ksz_common.h +++ b/drivers/net/dsa/microchip/ksz_common.h @@ -66,6 +66,7 @@ struct ksz_chip_data { bool tc_cbs_supported; const struct ksz_dev_ops *ops; const struct phylink_mac_ops *phylink_mac_ops; + bool phy_errata_9477; bool ksz87xx_eee_link_erratum; const struct ksz_mib_names *mib_names; int mib_cnt; From 8a67cbd47bf431a1f531ba73e952ce4c114a33a5 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Fri, 21 Jun 2024 13:00:00 -0400 Subject: [PATCH 1327/1578] dt-bindings: net: fman: remove ptp-timer from required list IEEE1588(ptp) is optional feature for network. Remove it from required list to fix below CHECK_DTBS warning. arch/arm64/boot/dts/freescale/fsl-ls1043a-qds.dtb: ethernet@f0000: 'ptp-timer' is a required property Signed-off-by: Frank Li Acked-by: Krzysztof Kozlowski Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/fsl,fman-dtsec.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/Documentation/devicetree/bindings/net/fsl,fman-dtsec.yaml b/Documentation/devicetree/bindings/net/fsl,fman-dtsec.yaml index c80c880a9dab4e..60aaf30d68edf5 100644 --- a/Documentation/devicetree/bindings/net/fsl,fman-dtsec.yaml +++ b/Documentation/devicetree/bindings/net/fsl,fman-dtsec.yaml @@ -128,7 +128,6 @@ required: - cell-index - reg - fsl,fman-ports - - ptp-timer dependencies: pcs-handle-names: From f4b91c1d17c676b8ad4c6bd674da874f3f7d5701 Mon Sep 17 00:00:00 2001 From: Jan Sokolowski Date: Fri, 21 Jun 2024 10:54:19 -0700 Subject: [PATCH 1328/1578] ice: Rebuild TC queues on VSI queue reconfiguration TC queues needs to be correctly updated when the number of queues on a VSI is reconfigured, so netdev's queue and TC settings will be dynamically adjusted and could accurately represent the underlying hardware state after changes to the VSI queue counts. Fixes: 0754d65bd4be ("ice: Add infrastructure for mqprio support via ndo_setup_tc") Reviewed-by: Wojciech Drewek Signed-off-by: Jan Sokolowski Signed-off-by: Karen Ostrowska Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/ice/ice_main.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 1766230abfff65..55a42aad92a51a 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -4139,7 +4139,7 @@ bool ice_is_wol_supported(struct ice_hw *hw) int ice_vsi_recfg_qs(struct ice_vsi *vsi, int new_rx, int new_tx, bool locked) { struct ice_pf *pf = vsi->back; - int err = 0, timeout = 50; + int i, err = 0, timeout = 50; if (!new_rx && !new_tx) return -EINVAL; @@ -4165,6 +4165,14 @@ int ice_vsi_recfg_qs(struct ice_vsi *vsi, int new_rx, int new_tx, bool locked) ice_vsi_close(vsi); ice_vsi_rebuild(vsi, ICE_VSI_FLAG_NO_INIT); + + ice_for_each_traffic_class(i) { + if (vsi->tc_cfg.ena_tc & BIT(i)) + netdev_set_tc_queue(vsi->netdev, + vsi->tc_cfg.tc_info[i].netdev_tc, + vsi->tc_cfg.tc_info[i].qcount_tx, + vsi->tc_cfg.tc_info[i].qoffset); + } ice_pf_dcb_recfg(pf, locked); ice_vsi_open(vsi); done: From 36da8e387b0632d4c43d67849a5b506fa79fcadd Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 23 Jun 2024 10:10:43 -0400 Subject: [PATCH 1329/1578] bcachefs: Add missing recalc_capacity() call This fixes filesystem size not changing on device removal. Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 9083df82073a56..641f2975177b61 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1529,6 +1529,7 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) * The allocator thread itself allocates btree nodes, so stop it first: */ bch2_dev_allocator_remove(c, ca); + bch2_recalc_capacity(c); bch2_dev_journal_stop(&c->journal, ca); } From 2d64eaeeeda5659d52da1af79d237269ba3c2d2c Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Sun, 23 Jun 2024 11:41:13 +0800 Subject: [PATCH 1330/1578] irqchip/loongson-eiointc: Use early_cpu_to_node() instead of cpu_to_node() Multi-bridge machines required that all eiointc controllers in the system are initialized, otherwise the system does not boot. The initialization happens on the boot CPU during early boot and relies on cpu_to_node() for identifying the individual nodes. That works when the number of possible CPUs is large enough, but with a command line limit, e.g. "nr_cpus=$N" for kdump, but fails when the CPUs of the secondary nodes are not covered. During early ACPI enumeration all CPU to node mappings are recorded up to CONFIG_NR_CPUS. These are accessible via early_cpu_to_node() even in the case that "nr_cpus=N" truncates the number of possible CPUs and only provides the possible CPUs via cpu_to_node() translation. Change the node lookup in the driver to use early_cpu_to_node() so that even with a limitation on the number of possible CPUs all eointc instances are initialized. This can't obviously cure the case where CONFIG_NR_CPUS is too small. [ tglx: Massaged changelog ] Fixes: 64cc451e45e1 ("irqchip/loongson-eiointc: Fix incorrect use of acpi_get_vec_parent") Signed-off-by: Huacai Chen Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240623034113.1808727-1-chenhuacai@loongson.cn --- drivers/irqchip/irq-loongson-eiointc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-loongson-eiointc.c b/drivers/irqchip/irq-loongson-eiointc.c index c7ddebf312ad2c..b1f2080be2beb2 100644 --- a/drivers/irqchip/irq-loongson-eiointc.c +++ b/drivers/irqchip/irq-loongson-eiointc.c @@ -15,6 +15,7 @@ #include #include #include +#include #define EIOINTC_REG_NODEMAP 0x14a0 #define EIOINTC_REG_IPMAP 0x14c0 @@ -339,7 +340,7 @@ static int __init pch_msi_parse_madt(union acpi_subtable_headers *header, int node; if (cpu_has_flatmode) - node = cpu_to_node(eiointc_priv[nr_pics - 1]->node * CORES_PER_EIO_NODE); + node = early_cpu_to_node(eiointc_priv[nr_pics - 1]->node * CORES_PER_EIO_NODE); else node = eiointc_priv[nr_pics - 1]->node; @@ -431,7 +432,7 @@ int __init eiointc_acpi_init(struct irq_domain *parent, goto out_free_handle; if (cpu_has_flatmode) - node = cpu_to_node(acpi_eiointc->node * CORES_PER_EIO_NODE); + node = early_cpu_to_node(acpi_eiointc->node * CORES_PER_EIO_NODE); else node = acpi_eiointc->node; acpi_set_vec_parent(node, priv->eiointc_domain, pch_group); From a9c3ee5d0fdb069b54902300df6ac822027f3b0a Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Sat, 22 Jun 2024 12:33:38 +0800 Subject: [PATCH 1331/1578] irqchip/loongson-liointc: Set different ISRs for different cores The liointc hardware provides separate Interrupt Status Registers (ISR) for each core. The current code uses always the ISR of core #0, which works during boot because by default all interrupts are routed to core #0. When the interrupt routing changes in the firmware configuration then this causes interrupts to be lost because they are not configured in the corresponding core. Use the core index to access the correct ISR instead of a hardcoded 0. [ tglx: Massaged changelog ] Fixes: 0858ed035a85 ("irqchip/loongson-liointc: Add ACPI init support") Co-developed-by: Tianli Xiong Signed-off-by: Tianli Xiong Signed-off-by: Huacai Chen Signed-off-by: Thomas Gleixner Cc: Link: https://lore.kernel.org/r/20240622043338.1566945-1-chenhuacai@loongson.cn --- drivers/irqchip/irq-loongson-liointc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-loongson-liointc.c b/drivers/irqchip/irq-loongson-liointc.c index e4b33aed1c97b3..7c4fe7ab4b830e 100644 --- a/drivers/irqchip/irq-loongson-liointc.c +++ b/drivers/irqchip/irq-loongson-liointc.c @@ -28,7 +28,7 @@ #define LIOINTC_INTC_CHIP_START 0x20 -#define LIOINTC_REG_INTC_STATUS (LIOINTC_INTC_CHIP_START + 0x20) +#define LIOINTC_REG_INTC_STATUS(core) (LIOINTC_INTC_CHIP_START + 0x20 + (core) * 8) #define LIOINTC_REG_INTC_EN_STATUS (LIOINTC_INTC_CHIP_START + 0x04) #define LIOINTC_REG_INTC_ENABLE (LIOINTC_INTC_CHIP_START + 0x08) #define LIOINTC_REG_INTC_DISABLE (LIOINTC_INTC_CHIP_START + 0x0c) @@ -217,7 +217,7 @@ static int liointc_init(phys_addr_t addr, unsigned long size, int revision, goto out_free_priv; for (i = 0; i < LIOINTC_NUM_CORES; i++) - priv->core_isr[i] = base + LIOINTC_REG_INTC_STATUS; + priv->core_isr[i] = base + LIOINTC_REG_INTC_STATUS(i); for (i = 0; i < LIOINTC_NUM_PARENT; i++) priv->handler[i].parent_int_map = parent_int_map[i]; From d6b52f6828e6d9bb1fe35f889e1a9d0dcff0e21d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 23 Jun 2024 12:07:07 -0400 Subject: [PATCH 1332/1578] bcachefs: Fix null ptr deref in journal_pins_to_text() Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index dac2f498ae8b61..13669dd0e37561 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1521,6 +1521,11 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 struct journal_entry_pin *pin; spin_lock(&j->lock); + if (!test_bit(JOURNAL_running, &j->flags)) { + spin_unlock(&j->lock); + return true; + } + *seq = max(*seq, j->pin.front); if (*seq >= j->pin.back) { From 89d21b69b4f88e7a04b66bec38a01470cd40d703 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 23 Jun 2024 12:55:16 -0400 Subject: [PATCH 1333/1578] bcachefs: Add missing bch2_journal_do_writes() call This fixes a rare deadlock when we're doing an emergency shutdown due to failure to do a journal write. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_io.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 492426c8d869ac..db24ce21b2acfa 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1677,6 +1677,13 @@ static CLOSURE_CALLBACK(journal_write_done) mod_delayed_work(j->wq, &j->write_work, max(0L, delta)); } + /* + * We don't typically trigger journal writes from her - the next journal + * write will be triggered immediately after the previous one is + * allocated, in bch2_journal_write() - but the journal write error path + * is special: + */ + bch2_journal_do_writes(j); spin_unlock(&j->lock); } From 02ea312055da84e08e3e5bce2539c1ff11c8b5f2 Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Sat, 22 Jun 2024 12:14:37 +0530 Subject: [PATCH 1334/1578] octeontx2-pf: Fix coverity and klockwork issues in octeon PF driver Fix unintended sign extension and klockwork issues. These are not real issue but for sanity checks. Signed-off-by: Ratheesh Kannoth Signed-off-by: Suman Ghosh Signed-off-by: David S. Miller --- .../marvell/octeontx2/nic/otx2_common.c | 10 ++-- .../ethernet/marvell/octeontx2/nic/otx2_reg.h | 55 ++++++++++--------- .../marvell/octeontx2/nic/otx2_txrx.c | 2 +- .../net/ethernet/marvell/octeontx2/nic/qos.c | 3 +- 4 files changed, 35 insertions(+), 35 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c index a85ac039d779be..87d5776e3b88e9 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c @@ -648,14 +648,14 @@ int otx2_txschq_config(struct otx2_nic *pfvf, int lvl, int prio, bool txschq_for } else if (lvl == NIX_TXSCH_LVL_TL4) { parent = schq_list[NIX_TXSCH_LVL_TL3][prio]; req->reg[0] = NIX_AF_TL4X_PARENT(schq); - req->regval[0] = parent << 16; + req->regval[0] = (u64)parent << 16; req->num_regs++; req->reg[1] = NIX_AF_TL4X_SCHEDULE(schq); req->regval[1] = dwrr_val; } else if (lvl == NIX_TXSCH_LVL_TL3) { parent = schq_list[NIX_TXSCH_LVL_TL2][prio]; req->reg[0] = NIX_AF_TL3X_PARENT(schq); - req->regval[0] = parent << 16; + req->regval[0] = (u64)parent << 16; req->num_regs++; req->reg[1] = NIX_AF_TL3X_SCHEDULE(schq); req->regval[1] = dwrr_val; @@ -670,11 +670,11 @@ int otx2_txschq_config(struct otx2_nic *pfvf, int lvl, int prio, bool txschq_for } else if (lvl == NIX_TXSCH_LVL_TL2) { parent = schq_list[NIX_TXSCH_LVL_TL1][prio]; req->reg[0] = NIX_AF_TL2X_PARENT(schq); - req->regval[0] = parent << 16; + req->regval[0] = (u64)parent << 16; req->num_regs++; req->reg[1] = NIX_AF_TL2X_SCHEDULE(schq); - req->regval[1] = TXSCH_TL1_DFLT_RR_PRIO << 24 | dwrr_val; + req->regval[1] = (u64)hw->txschq_aggr_lvl_rr_prio << 24 | dwrr_val; if (lvl == hw->txschq_link_cfg_lvl) { req->num_regs++; @@ -698,7 +698,7 @@ int otx2_txschq_config(struct otx2_nic *pfvf, int lvl, int prio, bool txschq_for req->num_regs++; req->reg[1] = NIX_AF_TL1X_TOPOLOGY(schq); - req->regval[1] = (TXSCH_TL1_DFLT_RR_PRIO << 1); + req->regval[1] = hw->txschq_aggr_lvl_rr_prio << 1; req->num_regs++; req->reg[2] = NIX_AF_TL1X_CIR(schq); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_reg.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_reg.h index 45a32e4b49d1cb..e3aee6e3621517 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_reg.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_reg.h @@ -139,33 +139,34 @@ #define NIX_LF_CINTX_ENA_W1C(a) (NIX_LFBASE | 0xD50 | (a) << 12) /* NIX AF transmit scheduler registers */ -#define NIX_AF_SMQX_CFG(a) (0x700 | (a) << 16) -#define NIX_AF_TL1X_SCHEDULE(a) (0xC00 | (a) << 16) -#define NIX_AF_TL1X_CIR(a) (0xC20 | (a) << 16) -#define NIX_AF_TL1X_TOPOLOGY(a) (0xC80 | (a) << 16) -#define NIX_AF_TL2X_PARENT(a) (0xE88 | (a) << 16) -#define NIX_AF_TL2X_SCHEDULE(a) (0xE00 | (a) << 16) -#define NIX_AF_TL2X_TOPOLOGY(a) (0xE80 | (a) << 16) -#define NIX_AF_TL2X_CIR(a) (0xE20 | (a) << 16) -#define NIX_AF_TL2X_PIR(a) (0xE30 | (a) << 16) -#define NIX_AF_TL3X_PARENT(a) (0x1088 | (a) << 16) -#define NIX_AF_TL3X_SCHEDULE(a) (0x1000 | (a) << 16) -#define NIX_AF_TL3X_SHAPE(a) (0x1010 | (a) << 16) -#define NIX_AF_TL3X_CIR(a) (0x1020 | (a) << 16) -#define NIX_AF_TL3X_PIR(a) (0x1030 | (a) << 16) -#define NIX_AF_TL3X_TOPOLOGY(a) (0x1080 | (a) << 16) -#define NIX_AF_TL4X_PARENT(a) (0x1288 | (a) << 16) -#define NIX_AF_TL4X_SCHEDULE(a) (0x1200 | (a) << 16) -#define NIX_AF_TL4X_SHAPE(a) (0x1210 | (a) << 16) -#define NIX_AF_TL4X_CIR(a) (0x1220 | (a) << 16) -#define NIX_AF_TL4X_PIR(a) (0x1230 | (a) << 16) -#define NIX_AF_TL4X_TOPOLOGY(a) (0x1280 | (a) << 16) -#define NIX_AF_MDQX_SCHEDULE(a) (0x1400 | (a) << 16) -#define NIX_AF_MDQX_SHAPE(a) (0x1410 | (a) << 16) -#define NIX_AF_MDQX_CIR(a) (0x1420 | (a) << 16) -#define NIX_AF_MDQX_PIR(a) (0x1430 | (a) << 16) -#define NIX_AF_MDQX_PARENT(a) (0x1480 | (a) << 16) -#define NIX_AF_TL3_TL2X_LINKX_CFG(a, b) (0x1700 | (a) << 16 | (b) << 3) +#define NIX_AF_SMQX_CFG(a) (0x700 | (u64)(a) << 16) +#define NIX_AF_TL4X_SDP_LINK_CFG(a) (0xB10 | (u64)(a) << 16) +#define NIX_AF_TL1X_SCHEDULE(a) (0xC00 | (u64)(a) << 16) +#define NIX_AF_TL1X_CIR(a) (0xC20 | (u64)(a) << 16) +#define NIX_AF_TL1X_TOPOLOGY(a) (0xC80 | (u64)(a) << 16) +#define NIX_AF_TL2X_PARENT(a) (0xE88 | (u64)(a) << 16) +#define NIX_AF_TL2X_SCHEDULE(a) (0xE00 | (u64)(a) << 16) +#define NIX_AF_TL2X_TOPOLOGY(a) (0xE80 | (u64)(a) << 16) +#define NIX_AF_TL2X_CIR(a) (0xE20 | (u64)(a) << 16) +#define NIX_AF_TL2X_PIR(a) (0xE30 | (u64)(a) << 16) +#define NIX_AF_TL3X_PARENT(a) (0x1088 | (u64)(a) << 16) +#define NIX_AF_TL3X_SCHEDULE(a) (0x1000 | (u64)(a) << 16) +#define NIX_AF_TL3X_SHAPE(a) (0x1010 | (u64)(a) << 16) +#define NIX_AF_TL3X_CIR(a) (0x1020 | (u64)(a) << 16) +#define NIX_AF_TL3X_PIR(a) (0x1030 | (u64)(a) << 16) +#define NIX_AF_TL3X_TOPOLOGY(a) (0x1080 | (u64)(a) << 16) +#define NIX_AF_TL4X_PARENT(a) (0x1288 | (u64)(a) << 16) +#define NIX_AF_TL4X_SCHEDULE(a) (0x1200 | (u64)(a) << 16) +#define NIX_AF_TL4X_SHAPE(a) (0x1210 | (u64)(a) << 16) +#define NIX_AF_TL4X_CIR(a) (0x1220 | (u64)(a) << 16) +#define NIX_AF_TL4X_PIR(a) (0x1230 | (u64)(a) << 16) +#define NIX_AF_TL4X_TOPOLOGY(a) (0x1280 | (u64)(a) << 16) +#define NIX_AF_MDQX_SCHEDULE(a) (0x1400 | (u64)(a) << 16) +#define NIX_AF_MDQX_SHAPE(a) (0x1410 | (u64)(a) << 16) +#define NIX_AF_MDQX_CIR(a) (0x1420 | (u64)(a) << 16) +#define NIX_AF_MDQX_PIR(a) (0x1430 | (u64)(a) << 16) +#define NIX_AF_MDQX_PARENT(a) (0x1480 | (u64)(a) << 16) +#define NIX_AF_TL3_TL2X_LINKX_CFG(a, b) (0x1700 | (u64)(a) << 16 | (b) << 3) /* LMT LF registers */ #define LMT_LFBASE BIT_ULL(RVU_FUNC_BLKADDR_SHIFT) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c index 929b4eac25d974..3eb85949677ad2 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c @@ -513,7 +513,7 @@ static int otx2_tx_napi_handler(struct otx2_nic *pfvf, static void otx2_adjust_adaptive_coalese(struct otx2_nic *pfvf, struct otx2_cq_poll *cq_poll) { - struct dim_sample dim_sample; + struct dim_sample dim_sample = { 0 }; u64 rx_frames, rx_bytes; u64 tx_frames, tx_bytes; diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/qos.c b/drivers/net/ethernet/marvell/octeontx2/nic/qos.c index edac008099c0c4..0f844c14485a0e 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/qos.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/qos.c @@ -153,7 +153,6 @@ static void __otx2_qos_txschq_cfg(struct otx2_nic *pfvf, num_regs++; otx2_config_sched_shaping(pfvf, node, cfg, &num_regs); - } else if (level == NIX_TXSCH_LVL_TL4) { otx2_config_sched_shaping(pfvf, node, cfg, &num_regs); } else if (level == NIX_TXSCH_LVL_TL3) { @@ -176,7 +175,7 @@ static void __otx2_qos_txschq_cfg(struct otx2_nic *pfvf, /* check if node is root */ if (node->qid == OTX2_QOS_QID_INNER && !node->parent) { cfg->reg[num_regs] = NIX_AF_TL2X_SCHEDULE(node->schq); - cfg->regval[num_regs] = TXSCH_TL1_DFLT_RR_PRIO << 24 | + cfg->regval[num_regs] = (u64)hw->txschq_aggr_lvl_rr_prio << 24 | mtu_to_dwrr_weight(pfvf, pfvf->tx_max_pktlen); num_regs++; From 6ef8eb5125722c241fd60d7b0c872d5c2e5dd4ca Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Tue, 18 Jun 2024 16:13:36 +0800 Subject: [PATCH 1335/1578] cpu: Fix broken cmdline "nosmp" and "maxcpus=0" After the rework of "Parallel CPU bringup", the cmdline "nosmp" and "maxcpus=0" parameters are not working anymore. These parameters set setup_max_cpus to zero and that's handed to bringup_nonboot_cpus(). The code there does a decrement before checking for zero, which brings it into the negative space and brings up all CPUs. Add a zero check at the beginning of the function to prevent this. [ tglx: Massaged change log ] Fixes: 18415f33e2ac4ab382 ("cpu/hotplug: Allow "parallel" bringup up to CPUHP_BP_KICK_AP_STATE") Fixes: 06c6796e0304234da6 ("cpu/hotplug: Fix off by one in cpuhp_bringup_mask()") Signed-off-by: Huacai Chen Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240618081336.3996825-1-chenhuacai@loongson.cn --- kernel/cpu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/cpu.c b/kernel/cpu.c index 74cfdb66a9bdd2..3d2bf1d50a0c4a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1859,6 +1859,9 @@ static inline bool cpuhp_bringup_cpus_parallel(unsigned int ncpus) { return fals void __init bringup_nonboot_cpus(unsigned int max_cpus) { + if (!max_cpus) + return; + /* Try parallel bringup optimization if enabled */ if (cpuhp_bringup_cpus_parallel(max_cpus)) return; From f2661062f16b2de5d7b6a5c42a9a5c96326b8454 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 23 Jun 2024 17:08:54 -0400 Subject: [PATCH 1336/1578] Linux 6.10-rc5 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 14427547dc1eb5..4d36f943b3b1f4 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 10 SUBLEVEL = 0 -EXTRAVERSION = -rc4 +EXTRAVERSION = -rc5 NAME = Baby Opossum Posse # *DOCUMENTATION* From 37ce99b77762256ec9fda58d58fd613230151456 Mon Sep 17 00:00:00 2001 From: Liu Ying Date: Mon, 24 Jun 2024 09:56:12 +0800 Subject: [PATCH 1337/1578] drm/panel: simple: Add missing display timing flags for KOE TX26D202VM0BWA KOE TX26D202VM0BWA panel spec indicates the DE signal is active high in timing chart, so add DISPLAY_FLAGS_DE_HIGH flag in display timing flags. This aligns display_timing with panel_desc. Fixes: 8a07052440c2 ("drm/panel: simple: Add support for KOE TX26D202VM0BWA panel") Signed-off-by: Liu Ying Reviewed-by: Neil Armstrong Link: https://lore.kernel.org/r/20240624015612.341983-1-victor.liu@nxp.com Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20240624015612.341983-1-victor.liu@nxp.com --- drivers/gpu/drm/panel/panel-simple.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/panel/panel-simple.c b/drivers/gpu/drm/panel/panel-simple.c index dcb6d0b6ced069..c8cdc8356c58c1 100644 --- a/drivers/gpu/drm/panel/panel-simple.c +++ b/drivers/gpu/drm/panel/panel-simple.c @@ -2752,6 +2752,7 @@ static const struct display_timing koe_tx26d202vm0bwa_timing = { .vfront_porch = { 3, 5, 10 }, .vback_porch = { 2, 5, 10 }, .vsync_len = { 5, 5, 5 }, + .flags = DISPLAY_FLAGS_DE_HIGH, }; static const struct panel_desc koe_tx26d202vm0bwa = { From 6434b33faaa063df500af355ee6c3942e0f8d982 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 14 Jun 2024 18:09:01 +0200 Subject: [PATCH 1338/1578] s390/sclp: Fix sclp_init() cleanup on failure If sclp_init() fails it only partially cleans up: if there are multiple failing calls to sclp_init() sclp_state_change_event will be added several times to sclp_reg_list, which results in the following warning: ------------[ cut here ]------------ list_add double add: new=000003ffe1598c10, prev=000003ffe1598bf0, next=000003ffe1598c10. WARNING: CPU: 0 PID: 1 at lib/list_debug.c:35 __list_add_valid_or_report+0xde/0xf8 CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.10.0-rc3 Krnl PSW : 0404c00180000000 000003ffe0d6076a (__list_add_valid_or_report+0xe2/0xf8) R:0 T:1 IO:0 EX:0 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3 ... Call Trace: [<000003ffe0d6076a>] __list_add_valid_or_report+0xe2/0xf8 ([<000003ffe0d60766>] __list_add_valid_or_report+0xde/0xf8) [<000003ffe0a8d37e>] sclp_init+0x40e/0x450 [<000003ffe00009f2>] do_one_initcall+0x42/0x1e0 [<000003ffe15b77a6>] do_initcalls+0x126/0x150 [<000003ffe15b7a0a>] kernel_init_freeable+0x1ba/0x1f8 [<000003ffe0d6650e>] kernel_init+0x2e/0x180 [<000003ffe000301c>] __ret_from_fork+0x3c/0x60 [<000003ffe0d759ca>] ret_from_fork+0xa/0x30 Fix this by removing sclp_state_change_event from sclp_reg_list when sclp_init() fails. Reviewed-by: Peter Oberparleiter Signed-off-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- drivers/s390/char/sclp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/s390/char/sclp.c b/drivers/s390/char/sclp.c index d53ee34d398f67..fbe29cabcbb83b 100644 --- a/drivers/s390/char/sclp.c +++ b/drivers/s390/char/sclp.c @@ -1293,6 +1293,7 @@ sclp_init(void) fail_unregister_reboot_notifier: unregister_reboot_notifier(&sclp_reboot_notifier); fail_init_state_uninitialized: + list_del(&sclp_state_change_event.list); sclp_init_state = sclp_init_state_uninitialized; free_page((unsigned long) sclp_read_sccb); free_page((unsigned long) sclp_init_sccb); From 058722ee350c0bdd664e467156feb2bf5d9cc271 Mon Sep 17 00:00:00 2001 From: Jose Ignacio Tornos Martinez Date: Thu, 20 Jun 2024 15:34:31 +0200 Subject: [PATCH 1339/1578] net: usb: ax88179_178a: improve link status logs Avoid spurious link status logs that may ultimately be wrong; for example, if the link is set to down with the cable plugged, then the cable is unplugged and after this the link is set to up, the last new log that is appearing is incorrectly telling that the link is up. In order to avoid errors, show link status logs after link_reset processing, and in order to avoid spurious as much as possible, only show the link loss when some link status change is detected. cc: stable@vger.kernel.org Fixes: e2ca90c276e1 ("ax88179_178a: ASIX AX88179_178A USB 3.0/2.0 to gigabit ethernet adapter driver") Signed-off-by: Jose Ignacio Tornos Martinez Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/usb/ax88179_178a.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c index c2fb736f78b26d..b034ef8a73ea7a 100644 --- a/drivers/net/usb/ax88179_178a.c +++ b/drivers/net/usb/ax88179_178a.c @@ -326,7 +326,8 @@ static void ax88179_status(struct usbnet *dev, struct urb *urb) if (netif_carrier_ok(dev->net) != link) { usbnet_link_change(dev, link, 1); - netdev_info(dev->net, "ax88179 - Link status is: %d\n", link); + if (!link) + netdev_info(dev->net, "ax88179 - Link status is: 0\n"); } } @@ -1542,6 +1543,7 @@ static int ax88179_link_reset(struct usbnet *dev) GMII_PHY_PHYSR, 2, &tmp16); if (!(tmp16 & GMII_PHY_PHYSR_LINK)) { + netdev_info(dev->net, "ax88179 - Link status is: 0\n"); return 0; } else if (GMII_PHY_PHYSR_GIGA == (tmp16 & GMII_PHY_PHYSR_SMASK)) { mode |= AX_MEDIUM_GIGAMODE | AX_MEDIUM_EN_125MHZ; @@ -1579,6 +1581,8 @@ static int ax88179_link_reset(struct usbnet *dev) netif_carrier_on(dev->net); + netdev_info(dev->net, "ax88179 - Link status is: 1\n"); + return 0; } From 996c3412a06578e9d779a16b9e79ace18125ab50 Mon Sep 17 00:00:00 2001 From: Janusz Krzysztofik Date: Mon, 3 Jun 2024 21:54:45 +0200 Subject: [PATCH 1340/1578] drm/i915/gt: Fix potential UAF by revoke of fence registers CI has been sporadically reporting the following issue triggered by igt@i915_selftest@live@hangcheck on ADL-P and similar machines: <6> [414.049203] i915: Running intel_hangcheck_live_selftests/igt_reset_evict_fence ... <6> [414.068804] i915 0000:00:02.0: [drm] GT0: GUC: submission enabled <6> [414.068812] i915 0000:00:02.0: [drm] GT0: GUC: SLPC enabled <3> [414.070354] Unable to pin Y-tiled fence; err:-4 <3> [414.071282] i915_vma_revoke_fence:301 GEM_BUG_ON(!i915_active_is_idle(&fence->active)) ... <4>[ 609.603992] ------------[ cut here ]------------ <2>[ 609.603995] kernel BUG at drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c:301! <4>[ 609.604003] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI <4>[ 609.604006] CPU: 0 PID: 268 Comm: kworker/u64:3 Tainted: G U W 6.9.0-CI_DRM_14785-g1ba62f8cea9c+ #1 <4>[ 609.604008] Hardware name: Intel Corporation Alder Lake Client Platform/AlderLake-P DDR4 RVP, BIOS RPLPFWI1.R00.4035.A00.2301200723 01/20/2023 <4>[ 609.604010] Workqueue: i915 __i915_gem_free_work [i915] <4>[ 609.604149] RIP: 0010:i915_vma_revoke_fence+0x187/0x1f0 [i915] ... <4>[ 609.604271] Call Trace: <4>[ 609.604273] ... <4>[ 609.604716] __i915_vma_evict+0x2e9/0x550 [i915] <4>[ 609.604852] __i915_vma_unbind+0x7c/0x160 [i915] <4>[ 609.604977] force_unbind+0x24/0xa0 [i915] <4>[ 609.605098] i915_vma_destroy+0x2f/0xa0 [i915] <4>[ 609.605210] __i915_gem_object_pages_fini+0x51/0x2f0 [i915] <4>[ 609.605330] __i915_gem_free_objects.isra.0+0x6a/0xc0 [i915] <4>[ 609.605440] process_scheduled_works+0x351/0x690 ... In the past, there were similar failures reported by CI from other IGT tests, observed on other platforms. Before commit 63baf4f3d587 ("drm/i915/gt: Only wait for GPU activity before unbinding a GGTT fence"), i915_vma_revoke_fence() was waiting for idleness of vma->active via fence_update(). That commit introduced vma->fence->active in order for the fence_update() to be able to wait selectively on that one instead of vma->active since only idleness of fence registers was needed. But then, another commit 0d86ee35097a ("drm/i915/gt: Make fence revocation unequivocal") replaced the call to fence_update() in i915_vma_revoke_fence() with only fence_write(), and also added that GEM_BUG_ON(!i915_active_is_idle(&fence->active)) in front. No justification was provided on why we might then expect idleness of vma->fence->active without first waiting on it. The issue can be potentially caused by a race among revocation of fence registers on one side and sequential execution of signal callbacks invoked on completion of a request that was using them on the other, still processed in parallel to revocation of those fence registers. Fix it by waiting for idleness of vma->fence->active in i915_vma_revoke_fence(). Fixes: 0d86ee35097a ("drm/i915/gt: Make fence revocation unequivocal") Closes: https://gitlab.freedesktop.org/drm/intel/issues/10021 Signed-off-by: Janusz Krzysztofik Cc: stable@vger.kernel.org # v5.8+ Reviewed-by: Andi Shyti Signed-off-by: Andi Shyti Link: https://patchwork.freedesktop.org/patch/msgid/20240603195446.297690-2-janusz.krzysztofik@linux.intel.com (cherry picked from commit 24bb052d3dd499c5956abad5f7d8e4fd07da7fb1) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c b/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c index 40371b8a9bbbd7..93bc1cc1ee7e64 100644 --- a/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c +++ b/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c @@ -298,6 +298,7 @@ void i915_vma_revoke_fence(struct i915_vma *vma) return; GEM_BUG_ON(fence->vma != vma); + i915_active_wait(&fence->active); GEM_BUG_ON(!i915_active_is_idle(&fence->active)); GEM_BUG_ON(atomic_read(&fence->pin_count)); From d56fbfbaf592a115b2e11c1044829afba34069d2 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Wed, 5 Jun 2024 11:27:45 +0800 Subject: [PATCH 1341/1578] platform/mellanox: nvsw-sn2201: Add check for platform_device_add_resources Add check for the return value of platform_device_add_resources() and return the error if it fails in order to catch the error. Signed-off-by: Chen Ni Link: https://lore.kernel.org/r/20240605032745.2916183-1-nichen@iscas.ac.cn Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/mellanox/nvsw-sn2201.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/platform/mellanox/nvsw-sn2201.c b/drivers/platform/mellanox/nvsw-sn2201.c index 3ef655591424c2..abe7be602f846e 100644 --- a/drivers/platform/mellanox/nvsw-sn2201.c +++ b/drivers/platform/mellanox/nvsw-sn2201.c @@ -1198,6 +1198,7 @@ static int nvsw_sn2201_config_pre_init(struct nvsw_sn2201 *nvsw_sn2201) static int nvsw_sn2201_probe(struct platform_device *pdev) { struct nvsw_sn2201 *nvsw_sn2201; + int ret; nvsw_sn2201 = devm_kzalloc(&pdev->dev, sizeof(*nvsw_sn2201), GFP_KERNEL); if (!nvsw_sn2201) @@ -1205,8 +1206,10 @@ static int nvsw_sn2201_probe(struct platform_device *pdev) nvsw_sn2201->dev = &pdev->dev; platform_set_drvdata(pdev, nvsw_sn2201); - platform_device_add_resources(pdev, nvsw_sn2201_lpc_io_resources, + ret = platform_device_add_resources(pdev, nvsw_sn2201_lpc_io_resources, ARRAY_SIZE(nvsw_sn2201_lpc_io_resources)); + if (ret) + return ret; nvsw_sn2201->main_mux_deferred_nr = NVSW_SN2201_MAIN_MUX_DEFER_NR; nvsw_sn2201->main_mux_devs = nvsw_sn2201_main_mux_brdinfo; From 7aa9b96e9a73e4ec1771492d0527bd5fc5ef9164 Mon Sep 17 00:00:00 2001 From: Aleksandr Mishin Date: Tue, 18 Jun 2024 17:43:44 +0300 Subject: [PATCH 1342/1578] gpio: davinci: Validate the obtained number of IRQs Value of pdata->gpio_unbanked is taken from Device Tree. In case of broken DT due to any error this value can be any. Without this value validation there can be out of chips->irqs array boundaries access in davinci_gpio_probe(). Validate the obtained nirq value so that it won't exceed the maximum number of IRQs per bank. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: eb3744a2dd01 ("gpio: davinci: Do not assume continuous IRQ numbering") Signed-off-by: Aleksandr Mishin Link: https://lore.kernel.org/r/20240618144344.16943-1-amishin@t-argos.ru Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-davinci.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/gpio/gpio-davinci.c b/drivers/gpio/gpio-davinci.c index bb499e36291258..1d0175d6350b78 100644 --- a/drivers/gpio/gpio-davinci.c +++ b/drivers/gpio/gpio-davinci.c @@ -225,6 +225,11 @@ static int davinci_gpio_probe(struct platform_device *pdev) else nirq = DIV_ROUND_UP(ngpio, 16); + if (nirq > MAX_INT_PER_BANK) { + dev_err(dev, "Too many IRQs!\n"); + return -EINVAL; + } + chips = devm_kzalloc(dev, sizeof(*chips), GFP_KERNEL); if (!chips) return -ENOMEM; From 151e78a0b89ee6dec93382dbdf5b1ef83f9c4716 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Fri, 7 Jun 2024 01:35:37 +0200 Subject: [PATCH 1343/1578] platform/x86: wireless-hotkey: Add support for LG Airplane Button MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The LGEX0815 ACPI device is used by the "LG Airplane Mode Button" Windows driver for handling rfkill requests. When the ACPI device receives an 0x80 ACPI notification, an rfkill event is to be send to userspace. Add support for the LGEX0815 ACPI device to the driver. Tested-by: Agathe Boutmy Signed-off-by: Armin Wolf Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20240606233540.9774-2-W_Armin@gmx.de Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/wireless-hotkey.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/platform/x86/wireless-hotkey.c b/drivers/platform/x86/wireless-hotkey.c index e95cdbbfb70899..ab46164cbe1367 100644 --- a/drivers/platform/x86/wireless-hotkey.c +++ b/drivers/platform/x86/wireless-hotkey.c @@ -19,6 +19,7 @@ MODULE_AUTHOR("Alex Hung"); MODULE_ALIAS("acpi*:HPQ6001:*"); MODULE_ALIAS("acpi*:WSTADEF:*"); MODULE_ALIAS("acpi*:AMDI0051:*"); +MODULE_ALIAS("acpi*:LGEX0815:*"); struct wl_button { struct input_dev *input_dev; @@ -29,6 +30,7 @@ static const struct acpi_device_id wl_ids[] = { {"HPQ6001", 0}, {"WSTADEF", 0}, {"AMDI0051", 0}, + {"LGEX0815", 0}, {"", 0}, }; From 413c204595ca98a4f33414a948c18d7314087342 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Fri, 7 Jun 2024 01:35:38 +0200 Subject: [PATCH 1344/1578] platform/x86: lg-laptop: Remove LGEX0815 hotkey handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The rfkill hotkey handling is already provided by the wireless-hotkey driver. Remove the now unnecessary rfkill hotkey handling to avoid duplicating functionality. The ACPI notify handler still prints debugging information when receiving ACPI notifications to aid in reverse-engineering. Tested-by: Agathe Boutmy Signed-off-by: Armin Wolf Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20240606233540.9774-3-W_Armin@gmx.de Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/lg-laptop.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/platform/x86/lg-laptop.c b/drivers/platform/x86/lg-laptop.c index d0fee5d375d77b..ea83630106e836 100644 --- a/drivers/platform/x86/lg-laptop.c +++ b/drivers/platform/x86/lg-laptop.c @@ -84,7 +84,6 @@ static const struct key_entry wmi_keymap[] = { * this key both sends an event and * changes backlight level. */ - {KE_KEY, 0x80, {KEY_RFKILL} }, {KE_END, 0} }; @@ -272,14 +271,7 @@ static void wmi_input_setup(void) static void acpi_notify(struct acpi_device *device, u32 event) { - struct key_entry *key; - acpi_handle_debug(device->handle, "notify: %d\n", event); - if (inited & INIT_SPARSE_KEYMAP) { - key = sparse_keymap_entry_from_scancode(wmi_input_dev, 0x80); - if (key && key->type == KE_KEY) - sparse_keymap_report_entry(wmi_input_dev, key, 1, true); - } } static ssize_t fan_mode_store(struct device *dev, From 58a54f27a0dac81f7fd3514be01012635219a53c Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Fri, 7 Jun 2024 01:35:39 +0200 Subject: [PATCH 1345/1578] platform/x86: lg-laptop: Change ACPI device id MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The LGEX0815 ACPI device id is used for handling hotkey events, but this functionality is already handled by the wireless-hotkey driver. The LGEX0820 ACPI device id however is used to manage various platform features using the WMAB/WMBB ACPI methods. Use this ACPI device id to avoid blocking the wireless-hotkey driver from probing. Tested-by: Agathe Boutmy Signed-off-by: Armin Wolf Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20240606233540.9774-4-W_Armin@gmx.de Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/lg-laptop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/lg-laptop.c b/drivers/platform/x86/lg-laptop.c index ea83630106e836..db8a2f79bf0aed 100644 --- a/drivers/platform/x86/lg-laptop.c +++ b/drivers/platform/x86/lg-laptop.c @@ -768,7 +768,7 @@ static void acpi_remove(struct acpi_device *device) } static const struct acpi_device_id device_ids[] = { - {"LGEX0815", 0}, + {"LGEX0820", 0}, {"", 0} }; MODULE_DEVICE_TABLE(acpi, device_ids); From b27ea279556121b54d3f45d0529706cf100cdb3a Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Fri, 7 Jun 2024 01:35:40 +0200 Subject: [PATCH 1346/1578] platform/x86: lg-laptop: Use ACPI device handle when evaluating WMAB/WMBB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On the LG Gram 16Z90S, the WMAB and WMBB ACPI methods are not mapped under \XINI, but instead are mapped under \_SB.XINI. The reason for this is that the LGEX0820 ACPI device used by this driver is mapped at \_SB.XINI, so the ACPI methods where moved as well to appear below the LGEX0820 ACPI device. Fix this by using the ACPI handle from the ACPI device when evaluating both methods. Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218901 Tested-by: Agathe Boutmy Signed-off-by: Armin Wolf Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20240606233540.9774-5-W_Armin@gmx.de Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/lg-laptop.c | 79 +++++++++++++------------------- 1 file changed, 33 insertions(+), 46 deletions(-) diff --git a/drivers/platform/x86/lg-laptop.c b/drivers/platform/x86/lg-laptop.c index db8a2f79bf0aed..9c7857842caf4c 100644 --- a/drivers/platform/x86/lg-laptop.c +++ b/drivers/platform/x86/lg-laptop.c @@ -39,8 +39,6 @@ MODULE_LICENSE("GPL"); #define WMI_METHOD_WMBB "2B4F501A-BD3C-4394-8DCF-00A7D2BC8210" #define WMI_EVENT_GUID WMI_EVENT_GUID0 -#define WMAB_METHOD "\\XINI.WMAB" -#define WMBB_METHOD "\\XINI.WMBB" #define SB_GGOV_METHOD "\\_SB.GGOV" #define GOV_TLED 0x2020008 #define WM_GET 1 @@ -74,7 +72,7 @@ static u32 inited; static int battery_limit_use_wmbb; static struct led_classdev kbd_backlight; -static enum led_brightness get_kbd_backlight_level(void); +static enum led_brightness get_kbd_backlight_level(struct device *dev); static const struct key_entry wmi_keymap[] = { {KE_KEY, 0x70, {KEY_F15} }, /* LG control panel (F1) */ @@ -127,11 +125,10 @@ static int ggov(u32 arg0) return res; } -static union acpi_object *lg_wmab(u32 method, u32 arg1, u32 arg2) +static union acpi_object *lg_wmab(struct device *dev, u32 method, u32 arg1, u32 arg2) { union acpi_object args[3]; acpi_status status; - acpi_handle handle; struct acpi_object_list arg; struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; @@ -142,29 +139,22 @@ static union acpi_object *lg_wmab(u32 method, u32 arg1, u32 arg2) args[2].type = ACPI_TYPE_INTEGER; args[2].integer.value = arg2; - status = acpi_get_handle(NULL, (acpi_string) WMAB_METHOD, &handle); - if (ACPI_FAILURE(status)) { - pr_err("Cannot get handle"); - return NULL; - } - arg.count = 3; arg.pointer = args; - status = acpi_evaluate_object(handle, NULL, &arg, &buffer); + status = acpi_evaluate_object(ACPI_HANDLE(dev), "WMAB", &arg, &buffer); if (ACPI_FAILURE(status)) { - acpi_handle_err(handle, "WMAB: call failed.\n"); + dev_err(dev, "WMAB: call failed.\n"); return NULL; } return buffer.pointer; } -static union acpi_object *lg_wmbb(u32 method_id, u32 arg1, u32 arg2) +static union acpi_object *lg_wmbb(struct device *dev, u32 method_id, u32 arg1, u32 arg2) { union acpi_object args[3]; acpi_status status; - acpi_handle handle; struct acpi_object_list arg; struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; u8 buf[32]; @@ -180,18 +170,12 @@ static union acpi_object *lg_wmbb(u32 method_id, u32 arg1, u32 arg2) args[2].buffer.length = 32; args[2].buffer.pointer = buf; - status = acpi_get_handle(NULL, (acpi_string)WMBB_METHOD, &handle); - if (ACPI_FAILURE(status)) { - pr_err("Cannot get handle"); - return NULL; - } - arg.count = 3; arg.pointer = args; - status = acpi_evaluate_object(handle, NULL, &arg, &buffer); + status = acpi_evaluate_object(ACPI_HANDLE(dev), "WMBB", &arg, &buffer); if (ACPI_FAILURE(status)) { - acpi_handle_err(handle, "WMAB: call failed.\n"); + dev_err(dev, "WMBB: call failed.\n"); return NULL; } @@ -222,7 +206,7 @@ static void wmi_notify(u32 value, void *context) if (eventcode == 0x10000000) { led_classdev_notify_brightness_hw_changed( - &kbd_backlight, get_kbd_backlight_level()); + &kbd_backlight, get_kbd_backlight_level(kbd_backlight.dev->parent)); } else { key = sparse_keymap_entry_from_scancode( wmi_input_dev, eventcode); @@ -287,7 +271,7 @@ static ssize_t fan_mode_store(struct device *dev, if (ret) return ret; - r = lg_wmab(WM_FAN_MODE, WM_GET, 0); + r = lg_wmab(dev, WM_FAN_MODE, WM_GET, 0); if (!r) return -EIO; @@ -298,9 +282,9 @@ static ssize_t fan_mode_store(struct device *dev, m = r->integer.value; kfree(r); - r = lg_wmab(WM_FAN_MODE, WM_SET, (m & 0xffffff0f) | (value << 4)); + r = lg_wmab(dev, WM_FAN_MODE, WM_SET, (m & 0xffffff0f) | (value << 4)); kfree(r); - r = lg_wmab(WM_FAN_MODE, WM_SET, (m & 0xfffffff0) | value); + r = lg_wmab(dev, WM_FAN_MODE, WM_SET, (m & 0xfffffff0) | value); kfree(r); return count; @@ -312,7 +296,7 @@ static ssize_t fan_mode_show(struct device *dev, unsigned int status; union acpi_object *r; - r = lg_wmab(WM_FAN_MODE, WM_GET, 0); + r = lg_wmab(dev, WM_FAN_MODE, WM_GET, 0); if (!r) return -EIO; @@ -339,7 +323,7 @@ static ssize_t usb_charge_store(struct device *dev, if (ret) return ret; - r = lg_wmbb(WMBB_USB_CHARGE, WM_SET, value); + r = lg_wmbb(dev, WMBB_USB_CHARGE, WM_SET, value); if (!r) return -EIO; @@ -353,7 +337,7 @@ static ssize_t usb_charge_show(struct device *dev, unsigned int status; union acpi_object *r; - r = lg_wmbb(WMBB_USB_CHARGE, WM_GET, 0); + r = lg_wmbb(dev, WMBB_USB_CHARGE, WM_GET, 0); if (!r) return -EIO; @@ -381,7 +365,7 @@ static ssize_t reader_mode_store(struct device *dev, if (ret) return ret; - r = lg_wmab(WM_READER_MODE, WM_SET, value); + r = lg_wmab(dev, WM_READER_MODE, WM_SET, value); if (!r) return -EIO; @@ -395,7 +379,7 @@ static ssize_t reader_mode_show(struct device *dev, unsigned int status; union acpi_object *r; - r = lg_wmab(WM_READER_MODE, WM_GET, 0); + r = lg_wmab(dev, WM_READER_MODE, WM_GET, 0); if (!r) return -EIO; @@ -423,7 +407,7 @@ static ssize_t fn_lock_store(struct device *dev, if (ret) return ret; - r = lg_wmab(WM_FN_LOCK, WM_SET, value); + r = lg_wmab(dev, WM_FN_LOCK, WM_SET, value); if (!r) return -EIO; @@ -437,7 +421,7 @@ static ssize_t fn_lock_show(struct device *dev, unsigned int status; union acpi_object *r; - r = lg_wmab(WM_FN_LOCK, WM_GET, 0); + r = lg_wmab(dev, WM_FN_LOCK, WM_GET, 0); if (!r) return -EIO; @@ -467,9 +451,9 @@ static ssize_t charge_control_end_threshold_store(struct device *dev, union acpi_object *r; if (battery_limit_use_wmbb) - r = lg_wmbb(WMBB_BATT_LIMIT, WM_SET, value); + r = lg_wmbb(&pf_device->dev, WMBB_BATT_LIMIT, WM_SET, value); else - r = lg_wmab(WM_BATT_LIMIT, WM_SET, value); + r = lg_wmab(&pf_device->dev, WM_BATT_LIMIT, WM_SET, value); if (!r) return -EIO; @@ -488,7 +472,7 @@ static ssize_t charge_control_end_threshold_show(struct device *device, union acpi_object *r; if (battery_limit_use_wmbb) { - r = lg_wmbb(WMBB_BATT_LIMIT, WM_GET, 0); + r = lg_wmbb(&pf_device->dev, WMBB_BATT_LIMIT, WM_GET, 0); if (!r) return -EIO; @@ -499,7 +483,7 @@ static ssize_t charge_control_end_threshold_show(struct device *device, status = r->buffer.pointer[0x10]; } else { - r = lg_wmab(WM_BATT_LIMIT, WM_GET, 0); + r = lg_wmab(&pf_device->dev, WM_BATT_LIMIT, WM_GET, 0); if (!r) return -EIO; @@ -578,7 +562,7 @@ static void tpad_led_set(struct led_classdev *cdev, { union acpi_object *r; - r = lg_wmab(WM_TLED, WM_SET, brightness > LED_OFF); + r = lg_wmab(cdev->dev->parent, WM_TLED, WM_SET, brightness > LED_OFF); kfree(r); } @@ -600,16 +584,16 @@ static void kbd_backlight_set(struct led_classdev *cdev, val = 0; if (brightness >= LED_FULL) val = 0x24; - r = lg_wmab(WM_KEY_LIGHT, WM_SET, val); + r = lg_wmab(cdev->dev->parent, WM_KEY_LIGHT, WM_SET, val); kfree(r); } -static enum led_brightness get_kbd_backlight_level(void) +static enum led_brightness get_kbd_backlight_level(struct device *dev) { union acpi_object *r; int val; - r = lg_wmab(WM_KEY_LIGHT, WM_GET, 0); + r = lg_wmab(dev, WM_KEY_LIGHT, WM_GET, 0); if (!r) return LED_OFF; @@ -637,7 +621,7 @@ static enum led_brightness get_kbd_backlight_level(void) static enum led_brightness kbd_backlight_get(struct led_classdev *cdev) { - return get_kbd_backlight_level(); + return get_kbd_backlight_level(cdev->dev->parent); } static LED_DEVICE(kbd_backlight, 255, LED_BRIGHT_HW_CHANGED); @@ -664,6 +648,11 @@ static struct platform_driver pf_driver = { static int acpi_add(struct acpi_device *device) { + struct platform_device_info pdev_info = { + .fwnode = acpi_fwnode_handle(device), + .name = PLATFORM_NAME, + .id = PLATFORM_DEVID_NONE, + }; int ret; const char *product; int year = 2017; @@ -675,9 +664,7 @@ static int acpi_add(struct acpi_device *device) if (ret) return ret; - pf_device = platform_device_register_simple(PLATFORM_NAME, - PLATFORM_DEVID_NONE, - NULL, 0); + pf_device = platform_device_register_full(&pdev_info); if (IS_ERR(pf_device)) { ret = PTR_ERR(pf_device); pf_device = NULL; From 3fd8ca27073e963aba7e64cd087968d87a953ac1 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Tue, 11 Jun 2024 21:25:56 -0700 Subject: [PATCH 1347/1578] platform/x86/siemens: add missing MODULE_DESCRIPTION() macros With ARCH=x86, make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/platform/x86/siemens/simatic-ipc.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/platform/x86/siemens/simatic-ipc-batt.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/platform/x86/siemens/simatic-ipc-batt-apollolake.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/platform/x86/siemens/simatic-ipc-batt-elkhartlake.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/platform/x86/siemens/simatic-ipc-batt-f7188x.o Add the missing invocations of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Link: https://lore.kernel.org/r/20240611-md-drivers-platform-x86-siemens-v1-1-b399d7d6ae64@quicinc.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/siemens/simatic-ipc-batt-apollolake.c | 1 + drivers/platform/x86/siemens/simatic-ipc-batt-elkhartlake.c | 1 + drivers/platform/x86/siemens/simatic-ipc-batt-f7188x.c | 1 + drivers/platform/x86/siemens/simatic-ipc-batt.c | 1 + drivers/platform/x86/siemens/simatic-ipc.c | 1 + 5 files changed, 5 insertions(+) diff --git a/drivers/platform/x86/siemens/simatic-ipc-batt-apollolake.c b/drivers/platform/x86/siemens/simatic-ipc-batt-apollolake.c index 31a139d87d9a47..5edc294de6e4ad 100644 --- a/drivers/platform/x86/siemens/simatic-ipc-batt-apollolake.c +++ b/drivers/platform/x86/siemens/simatic-ipc-batt-apollolake.c @@ -45,6 +45,7 @@ static struct platform_driver simatic_ipc_batt_driver = { module_platform_driver(simatic_ipc_batt_driver); +MODULE_DESCRIPTION("CMOS Battery monitoring for Simatic IPCs based on Apollo Lake GPIO"); MODULE_LICENSE("GPL"); MODULE_ALIAS("platform:" KBUILD_MODNAME); MODULE_SOFTDEP("pre: simatic-ipc-batt platform:apollolake-pinctrl"); diff --git a/drivers/platform/x86/siemens/simatic-ipc-batt-elkhartlake.c b/drivers/platform/x86/siemens/simatic-ipc-batt-elkhartlake.c index a7676f224075fc..e6a56d14b505cc 100644 --- a/drivers/platform/x86/siemens/simatic-ipc-batt-elkhartlake.c +++ b/drivers/platform/x86/siemens/simatic-ipc-batt-elkhartlake.c @@ -45,6 +45,7 @@ static struct platform_driver simatic_ipc_batt_driver = { module_platform_driver(simatic_ipc_batt_driver); +MODULE_DESCRIPTION("CMOS Battery monitoring for Simatic IPCs based on Elkhart Lake GPIO"); MODULE_LICENSE("GPL"); MODULE_ALIAS("platform:" KBUILD_MODNAME); MODULE_SOFTDEP("pre: simatic-ipc-batt platform:elkhartlake-pinctrl"); diff --git a/drivers/platform/x86/siemens/simatic-ipc-batt-f7188x.c b/drivers/platform/x86/siemens/simatic-ipc-batt-f7188x.c index 5e77e05fdb5dc0..f8849d0e48a846 100644 --- a/drivers/platform/x86/siemens/simatic-ipc-batt-f7188x.c +++ b/drivers/platform/x86/siemens/simatic-ipc-batt-f7188x.c @@ -81,6 +81,7 @@ static struct platform_driver simatic_ipc_batt_driver = { module_platform_driver(simatic_ipc_batt_driver); +MODULE_DESCRIPTION("CMOS Battery monitoring for Simatic IPCs based on Nuvoton GPIO"); MODULE_LICENSE("GPL"); MODULE_ALIAS("platform:" KBUILD_MODNAME); MODULE_SOFTDEP("pre: simatic-ipc-batt gpio_f7188x platform:elkhartlake-pinctrl platform:alderlake-pinctrl"); diff --git a/drivers/platform/x86/siemens/simatic-ipc-batt.c b/drivers/platform/x86/siemens/simatic-ipc-batt.c index c6dd263b4ee364..d9aff10608cf28 100644 --- a/drivers/platform/x86/siemens/simatic-ipc-batt.c +++ b/drivers/platform/x86/siemens/simatic-ipc-batt.c @@ -247,6 +247,7 @@ static struct platform_driver simatic_ipc_batt_driver = { module_platform_driver(simatic_ipc_batt_driver); +MODULE_DESCRIPTION("CMOS core battery driver for Siemens Simatic IPCs"); MODULE_LICENSE("GPL"); MODULE_ALIAS("platform:" KBUILD_MODNAME); MODULE_AUTHOR("Henning Schild "); diff --git a/drivers/platform/x86/siemens/simatic-ipc.c b/drivers/platform/x86/siemens/simatic-ipc.c index 8ca6e277fa038e..7039874d8f1122 100644 --- a/drivers/platform/x86/siemens/simatic-ipc.c +++ b/drivers/platform/x86/siemens/simatic-ipc.c @@ -231,6 +231,7 @@ static void __exit simatic_ipc_exit_module(void) module_init(simatic_ipc_init_module); module_exit(simatic_ipc_exit_module); +MODULE_DESCRIPTION("Siemens SIMATIC IPC platform driver"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Gerd Haeussler "); MODULE_ALIAS("dmi:*:svnSIEMENSAG:*"); From 41ab81ce8490b9cad88e87c49315fb8c46208be7 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Tue, 11 Jun 2024 21:56:26 -0700 Subject: [PATCH 1348/1578] platform/x86/intel: add missing MODULE_DESCRIPTION() macros With ARCH=x86, make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/platform/x86/intel/pmc/intel_pmc_core_pltdrv.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/platform/x86/intel/intel-hid.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/platform/x86/intel/intel-vbtn.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/platform/x86/intel/intel-rst.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/platform/x86/intel/intel-smartconnect.o Add the missing invocations of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Link: https://lore.kernel.org/r/20240611-md-drivers-platform-x86-intel-v1-1-5ed967425b04@quicinc.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/hid.c | 1 + drivers/platform/x86/intel/pmc/pltdrv.c | 1 + drivers/platform/x86/intel/rst.c | 1 + drivers/platform/x86/intel/smartconnect.c | 1 + drivers/platform/x86/intel/vbtn.c | 1 + 5 files changed, 5 insertions(+) diff --git a/drivers/platform/x86/intel/hid.c b/drivers/platform/x86/intel/hid.c index c7a8276458640a..10cd65497cc162 100644 --- a/drivers/platform/x86/intel/hid.c +++ b/drivers/platform/x86/intel/hid.c @@ -38,6 +38,7 @@ MODULE_PARM_DESC(enable_sw_tablet_mode, /* When NOT in tablet mode, VGBS returns with the flag 0x40 */ #define TABLET_MODE_FLAG BIT(6) +MODULE_DESCRIPTION("Intel HID Event hotkey driver"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Alex Hung"); diff --git a/drivers/platform/x86/intel/pmc/pltdrv.c b/drivers/platform/x86/intel/pmc/pltdrv.c index ddfba38c210444..f2cb87dc2d37a2 100644 --- a/drivers/platform/x86/intel/pmc/pltdrv.c +++ b/drivers/platform/x86/intel/pmc/pltdrv.c @@ -86,4 +86,5 @@ static void __exit pmc_core_platform_exit(void) module_init(pmc_core_platform_init); module_exit(pmc_core_platform_exit); +MODULE_DESCRIPTION("Intel PMC Core platform driver"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/platform/x86/intel/rst.c b/drivers/platform/x86/intel/rst.c index 6bc9c4a603e076..f3a60e14d4c17c 100644 --- a/drivers/platform/x86/intel/rst.c +++ b/drivers/platform/x86/intel/rst.c @@ -7,6 +7,7 @@ #include #include +MODULE_DESCRIPTION("Intel Rapid Start Technology Driver"); MODULE_LICENSE("GPL"); static ssize_t irst_show_wakeup_events(struct device *dev, diff --git a/drivers/platform/x86/intel/smartconnect.c b/drivers/platform/x86/intel/smartconnect.c index cd25d05853249b..31019a1a6d5e88 100644 --- a/drivers/platform/x86/intel/smartconnect.c +++ b/drivers/platform/x86/intel/smartconnect.c @@ -6,6 +6,7 @@ #include #include +MODULE_DESCRIPTION("Intel Smart Connect disabling driver"); MODULE_LICENSE("GPL"); static int smartconnect_acpi_init(struct acpi_device *acpi) diff --git a/drivers/platform/x86/intel/vbtn.c b/drivers/platform/x86/intel/vbtn.c index 84c1353eb12bf9..9b7ce03ba085c2 100644 --- a/drivers/platform/x86/intel/vbtn.c +++ b/drivers/platform/x86/intel/vbtn.c @@ -24,6 +24,7 @@ #define VGBS_TABLET_MODE_FLAGS (VGBS_TABLET_MODE_FLAG | VGBS_TABLET_MODE_FLAG_ALT) +MODULE_DESCRIPTION("Intel Virtual Button driver"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("AceLan Kao"); From 7add1ee34692aabd146b86a8e88abad843ed6659 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Tue, 11 Jun 2024 22:20:59 -0700 Subject: [PATCH 1349/1578] platform/x86: add missing MODULE_DESCRIPTION() macros With ARCH=x86, make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/platform/x86/amilo-rfkill.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/platform/x86/uv_sysfs.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/platform/x86/ibm_rtl.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/platform/x86/xo1-rfkill.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/platform/x86/firmware_attributes_class.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/platform/x86/wireless-hotkey.o Add the missing invocations of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Link: https://lore.kernel.org/r/20240611-md-drivers-platform-x86-v1-1-d850e53619ee@quicinc.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/amilo-rfkill.c | 1 + drivers/platform/x86/firmware_attributes_class.c | 1 + drivers/platform/x86/ibm_rtl.c | 1 + drivers/platform/x86/uv_sysfs.c | 1 + drivers/platform/x86/wireless-hotkey.c | 1 + drivers/platform/x86/xo1-rfkill.c | 1 + 6 files changed, 6 insertions(+) diff --git a/drivers/platform/x86/amilo-rfkill.c b/drivers/platform/x86/amilo-rfkill.c index efcf909786a52c..2423dc91debb93 100644 --- a/drivers/platform/x86/amilo-rfkill.c +++ b/drivers/platform/x86/amilo-rfkill.c @@ -171,6 +171,7 @@ static void __exit amilo_rfkill_exit(void) } MODULE_AUTHOR("Ben Hutchings "); +MODULE_DESCRIPTION("Fujitsu-Siemens Amilo rfkill support"); MODULE_LICENSE("GPL"); MODULE_DEVICE_TABLE(dmi, amilo_rfkill_id_table); diff --git a/drivers/platform/x86/firmware_attributes_class.c b/drivers/platform/x86/firmware_attributes_class.c index dd8240009565dd..182a07d8ae3dfa 100644 --- a/drivers/platform/x86/firmware_attributes_class.c +++ b/drivers/platform/x86/firmware_attributes_class.c @@ -49,4 +49,5 @@ int fw_attributes_class_put(void) EXPORT_SYMBOL_GPL(fw_attributes_class_put); MODULE_AUTHOR("Mark Pearson "); +MODULE_DESCRIPTION("Firmware attributes class helper module"); MODULE_LICENSE("GPL"); diff --git a/drivers/platform/x86/ibm_rtl.c b/drivers/platform/x86/ibm_rtl.c index 1d4bbae115f1fb..231b3790980170 100644 --- a/drivers/platform/x86/ibm_rtl.c +++ b/drivers/platform/x86/ibm_rtl.c @@ -29,6 +29,7 @@ static bool debug; module_param(debug, bool, 0644); MODULE_PARM_DESC(debug, "Show debug output"); +MODULE_DESCRIPTION("IBM Premium Real Time Mode (PRTM) driver"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Keith Mannthey "); MODULE_AUTHOR("Vernon Mauery "); diff --git a/drivers/platform/x86/uv_sysfs.c b/drivers/platform/x86/uv_sysfs.c index 37372d7cc54a9f..f6a0627f36db5d 100644 --- a/drivers/platform/x86/uv_sysfs.c +++ b/drivers/platform/x86/uv_sysfs.c @@ -929,4 +929,5 @@ module_init(uv_sysfs_init); module_exit(uv_sysfs_exit); MODULE_AUTHOR("Hewlett Packard Enterprise"); +MODULE_DESCRIPTION("Sysfs structure for HPE UV systems"); MODULE_LICENSE("GPL"); diff --git a/drivers/platform/x86/wireless-hotkey.c b/drivers/platform/x86/wireless-hotkey.c index ab46164cbe1367..a220fe4f9ef8b4 100644 --- a/drivers/platform/x86/wireless-hotkey.c +++ b/drivers/platform/x86/wireless-hotkey.c @@ -14,6 +14,7 @@ #include #include +MODULE_DESCRIPTION("Airplane mode button for AMD, HP & Xiaomi laptops"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Alex Hung"); MODULE_ALIAS("acpi*:HPQ6001:*"); diff --git a/drivers/platform/x86/xo1-rfkill.c b/drivers/platform/x86/xo1-rfkill.c index e64d5646b4c799..5fe68296501c44 100644 --- a/drivers/platform/x86/xo1-rfkill.c +++ b/drivers/platform/x86/xo1-rfkill.c @@ -74,5 +74,6 @@ static struct platform_driver xo1_rfkill_driver = { module_platform_driver(xo1_rfkill_driver); MODULE_AUTHOR("Daniel Drake "); +MODULE_DESCRIPTION("OLPC XO-1 software RF kill switch"); MODULE_LICENSE("GPL"); MODULE_ALIAS("platform:xo1-rfkill"); From ecc54006f158ae0245a13e59026da2f0239c1b86 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Fri, 21 Jun 2024 17:28:09 +0800 Subject: [PATCH 1350/1578] arm64: Clear the initial ID map correctly before remapping In the attempt to clear and recreate the initial ID map for LPA2, we wrongly use 'start - end' as the map size and make the memset() almost a nop. Fix it by passing the correct map size. Fixes: 9684ec186f8f ("arm64: Enable LPA2 at boot if supported by the system") Signed-off-by: Zenghui Yu Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20240621092809.162-1-yuzenghui@huawei.com Signed-off-by: Will Deacon --- arch/arm64/kernel/pi/map_kernel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c index 5fa08e13e17e52..f374a3e5a5fe10 100644 --- a/arch/arm64/kernel/pi/map_kernel.c +++ b/arch/arm64/kernel/pi/map_kernel.c @@ -173,7 +173,7 @@ static void __init remap_idmap_for_lpa2(void) * Don't bother with the FDT, we no longer need it after this. */ memset(init_idmap_pg_dir, 0, - (u64)init_idmap_pg_dir - (u64)init_idmap_pg_end); + (u64)init_idmap_pg_end - (u64)init_idmap_pg_dir); create_init_idmap(init_idmap_pg_dir, mask); dsb(ishst); From 316930d06b92a2419d8e767193266e678545b31d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 21 Jun 2024 16:08:28 +0200 Subject: [PATCH 1351/1578] selftests/bpf: Add more ring buffer test coverage Add test coverage for reservations beyond the ring buffer size in order to validate that bpf_ringbuf_reserve() rejects the request with NULL, all other ring buffer tests keep passing as well: # ./vmtest.sh -- ./test_progs -t ringbuf [...] ./test_progs -t ringbuf [ 1.165434] bpf_testmod: loading out-of-tree module taints kernel. [ 1.165825] bpf_testmod: module verification failed: signature and/or required key missing - tainting kernel [ 1.284001] tsc: Refined TSC clocksource calibration: 3407.982 MHz [ 1.286871] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x311fc34e357, max_idle_ns: 440795379773 ns [ 1.289555] clocksource: Switched to clocksource tsc #274/1 ringbuf/ringbuf:OK #274/2 ringbuf/ringbuf_n:OK #274/3 ringbuf/ringbuf_map_key:OK #274/4 ringbuf/ringbuf_write:OK #274 ringbuf:OK #275 ringbuf_multi:OK [...] Signed-off-by: Daniel Borkmann Signed-off-by: Andrii Nakryiko [ Test fixups for getting BPF CI back to work ] Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240621140828.18238-2-daniel@iogearbox.net --- tools/testing/selftests/bpf/Makefile | 2 +- .../selftests/bpf/prog_tests/ringbuf.c | 56 +++++++++++++++++++ .../selftests/bpf/progs/test_ringbuf_write.c | 46 +++++++++++++++ 3 files changed, 103 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/progs/test_ringbuf_write.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index e0b3887b3d2df1..dd49c1d23a604c 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -457,7 +457,7 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h \ LSKELS := fentry_test.c fexit_test.c fexit_sleep.c atomics.c \ trace_printk.c trace_vprintk.c map_ptr_kern.c \ core_kern.c core_kern_overflow.c test_ringbuf.c \ - test_ringbuf_n.c test_ringbuf_map_key.c + test_ringbuf_n.c test_ringbuf_map_key.c test_ringbuf_write.c # Generate both light skeleton and libbpf skeleton for these LSKELS_EXTRA := test_ksyms_module.c test_ksyms_weak.c kfunc_call_test.c \ diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c index 4c6f42dae4096c..da430df45aa497 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c @@ -12,9 +12,11 @@ #include #include #include + #include "test_ringbuf.lskel.h" #include "test_ringbuf_n.lskel.h" #include "test_ringbuf_map_key.lskel.h" +#include "test_ringbuf_write.lskel.h" #define EDONE 7777 @@ -84,6 +86,58 @@ static void *poll_thread(void *input) return (void *)(long)ring_buffer__poll(ringbuf, timeout); } +static void ringbuf_write_subtest(void) +{ + struct test_ringbuf_write_lskel *skel; + int page_size = getpagesize(); + size_t *mmap_ptr; + int err, rb_fd; + + skel = test_ringbuf_write_lskel__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + skel->maps.ringbuf.max_entries = 0x4000; + + err = test_ringbuf_write_lskel__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + rb_fd = skel->maps.ringbuf.map_fd; + + mmap_ptr = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, rb_fd, 0); + if (!ASSERT_OK_PTR(mmap_ptr, "rw_cons_pos")) + goto cleanup; + *mmap_ptr = 0x3000; + ASSERT_OK(munmap(mmap_ptr, page_size), "unmap_rw"); + + skel->bss->pid = getpid(); + + ringbuf = ring_buffer__new(rb_fd, process_sample, NULL, NULL); + if (!ASSERT_OK_PTR(ringbuf, "ringbuf_new")) + goto cleanup; + + err = test_ringbuf_write_lskel__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup_ringbuf; + + skel->bss->discarded = 0; + skel->bss->passed = 0; + + /* trigger exactly two samples */ + syscall(__NR_getpgid); + syscall(__NR_getpgid); + + ASSERT_EQ(skel->bss->discarded, 2, "discarded"); + ASSERT_EQ(skel->bss->passed, 0, "passed"); + + test_ringbuf_write_lskel__detach(skel); +cleanup_ringbuf: + ring_buffer__free(ringbuf); +cleanup: + test_ringbuf_write_lskel__destroy(skel); +} + static void ringbuf_subtest(void) { const size_t rec_sz = BPF_RINGBUF_HDR_SZ + sizeof(struct sample); @@ -451,4 +505,6 @@ void test_ringbuf(void) ringbuf_n_subtest(); if (test__start_subtest("ringbuf_map_key")) ringbuf_map_key_subtest(); + if (test__start_subtest("ringbuf_write")) + ringbuf_write_subtest(); } diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf_write.c b/tools/testing/selftests/bpf/progs/test_ringbuf_write.c new file mode 100644 index 00000000000000..350513c0e4c985 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_ringbuf_write.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); +} ringbuf SEC(".maps"); + +/* inputs */ +int pid = 0; + +/* outputs */ +long passed = 0; +long discarded = 0; + +SEC("fentry/" SYS_PREFIX "sys_getpgid") +int test_ringbuf_write(void *ctx) +{ + int *foo, cur_pid = bpf_get_current_pid_tgid() >> 32; + void *sample1, *sample2; + + if (cur_pid != pid) + return 0; + + sample1 = bpf_ringbuf_reserve(&ringbuf, 0x3000, 0); + if (!sample1) + return 0; + /* first one can pass */ + sample2 = bpf_ringbuf_reserve(&ringbuf, 0x3000, 0); + if (!sample2) { + bpf_ringbuf_discard(sample1, 0); + __sync_fetch_and_add(&discarded, 1); + return 0; + } + /* second one must not */ + __sync_fetch_and_add(&passed, 1); + foo = sample2 + 4084; + *foo = 256; + bpf_ringbuf_discard(sample1, 0); + bpf_ringbuf_discard(sample2, 0); + return 0; +} From 2b2efe1937ca9f8815884bd4dcd5b32733025103 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 19 Jun 2024 16:53:54 -0700 Subject: [PATCH 1352/1578] bpf: Fix may_goto with negative offset. Zac's syzbot crafted a bpf prog that exposed two bugs in may_goto. The 1st bug is the way may_goto is patched. When offset is negative it should be patched differently. The 2nd bug is in the verifier: when current state may_goto_depth is equal to visited state may_goto_depth it means there is an actual infinite loop. It's not correct to prune exploration of the program at this point. Note, that this check doesn't limit the program to only one may_goto insn, since 2nd and any further may_goto will increment may_goto_depth only in the queued state pushed for future exploration. The current state will have may_goto_depth == 0 regardless of number of may_goto insns and the verifier has to explore the program until bpf_exit. Fixes: 011832b97b31 ("bpf: Introduce may_goto instruction") Reported-by: Zac Ecob Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Eduard Zingerman Closes: https://lore.kernel.org/bpf/CAADnVQL-15aNp04-cyHRn47Yv61NXfYyhopyZtUyxNojUZUXpA@mail.gmail.com/ Link: https://lore.kernel.org/bpf/20240619235355.85031-1-alexei.starovoitov@gmail.com --- kernel/bpf/verifier.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5586a571bf551e..214a9fa8c6fb74 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -17460,11 +17460,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) goto skip_inf_loop_check; } if (is_may_goto_insn_at(env, insn_idx)) { - if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) { + if (sl->state.may_goto_depth != cur->may_goto_depth && + states_equal(env, &sl->state, cur, RANGE_WITHIN)) { update_loop_entry(cur, &sl->state); goto hit; } - goto skip_inf_loop_check; } if (calls_callback(env, insn_idx)) { if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) @@ -20049,7 +20049,10 @@ static int do_misc_fixups(struct bpf_verifier_env *env) stack_depth_extra = 8; insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off); - insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 2); + if (insn->off >= 0) + insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 2); + else + insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1); insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1); insn_buf[3] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off); cnt = 4; From 280e4ebffd16ea1b55dc09761448545e216f60a9 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 19 Jun 2024 16:53:55 -0700 Subject: [PATCH 1353/1578] selftests/bpf: Add tests for may_goto with negative offset. Add few tests with may_goto and negative offset. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240619235355.85031-2-alexei.starovoitov@gmail.com --- .../bpf/progs/verifier_iterating_callbacks.c | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c b/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c index 8885e5239d6b73..80c737b6d34006 100644 --- a/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c +++ b/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c @@ -274,6 +274,58 @@ static __naked void iter_limit_bug_cb(void) ); } +int tmp_var; +SEC("socket") +__failure __msg("infinite loop detected at insn 2") +__naked void jgt_imm64_and_may_goto(void) +{ + asm volatile (" \ + r0 = %[tmp_var] ll; \ +l0_%=: .byte 0xe5; /* may_goto */ \ + .byte 0; /* regs */ \ + .short -3; /* off -3 */ \ + .long 0; /* imm */ \ + if r0 > 10 goto l0_%=; \ + r0 = 0; \ + exit; \ +" :: __imm_addr(tmp_var) + : __clobber_all); +} + +SEC("socket") +__failure __msg("infinite loop detected at insn 1") +__naked void may_goto_self(void) +{ + asm volatile (" \ + r0 = *(u32 *)(r10 - 4); \ +l0_%=: .byte 0xe5; /* may_goto */ \ + .byte 0; /* regs */ \ + .short -1; /* off -1 */ \ + .long 0; /* imm */ \ + if r0 > 10 goto l0_%=; \ + r0 = 0; \ + exit; \ +" ::: __clobber_all); +} + +SEC("socket") +__success __retval(0) +__naked void may_goto_neg_off(void) +{ + asm volatile (" \ + r0 = *(u32 *)(r10 - 4); \ + goto l0_%=; \ + goto l1_%=; \ +l0_%=: .byte 0xe5; /* may_goto */ \ + .byte 0; /* regs */ \ + .short -2; /* off -2 */ \ + .long 0; /* imm */ \ + if r0 > 10 goto l0_%=; \ +l1_%=: r0 = 0; \ + exit; \ +" ::: __clobber_all); +} + SEC("tc") __failure __flag(BPF_F_TEST_STATE_FREQ) From 7e9f79428372c6eab92271390851be34ab26bfb4 Mon Sep 17 00:00:00 2001 From: Daniil Dulov Date: Mon, 24 Jun 2024 11:07:47 +0300 Subject: [PATCH 1354/1578] xdp: Remove WARN() from __xdp_reg_mem_model() syzkaller reports a warning in __xdp_reg_mem_model(). The warning occurs only if __mem_id_init_hash_table() returns an error. It returns the error in two cases: 1. memory allocation fails; 2. rhashtable_init() fails when some fields of rhashtable_params struct are not initialized properly. The second case cannot happen since there is a static const rhashtable_params struct with valid fields. So, warning is only triggered when there is a problem with memory allocation. Thus, there is no sense in using WARN() to handle this error and it can be safely removed. WARNING: CPU: 0 PID: 5065 at net/core/xdp.c:299 __xdp_reg_mem_model+0x2d9/0x650 net/core/xdp.c:299 CPU: 0 PID: 5065 Comm: syz-executor883 Not tainted 6.8.0-syzkaller-05271-gf99c5f563c17 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/27/2024 RIP: 0010:__xdp_reg_mem_model+0x2d9/0x650 net/core/xdp.c:299 Call Trace: xdp_reg_mem_model+0x22/0x40 net/core/xdp.c:344 xdp_test_run_setup net/bpf/test_run.c:188 [inline] bpf_test_run_xdp_live+0x365/0x1e90 net/bpf/test_run.c:377 bpf_prog_test_run_xdp+0x813/0x11b0 net/bpf/test_run.c:1267 bpf_prog_test_run+0x33a/0x3b0 kernel/bpf/syscall.c:4240 __sys_bpf+0x48d/0x810 kernel/bpf/syscall.c:5649 __do_sys_bpf kernel/bpf/syscall.c:5738 [inline] __se_sys_bpf kernel/bpf/syscall.c:5736 [inline] __x64_sys_bpf+0x7c/0x90 kernel/bpf/syscall.c:5736 do_syscall_64+0xfb/0x240 entry_SYSCALL_64_after_hwframe+0x6d/0x75 Found by Linux Verification Center (linuxtesting.org) with syzkaller. Fixes: 8d5d88527587 ("xdp: rhashtable with allocator ID to pointer mapping") Signed-off-by: Daniil Dulov Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/all/20240617162708.492159-1-d.dulov@aladdin.ru Link: https://lore.kernel.org/bpf/20240624080747.36858-1-d.dulov@aladdin.ru --- net/core/xdp.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/core/xdp.c b/net/core/xdp.c index 41693154e426f7..022c12059cf2f7 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -295,10 +295,8 @@ static struct xdp_mem_allocator *__xdp_reg_mem_model(struct xdp_mem_info *mem, mutex_lock(&mem_id_lock); ret = __mem_id_init_hash_table(); mutex_unlock(&mem_id_lock); - if (ret < 0) { - WARN_ON(1); + if (ret < 0) return ERR_PTR(ret); - } } xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp); From 282a4482e198e03781c152c88aac8aa382ef9a55 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Mon, 24 Jun 2024 14:12:56 +0800 Subject: [PATCH 1355/1578] ASoC: mediatek: mt8195: Add platform entry for ETDM1_OUT_BE dai link Commit e70b8dd26711 ("ASoC: mediatek: mt8195: Remove afe-dai component and rework codec link") removed the codec entry for the ETDM1_OUT_BE dai link entirely instead of replacing it with COMP_EMPTY(). This worked by accident as the remaining COMP_EMPTY() platform entry became the codec entry, and the platform entry became completely empty, effectively the same as COMP_DUMMY() since snd_soc_fill_dummy_dai() doesn't do anything for platform entries. This causes a KASAN out-of-bounds warning in mtk_soundcard_common_probe() in sound/soc/mediatek/common/mtk-soundcard-driver.c: for_each_card_prelinks(card, i, dai_link) { if (adsp_node && !strncmp(dai_link->name, "AFE_SOF", strlen("AFE_SOF"))) dai_link->platforms->of_node = adsp_node; else if (!dai_link->platforms->name && !dai_link->platforms->of_node) dai_link->platforms->of_node = platform_node; } where the code expects the platforms array to have space for at least one entry. Add an COMP_EMPTY() entry so that dai_link->platforms has space. Fixes: e70b8dd26711 ("ASoC: mediatek: mt8195: Remove afe-dai component and rework codec link") Signed-off-by: Chen-Yu Tsai Reviewed-by: AngeloGioacchino Del Regno Link: https://patch.msgid.link/20240624061257.3115467-1-wenst@chromium.org Signed-off-by: Mark Brown --- sound/soc/mediatek/mt8195/mt8195-mt6359.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/mediatek/mt8195/mt8195-mt6359.c b/sound/soc/mediatek/mt8195/mt8195-mt6359.c index ca875119052032..2832ef78eaed72 100644 --- a/sound/soc/mediatek/mt8195/mt8195-mt6359.c +++ b/sound/soc/mediatek/mt8195/mt8195-mt6359.c @@ -827,6 +827,7 @@ SND_SOC_DAILINK_DEFS(ETDM2_IN_BE, SND_SOC_DAILINK_DEFS(ETDM1_OUT_BE, DAILINK_COMP_ARRAY(COMP_CPU("ETDM1_OUT")), + DAILINK_COMP_ARRAY(COMP_EMPTY()), DAILINK_COMP_ARRAY(COMP_EMPTY())); SND_SOC_DAILINK_DEFS(ETDM2_OUT_BE, From d941b5877a4eaef367ca48325c22d2e1bd8e8619 Mon Sep 17 00:00:00 2001 From: Kieran Bingham Date: Thu, 20 Jun 2024 23:10:42 +0100 Subject: [PATCH 1356/1578] staging: vc04_services: vchiq_arm: Fix initialisation check The vchiq_state used to be obtained through an accessor which would validate that the VCHIQ had been initialised correctly with the remote, or return a null state. In commit 42a2f6664e18 ("staging: vc04_services: Move global g_state to vchiq_state") the global state was moved to the vchiq_mgnt structures stored as a vchiq instance specific context. This conversion removed the helpers and instead replaced users of this helper with the assumption that the state is always available and the remote connected. The conversion does ensure that the state is always available, so some remaining state null pointer checks that remain are unnecessary, but the assumption that the remote is present and initialised is incorrect. Fix this broken assumption by re-introducing the logic that was lost during the conversion. Fixes: 42a2f6664e18 ("staging: vc04_services: Move global g_state to vchiq_state") Signed-off-by: Kieran Bingham Reviewed-by: Stefan Wahren Reviewed-by: Umang Jain Link: https://lore.kernel.org/r/20240620221046.2731704-1-kieran.bingham@ideasonboard.com Signed-off-by: Greg Kroah-Hartman --- .../staging/vc04_services/interface/vchiq_arm/vchiq_arm.c | 4 ++-- .../staging/vc04_services/interface/vchiq_arm/vchiq_core.h | 5 +++++ .../staging/vc04_services/interface/vchiq_arm/vchiq_dev.c | 7 ++++++- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c index 69daeba974f2dc..5f518e5a92739e 100644 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c @@ -707,7 +707,7 @@ int vchiq_initialise(struct vchiq_state *state, struct vchiq_instance **instance * block forever. */ for (i = 0; i < VCHIQ_INIT_RETRIES; i++) { - if (state) + if (vchiq_remote_initialised(state)) break; usleep_range(500, 600); } @@ -1202,7 +1202,7 @@ void vchiq_dump_platform_instances(struct vchiq_state *state, struct seq_file *f { int i; - if (!state) + if (!vchiq_remote_initialised(state)) return; /* diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.h b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.h index 8af209e34fb2c1..382ec08f6a1400 100644 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.h +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.h @@ -413,6 +413,11 @@ struct vchiq_state { struct opaque_platform_state *platform_state; }; +static inline bool vchiq_remote_initialised(const struct vchiq_state *state) +{ + return state->remote && state->remote->initialised; +} + struct bulk_waiter { struct vchiq_bulk *bulk; struct completion event; diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_dev.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_dev.c index 3c63347d2d0874..430f2ed2ccd3b0 100644 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_dev.c +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_dev.c @@ -1170,6 +1170,11 @@ static int vchiq_open(struct inode *inode, struct file *file) dev_dbg(state->dev, "arm: vchiq open\n"); + if (!vchiq_remote_initialised(state)) { + dev_dbg(state->dev, "arm: vchiq has no connection to VideoCore\n"); + return -ENOTCONN; + } + instance = kzalloc(sizeof(*instance), GFP_KERNEL); if (!instance) return -ENOMEM; @@ -1200,7 +1205,7 @@ static int vchiq_release(struct inode *inode, struct file *file) dev_dbg(state->dev, "arm: instance=%p\n", instance); - if (!state) { + if (!vchiq_remote_initialised(state)) { ret = -EPERM; goto out; } From aef5daa2c49d510436b733827d4f0bab79fcc4a0 Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Fri, 21 Jun 2024 10:41:13 +0800 Subject: [PATCH 1357/1578] netfilter: fix undefined reference to 'netfilter_lwtunnel_*' when CONFIG_SYSCTL=n if CONFIG_SYSFS is not enabled in config, we get the below compile error, All errors (new ones prefixed by >>): csky-linux-ld: net/netfilter/core.o: in function `netfilter_init': core.c:(.init.text+0x42): undefined reference to `netfilter_lwtunnel_init' >> csky-linux-ld: core.c:(.init.text+0x56): undefined reference to `netfilter_lwtunnel_fini' >> csky-linux-ld: core.c:(.init.text+0x70): undefined reference to `netfilter_lwtunnel_init' csky-linux-ld: core.c:(.init.text+0x78): undefined reference to `netfilter_lwtunnel_fini' Fixes: a2225e0250c5 ("netfilter: move the sysctl nf_hooks_lwtunnel into the netfilter core") Reported-by: Mirsad Todorovac Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202406210511.8vbByYj3-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202406210520.6HmrUaA2-lkp@intel.com/ Signed-off-by: Jianguo Wu Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_hooks_lwtunnel.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/nf_hooks_lwtunnel.c b/net/netfilter/nf_hooks_lwtunnel.c index 7cdb59bb4459f3..d8ebebc9775d78 100644 --- a/net/netfilter/nf_hooks_lwtunnel.c +++ b/net/netfilter/nf_hooks_lwtunnel.c @@ -117,4 +117,7 @@ void netfilter_lwtunnel_fini(void) { unregister_pernet_subsys(&nf_lwtunnel_net_ops); } +#else +int __init netfilter_lwtunnel_init(void) { return 0; } +void netfilter_lwtunnel_fini(void) {} #endif /* CONFIG_SYSCTL */ From 0ac18dac43103ab1df6d26ec9a781c0126f83ced Mon Sep 17 00:00:00 2001 From: Crescent Hsieh Date: Mon, 17 Jun 2024 14:30:58 +0800 Subject: [PATCH 1358/1578] tty: serial: 8250: Fix port count mismatch with the device Normally, the number of ports is indicated by the third digit of the device ID on Moxa PCI serial boards. For example, `0x1121` indicates a device with 2 ports. However, `CP116E_A_A` and `CP116E_A_B` are exceptions; they have 8 ports, but the third digit of the device ID is `6`. This patch introduces a function to retrieve the number of ports on Moxa PCI serial boards, addressing the issue described above. Fixes: 37058fd5d239 ("tty: serial: 8250: Add support for MOXA Mini PCIe boards") Cc: stable Signed-off-by: Crescent Hsieh Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240617063058.18866-1-crescentcy.hsieh@moxa.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_pci.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c index 40af74b55933cf..e1d7aa2fa3474d 100644 --- a/drivers/tty/serial/8250/8250_pci.c +++ b/drivers/tty/serial/8250/8250_pci.c @@ -1985,6 +1985,17 @@ enum { MOXA_SUPP_RS485 = BIT(2), }; +static unsigned short moxa_get_nports(unsigned short device) +{ + switch (device) { + case PCI_DEVICE_ID_MOXA_CP116E_A_A: + case PCI_DEVICE_ID_MOXA_CP116E_A_B: + return 8; + } + + return FIELD_GET(0x00F0, device); +} + static bool pci_moxa_is_mini_pcie(unsigned short device) { if (device == PCI_DEVICE_ID_MOXA_CP102N || @@ -2038,7 +2049,7 @@ static int pci_moxa_init(struct pci_dev *dev) { unsigned short device = dev->device; resource_size_t iobar_addr = pci_resource_start(dev, 2); - unsigned int num_ports = (device & 0x00F0) >> 4, i; + unsigned int i, num_ports = moxa_get_nports(device); u8 val, init_mode = MOXA_RS232; if (!(pci_moxa_supported_rs(dev) & MOXA_SUPP_RS232)) { From 9d141c1e615795eeb93cd35501ad144ee997a826 Mon Sep 17 00:00:00 2001 From: Udit Kumar Date: Wed, 19 Jun 2024 16:29:03 +0530 Subject: [PATCH 1359/1578] serial: 8250_omap: Implementation of Errata i2310 As per Errata i2310[0], Erroneous timeout can be triggered, if this Erroneous interrupt is not cleared then it may leads to storm of interrupts, therefore apply Errata i2310 solution. [0] https://www.ti.com/lit/pdf/sprz536 page 23 Fixes: b67e830d38fa ("serial: 8250: 8250_omap: Fix possible interrupt storm on K3 SoCs") Cc: stable@vger.kernel.org Signed-off-by: Udit Kumar Link: https://lore.kernel.org/r/20240619105903.165434-1-u-kumar1@ti.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_omap.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c index 170639d12b2a0f..ddac0a13cf8400 100644 --- a/drivers/tty/serial/8250/8250_omap.c +++ b/drivers/tty/serial/8250/8250_omap.c @@ -115,6 +115,10 @@ /* RX FIFO occupancy indicator */ #define UART_OMAP_RX_LVL 0x19 +/* Timeout low and High */ +#define UART_OMAP_TO_L 0x26 +#define UART_OMAP_TO_H 0x27 + /* * Copy of the genpd flags for the console. * Only used if console suspend is disabled @@ -663,13 +667,24 @@ static irqreturn_t omap8250_irq(int irq, void *dev_id) /* * On K3 SoCs, it is observed that RX TIMEOUT is signalled after - * FIFO has been drained, in which case a dummy read of RX FIFO - * is required to clear RX TIMEOUT condition. + * FIFO has been drained or erroneously. + * So apply solution of Errata i2310 as mentioned in + * https://www.ti.com/lit/pdf/sprz536 */ if (priv->habit & UART_RX_TIMEOUT_QUIRK && - (iir & UART_IIR_RX_TIMEOUT) == UART_IIR_RX_TIMEOUT && - serial_port_in(port, UART_OMAP_RX_LVL) == 0) { - serial_port_in(port, UART_RX); + (iir & UART_IIR_RX_TIMEOUT) == UART_IIR_RX_TIMEOUT) { + unsigned char efr2, timeout_h, timeout_l; + + efr2 = serial_in(up, UART_OMAP_EFR2); + timeout_h = serial_in(up, UART_OMAP_TO_H); + timeout_l = serial_in(up, UART_OMAP_TO_L); + serial_out(up, UART_OMAP_TO_H, 0xFF); + serial_out(up, UART_OMAP_TO_L, 0xFF); + serial_out(up, UART_OMAP_EFR2, UART_OMAP_EFR2_TIMEOUT_BEHAVE); + serial_in(up, UART_IIR); + serial_out(up, UART_OMAP_EFR2, efr2); + serial_out(up, UART_OMAP_TO_H, timeout_h); + serial_out(up, UART_OMAP_TO_L, timeout_l); } /* Stop processing interrupts on input overrun */ From 7c92a8bd53f24d50c8cf4aba53bb75505b382fed Mon Sep 17 00:00:00 2001 From: Jean-Michel Hautbois Date: Thu, 20 Jun 2024 18:29:59 +0200 Subject: [PATCH 1360/1578] tty: mcf: MCF54418 has 10 UARTS Most of the colfires have up to 5 UARTs but MCF54418 has up-to 10 ! Change the maximum value authorized. Signed-off-by: Jean-Michel Hautbois Cc: stable Fixes: 2545cf6e94b4 ("m68knommu: allow 4 coldfire serial ports") Link: https://lore.kernel.org/r/20240620-upstream-uart-v1-1-a9d0d95fb19e@yoseli.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/mcf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/mcf.c b/drivers/tty/serial/mcf.c index b0604d6da02579..58858dd352c59b 100644 --- a/drivers/tty/serial/mcf.c +++ b/drivers/tty/serial/mcf.c @@ -462,7 +462,7 @@ static const struct uart_ops mcf_uart_ops = { .verify_port = mcf_verify_port, }; -static struct mcf_uart mcf_ports[4]; +static struct mcf_uart mcf_ports[10]; #define MCF_MAXPORTS ARRAY_SIZE(mcf_ports) From a81dbd0463eca317eee44985a66aa6cc2ce5c101 Mon Sep 17 00:00:00 2001 From: Stefan Eichenberger Date: Fri, 21 Jun 2024 17:37:49 +0200 Subject: [PATCH 1361/1578] serial: imx: set receiver level before starting uart Set the receiver level to something > 0 before calling imx_uart_start_rx in rs485_config. This is necessary to avoid an interrupt storm that might prevent the system from booting. This was seen on an i.MX7 device when the rs485-rts-active-low property was active in the device tree. Fixes: 6d215f83e5fc ("serial: imx: warn user when using unsupported configuration") Cc: stable Signed-off-by: Stefan Eichenberger Link: https://lore.kernel.org/r/20240621153829.183780-1-eichest@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/imx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c index 2eb22594960f35..f4f40c9373c2f7 100644 --- a/drivers/tty/serial/imx.c +++ b/drivers/tty/serial/imx.c @@ -1952,8 +1952,10 @@ static int imx_uart_rs485_config(struct uart_port *port, struct ktermios *termio /* Make sure Rx is enabled in case Tx is active with Rx disabled */ if (!(rs485conf->flags & SER_RS485_ENABLED) || - rs485conf->flags & SER_RS485_RX_DURING_TX) + rs485conf->flags & SER_RS485_RX_DURING_TX) { + imx_uart_setup_ufcr(sport, TXTL_DEFAULT, RXTL_DEFAULT); imx_uart_start_rx(port); + } return 0; } From c5603e2a621dac10c5e21cc430848ebcfa6c7e01 Mon Sep 17 00:00:00 2001 From: Doug Brown Date: Thu, 6 Jun 2024 12:56:31 -0700 Subject: [PATCH 1362/1578] Revert "serial: core: only stop transmit when HW fifo is empty" This reverts commit 7bfb915a597a301abb892f620fe5c283a9fdbd77. This commit broke pxa and omap-serial, because it inhibited them from calling stop_tx() if their TX FIFOs weren't completely empty. This resulted in these two drivers hanging during transmits because the TX interrupt would stay enabled, and a new TX interrupt would never fire. Cc: stable@vger.kernel.org Fixes: 7bfb915a597a ("serial: core: only stop transmit when HW fifo is empty") Signed-off-by: Doug Brown Link: https://lore.kernel.org/r/20240606195632.173255-2-doug@schmorgal.com Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 8cb65f50e830c8..3fb9a29e025fd8 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -811,8 +811,7 @@ enum UART_TX_FLAGS { if (pending < WAKEUP_CHARS) { \ uart_write_wakeup(__port); \ \ - if (!((flags) & UART_TX_NOSTOP) && pending == 0 && \ - __port->ops->tx_empty(__port)) \ + if (!((flags) & UART_TX_NOSTOP) && pending == 0) \ __port->ops->stop_tx(__port); \ } \ \ From 9bb43b9e8d9a288a214e9b17acc9e46fda3977cf Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Thu, 6 Jun 2024 12:56:32 -0700 Subject: [PATCH 1363/1578] serial: core: introduce uart_port_tx_limited_flags() Analogue to uart_port_tx_flags() introduced in commit 3ee07964d407 ("serial: core: introduce uart_port_tx_flags()"), add a _flags variant for uart_port_tx_limited(). Fixes: d11cc8c3c4b6 ("tty: serial: use uart_port_tx_limited()") Cc: stable@vger.kernel.org Signed-off-by: Jonas Gorski Signed-off-by: Doug Brown Link: https://lore.kernel.org/r/20240606195632.173255-3-doug@schmorgal.com Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 3fb9a29e025fd8..aea25eef9a1a7b 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -850,6 +850,24 @@ enum UART_TX_FLAGS { __count--); \ }) +/** + * uart_port_tx_limited_flags -- transmit helper for uart_port with count limiting with flags + * @port: uart port + * @ch: variable to store a character to be written to the HW + * @flags: %UART_TX_NOSTOP or similar + * @count: a limit of characters to send + * @tx_ready: can HW accept more data function + * @put_char: function to write a character + * @tx_done: function to call after the loop is done + * + * See uart_port_tx_limited() for more details. + */ +#define uart_port_tx_limited_flags(port, ch, flags, count, tx_ready, put_char, tx_done) ({ \ + unsigned int __count = (count); \ + __uart_port_tx(port, ch, flags, tx_ready, put_char, tx_done, __count, \ + __count--); \ +}) + /** * uart_port_tx -- transmit helper for uart_port * @port: uart port From ea55c65dedf40e9c1911dc1e63e26bc9a59692b9 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Thu, 6 Jun 2024 12:56:33 -0700 Subject: [PATCH 1364/1578] serial: bcm63xx-uart: fix tx after conversion to uart_port_tx_limited() When bcm63xx-uart was converted to uart_port_tx_limited(), it implicitly added a call to stop_tx(). This causes garbage to be put out on the serial console. To fix this, pass UART_TX_NOSTOP in flags, and manually call stop_tx() ourselves analogue to how a similar issue was fixed in commit 7be50f2e8f20 ("serial: mxs-auart: fix tx"). Fixes: d11cc8c3c4b6 ("tty: serial: use uart_port_tx_limited()") Cc: stable@vger.kernel.org Signed-off-by: Jonas Gorski Signed-off-by: Doug Brown Link: https://lore.kernel.org/r/20240606195632.173255-4-doug@schmorgal.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/bcm63xx_uart.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/tty/serial/bcm63xx_uart.c b/drivers/tty/serial/bcm63xx_uart.c index 34801a6f300b68..b88cc28c94e337 100644 --- a/drivers/tty/serial/bcm63xx_uart.c +++ b/drivers/tty/serial/bcm63xx_uart.c @@ -308,8 +308,8 @@ static void bcm_uart_do_tx(struct uart_port *port) val = bcm_uart_readl(port, UART_MCTL_REG); val = (val & UART_MCTL_TXFIFOFILL_MASK) >> UART_MCTL_TXFIFOFILL_SHIFT; - - pending = uart_port_tx_limited(port, ch, port->fifosize - val, + pending = uart_port_tx_limited_flags(port, ch, UART_TX_NOSTOP, + port->fifosize - val, true, bcm_uart_writel(port, ch, UART_FIFO_REG), ({})); @@ -320,6 +320,9 @@ static void bcm_uart_do_tx(struct uart_port *port) val = bcm_uart_readl(port, UART_IR_REG); val &= ~UART_TX_INT_MASK; bcm_uart_writel(port, val, UART_IR_REG); + + if (uart_tx_stopped(port)) + bcm_uart_stop_tx(port); } /* From 876835b128976e2e9a7d18daab58b4cba7742787 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Sun, 2 Jun 2024 16:46:25 -0700 Subject: [PATCH 1365/1578] brd: add missing MODULE_DESCRIPTION() macro make allmodconfig && make W=1 C=1 reports: modpost: missing MODULE_DESCRIPTION() in drivers/block/brd.o Add the missing invocation of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Link: https://lore.kernel.org/r/20240602-md-block-brd-v1-1-e71338e131b6@quicinc.com Signed-off-by: Jens Axboe --- drivers/block/brd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index a300645cd9d4a5..2fd1ed1017481b 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -296,6 +296,7 @@ static int max_part = 1; module_param(max_part, int, 0444); MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices"); +MODULE_DESCRIPTION("Ram backed block device driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR); MODULE_ALIAS("rd"); From d57afd8bb7f2c4f0d86e9e9b276f7c3a7fedfc6d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 28 May 2024 08:40:12 -0600 Subject: [PATCH 1366/1578] io_uring/msg_ring: tighten requirement for remote posting Currently this is gated on whether or not the target ring needs a local completion - and if so, whether or not we're running on the right task. The use case for same thread cross posting is probably a lot less relevant than remote posting. And since we're going to improve this situation anyway, just gate it on local posting and ignore what task we're currently running on. Signed-off-by: Jens Axboe --- io_uring/msg_ring.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 81c4a9d437296c..9fdb0cc19bfdd8 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -68,9 +68,7 @@ void io_msg_ring_cleanup(struct io_kiocb *req) static inline bool io_msg_need_remote(struct io_ring_ctx *target_ctx) { - if (!target_ctx->task_complete) - return false; - return current != target_ctx->submitter_task; + return target_ctx->task_complete; } static int io_msg_exec_remote(struct io_kiocb *req, task_work_func_t func) From c3ac76f9ca7a621428851149bc56bfca0aacaef4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 28 Mar 2024 12:38:44 -0600 Subject: [PATCH 1367/1578] io_uring: add remote task_work execution helper All our task_work handling is targeted at the state in the io_kiocb itself, which is what it is being used for. However, MSG_RING rolls its own task_work handling, ignoring how that is usually done. In preparation for switching MSG_RING to be able to use the normal task_work handling, add io_req_task_work_add_remote() which allows the caller to pass in the target io_ring_ctx. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 24 ++++++++++++++++-------- io_uring/io_uring.h | 2 ++ 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 438c44ca3abd63..85b2ce54328c3e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1098,9 +1098,10 @@ void tctx_task_work(struct callback_head *cb) WARN_ON_ONCE(ret); } -static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) +static inline void io_req_local_work_add(struct io_kiocb *req, + struct io_ring_ctx *ctx, + unsigned flags) { - struct io_ring_ctx *ctx = req->ctx; unsigned nr_wait, nr_tw, nr_tw_prev; struct llist_node *head; @@ -1114,6 +1115,8 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) flags &= ~IOU_F_TWQ_LAZY_WAKE; + guard(rcu)(); + head = READ_ONCE(ctx->work_llist.first); do { nr_tw_prev = 0; @@ -1195,13 +1198,18 @@ static void io_req_normal_work_add(struct io_kiocb *req) void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) { - if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) { - rcu_read_lock(); - io_req_local_work_add(req, flags); - rcu_read_unlock(); - } else { + if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) + io_req_local_work_add(req, req->ctx, flags); + else io_req_normal_work_add(req); - } +} + +void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx, + unsigned flags) +{ + if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))) + return; + io_req_local_work_add(req, ctx, flags); } static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index cd43924eed04e1..7a8641214509ff 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -73,6 +73,8 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); +void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx, + unsigned flags); bool io_alloc_async_data(struct io_kiocb *req); void io_req_task_queue(struct io_kiocb *req); void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts); From f33096a3c99c0149be49fe1e107244a7ed860ecb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 6 Jun 2024 10:28:26 -0600 Subject: [PATCH 1368/1578] io_uring: add io_add_aux_cqe() helper This helper will post a CQE, and can be called from task_work where we now that the ctx is already properly locked and that deferred completions will get flushed later on. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 23 +++++++++++++++++++++-- io_uring/io_uring.h | 1 + 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 85b2ce54328c3e..cdeb94d2a26b61 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -801,19 +801,38 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, return false; } -bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) +static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, + u32 cflags) { bool filled; - io_cq_lock(ctx); filled = io_fill_cqe_aux(ctx, user_data, res, cflags); if (!filled) filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); + return filled; +} + +bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) +{ + bool filled; + + io_cq_lock(ctx); + filled = __io_post_aux_cqe(ctx, user_data, res, cflags); io_cq_unlock_post(ctx); return filled; } +/* + * Must be called from inline task_work so we now a flush will happen later, + * and obviously with ctx->uring_lock held (tw always has that). + */ +void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) +{ + __io_post_aux_cqe(ctx, user_data, res, cflags); + ctx->submit_state.cq_flush = true; +} + /* * A helper for multishot requests posting additional CQEs. * Should only be used from a task_work including IO_URING_F_MULTISHOT. diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 7a8641214509ff..e1ce908f067992 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -65,6 +65,7 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow); int io_run_task_work_sig(struct io_ring_ctx *ctx); void io_req_defer_failed(struct io_kiocb *req, s32 res); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); +void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); From 0617bb500bfabf8447062f1e1edde92ed2b638f1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 28 Mar 2024 11:00:21 -0600 Subject: [PATCH 1369/1578] io_uring/msg_ring: improve handling of target CQE posting Use the exported helper for queueing task_work for message passing, rather than rolling our own. Note that this is only done for strict data messages for now, file descriptor passing messages still rely on the kernel task_work. It could get converted at some point if it's performance critical. This improves peak performance of message passing by about 5x in some basic testing, with 2 threads just sending messages to each other. Before this change, it was capped at around 700K/sec, with the change it's at over 4M/sec. Signed-off-by: Jens Axboe --- io_uring/msg_ring.c | 90 +++++++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 43 deletions(-) diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 9fdb0cc19bfdd8..ad7d67d444612b 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -13,7 +13,6 @@ #include "filetable.h" #include "msg_ring.h" - /* All valid masks for MSG_RING */ #define IORING_MSG_RING_MASK (IORING_MSG_RING_CQE_SKIP | \ IORING_MSG_RING_FLAGS_PASS) @@ -71,54 +70,43 @@ static inline bool io_msg_need_remote(struct io_ring_ctx *target_ctx) return target_ctx->task_complete; } -static int io_msg_exec_remote(struct io_kiocb *req, task_work_func_t func) +static void io_msg_tw_complete(struct io_kiocb *req, struct io_tw_state *ts) { - struct io_ring_ctx *ctx = req->file->private_data; - struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); - struct task_struct *task = READ_ONCE(ctx->submitter_task); - - if (unlikely(!task)) - return -EOWNERDEAD; + struct io_ring_ctx *ctx = req->ctx; - init_task_work(&msg->tw, func); - if (task_work_add(task, &msg->tw, TWA_SIGNAL)) - return -EOWNERDEAD; + io_add_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, req->cqe.flags); + kmem_cache_free(req_cachep, req); + percpu_ref_put(&ctx->refs); +} - return IOU_ISSUE_SKIP_COMPLETE; +static void io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, + int res, u32 cflags, u64 user_data) +{ + req->cqe.user_data = user_data; + io_req_set_res(req, res, cflags); + percpu_ref_get(&ctx->refs); + req->ctx = ctx; + req->task = READ_ONCE(ctx->submitter_task); + req->io_task_work.func = io_msg_tw_complete; + io_req_task_work_add_remote(req, ctx, IOU_F_TWQ_LAZY_WAKE); } -static void io_msg_tw_complete(struct callback_head *head) +static int io_msg_data_remote(struct io_kiocb *req) { - struct io_msg *msg = container_of(head, struct io_msg, tw); - struct io_kiocb *req = cmd_to_io_kiocb(msg); struct io_ring_ctx *target_ctx = req->file->private_data; - int ret = 0; - - if (current->flags & PF_EXITING) { - ret = -EOWNERDEAD; - } else { - u32 flags = 0; - - if (msg->flags & IORING_MSG_RING_FLAGS_PASS) - flags = msg->cqe_flags; - - /* - * If the target ring is using IOPOLL mode, then we need to be - * holding the uring_lock for posting completions. Other ring - * types rely on the regular completion locking, which is - * handled while posting. - */ - if (target_ctx->flags & IORING_SETUP_IOPOLL) - mutex_lock(&target_ctx->uring_lock); - if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) - ret = -EOVERFLOW; - if (target_ctx->flags & IORING_SETUP_IOPOLL) - mutex_unlock(&target_ctx->uring_lock); - } + struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); + struct io_kiocb *target; + u32 flags = 0; - if (ret < 0) - req_set_fail(req); - io_req_queue_tw_complete(req, ret); + target = kmem_cache_alloc(req_cachep, GFP_KERNEL); + if (unlikely(!target)) + return -ENOMEM; + + if (msg->flags & IORING_MSG_RING_FLAGS_PASS) + flags = msg->cqe_flags; + + io_msg_remote_post(target_ctx, target, msg->len, flags, msg->user_data); + return 0; } static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) @@ -136,7 +124,7 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) return -EBADFD; if (io_msg_need_remote(target_ctx)) - return io_msg_exec_remote(req, io_msg_tw_complete); + return io_msg_data_remote(req); if (msg->flags & IORING_MSG_RING_FLAGS_PASS) flags = msg->cqe_flags; @@ -216,6 +204,22 @@ static void io_msg_tw_fd_complete(struct callback_head *head) io_req_queue_tw_complete(req, ret); } +static int io_msg_fd_remote(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->file->private_data; + struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); + struct task_struct *task = READ_ONCE(ctx->submitter_task); + + if (unlikely(!task)) + return -EOWNERDEAD; + + init_task_work(&msg->tw, io_msg_tw_fd_complete); + if (task_work_add(task, &msg->tw, TWA_SIGNAL)) + return -EOWNERDEAD; + + return IOU_ISSUE_SKIP_COMPLETE; +} + static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *target_ctx = req->file->private_data; @@ -238,7 +242,7 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) } if (io_msg_need_remote(target_ctx)) - return io_msg_exec_remote(req, io_msg_tw_fd_complete); + return io_msg_fd_remote(req); return io_msg_install_complete(req, issue_flags); } From 50cf5f3842af3135b88b041890e7e12a74425fcb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 6 Jun 2024 12:25:01 -0600 Subject: [PATCH 1370/1578] io_uring/msg_ring: add an alloc cache for io_kiocb entries With slab accounting, allocating and freeing memory has considerable overhead. Add a basic alloc cache for the io_kiocb allocations that msg_ring needs to do. Unlike other caches, this one is used by the sender, grabbing it from the remote ring. When the remote ring gets the posted completion, it'll free it locally. Hence it is separately locked, using ctx->msg_lock. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +++ io_uring/io_uring.c | 6 ++++++ io_uring/msg_ring.c | 31 +++++++++++++++++++++++++++++-- io_uring/msg_ring.h | 1 + 4 files changed, 39 insertions(+), 2 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 1052a68fd68df4..ede42dce15067c 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -397,6 +397,9 @@ struct io_ring_ctx { struct callback_head poll_wq_task_work; struct list_head defer_list; + struct io_alloc_cache msg_cache; + spinlock_t msg_lock; + #ifdef CONFIG_NET_RX_BUSY_POLL struct list_head napi_list; /* track busy poll napi_id */ spinlock_t napi_lock; /* napi_list lock */ diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index cdeb94d2a26b61..7ed1e009aaecb0 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -95,6 +95,7 @@ #include "futex.h" #include "napi.h" #include "uring_cmd.h" +#include "msg_ring.h" #include "memmap.h" #include "timeout.h" @@ -315,6 +316,9 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) sizeof(struct io_async_rw)); ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX, sizeof(struct uring_cache)); + spin_lock_init(&ctx->msg_lock); + ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX, + sizeof(struct io_kiocb)); ret |= io_futex_cache_init(ctx); if (ret) goto err; @@ -351,6 +355,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); io_alloc_cache_free(&ctx->uring_cache, kfree); + io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); io_futex_cache_free(ctx); kfree(ctx->cancel_table.hbs); kfree(ctx->cancel_table_locked.hbs); @@ -2599,6 +2604,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); io_alloc_cache_free(&ctx->uring_cache, kfree); + io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); io_futex_cache_free(ctx); io_destroy_buffers(ctx); mutex_unlock(&ctx->uring_lock); diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index ad7d67d444612b..47a754e83b49a9 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -11,6 +11,7 @@ #include "io_uring.h" #include "rsrc.h" #include "filetable.h" +#include "alloc_cache.h" #include "msg_ring.h" /* All valid masks for MSG_RING */ @@ -75,7 +76,13 @@ static void io_msg_tw_complete(struct io_kiocb *req, struct io_tw_state *ts) struct io_ring_ctx *ctx = req->ctx; io_add_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, req->cqe.flags); - kmem_cache_free(req_cachep, req); + if (spin_trylock(&ctx->msg_lock)) { + if (io_alloc_cache_put(&ctx->msg_cache, req)) + req = NULL; + spin_unlock(&ctx->msg_lock); + } + if (req) + kfree(req); percpu_ref_put(&ctx->refs); } @@ -91,6 +98,19 @@ static void io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, io_req_task_work_add_remote(req, ctx, IOU_F_TWQ_LAZY_WAKE); } +static struct io_kiocb *io_msg_get_kiocb(struct io_ring_ctx *ctx) +{ + struct io_kiocb *req = NULL; + + if (spin_trylock(&ctx->msg_lock)) { + req = io_alloc_cache_get(&ctx->msg_cache); + spin_unlock(&ctx->msg_lock); + } + if (req) + return req; + return kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN); +} + static int io_msg_data_remote(struct io_kiocb *req) { struct io_ring_ctx *target_ctx = req->file->private_data; @@ -98,7 +118,7 @@ static int io_msg_data_remote(struct io_kiocb *req) struct io_kiocb *target; u32 flags = 0; - target = kmem_cache_alloc(req_cachep, GFP_KERNEL); + target = io_msg_get_kiocb(req->ctx); if (unlikely(!target)) return -ENOMEM; @@ -296,3 +316,10 @@ int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) io_req_set_res(req, ret, 0); return IOU_OK; } + +void io_msg_cache_free(const void *entry) +{ + struct io_kiocb *req = (struct io_kiocb *) entry; + + kmem_cache_free(req_cachep, req); +} diff --git a/io_uring/msg_ring.h b/io_uring/msg_ring.h index 3987ee6c0e5f1d..3030f3942f0f5e 100644 --- a/io_uring/msg_ring.h +++ b/io_uring/msg_ring.h @@ -3,3 +3,4 @@ int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags); void io_msg_ring_cleanup(struct io_kiocb *req); +void io_msg_cache_free(const void *entry); From 4b8e88e563b5f666446d002ad0dc1e6e8e7102b0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 19 Jun 2024 11:34:09 +0200 Subject: [PATCH 1371/1578] ftruncate: pass a signed offset The old ftruncate() syscall, using the 32-bit off_t misses a sign extension when called in compat mode on 64-bit architectures. As a result, passing a negative length accidentally succeeds in truncating to file size between 2GiB and 4GiB. Changing the type of the compat syscall to the signed compat_off_t changes the behavior so it instead returns -EINVAL. The native entry point, the truncate() syscall and the corresponding loff_t based variants are all correct already and do not suffer from this mistake. Fixes: 3f6d078d4acc ("fix compat truncate/ftruncate") Reviewed-by: Christian Brauner Cc: stable@vger.kernel.org Signed-off-by: Arnd Bergmann --- fs/open.c | 4 ++-- include/linux/compat.h | 2 +- include/linux/syscalls.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/open.c b/fs/open.c index 89cafb57206118..50e45bc7c4d8b9 100644 --- a/fs/open.c +++ b/fs/open.c @@ -202,13 +202,13 @@ long do_sys_ftruncate(unsigned int fd, loff_t length, int small) return error; } -SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length) +SYSCALL_DEFINE2(ftruncate, unsigned int, fd, off_t, length) { return do_sys_ftruncate(fd, length, 1); } #ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length) +COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_off_t, length) { return do_sys_ftruncate(fd, length, 1); } diff --git a/include/linux/compat.h b/include/linux/compat.h index 233f61ec8afca3..56cebaff0c910f 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -608,7 +608,7 @@ asmlinkage long compat_sys_fstatfs(unsigned int fd, asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf); asmlinkage long compat_sys_truncate(const char __user *, compat_off_t); -asmlinkage long compat_sys_ftruncate(unsigned int, compat_ulong_t); +asmlinkage long compat_sys_ftruncate(unsigned int, compat_off_t); /* No generic prototype for truncate64, ftruncate64, fallocate */ asmlinkage long compat_sys_openat(int dfd, const char __user *filename, int flags, umode_t mode); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 9104952d323d1d..ba933770987860 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -418,7 +418,7 @@ asmlinkage long sys_listmount(const struct mnt_id_req __user *req, u64 __user *mnt_ids, size_t nr_mnt_ids, unsigned int flags); asmlinkage long sys_truncate(const char __user *path, long length); -asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length); +asmlinkage long sys_ftruncate(unsigned int fd, off_t length); #if BITS_PER_LONG == 32 asmlinkage long sys_truncate64(const char __user *path, loff_t length); asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length); From a1ff59784b277795a613beaa5d3dd9c5595c69a7 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 20 Jun 2024 18:14:53 +0200 Subject: [PATCH 1372/1578] cpufreq: intel_pstate: Use HWP to initialize ITMT if CPPC is missing It is reported that single-thread performance on some hybrid systems dropped significantly after commit 7feec7430edd ("ACPI: CPPC: Only probe for _CPC if CPPC v2 is acked") which prevented _CPC from being used if the support for it had not been confirmed by the platform firmware. The problem is that if the platform firmware does not confirm CPPC v2 support, cppc_get_perf_caps() returns an error which prevents the intel_pstate driver from enabling ITMT. Consequently, the scheduler does not get any hints on CPU performance differences, so in a hybrid system some tasks may run on CPUs with lower capacity even though they should be running on high-capacity CPUs. To address this, modify intel_pstate to use the information from MSR_HWP_CAPABILITIES to enable ITMT if CPPC is not available (which is done already if the highest performance number coming from CPPC is not realistic). Fixes: 7feec7430edd ("ACPI: CPPC: Only probe for _CPC if CPPC v2 is acked") Closes: https://lore.kernel.org/linux-acpi/d01b0a1f-bd33-47fe-ab41-43843d8a374f@kfocus.org Link: https://lore.kernel.org/linux-acpi/ZnD22b3Br1ng7alf@kf-XE Reported-by: Aaron Rainbolt Tested-by: Aaron Rainbolt Cc: 5.19+ # 5.19+ Link: https://patch.msgid.link/12460110.O9o76ZdvQC@rjwysocki.net Signed-off-by: Rafael J. Wysocki Reviewed-by: Mario Limonciello --- drivers/cpufreq/intel_pstate.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 15de5e3d96fd4d..c31914a9876fa1 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -355,15 +355,14 @@ static void intel_pstate_set_itmt_prio(int cpu) int ret; ret = cppc_get_perf_caps(cpu, &cppc_perf); - if (ret) - return; - /* - * On some systems with overclocking enabled, CPPC.highest_perf is hardcoded to 0xff. - * In this case we can't use CPPC.highest_perf to enable ITMT. - * In this case we can look at MSR_HWP_CAPABILITIES bits [8:0] to decide. + * If CPPC is not available, fall back to MSR_HWP_CAPABILITIES bits [8:0]. + * + * Also, on some systems with overclocking enabled, CPPC.highest_perf is + * hardcoded to 0xff, so CPPC.highest_perf cannot be used to enable ITMT. + * Fall back to MSR_HWP_CAPABILITIES then too. */ - if (cppc_perf.highest_perf == CPPC_MAX_PERF) + if (ret || cppc_perf.highest_perf == CPPC_MAX_PERF) cppc_perf.highest_perf = HWP_HIGHEST_PERF(READ_ONCE(all_cpu_data[cpu]->hwp_cap_cached)); /* From 44348870de4b8f292f97b84583a298d66fbaf738 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jun 2024 19:38:35 +0200 Subject: [PATCH 1373/1578] block: fix the blk_queue_nonrot polarity Take care of the inverse polarity of the BLK_FEAT_ROTATIONAL flag vs the old nonrot helper. Fixes: bd4a633b6f7c ("block: move the nonrot flag to queue_limits") Reported-by: kernel test robot Reported-by: Keith Busch Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20240624173835.76753-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e89003360c17c7..b2f1362c46814f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -617,7 +617,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) #define blk_queue_noxmerges(q) \ test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) -#define blk_queue_nonrot(q) ((q)->limits.features & BLK_FEAT_ROTATIONAL) +#define blk_queue_nonrot(q) (!((q)->limits.features & BLK_FEAT_ROTATIONAL)) #define blk_queue_io_stat(q) ((q)->limits.features & BLK_FEAT_IO_STAT) #define blk_queue_zone_resetall(q) \ ((q)->limits.features & BLK_FEAT_ZONE_RESETALL) From 9eee5330656bf92f51cb1f09b2dc9f8cf975b3d1 Mon Sep 17 00:00:00 2001 From: Mostafa Saleh Date: Mon, 24 Jun 2024 20:37:28 +0000 Subject: [PATCH 1374/1578] PCI/MSI: Fix UAF in msi_capability_init KFENCE reports the following UAF: BUG: KFENCE: use-after-free read in __pci_enable_msi_range+0x2c0/0x488 Use-after-free read at 0x0000000024629571 (in kfence-#12): __pci_enable_msi_range+0x2c0/0x488 pci_alloc_irq_vectors_affinity+0xec/0x14c pci_alloc_irq_vectors+0x18/0x28 kfence-#12: 0x0000000008614900-0x00000000e06c228d, size=104, cache=kmalloc-128 allocated by task 81 on cpu 7 at 10.808142s: __kmem_cache_alloc_node+0x1f0/0x2bc kmalloc_trace+0x44/0x138 msi_alloc_desc+0x3c/0x9c msi_domain_insert_msi_desc+0x30/0x78 msi_setup_msi_desc+0x13c/0x184 __pci_enable_msi_range+0x258/0x488 pci_alloc_irq_vectors_affinity+0xec/0x14c pci_alloc_irq_vectors+0x18/0x28 freed by task 81 on cpu 7 at 10.811436s: msi_domain_free_descs+0xd4/0x10c msi_domain_free_locked.part.0+0xc0/0x1d8 msi_domain_alloc_irqs_all_locked+0xb4/0xbc pci_msi_setup_msi_irqs+0x30/0x4c __pci_enable_msi_range+0x2a8/0x488 pci_alloc_irq_vectors_affinity+0xec/0x14c pci_alloc_irq_vectors+0x18/0x28 Descriptor allocation done in: __pci_enable_msi_range msi_capability_init msi_setup_msi_desc msi_insert_msi_desc msi_domain_insert_msi_desc msi_alloc_desc ... Freed in case of failure in __msi_domain_alloc_locked() __pci_enable_msi_range msi_capability_init pci_msi_setup_msi_irqs msi_domain_alloc_irqs_all_locked msi_domain_alloc_locked __msi_domain_alloc_locked => fails msi_domain_free_locked ... That failure propagates back to pci_msi_setup_msi_irqs() in msi_capability_init() which accesses the descriptor for unmasking in the error exit path. Cure it by copying the descriptor and using the copy for the error exit path unmask operation. [ tglx: Massaged change log ] Fixes: bf6e054e0e3f ("genirq/msi: Provide msi_device_populate/destroy_sysfs()") Suggested-by: Thomas Gleixner Signed-off-by: Mostafa Saleh Signed-off-by: Thomas Gleixner Cc: Bjorn Heelgas Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240624203729.1094506-1-smostafa@google.com --- drivers/pci/msi/msi.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c index c5625dd9bf490d..3a45879d85db96 100644 --- a/drivers/pci/msi/msi.c +++ b/drivers/pci/msi/msi.c @@ -352,7 +352,7 @@ static int msi_capability_init(struct pci_dev *dev, int nvec, struct irq_affinity *affd) { struct irq_affinity_desc *masks = NULL; - struct msi_desc *entry; + struct msi_desc *entry, desc; int ret; /* Reject multi-MSI early on irq domain enabled architectures */ @@ -377,6 +377,12 @@ static int msi_capability_init(struct pci_dev *dev, int nvec, /* All MSIs are unmasked by default; mask them all */ entry = msi_first_desc(&dev->dev, MSI_DESC_ALL); pci_msi_mask(entry, msi_multi_mask(entry)); + /* + * Copy the MSI descriptor for the error path because + * pci_msi_setup_msi_irqs() will free it for the hierarchical + * interrupt domain case. + */ + memcpy(&desc, entry, sizeof(desc)); /* Configure MSI capability structure */ ret = pci_msi_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSI); @@ -396,7 +402,7 @@ static int msi_capability_init(struct pci_dev *dev, int nvec, goto unlock; err: - pci_msi_unmask(entry, msi_multi_mask(entry)); + pci_msi_unmask(&desc, msi_multi_mask(&desc)); pci_free_msi_irqs(dev); fail: dev->msi_enabled = 0; From d1825752e3074b5ff8d7f6016160e2b7c5c367ca Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 13 Jun 2024 11:16:19 +0100 Subject: [PATCH 1375/1578] btrfs: use NOFS context when getting inodes during logging and log replay During inode logging (and log replay too), we are holding a transaction handle and we often need to call btrfs_iget(), which will read an inode from its subvolume btree if it's not loaded in memory and that results in allocating an inode with GFP_KERNEL semantics at the btrfs_alloc_inode() callback - and this may recurse into the filesystem in case we are under memory pressure and attempt to commit the current transaction, resulting in a deadlock since the logging (or log replay) task is holding a transaction handle open. Syzbot reported this with the following stack traces: WARNING: possible circular locking dependency detected 6.10.0-rc2-syzkaller-00361-g061d1af7b030 #0 Not tainted ------------------------------------------------------ syz-executor.1/9919 is trying to acquire lock: ffffffff8dd3aac0 (fs_reclaim){+.+.}-{0:0}, at: might_alloc include/linux/sched/mm.h:334 [inline] ffffffff8dd3aac0 (fs_reclaim){+.+.}-{0:0}, at: slab_pre_alloc_hook mm/slub.c:3891 [inline] ffffffff8dd3aac0 (fs_reclaim){+.+.}-{0:0}, at: slab_alloc_node mm/slub.c:3981 [inline] ffffffff8dd3aac0 (fs_reclaim){+.+.}-{0:0}, at: kmem_cache_alloc_lru_noprof+0x58/0x2f0 mm/slub.c:4020 but task is already holding lock: ffff88804b569358 (&ei->log_mutex){+.+.}-{3:3}, at: btrfs_log_inode+0x39c/0x4660 fs/btrfs/tree-log.c:6481 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (&ei->log_mutex){+.+.}-{3:3}: __mutex_lock_common kernel/locking/mutex.c:608 [inline] __mutex_lock+0x175/0x9c0 kernel/locking/mutex.c:752 btrfs_log_inode+0x39c/0x4660 fs/btrfs/tree-log.c:6481 btrfs_log_inode_parent+0x8cb/0x2a90 fs/btrfs/tree-log.c:7079 btrfs_log_dentry_safe+0x59/0x80 fs/btrfs/tree-log.c:7180 btrfs_sync_file+0x9c1/0xe10 fs/btrfs/file.c:1959 vfs_fsync_range+0x141/0x230 fs/sync.c:188 generic_write_sync include/linux/fs.h:2794 [inline] btrfs_do_write_iter+0x584/0x10c0 fs/btrfs/file.c:1705 new_sync_write fs/read_write.c:497 [inline] vfs_write+0x6b6/0x1140 fs/read_write.c:590 ksys_write+0x12f/0x260 fs/read_write.c:643 do_syscall_32_irqs_on arch/x86/entry/common.c:165 [inline] __do_fast_syscall_32+0x73/0x120 arch/x86/entry/common.c:386 do_fast_syscall_32+0x32/0x80 arch/x86/entry/common.c:411 entry_SYSENTER_compat_after_hwframe+0x84/0x8e -> #2 (btrfs_trans_num_extwriters){++++}-{0:0}: join_transaction+0x164/0xf40 fs/btrfs/transaction.c:315 start_transaction+0x427/0x1a70 fs/btrfs/transaction.c:700 btrfs_commit_super+0xa1/0x110 fs/btrfs/disk-io.c:4170 close_ctree+0xcb0/0xf90 fs/btrfs/disk-io.c:4324 generic_shutdown_super+0x159/0x3d0 fs/super.c:642 kill_anon_super+0x3a/0x60 fs/super.c:1226 btrfs_kill_super+0x3b/0x50 fs/btrfs/super.c:2096 deactivate_locked_super+0xbe/0x1a0 fs/super.c:473 deactivate_super+0xde/0x100 fs/super.c:506 cleanup_mnt+0x222/0x450 fs/namespace.c:1267 task_work_run+0x14e/0x250 kernel/task_work.c:180 resume_user_mode_work include/linux/resume_user_mode.h:50 [inline] exit_to_user_mode_loop kernel/entry/common.c:114 [inline] exit_to_user_mode_prepare include/linux/entry-common.h:328 [inline] __syscall_exit_to_user_mode_work kernel/entry/common.c:207 [inline] syscall_exit_to_user_mode+0x278/0x2a0 kernel/entry/common.c:218 __do_fast_syscall_32+0x80/0x120 arch/x86/entry/common.c:389 do_fast_syscall_32+0x32/0x80 arch/x86/entry/common.c:411 entry_SYSENTER_compat_after_hwframe+0x84/0x8e -> #1 (btrfs_trans_num_writers){++++}-{0:0}: __lock_release kernel/locking/lockdep.c:5468 [inline] lock_release+0x33e/0x6c0 kernel/locking/lockdep.c:5774 percpu_up_read include/linux/percpu-rwsem.h:99 [inline] __sb_end_write include/linux/fs.h:1650 [inline] sb_end_intwrite include/linux/fs.h:1767 [inline] __btrfs_end_transaction+0x5ca/0x920 fs/btrfs/transaction.c:1071 btrfs_commit_inode_delayed_inode+0x228/0x330 fs/btrfs/delayed-inode.c:1301 btrfs_evict_inode+0x960/0xe80 fs/btrfs/inode.c:5291 evict+0x2ed/0x6c0 fs/inode.c:667 iput_final fs/inode.c:1741 [inline] iput.part.0+0x5a8/0x7f0 fs/inode.c:1767 iput+0x5c/0x80 fs/inode.c:1757 dentry_unlink_inode+0x295/0x480 fs/dcache.c:400 __dentry_kill+0x1d0/0x600 fs/dcache.c:603 dput.part.0+0x4b1/0x9b0 fs/dcache.c:845 dput+0x1f/0x30 fs/dcache.c:835 ovl_stack_put+0x60/0x90 fs/overlayfs/util.c:132 ovl_destroy_inode+0xc6/0x190 fs/overlayfs/super.c:182 destroy_inode+0xc4/0x1b0 fs/inode.c:311 iput_final fs/inode.c:1741 [inline] iput.part.0+0x5a8/0x7f0 fs/inode.c:1767 iput+0x5c/0x80 fs/inode.c:1757 dentry_unlink_inode+0x295/0x480 fs/dcache.c:400 __dentry_kill+0x1d0/0x600 fs/dcache.c:603 shrink_kill fs/dcache.c:1048 [inline] shrink_dentry_list+0x140/0x5d0 fs/dcache.c:1075 prune_dcache_sb+0xeb/0x150 fs/dcache.c:1156 super_cache_scan+0x32a/0x550 fs/super.c:221 do_shrink_slab+0x44f/0x11c0 mm/shrinker.c:435 shrink_slab_memcg mm/shrinker.c:548 [inline] shrink_slab+0xa87/0x1310 mm/shrinker.c:626 shrink_one+0x493/0x7c0 mm/vmscan.c:4790 shrink_many mm/vmscan.c:4851 [inline] lru_gen_shrink_node+0x89f/0x1750 mm/vmscan.c:4951 shrink_node mm/vmscan.c:5910 [inline] kswapd_shrink_node mm/vmscan.c:6720 [inline] balance_pgdat+0x1105/0x1970 mm/vmscan.c:6911 kswapd+0x5ea/0xbf0 mm/vmscan.c:7180 kthread+0x2c1/0x3a0 kernel/kthread.c:389 ret_from_fork+0x45/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 -> #0 (fs_reclaim){+.+.}-{0:0}: check_prev_add kernel/locking/lockdep.c:3134 [inline] check_prevs_add kernel/locking/lockdep.c:3253 [inline] validate_chain kernel/locking/lockdep.c:3869 [inline] __lock_acquire+0x2478/0x3b30 kernel/locking/lockdep.c:5137 lock_acquire kernel/locking/lockdep.c:5754 [inline] lock_acquire+0x1b1/0x560 kernel/locking/lockdep.c:5719 __fs_reclaim_acquire mm/page_alloc.c:3801 [inline] fs_reclaim_acquire+0x102/0x160 mm/page_alloc.c:3815 might_alloc include/linux/sched/mm.h:334 [inline] slab_pre_alloc_hook mm/slub.c:3891 [inline] slab_alloc_node mm/slub.c:3981 [inline] kmem_cache_alloc_lru_noprof+0x58/0x2f0 mm/slub.c:4020 btrfs_alloc_inode+0x118/0xb20 fs/btrfs/inode.c:8411 alloc_inode+0x5d/0x230 fs/inode.c:261 iget5_locked fs/inode.c:1235 [inline] iget5_locked+0x1c9/0x2c0 fs/inode.c:1228 btrfs_iget_locked fs/btrfs/inode.c:5590 [inline] btrfs_iget_path fs/btrfs/inode.c:5607 [inline] btrfs_iget+0xfb/0x230 fs/btrfs/inode.c:5636 add_conflicting_inode fs/btrfs/tree-log.c:5657 [inline] copy_inode_items_to_log+0x1039/0x1e30 fs/btrfs/tree-log.c:5928 btrfs_log_inode+0xa48/0x4660 fs/btrfs/tree-log.c:6592 log_new_delayed_dentries fs/btrfs/tree-log.c:6363 [inline] btrfs_log_inode+0x27dd/0x4660 fs/btrfs/tree-log.c:6718 btrfs_log_all_parents fs/btrfs/tree-log.c:6833 [inline] btrfs_log_inode_parent+0x22ba/0x2a90 fs/btrfs/tree-log.c:7141 btrfs_log_dentry_safe+0x59/0x80 fs/btrfs/tree-log.c:7180 btrfs_sync_file+0x9c1/0xe10 fs/btrfs/file.c:1959 vfs_fsync_range+0x141/0x230 fs/sync.c:188 generic_write_sync include/linux/fs.h:2794 [inline] btrfs_do_write_iter+0x584/0x10c0 fs/btrfs/file.c:1705 do_iter_readv_writev+0x504/0x780 fs/read_write.c:741 vfs_writev+0x36f/0xde0 fs/read_write.c:971 do_pwritev+0x1b2/0x260 fs/read_write.c:1072 __do_compat_sys_pwritev2 fs/read_write.c:1218 [inline] __se_compat_sys_pwritev2 fs/read_write.c:1210 [inline] __ia32_compat_sys_pwritev2+0x121/0x1b0 fs/read_write.c:1210 do_syscall_32_irqs_on arch/x86/entry/common.c:165 [inline] __do_fast_syscall_32+0x73/0x120 arch/x86/entry/common.c:386 do_fast_syscall_32+0x32/0x80 arch/x86/entry/common.c:411 entry_SYSENTER_compat_after_hwframe+0x84/0x8e other info that might help us debug this: Chain exists of: fs_reclaim --> btrfs_trans_num_extwriters --> &ei->log_mutex Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&ei->log_mutex); lock(btrfs_trans_num_extwriters); lock(&ei->log_mutex); lock(fs_reclaim); *** DEADLOCK *** 7 locks held by syz-executor.1/9919: #0: ffff88802be20420 (sb_writers#23){.+.+}-{0:0}, at: do_pwritev+0x1b2/0x260 fs/read_write.c:1072 #1: ffff888065c0f8f0 (&sb->s_type->i_mutex_key#33){++++}-{3:3}, at: inode_lock include/linux/fs.h:791 [inline] #1: ffff888065c0f8f0 (&sb->s_type->i_mutex_key#33){++++}-{3:3}, at: btrfs_inode_lock+0xc8/0x110 fs/btrfs/inode.c:385 #2: ffff888065c0f778 (&ei->i_mmap_lock){++++}-{3:3}, at: btrfs_inode_lock+0xee/0x110 fs/btrfs/inode.c:388 #3: ffff88802be20610 (sb_internal#4){.+.+}-{0:0}, at: btrfs_sync_file+0x95b/0xe10 fs/btrfs/file.c:1952 #4: ffff8880546323f0 (btrfs_trans_num_writers){++++}-{0:0}, at: join_transaction+0x430/0xf40 fs/btrfs/transaction.c:290 #5: ffff888054632418 (btrfs_trans_num_extwriters){++++}-{0:0}, at: join_transaction+0x430/0xf40 fs/btrfs/transaction.c:290 #6: ffff88804b569358 (&ei->log_mutex){+.+.}-{3:3}, at: btrfs_log_inode+0x39c/0x4660 fs/btrfs/tree-log.c:6481 stack backtrace: CPU: 2 PID: 9919 Comm: syz-executor.1 Not tainted 6.10.0-rc2-syzkaller-00361-g061d1af7b030 #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x116/0x1f0 lib/dump_stack.c:114 check_noncircular+0x31a/0x400 kernel/locking/lockdep.c:2187 check_prev_add kernel/locking/lockdep.c:3134 [inline] check_prevs_add kernel/locking/lockdep.c:3253 [inline] validate_chain kernel/locking/lockdep.c:3869 [inline] __lock_acquire+0x2478/0x3b30 kernel/locking/lockdep.c:5137 lock_acquire kernel/locking/lockdep.c:5754 [inline] lock_acquire+0x1b1/0x560 kernel/locking/lockdep.c:5719 __fs_reclaim_acquire mm/page_alloc.c:3801 [inline] fs_reclaim_acquire+0x102/0x160 mm/page_alloc.c:3815 might_alloc include/linux/sched/mm.h:334 [inline] slab_pre_alloc_hook mm/slub.c:3891 [inline] slab_alloc_node mm/slub.c:3981 [inline] kmem_cache_alloc_lru_noprof+0x58/0x2f0 mm/slub.c:4020 btrfs_alloc_inode+0x118/0xb20 fs/btrfs/inode.c:8411 alloc_inode+0x5d/0x230 fs/inode.c:261 iget5_locked fs/inode.c:1235 [inline] iget5_locked+0x1c9/0x2c0 fs/inode.c:1228 btrfs_iget_locked fs/btrfs/inode.c:5590 [inline] btrfs_iget_path fs/btrfs/inode.c:5607 [inline] btrfs_iget+0xfb/0x230 fs/btrfs/inode.c:5636 add_conflicting_inode fs/btrfs/tree-log.c:5657 [inline] copy_inode_items_to_log+0x1039/0x1e30 fs/btrfs/tree-log.c:5928 btrfs_log_inode+0xa48/0x4660 fs/btrfs/tree-log.c:6592 log_new_delayed_dentries fs/btrfs/tree-log.c:6363 [inline] btrfs_log_inode+0x27dd/0x4660 fs/btrfs/tree-log.c:6718 btrfs_log_all_parents fs/btrfs/tree-log.c:6833 [inline] btrfs_log_inode_parent+0x22ba/0x2a90 fs/btrfs/tree-log.c:7141 btrfs_log_dentry_safe+0x59/0x80 fs/btrfs/tree-log.c:7180 btrfs_sync_file+0x9c1/0xe10 fs/btrfs/file.c:1959 vfs_fsync_range+0x141/0x230 fs/sync.c:188 generic_write_sync include/linux/fs.h:2794 [inline] btrfs_do_write_iter+0x584/0x10c0 fs/btrfs/file.c:1705 do_iter_readv_writev+0x504/0x780 fs/read_write.c:741 vfs_writev+0x36f/0xde0 fs/read_write.c:971 do_pwritev+0x1b2/0x260 fs/read_write.c:1072 __do_compat_sys_pwritev2 fs/read_write.c:1218 [inline] __se_compat_sys_pwritev2 fs/read_write.c:1210 [inline] __ia32_compat_sys_pwritev2+0x121/0x1b0 fs/read_write.c:1210 do_syscall_32_irqs_on arch/x86/entry/common.c:165 [inline] __do_fast_syscall_32+0x73/0x120 arch/x86/entry/common.c:386 do_fast_syscall_32+0x32/0x80 arch/x86/entry/common.c:411 entry_SYSENTER_compat_after_hwframe+0x84/0x8e RIP: 0023:0xf7334579 Code: b8 01 10 06 03 (...) RSP: 002b:00000000f5f265ac EFLAGS: 00000292 ORIG_RAX: 000000000000017b RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00000000200002c0 RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000000000000000 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000292 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 Fix this by ensuring we are under a NOFS scope whenever we call btrfs_iget() during inode logging and log replay. Reported-by: syzbot+8576cfa84070dce4d59b@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/000000000000274a3a061abbd928@google.com/ Fixes: 712e36c5f2a7 ("btrfs: use GFP_KERNEL in btrfs_alloc_inode") Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 43 ++++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 26a2e5aa08e9ca..0bce1d45e25267 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -138,6 +138,25 @@ static void wait_log_commit(struct btrfs_root *root, int transid); * and once to do all the other items. */ +static struct inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root) +{ + unsigned int nofs_flag; + struct inode *inode; + + /* + * We're holding a transaction handle whether we are logging or + * replaying a log tree, so we must make sure NOFS semantics apply + * because btrfs_alloc_inode() may be triggered and it uses GFP_KERNEL + * to allocate an inode, which can recurse back into the filesystem and + * attempt a transaction commit, resulting in a deadlock. + */ + nofs_flag = memalloc_nofs_save(); + inode = btrfs_iget(root->fs_info->sb, objectid, root); + memalloc_nofs_restore(nofs_flag); + + return inode; +} + /* * start a sub transaction and setup the log tree * this increments the log tree writer count to make the people @@ -600,7 +619,7 @@ static noinline struct inode *read_one_inode(struct btrfs_root *root, { struct inode *inode; - inode = btrfs_iget(root->fs_info->sb, objectid, root); + inode = btrfs_iget_logging(objectid, root); if (IS_ERR(inode)) inode = NULL; return inode; @@ -5438,7 +5457,6 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx) { struct btrfs_root *root = start_inode->root; - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; LIST_HEAD(dir_list); struct btrfs_dir_list *dir_elem; @@ -5499,7 +5517,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, continue; btrfs_release_path(path); - di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root); + di_inode = btrfs_iget_logging(di_key.objectid, root); if (IS_ERR(di_inode)) { ret = PTR_ERR(di_inode); goto out; @@ -5559,7 +5577,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, btrfs_add_delayed_iput(curr_inode); curr_inode = NULL; - vfs_inode = btrfs_iget(fs_info->sb, ino, root); + vfs_inode = btrfs_iget_logging(ino, root); if (IS_ERR(vfs_inode)) { ret = PTR_ERR(vfs_inode); break; @@ -5654,7 +5672,7 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans, if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) return BTRFS_LOG_FORCE_COMMIT; - inode = btrfs_iget(root->fs_info->sb, ino, root); + inode = btrfs_iget_logging(ino, root); /* * If the other inode that had a conflicting dir entry was deleted in * the current transaction then we either: @@ -5755,7 +5773,6 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_log_ctx *ctx) { - struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; /* @@ -5786,7 +5803,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, list_del(&curr->list); kfree(curr); - inode = btrfs_iget(fs_info->sb, ino, root); + inode = btrfs_iget_logging(ino, root); /* * If the other inode that had a conflicting dir entry was * deleted in the current transaction, we need to log its parent @@ -5797,7 +5814,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, if (ret != -ENOENT) break; - inode = btrfs_iget(fs_info->sb, parent, root); + inode = btrfs_iget_logging(parent, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); break; @@ -6319,7 +6336,6 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx) { const bool orig_log_new_dentries = ctx->log_new_dentries; - struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_item *item; int ret = 0; @@ -6345,7 +6361,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, if (key.type == BTRFS_ROOT_ITEM_KEY) continue; - di_inode = btrfs_iget(fs_info->sb, key.objectid, inode->root); + di_inode = btrfs_iget_logging(key.objectid, inode->root); if (IS_ERR(di_inode)) { ret = PTR_ERR(di_inode); break; @@ -6729,7 +6745,6 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_log_ctx *ctx) { - struct btrfs_fs_info *fs_info = trans->fs_info; int ret; struct btrfs_path *path; struct btrfs_key key; @@ -6794,8 +6809,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, cur_offset = item_size; } - dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid, - root); + dir_inode = btrfs_iget_logging(inode_key.objectid, root); /* * If the parent inode was deleted, return an error to * fallback to a transaction commit. This is to prevent @@ -6857,7 +6871,6 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); while (true) { - struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; int slot; struct btrfs_key search_key; @@ -6872,7 +6885,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, search_key.objectid = found_key.offset; search_key.type = BTRFS_INODE_ITEM_KEY; search_key.offset = 0; - inode = btrfs_iget(fs_info->sb, ino, root); + inode = btrfs_iget_logging(ino, root); if (IS_ERR(inode)) return PTR_ERR(inode); From b9fd2affe4aa99a4ca14ee87e1f38fea22ece52a Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 11 Jun 2024 17:17:30 +0900 Subject: [PATCH 1376/1578] btrfs: zoned: fix initial free space detection When creating a new block group, it calls btrfs_add_new_free_space() to add the entire block group range into the free space accounting. __btrfs_add_free_space_zoned() checks if size == block_group->length to detect the initial free space adding, and proceed that case properly. However, if the zone_capacity == zone_size and the over-write speed is fast enough, the entire zone can be over-written within one transaction. That confuses __btrfs_add_free_space_zoned() to handle it as an initial free space accounting. As a result, that block group becomes a strange state: 0 used bytes, 0 zone_unusable bytes, but alloc_offset == zone_capacity (no allocation anymore). The initial free space accounting can properly be checked by checking alloc_offset too. Fixes: 98173255bddd ("btrfs: zoned: calculate free space from zone capacity") CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index c8a05d5eb9cbc5..b642df5e525584 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2697,7 +2697,7 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, u64 offset = bytenr - block_group->start; u64 to_free, to_unusable; int bg_reclaim_threshold = 0; - bool initial = (size == block_group->length); + bool initial = ((size == block_group->length) && (block_group->alloc_offset == 0)); u64 reclaimable_unusable; WARN_ON(!initial && offset + size > block_group->zone_capacity); From 2c49908634a2b97b1c3abe0589be2739ac5e7fd5 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 17 Jun 2024 15:18:44 +0930 Subject: [PATCH 1377/1578] btrfs: scrub: handle RST lookup error correctly [BUG] When running btrfs/060 with forced RST feature, it would crash the following ASSERT() inside scrub_read_endio(): ASSERT(sector_nr < stripe->nr_sectors); Before that, we would have tree dump from btrfs_get_raid_extent_offset(), as we failed to find the RST entry for the range. [CAUSE] Inside scrub_submit_extent_sector_read() every time we allocated a new bbio we immediately called btrfs_map_block() to make sure there was some RST range covering the scrub target. But if btrfs_map_block() fails, we immediately call endio for the bbio, while the bbio is newly allocated, it's completely empty. Then inside scrub_read_endio(), we go through the bvecs to find the sector number (as bi_sector is no longer reliable if the bio is submitted to lower layers). And since the bio is empty, such bvecs iteration would not find any sector matching the sector, and return sector_nr == stripe->nr_sectors, triggering the ASSERT(). [FIX] Instead of calling btrfs_map_block() after allocating a new bbio, call btrfs_map_block() first. Since our only objective of calling btrfs_map_block() is only to update stripe_len, there is really no need to do that after btrfs_alloc_bio(). This new timing would avoid the problem of handling empty bbio completely, and in fact fixes a possible race window for the old code, where if the submission thread is the only owner of the pending_io, the scrub would never finish (since we didn't decrease the pending_io counter). Although the root cause of RST lookup failure still needs to be addressed. Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index afd6932f5e895c..d7caa3732f074b 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1688,20 +1688,24 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, (i << fs_info->sectorsize_bits); int err; - bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ, - fs_info, scrub_read_endio, stripe); - bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT; - io_stripe.is_scrub = true; + stripe_len = (nr_sectors - i) << fs_info->sectorsize_bits; + /* + * For RST cases, we need to manually split the bbio to + * follow the RST boundary. + */ err = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, - &stripe_len, &bioc, &io_stripe, - &mirror); + &stripe_len, &bioc, &io_stripe, &mirror); btrfs_put_bioc(bioc); - if (err) { - btrfs_bio_end_io(bbio, - errno_to_blk_status(err)); - return; + if (err < 0) { + set_bit(i, &stripe->io_error_bitmap); + set_bit(i, &stripe->error_bitmap); + continue; } + + bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ, + fs_info, scrub_read_endio, stripe); + bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT; } __bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); From a7e4c6a3031c74078dba7fa36239d0f4fe476c53 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 20 Jun 2024 12:32:00 +0100 Subject: [PATCH 1378/1578] btrfs: qgroup: fix quota root leak after quota disable failure If during the quota disable we fail when cleaning the quota tree or when deleting the root from the root tree, we jump to the 'out' label without ever dropping the reference on the quota root, resulting in a leak of the root since fs_info->quota_root is no longer pointing to the root (we have set it to NULL just before those steps). Fix this by always doing a btrfs_put_root() call under the 'out' label. This is a problem that exists since qgroups were first added in 2012 by commit bed92eae26cc ("Btrfs: qgroup implementation and prototypes"), but back then we missed a kfree on the quota root and free_extent_buffer() calls on its root and commit root nodes, since back then roots were not yet reference counted. Reviewed-by: Boris Burkov Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index fc2a7ea2635418..bf0f81d59b6bca 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1351,7 +1351,7 @@ static int flush_reservations(struct btrfs_fs_info *fs_info) int btrfs_quota_disable(struct btrfs_fs_info *fs_info) { - struct btrfs_root *quota_root; + struct btrfs_root *quota_root = NULL; struct btrfs_trans_handle *trans = NULL; int ret = 0; @@ -1449,9 +1449,9 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) btrfs_free_tree_block(trans, btrfs_root_id(quota_root), quota_root->node, 0, 1); - btrfs_put_root(quota_root); out: + btrfs_put_root(quota_root); mutex_unlock(&fs_info->qgroup_ioctl_lock); if (ret && trans) btrfs_end_transaction(trans); From 26b97668e5339434e5df8ddc7b1898a37a350112 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 24 Jun 2024 13:47:22 -0600 Subject: [PATCH 1379/1578] io_uring: remove dead struct io_submit_state member When the intermediate CQE aux cache got removed, any usage of the this member went away. As it isn't used anymore, kill it. Fixes: 902ce82c2aa1 ("io_uring: get rid of intermediate aux cqe caches") Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index b48570eaa4497f..7abdc09271245f 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -207,7 +207,6 @@ struct io_submit_state { bool need_plug; bool cq_flush; unsigned short submit_nr; - unsigned int cqes_count; struct blk_plug plug; }; From dbcabac138fdfc730ba458ed2199ff1f29a271fc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 24 Jun 2024 19:07:18 -0600 Subject: [PATCH 1380/1578] io_uring: signal SQPOLL task_work with TWA_SIGNAL_NO_IPI Before SQPOLL was transitioned to managing its own task_work, the core used TWA_SIGNAL_NO_IPI to ensure that task_work was processed. If not, we can't be sure that all task_work is processed at SQPOLL thread exit time. Fixes: af5d68f8892f ("io_uring/sqpoll: manage task_work privately") Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 154b25b8a613b1..c326e2127dd4dd 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1259,8 +1259,8 @@ static void io_req_normal_work_add(struct io_kiocb *req) if (ctx->flags & IORING_SETUP_SQPOLL) { struct io_sq_data *sqd = ctx->sq_data; - if (wq_has_sleeper(&sqd->wait)) - wake_up(&sqd->wait); + if (sqd->thread) + __set_notify_signal(sqd->thread); return; } From 8c61291fd8500e3b35c7ec0c781b273d8cc96cde Mon Sep 17 00:00:00 2001 From: Zhaoyang Huang Date: Fri, 7 Jun 2024 10:31:16 +0800 Subject: [PATCH 1381/1578] mm: fix incorrect vbq reference in purge_fragmented_block MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit xa_for_each() in _vm_unmap_aliases() loops through all vbs. However, since commit 062eacf57ad9 ("mm: vmalloc: remove a global vmap_blocks xarray") the vb from xarray may not be on the corresponding CPU vmap_block_queue. Consequently, purge_fragmented_block() might use the wrong vbq->lock to protect the free list, leading to vbq->free breakage. Incorrect lock protection can exhaust all vmalloc space as follows: CPU0 CPU1 +--------------------------------------------+ | +--------------------+ +-----+ | +--> | |---->| |------+ | CPU1:vbq free_list | | vb1 | +--- | |<----| |<-----+ | +--------------------+ +-----+ | +--------------------------------------------+ _vm_unmap_aliases() vb_alloc() new_vmap_block() xa_for_each(&vbq->vmap_blocks, idx, vb) --> vb in CPU1:vbq->freelist purge_fragmented_block(vb) spin_lock(&vbq->lock) spin_lock(&vbq->lock) --> use CPU0:vbq->lock --> use CPU1:vbq->lock list_del_rcu(&vb->free_list) list_add_tail_rcu(&vb->free_list, &vbq->free) __list_del(vb->prev, vb->next) next->prev = prev +--------------------+ | | | CPU1:vbq free_list | +---| |<--+ | +--------------------+ | +----------------------------+ __list_add(new, head->prev, head) +--------------------------------------------+ | +--------------------+ +-----+ | +--> | |---->| |------+ | CPU1:vbq free_list | | vb2 | +--- | |<----| |<-----+ | +--------------------+ +-----+ | +--------------------------------------------+ prev->next = next +--------------------------------------------+ |----------------------------+ | | +--------------------+ | +-----+ | +--> | |--+ | |------+ | CPU1:vbq free_list | | vb2 | +--- | |<----| |<-----+ | +--------------------+ +-----+ | +--------------------------------------------+ Here’s a list breakdown. All vbs, which were to be added to ‘prev’, cannot be used by list_for_each_entry_rcu(vb, &vbq->free, free_list) in vb_alloc(). Thus, vmalloc space is exhausted. This issue affects both erofs and f2fs, the stacktrace is as follows: erofs: [] __switch_to+0x174 [] __schedule+0x624 [] schedule+0x7c [] schedule_preempt_disabled+0x24 [] __mutex_lock+0x374 [] __mutex_lock_slowpath+0x14 [] mutex_lock+0x24 [] reclaim_and_purge_vmap_areas+0x44 [] alloc_vmap_area+0x2e0 [] vm_map_ram+0x1b0 [] z_erofs_lz4_decompress+0x278 [] z_erofs_decompress_queue+0x650 [] z_erofs_runqueue+0x7f4 [] z_erofs_read_folio+0x104 [] filemap_read_folio+0x6c [] filemap_fault+0x300 [] __do_fault+0xc8 [] handle_mm_fault+0xb38 [] do_page_fault+0x288 [] do_translation_fault[jt]+0x40 [] do_mem_abort+0x58 [] el0_ia+0x70 [] el0t_64_sync_handler[jt]+0xb0 [] ret_to_user[jt]+0x0 f2fs: [] __switch_to+0x174 [] __schedule+0x624 [] schedule+0x7c [] schedule_preempt_disabled+0x24 [] __mutex_lock+0x374 [] __mutex_lock_slowpath+0x14 [] mutex_lock+0x24 [] reclaim_and_purge_vmap_areas+0x44 [] alloc_vmap_area+0x2e0 [] vm_map_ram+0x1b0 [] f2fs_prepare_decomp_mem+0x144 [] f2fs_alloc_dic+0x264 [] f2fs_read_multi_pages+0x428 [] f2fs_mpage_readpages+0x314 [] f2fs_readahead+0x50 [] read_pages+0x80 [] page_cache_ra_unbounded+0x1a0 [] page_cache_ra_order+0x274 [] do_sync_mmap_readahead+0x11c [] filemap_fault+0x1a0 [] f2fs_filemap_fault+0x28 [] __do_fault+0xc8 [] handle_mm_fault+0xb38 [] do_page_fault+0x288 [] do_translation_fault[jt]+0x40 [] do_mem_abort+0x58 [] el0_ia+0x70 [] el0t_64_sync_handler[jt]+0xb0 [] ret_to_user[jt]+0x0 To fix this, introducee cpu within vmap_block to record which this vb belongs to. Link: https://lkml.kernel.org/r/20240614021352.1822225-1-zhaoyang.huang@unisoc.com Link: https://lkml.kernel.org/r/20240607023116.1720640-1-zhaoyang.huang@unisoc.com Fixes: fc1e0d980037 ("mm/vmalloc: prevent stale TLBs in fully utilized blocks") Signed-off-by: Zhaoyang Huang Suggested-by: Hailong.Liu Reviewed-by: Uladzislau Rezki (Sony) Cc: Baoquan He Cc: Christoph Hellwig Cc: Lorenzo Stoakes Cc: Thomas Gleixner Cc: Signed-off-by: Andrew Morton --- mm/vmalloc.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 45e1506d58c3a4..d0cbdd7c1e5bc8 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2498,6 +2498,7 @@ struct vmap_block { struct list_head free_list; struct rcu_head rcu_head; struct list_head purge; + unsigned int cpu; }; /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ @@ -2625,8 +2626,15 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) free_vmap_area(va); return ERR_PTR(err); } - - vbq = raw_cpu_ptr(&vmap_block_queue); + /* + * list_add_tail_rcu could happened in another core + * rather than vb->cpu due to task migration, which + * is safe as list_add_tail_rcu will ensure the list's + * integrity together with list_for_each_rcu from read + * side. + */ + vb->cpu = raw_smp_processor_id(); + vbq = per_cpu_ptr(&vmap_block_queue, vb->cpu); spin_lock(&vbq->lock); list_add_tail_rcu(&vb->free_list, &vbq->free); spin_unlock(&vbq->lock); @@ -2654,9 +2662,10 @@ static void free_vmap_block(struct vmap_block *vb) } static bool purge_fragmented_block(struct vmap_block *vb, - struct vmap_block_queue *vbq, struct list_head *purge_list, - bool force_purge) + struct list_head *purge_list, bool force_purge) { + struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, vb->cpu); + if (vb->free + vb->dirty != VMAP_BBMAP_BITS || vb->dirty == VMAP_BBMAP_BITS) return false; @@ -2704,7 +2713,7 @@ static void purge_fragmented_blocks(int cpu) continue; spin_lock(&vb->lock); - purge_fragmented_block(vb, vbq, &purge, true); + purge_fragmented_block(vb, &purge, true); spin_unlock(&vb->lock); } rcu_read_unlock(); @@ -2841,7 +2850,7 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) * not purgeable, check whether there is dirty * space to be flushed. */ - if (!purge_fragmented_block(vb, vbq, &purge_list, false) && + if (!purge_fragmented_block(vb, &purge_list, false) && vb->dirty_max && vb->dirty != VMAP_BBMAP_BITS) { unsigned long va_start = vb->va->va_start; unsigned long s, e; From 399ab86ea55039f9d0a5f621a68cb4631f796f37 Mon Sep 17 00:00:00 2001 From: Jeff Xu Date: Fri, 14 Jun 2024 23:20:14 +0000 Subject: [PATCH 1382/1578] /proc/pid/smaps: add mseal info for vma MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add sl in /proc/pid/smaps to indicate vma is sealed Link: https://lkml.kernel.org/r/20240614232014.806352-2-jeffxu@google.com Fixes: 8be7258aad44 ("mseal: add mseal syscall") Signed-off-by: Jeff Xu Acked-by: David Hildenbrand Cc: Adhemerval Zanella Cc: Jann Horn Cc: Jorge Lucangeli Obes Cc: Kees Cook Cc: Randy Dunlap Cc: Stephen Röttger Signed-off-by: Andrew Morton --- Documentation/filesystems/proc.rst | 1 + fs/proc/task_mmu.c | 3 +++ include/linux/mm.h | 5 +++++ mm/internal.h | 5 ----- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 7c3a565ffbef37..82d142de3461bb 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -571,6 +571,7 @@ encoded manner. The codes are the following: um userfaultfd missing tracking uw userfaultfd wr-protect tracking ss shadow stack page + sl sealed == ======================================= Note that there is no guarantee that every flag and associated mnemonic will diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f8d35f993fe501..71e5039d940dcb 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -706,6 +706,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ #ifdef CONFIG_X86_USER_SHADOW_STACK [ilog2(VM_SHADOW_STACK)] = "ss", +#endif +#ifdef CONFIG_64BIT + [ilog2(VM_SEALED)] = "sl", #endif }; size_t i; diff --git a/include/linux/mm.h b/include/linux/mm.h index 9a5652c5fadd51..eb7c96d24ac02a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -406,6 +406,11 @@ extern unsigned int kobjsize(const void *objp); #define VM_ALLOW_ANY_UNCACHED VM_NONE #endif +#ifdef CONFIG_64BIT +/* VM is sealed, in vm_flags */ +#define VM_SEALED _BITUL(63) +#endif + /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) diff --git a/mm/internal.h b/mm/internal.h index c72c306761a48a..6902b7dd85091c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1434,11 +1434,6 @@ void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority); -#ifdef CONFIG_64BIT -/* VM is sealed, in vm_flags */ -#define VM_SEALED _BITUL(63) -#endif - #ifdef CONFIG_64BIT static inline int can_do_mseal(unsigned long flags) { From b4601d096aac8ed26afa88ef8b249975b0530ca1 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 14 Jun 2024 15:59:51 -0700 Subject: [PATCH 1383/1578] mm/slab: fix 'variable obj_exts set but not used' warning slab_post_alloc_hook() uses prepare_slab_obj_exts_hook() to obtain slabobj_ext object. Currently the only user of slabobj_ext object in this path is memory allocation profiling, therefore when it's not enabled this object is not needed. This also generates a warning when compiling with CONFIG_MEM_ALLOC_PROFILING=n. Move the code under this configuration to fix the warning. If more slabobj_ext users appear in the future, the code will have to be changed back to call prepare_slab_obj_exts_hook(). Link: https://lkml.kernel.org/r/20240614225951.3845577-1-surenb@google.com Fixes: 4b8736964640 ("mm/slab: add allocation accounting into slab allocation and free paths") Signed-off-by: Suren Baghdasaryan Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202406150444.F6neSaiy-lkp@intel.com/ Cc: Kent Overstreet Cc: Kees Cook Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/slub.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 1373ac365a46f4..4927edec6a8c93 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3902,7 +3902,6 @@ bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, unsigned int orig_size) { unsigned int zero_size = s->object_size; - struct slabobj_ext *obj_exts; bool kasan_init = init; size_t i; gfp_t init_flags = flags & gfp_allowed_mask; @@ -3945,9 +3944,11 @@ bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, kmemleak_alloc_recursive(p[i], s->object_size, 1, s->flags, init_flags); kmsan_slab_alloc(s, p[i], init_flags); +#ifdef CONFIG_MEM_ALLOC_PROFILING if (need_slab_obj_ext()) { + struct slabobj_ext *obj_exts; + obj_exts = prepare_slab_obj_exts_hook(s, flags, p[i]); -#ifdef CONFIG_MEM_ALLOC_PROFILING /* * Currently obj_exts is used only for allocation profiling. * If other users appear then mem_alloc_profiling_enabled() @@ -3955,8 +3956,8 @@ bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, */ if (likely(obj_exts)) alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size); -#endif } +#endif } return memcg_slab_post_alloc_hook(s, lru, flags, size, p); From 34a023dc88696afed9ade7825f11f87ba657b133 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 14 Jun 2024 16:05:04 -0700 Subject: [PATCH 1384/1578] mm: handle profiling for fake memory allocations during compaction During compaction isolated free pages are marked allocated so that they can be split and/or freed. For that, post_alloc_hook() is used inside split_map_pages() and release_free_list(). split_map_pages() marks free pages allocated, splits the pages and then lets alloc_contig_range_noprof() free those pages. release_free_list() marks free pages and immediately frees them. This usage of post_alloc_hook() affect memory allocation profiling because these functions might not be called from an instrumented allocator, therefore current->alloc_tag is NULL and when debugging is enabled (CONFIG_MEM_ALLOC_PROFILING_DEBUG=y) that causes warnings. To avoid that, wrap such post_alloc_hook() calls into an instrumented function which acts as an allocator which will be charged for these fake allocations. Note that these allocations are very short lived until they are freed, therefore the associated counters should usually read 0. Link: https://lkml.kernel.org/r/20240614230504.3849136-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Vlastimil Babka Cc: Kees Cook Cc: Kent Overstreet Cc: Pasha Tatashin Cc: Sourav Panda Signed-off-by: Andrew Morton --- mm/compaction.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index e731d45befc780..739b1bf3d6377f 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -79,6 +79,13 @@ static inline bool is_via_compact_memory(int order) { return false; } #define COMPACTION_HPAGE_ORDER (PMD_SHIFT - PAGE_SHIFT) #endif +static struct page *mark_allocated_noprof(struct page *page, unsigned int order, gfp_t gfp_flags) +{ + post_alloc_hook(page, order, __GFP_MOVABLE); + return page; +} +#define mark_allocated(...) alloc_hooks(mark_allocated_noprof(__VA_ARGS__)) + static void split_map_pages(struct list_head *freepages) { unsigned int i, order; @@ -93,7 +100,7 @@ static void split_map_pages(struct list_head *freepages) nr_pages = 1 << order; - post_alloc_hook(page, order, __GFP_MOVABLE); + mark_allocated(page, order, __GFP_MOVABLE); if (order) split_page(page, order); @@ -122,7 +129,7 @@ static unsigned long release_free_list(struct list_head *freepages) * Convert free pages into post allocation pages, so * that we can free them via __free_page. */ - post_alloc_hook(page, order, __GFP_MOVABLE); + mark_allocated(page, order, __GFP_MOVABLE); __free_pages(page, order); if (pfn > high_pfn) high_pfn = pfn; From 1c61990d3762a020817daa353da0a0af6794140b Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 14 Jun 2024 16:32:38 +0200 Subject: [PATCH 1385/1578] kasan: fix bad call to unpoison_slab_object Commit 29d7355a9d05 ("kasan: save alloc stack traces for mempool") messed up one of the calls to unpoison_slab_object: the last two arguments are supposed to be GFP flags and whether to init the object memory. Fix the call. Without this fix, __kasan_mempool_unpoison_object provides the object's size as GFP flags to unpoison_slab_object, which can cause LOCKDEP reports (and probably other issues). Link: https://lkml.kernel.org/r/20240614143238.60323-1-andrey.konovalov@linux.dev Fixes: 29d7355a9d05 ("kasan: save alloc stack traces for mempool") Signed-off-by: Andrey Konovalov Reported-by: Brad Spengler Acked-by: Marco Elver Cc: Signed-off-by: Andrew Morton --- mm/kasan/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index e7c9a4dc89f826..85e7c6b4575c21 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -532,7 +532,7 @@ void __kasan_mempool_unpoison_object(void *ptr, size_t size, unsigned long ip) return; /* Unpoison the object and save alloc info for non-kmalloc() allocations. */ - unpoison_slab_object(slab->slab_cache, ptr, size, flags); + unpoison_slab_object(slab->slab_cache, ptr, flags, false); /* Poison the redzone and save alloc info for kmalloc() allocations. */ if (is_kmalloc_cache(slab->slab_cache)) From be346c1a6eeb49d8fda827d2a9522124c2f72f36 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 14 Jun 2024 16:52:43 +0200 Subject: [PATCH 1386/1578] ocfs2: fix DIO failure due to insufficient transaction credits The code in ocfs2_dio_end_io_write() estimates number of necessary transaction credits using ocfs2_calc_extend_credits(). This however does not take into account that the IO could be arbitrarily large and can contain arbitrary number of extents. Extent tree manipulations do often extend the current transaction but not in all of the cases. For example if we have only single block extents in the tree, ocfs2_mark_extent_written() will end up calling ocfs2_replace_extent_rec() all the time and we will never extend the current transaction and eventually exhaust all the transaction credits if the IO contains many single block extents. Once that happens a WARN_ON(jbd2_handle_buffer_credits(handle) <= 0) is triggered in jbd2_journal_dirty_metadata() and subsequently OCFS2 aborts in response to this error. This was actually triggered by one of our customers on a heavily fragmented OCFS2 filesystem. To fix the issue make sure the transaction always has enough credits for one extent insert before each call of ocfs2_mark_extent_written(). Heming Zhao said: ------ PANIC: "Kernel panic - not syncing: OCFS2: (device dm-1): panic forced after error" PID: xxx TASK: xxxx CPU: 5 COMMAND: "SubmitThread-CA" #0 machine_kexec at ffffffff8c069932 #1 __crash_kexec at ffffffff8c1338fa #2 panic at ffffffff8c1d69b9 #3 ocfs2_handle_error at ffffffffc0c86c0c [ocfs2] #4 __ocfs2_abort at ffffffffc0c88387 [ocfs2] #5 ocfs2_journal_dirty at ffffffffc0c51e98 [ocfs2] #6 ocfs2_split_extent at ffffffffc0c27ea3 [ocfs2] #7 ocfs2_change_extent_flag at ffffffffc0c28053 [ocfs2] #8 ocfs2_mark_extent_written at ffffffffc0c28347 [ocfs2] #9 ocfs2_dio_end_io_write at ffffffffc0c2bef9 [ocfs2] #10 ocfs2_dio_end_io at ffffffffc0c2c0f5 [ocfs2] #11 dio_complete at ffffffff8c2b9fa7 #12 do_blockdev_direct_IO at ffffffff8c2bc09f #13 ocfs2_direct_IO at ffffffffc0c2b653 [ocfs2] #14 generic_file_direct_write at ffffffff8c1dcf14 #15 __generic_file_write_iter at ffffffff8c1dd07b #16 ocfs2_file_write_iter at ffffffffc0c49f1f [ocfs2] #17 aio_write at ffffffff8c2cc72e #18 kmem_cache_alloc at ffffffff8c248dde #19 do_io_submit at ffffffff8c2ccada #20 do_syscall_64 at ffffffff8c004984 #21 entry_SYSCALL_64_after_hwframe at ffffffff8c8000ba Link: https://lkml.kernel.org/r/20240617095543.6971-1-jack@suse.cz Link: https://lkml.kernel.org/r/20240614145243.8837-1-jack@suse.cz Fixes: c15471f79506 ("ocfs2: fix sparse file & data ordering issue in direct io") Signed-off-by: Jan Kara Reviewed-by: Joseph Qi Reviewed-by: Heming Zhao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Cc: Signed-off-by: Andrew Morton --- fs/ocfs2/aops.c | 5 +++++ fs/ocfs2/journal.c | 17 +++++++++++++++++ fs/ocfs2/journal.h | 2 ++ fs/ocfs2/ocfs2_trace.h | 2 ++ 4 files changed, 26 insertions(+) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index f0467d3b3c880e..6be175a1ab3ce1 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2366,6 +2366,11 @@ static int ocfs2_dio_end_io_write(struct inode *inode, } list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) { + ret = ocfs2_assure_trans_credits(handle, credits); + if (ret < 0) { + mlog_errno(ret); + break; + } ret = ocfs2_mark_extent_written(inode, &et, handle, ue->ue_cpos, 1, ue->ue_phys, diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 86807086b2dfd4..530fba34f6d312 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -445,6 +445,23 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks) return status; } +/* + * Make sure handle has at least 'nblocks' credits available. If it does not + * have that many credits available, we will try to extend the handle to have + * enough credits. If that fails, we will restart transaction to have enough + * credits. Similar notes regarding data consistency and locking implications + * as for ocfs2_extend_trans() apply here. + */ +int ocfs2_assure_trans_credits(handle_t *handle, int nblocks) +{ + int old_nblks = jbd2_handle_buffer_credits(handle); + + trace_ocfs2_assure_trans_credits(old_nblks); + if (old_nblks >= nblocks) + return 0; + return ocfs2_extend_trans(handle, nblocks - old_nblks); +} + /* * If we have fewer than thresh credits, extend by OCFS2_MAX_TRANS_DATA. * If that fails, restart the transaction & regain write access for the diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 41c9fe7e62f9b7..e3c3a35dc5e0e7 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -243,6 +243,8 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int ocfs2_commit_trans(struct ocfs2_super *osb, handle_t *handle); int ocfs2_extend_trans(handle_t *handle, int nblocks); +int ocfs2_assure_trans_credits(handle_t *handle, + int nblocks); int ocfs2_allocate_extend_trans(handle_t *handle, int thresh); diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index 60e208b01c8dc4..0511c69c9fde17 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -2577,6 +2577,8 @@ DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_commit_cache_end); DEFINE_OCFS2_INT_INT_EVENT(ocfs2_extend_trans); +DEFINE_OCFS2_INT_EVENT(ocfs2_assure_trans_credits); + DEFINE_OCFS2_INT_EVENT(ocfs2_extend_trans_restart); DEFINE_OCFS2_INT_INT_EVENT(ocfs2_allocate_extend_trans); From ff202303c398ed56386ca4954154de9a96eb732a Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Fri, 7 Jun 2024 13:29:53 -0700 Subject: [PATCH 1387/1578] mm: convert page type macros to enum Changing PG_slab from a page flag to a page type in commit 46df8e73a4a3 ("mm: free up PG_slab") in has the unintended consequence of removing the PG_slab constant from kernel debuginfo. The commit does add the value to the vmcoreinfo note, which allows debuggers to find the value without hardcoding it. However it's most flexible to continue representing the constant with an enum. To that end, convert the page type fields into an enum. Debuggers will now be able to detect that PG_slab's type has changed from enum pageflags to enum pagetype. Link: https://lkml.kernel.org/r/20240607202954.1198180-1-stephen.s.brennan@oracle.com Fixes: 46df8e73a4a3 ("mm: free up PG_slab") Signed-off-by: Stephen Brennan Acked-by: Vlastimil Babka Cc: David Hildenbrand Cc: Hao Ge Cc: Matthew Wilcox (Oracle) Cc: Omar Sandoval Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 104078afe0b16f..b9e914e1face8d 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -944,15 +944,18 @@ PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) * mistaken for a page type value. */ -#define PAGE_TYPE_BASE 0xf0000000 -/* Reserve 0x0000007f to catch underflows of _mapcount */ -#define PAGE_MAPCOUNT_RESERVE -128 -#define PG_buddy 0x00000080 -#define PG_offline 0x00000100 -#define PG_table 0x00000200 -#define PG_guard 0x00000400 -#define PG_hugetlb 0x00000800 -#define PG_slab 0x00001000 +enum pagetype { + PG_buddy = 0x00000080, + PG_offline = 0x00000100, + PG_table = 0x00000200, + PG_guard = 0x00000400, + PG_hugetlb = 0x00000800, + PG_slab = 0x00001000, + + PAGE_TYPE_BASE = 0xf0000000, + /* Reserve 0x0000007f to catch underflows of _mapcount */ + PAGE_MAPCOUNT_RESERVE = -128, +}; #define PageType(page, flag) \ ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) From 8b8546d298dc9ce9d5d01a06c822e255d2159ca7 Mon Sep 17 00:00:00 2001 From: aigourensheng Date: Mon, 17 Jun 2024 01:29:34 -0400 Subject: [PATCH 1388/1578] selftests/mm:fix test_prctl_fork_exec return failure After calling fork() in test_prctl_fork_exec(), the global variable ksm_full_scans_fd is initialized to 0 in the child process upon entering the main function of ./ksm_functional_tests. In the function call chain test_child_ksm() -> __mmap_and_merge_range -> ksm_merge-> ksm_get_full_scans, start_scans = ksm_get_full_scans() will return an error. Therefore, the value of ksm_full_scans_fd needs to be initialized before calling test_child_ksm in the child process. Link: https://lkml.kernel.org/r/20240617052934.5834-1-shechenglong001@gmail.com Signed-off-by: aigourensheng Acked-by: David Hildenbrand Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton --- .../selftests/mm/ksm_functional_tests.c | 38 +++++++++++-------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c index 37de82da9be76d..b61803e36d1cf5 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -656,12 +656,33 @@ static void test_prot_none(void) munmap(map, size); } +static void init_global_file_handles(void) +{ + mem_fd = open("/proc/self/mem", O_RDWR); + if (mem_fd < 0) + ksft_exit_fail_msg("opening /proc/self/mem failed\n"); + ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR); + if (ksm_fd < 0) + ksft_exit_skip("open(\"/sys/kernel/mm/ksm/run\") failed\n"); + ksm_full_scans_fd = open("/sys/kernel/mm/ksm/full_scans", O_RDONLY); + if (ksm_full_scans_fd < 0) + ksft_exit_skip("open(\"/sys/kernel/mm/ksm/full_scans\") failed\n"); + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + if (pagemap_fd < 0) + ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n"); + proc_self_ksm_stat_fd = open("/proc/self/ksm_stat", O_RDONLY); + proc_self_ksm_merging_pages_fd = open("/proc/self/ksm_merging_pages", + O_RDONLY); + ksm_use_zero_pages_fd = open("/sys/kernel/mm/ksm/use_zero_pages", O_RDWR); +} + int main(int argc, char **argv) { unsigned int tests = 8; int err; if (argc > 1 && !strcmp(argv[1], FORK_EXEC_CHILD_PRG_NAME)) { + init_global_file_handles(); exit(test_child_ksm()); } @@ -674,22 +695,7 @@ int main(int argc, char **argv) pagesize = getpagesize(); - mem_fd = open("/proc/self/mem", O_RDWR); - if (mem_fd < 0) - ksft_exit_fail_msg("opening /proc/self/mem failed\n"); - ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR); - if (ksm_fd < 0) - ksft_exit_skip("open(\"/sys/kernel/mm/ksm/run\") failed\n"); - ksm_full_scans_fd = open("/sys/kernel/mm/ksm/full_scans", O_RDONLY); - if (ksm_full_scans_fd < 0) - ksft_exit_skip("open(\"/sys/kernel/mm/ksm/full_scans\") failed\n"); - pagemap_fd = open("/proc/self/pagemap", O_RDONLY); - if (pagemap_fd < 0) - ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n"); - proc_self_ksm_stat_fd = open("/proc/self/ksm_stat", O_RDONLY); - proc_self_ksm_merging_pages_fd = open("/proc/self/ksm_merging_pages", - O_RDONLY); - ksm_use_zero_pages_fd = open("/sys/kernel/mm/ksm/use_zero_pages", O_RDWR); + init_global_file_handles(); test_unmerge(); test_unmerge_zero_pages(); From f3228a2d4c3bf19e49bf933ca9a6e64555aafee3 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 18 Jun 2024 16:35:56 +0300 Subject: [PATCH 1389/1578] MAINTAINERS: TPM DEVICE DRIVER: update the W-tag Git hosting for the test suite has been migrated from Gitlab to Codeberg, given the "less hostile environment". Link: https://lkml.kernel.org/r/20240618133556.105604-1-jarkko@kernel.org Link: https://codeberg.org/jarkko/linux-tpmdd-test Signed-off-by: Jarkko Sakkinen Signed-off-by: Andrew Morton --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 2ca8f35dfe0399..43353b7059883c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22745,7 +22745,7 @@ M: Jarkko Sakkinen R: Jason Gunthorpe L: linux-integrity@vger.kernel.org S: Maintained -W: https://gitlab.com/jarkkojs/linux-tpmdd-test +W: https://codeberg.org/jarkko/linux-tpmdd-test Q: https://patchwork.kernel.org/project/linux-integrity/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/jarkko/linux-tpmdd.git F: Documentation/devicetree/bindings/tpm/ From c6408250703530187cc6250dcd702d12a71c44f5 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Tue, 18 Jun 2024 09:41:51 -0400 Subject: [PATCH 1390/1578] mm/migrate: make migrate_pages_batch() stats consistent As Ying pointed out in [1], stats->nr_thp_failed needs to be updated to avoid stats inconsistency between MIGRATE_SYNC and MIGRATE_ASYNC when calling migrate_pages_batch(). Because if not, when migrate_pages_batch() is called via migrate_pages(MIGRATE_ASYNC), nr_thp_failed will not be increased and when migrate_pages_batch() is called via migrate_pages(MIGRATE_SYNC*), nr_thp_failed will be increase in migrate_pages_sync() by stats->nr_thp_failed += astats.nr_thp_split. [1] https://lore.kernel.org/linux-mm/87msnq7key.fsf@yhuang6-desk2.ccr.corp.intel.com/ Link: https://lkml.kernel.org/r/20240620012712.19804-1-zi.yan@sent.com Link: https://lkml.kernel.org/r/20240618134151.29214-1-zi.yan@sent.com Fixes: 7262f208ca68 ("mm/migrate: split source folio if it is on deferred split list") Signed-off-by: Zi Yan Suggested-by: "Huang, Ying" Reviewed-by: "Huang, Ying" Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Yang Shi Cc: Yin Fengwei Signed-off-by: Andrew Morton --- mm/migrate.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/migrate.c b/mm/migrate.c index 2cc5a68f684352..20cb9f5f744605 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1659,6 +1659,10 @@ static int migrate_pages_batch(struct list_head *from, * migrate_pages() may report success with (split but * unmigrated) pages still on its fromlist; whereas it * always reports success when its fromlist is empty. + * stats->nr_thp_failed should be increased too, + * otherwise stats inconsistency will happen when + * migrate_pages_batch is called via migrate_pages() + * with MIGRATE_SYNC and MIGRATE_ASYNC. * * Only check it without removing it from the list. * Since the folio can be on deferred_split_scan() @@ -1675,6 +1679,7 @@ static int migrate_pages_batch(struct list_head *from, !list_empty(&folio->_deferred_list)) { if (try_split_folio(folio, split_folios) == 0) { nr_failed++; + stats->nr_thp_failed += is_thp; stats->nr_thp_split += is_thp; stats->nr_split++; continue; From 54e7d59841dab977f6cb1183d658b1b82c9f4e94 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 18 Jun 2024 18:56:47 +1200 Subject: [PATCH 1391/1578] nfs: drop the incorrect assertion in nfs_swap_rw() Since commit 2282679fb20b ("mm: submit multipage write for SWP_FS_OPS swap-space"), we can plug multiple pages then unplug them all together. That means iov_iter_count(iter) could be way bigger than PAGE_SIZE, it actually equals the size of iov_iter_npages(iter, INT_MAX). Note this issue has nothing to do with large folios as we don't support THP_SWPOUT to non-block devices. [v-songbaohua@oppo.com: figure out the cause and correct the commit message] Link: https://lkml.kernel.org/r/20240618065647.21791-1-21cnbao@gmail.com Fixes: 2282679fb20b ("mm: submit multipage write for SWP_FS_OPS swap-space") Signed-off-by: Christoph Hellwig Signed-off-by: Barry Song Closes: https://lore.kernel.org/linux-mm/20240617053201.GA16852@lst.de/ Reviewed-by: Martin Wege Cc: NeilBrown Cc: Anna Schumaker Cc: Steve French Cc: Trond Myklebust Cc: Chuanhua Han Cc: Ryan Roberts Cc: Chris Li Cc: "Huang, Ying" Cc: Jeff Layton Cc: Matthew Wilcox Cc: Signed-off-by: Andrew Morton --- fs/nfs/direct.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index bb2f583eb28bf1..90079ca134dd3c 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -141,8 +141,6 @@ int nfs_swap_rw(struct kiocb *iocb, struct iov_iter *iter) { ssize_t ret; - VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE); - if (iov_iter_rw(iter) == READ) ret = nfs_file_direct_read(iocb, iter, true); else From bf14ed81f571f8dba31cd72ab2e50fbcc877cc31 Mon Sep 17 00:00:00 2001 From: yangge Date: Thu, 20 Jun 2024 08:59:50 +0800 Subject: [PATCH 1392/1578] mm/page_alloc: Separate THP PCP into movable and non-movable categories Since commit 5d0a661d808f ("mm/page_alloc: use only one PCP list for THP-sized allocations") no longer differentiates the migration type of pages in THP-sized PCP list, it's possible that non-movable allocation requests may get a CMA page from the list, in some cases, it's not acceptable. If a large number of CMA memory are configured in system (for example, the CMA memory accounts for 50% of the system memory), starting a virtual machine with device passthrough will get stuck. During starting the virtual machine, it will call pin_user_pages_remote(..., FOLL_LONGTERM, ...) to pin memory. Normally if a page is present and in CMA area, pin_user_pages_remote() will migrate the page from CMA area to non-CMA area because of FOLL_LONGTERM flag. But if non-movable allocation requests return CMA memory, migrate_longterm_unpinnable_pages() will migrate a CMA page to another CMA page, which will fail to pass the check in check_and_migrate_movable_pages() and cause migration endless. Call trace: pin_user_pages_remote --__gup_longterm_locked // endless loops in this function ----_get_user_pages_locked ----check_and_migrate_movable_pages ------migrate_longterm_unpinnable_pages --------alloc_migration_target This problem will also have a negative impact on CMA itself. For example, when CMA is borrowed by THP, and we need to reclaim it through cma_alloc() or dma_alloc_coherent(), we must move those pages out to ensure CMA's users can retrieve that contigous memory. Currently, CMA's memory is occupied by non-movable pages, meaning we can't relocate them. As a result, cma_alloc() is more likely to fail. To fix the problem above, we add one PCP list for THP, which will not introduce a new cacheline for struct per_cpu_pages. THP will have 2 PCP lists, one PCP list is used by MOVABLE allocation, and the other PCP list is used by UNMOVABLE allocation. MOVABLE allocation contains GPF_MOVABLE, and UNMOVABLE allocation contains GFP_UNMOVABLE and GFP_RECLAIMABLE. Link: https://lkml.kernel.org/r/1718845190-4456-1-git-send-email-yangge1116@126.com Fixes: 5d0a661d808f ("mm/page_alloc: use only one PCP list for THP-sized allocations") Signed-off-by: yangge Cc: Baolin Wang Cc: Barry Song <21cnbao@gmail.com> Cc: Mel Gorman Cc: Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 9 ++++----- mm/page_alloc.c | 9 +++++++-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8f9c9590a42cfd..586a8f0104d73a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -654,13 +654,12 @@ enum zone_watermarks { }; /* - * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. One additional list - * for THP which will usually be GFP_MOVABLE. Even if it is another type, - * it should not contribute to serious fragmentation causing THP allocation - * failures. + * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. Two additional lists + * are added for THP. One PCP list is used by GPF_MOVABLE, and the other PCP list + * is used by GFP_UNMOVABLE and GFP_RECLAIMABLE. */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define NR_PCP_THP 1 +#define NR_PCP_THP 2 #else #define NR_PCP_THP 0 #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7300aa9f14b0b1..9ecf99190ea201 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -504,10 +504,15 @@ static void bad_page(struct page *page, const char *reason) static inline unsigned int order_to_pindex(int migratetype, int order) { + bool __maybe_unused movable; + #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (order > PAGE_ALLOC_COSTLY_ORDER) { VM_BUG_ON(order != HPAGE_PMD_ORDER); - return NR_LOWORDER_PCP_LISTS; + + movable = migratetype == MIGRATE_MOVABLE; + + return NR_LOWORDER_PCP_LISTS + movable; } #else VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); @@ -521,7 +526,7 @@ static inline int pindex_to_order(unsigned int pindex) int order = pindex / MIGRATE_PCPTYPES; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (pindex == NR_LOWORDER_PCP_LISTS) + if (pindex >= NR_LOWORDER_PCP_LISTS) order = HPAGE_PMD_ORDER; #else VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); From ab1ffc86cb5bec1c92387b9811d9036512f8f4eb Mon Sep 17 00:00:00 2001 From: Andrew Bresticker Date: Tue, 11 Jun 2024 08:32:16 -0700 Subject: [PATCH 1393/1578] mm/memory: don't require head page for do_set_pmd() The requirement that the head page be passed to do_set_pmd() was added in commit ef37b2ea08ac ("mm/memory: page_add_file_rmap() -> folio_add_file_rmap_[pte|pmd]()") and prevents pmd-mapping in the finish_fault() and filemap_map_pages() paths if the page to be inserted is anything but the head page for an otherwise suitable vma and pmd-sized page. Matthew said: : We're going to stop using PMDs to map large folios unless the fault is : within the first 4KiB of the PMD. No idea how many workloads that : affects, but it only needs to be backported as far as v6.8, so we may : as well backport it. Link: https://lkml.kernel.org/r/20240611153216.2794513-1-abrestic@rivosinc.com Fixes: ef37b2ea08ac ("mm/memory: page_add_file_rmap() -> folio_add_file_rmap_[pte|pmd]()") Signed-off-by: Andrew Bresticker Acked-by: David Hildenbrand Acked-by: Hugh Dickins Cc: Signed-off-by: Andrew Morton --- mm/memory.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 25a77c4fe4a03a..d10e616d73898e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4608,8 +4608,9 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) return ret; - if (page != &folio->page || folio_order(folio) != HPAGE_PMD_ORDER) + if (folio_order(folio) != HPAGE_PMD_ORDER) return ret; + page = &folio->page; /* * Just backoff if any subpage of a THP is corrupted otherwise From 7312b740b70e77ccdc0dce11316f05861907d3ca Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 25 Jun 2024 07:54:54 +0200 Subject: [PATCH 1394/1578] Revert "serial: core: Fix ifdef for serial base console functions" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit b20172ca6bf489534892b801a5db41bbf5ceec75. Let's roll back all of the serial core and printk console changes that went into 6.10-rc1 as there still are problems with them that need to be sorted out. Link: https://lore.kernel.org/r/ZnpRozsdw6zbjqze@tlindgre-MOBL1 Reported-by: Petr Mladek Reported-by: Tony Lindgren Cc: Jiri Slaby Cc: John Ogness Cc: Sergey Senozhatsky Cc: Andy Shevchenko Cc: Ilpo Järvinen Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/serial_base_bus.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/tty/serial/serial_base_bus.c b/drivers/tty/serial/serial_base_bus.c index 73c6ee540c836c..281a2d5a733232 100644 --- a/drivers/tty/serial/serial_base_bus.c +++ b/drivers/tty/serial/serial_base_bus.c @@ -219,6 +219,8 @@ static int serial_base_add_one_prefcon(const char *match, const char *dev_name, return ret; } +#endif + #ifdef __sparc__ /* Handle Sparc ttya and ttyb options as done in console_setup() */ @@ -317,8 +319,6 @@ int serial_base_add_preferred_console(struct uart_driver *drv, return serial_base_add_one_prefcon(port_match, drv->dev_name, port->line); } -#endif - #ifdef CONFIG_SERIAL_8250_CONSOLE /* From 3ddbc427df968f50e6048368fc01088d428491c7 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 25 Jun 2024 07:57:48 +0200 Subject: [PATCH 1395/1578] Revert "serial: 8250: Fix add preferred console for serial8250_isa_init_ports()" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 4547cd76f08a6f301f6ad563f5d0e4566924ec6b. Let's roll back all of the serial core and printk console changes that went into 6.10-rc1 as there still are problems with them that need to be sorted out. Link: https://lore.kernel.org/r/ZnpRozsdw6zbjqze@tlindgre-MOBL1 Reported-by: Petr Mladek Reported-by: Tony Lindgren Cc: Jiri Slaby Cc: John Ogness Cc: Sergey Senozhatsky Cc: Andy Shevchenko Cc: Ilpo Järvinen Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/serial_base.h | 10 ++-------- drivers/tty/serial/serial_base_bus.c | 11 +++++++++-- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/tty/serial/serial_base.h b/drivers/tty/serial/serial_base.h index 743a72ac34f3d1..40af7052b4b64f 100644 --- a/drivers/tty/serial/serial_base.h +++ b/drivers/tty/serial/serial_base.h @@ -55,6 +55,8 @@ void serial_core_unregister_port(struct uart_driver *drv, struct uart_port *port int serial_base_add_preferred_console(struct uart_driver *drv, struct uart_port *port); +int serial_base_add_isa_preferred_console(const char *name, int idx); + #else static inline @@ -64,14 +66,6 @@ int serial_base_add_preferred_console(struct uart_driver *drv, return 0; } -#endif - -#ifdef CONFIG_SERIAL_8250_CONSOLE - -int serial_base_add_isa_preferred_console(const char *name, int idx); - -#else - static inline int serial_base_add_isa_preferred_console(const char *name, int idx) { diff --git a/drivers/tty/serial/serial_base_bus.c b/drivers/tty/serial/serial_base_bus.c index 281a2d5a733232..ae18871c0f3641 100644 --- a/drivers/tty/serial/serial_base_bus.c +++ b/drivers/tty/serial/serial_base_bus.c @@ -219,8 +219,6 @@ static int serial_base_add_one_prefcon(const char *match, const char *dev_name, return ret; } -#endif - #ifdef __sparc__ /* Handle Sparc ttya and ttyb options as done in console_setup() */ @@ -331,6 +329,15 @@ int serial_base_add_isa_preferred_console(const char *name, int idx) return serial_base_add_prefcon(name, idx); } +#else + +int serial_base_add_isa_preferred_console(const char *name, int idx) +{ + return 0; +} + +#endif + #endif static int serial_base_init(void) From 12b7210ea83e7119a0041a54027948d65c5e6206 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 25 Jun 2024 07:57:51 +0200 Subject: [PATCH 1396/1578] Revert "Documentation: kernel-parameters: Add DEVNAME:0.0 format for serial ports" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 5c3a766e9f057ee7a54b5d7addff7fab02676fea. Let's roll back all of the serial core and printk console changes that went into 6.10-rc1 as there still are problems with them that need to be sorted out. Link: https://lore.kernel.org/r/ZnpRozsdw6zbjqze@tlindgre-MOBL1 Reported-by: Petr Mladek Reported-by: Tony Lindgren Cc: Jiri Slaby Cc: John Ogness Cc: Sergey Senozhatsky Cc: Andy Shevchenko Cc: Ilpo Järvinen Signed-off-by: Greg Kroah-Hartman --- .../admin-guide/kernel-parameters.txt | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index b600df82669db0..4d4c640e51e71c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -788,25 +788,6 @@ Documentation/networking/netconsole.rst for an alternative. - :.[,options] - Use the specified serial port on the serial core bus. - The addressing uses DEVNAME of the physical serial port - device, followed by the serial core controller instance, - and the serial port instance. The options are the same - as documented for the ttyS addressing above. - - The mapping of the serial ports to the tty instances - can be viewed with: - - $ ls -d /sys/bus/serial-base/devices/*:*.*/tty/* - /sys/bus/serial-base/devices/00:04:0.0/tty/ttyS0 - - In the above example, the console can be addressed with - console=00:04:0.0. Note that a console addressed this - way will only get added when the related device driver - is ready. The use of an earlycon parameter in addition to - the console may be desired for console output early on. - uart[8250],io,[,options] uart[8250],mmio,[,options] uart[8250],mmio16,[,options] From 740a79675833db5e14c89113d94be769ab8369c0 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 25 Jun 2024 07:57:55 +0200 Subject: [PATCH 1397/1578] Revert "serial: 8250: Add preferred console in serial8250_isa_init_ports()" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit a8b04cfe7dad84e65df5996e14b435fd356fe62c. Let's roll back all of the serial core and printk console changes that went into 6.10-rc1 as there still are problems with them that need to be sorted out. Link: https://lore.kernel.org/r/ZnpRozsdw6zbjqze@tlindgre-MOBL1 Reported-by: Petr Mladek Reported-by: Tony Lindgren Cc: Jiri Slaby Cc: John Ogness Cc: Sergey Senozhatsky Cc: Andy Shevchenko Cc: Ilpo Järvinen Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_core.c | 5 ----- drivers/tty/serial/serial_base.h | 8 -------- drivers/tty/serial/serial_base_bus.c | 21 --------------------- 3 files changed, 34 deletions(-) diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c index ff15022369e459..b0adafc44747de 100644 --- a/drivers/tty/serial/8250/8250_core.c +++ b/drivers/tty/serial/8250/8250_core.c @@ -15,7 +15,6 @@ */ #include -#include #include #include #include @@ -42,8 +41,6 @@ #include -#include "../serial_base.h" /* For serial_base_add_isa_preferred_console() */ - #include "8250.h" /* @@ -563,8 +560,6 @@ static void __init serial8250_isa_init_ports(void) port->irqflags |= irqflag; if (serial8250_isa_config != NULL) serial8250_isa_config(i, &up->port, &up->capabilities); - - serial_base_add_isa_preferred_console(serial8250_reg.dev_name, i); } } diff --git a/drivers/tty/serial/serial_base.h b/drivers/tty/serial/serial_base.h index 40af7052b4b64f..68a86df984dabe 100644 --- a/drivers/tty/serial/serial_base.h +++ b/drivers/tty/serial/serial_base.h @@ -55,8 +55,6 @@ void serial_core_unregister_port(struct uart_driver *drv, struct uart_port *port int serial_base_add_preferred_console(struct uart_driver *drv, struct uart_port *port); -int serial_base_add_isa_preferred_console(const char *name, int idx); - #else static inline @@ -66,10 +64,4 @@ int serial_base_add_preferred_console(struct uart_driver *drv, return 0; } -static inline -int serial_base_add_isa_preferred_console(const char *name, int idx) -{ - return 0; -} - #endif diff --git a/drivers/tty/serial/serial_base_bus.c b/drivers/tty/serial/serial_base_bus.c index ae18871c0f3641..1375be3315e6d4 100644 --- a/drivers/tty/serial/serial_base_bus.c +++ b/drivers/tty/serial/serial_base_bus.c @@ -317,27 +317,6 @@ int serial_base_add_preferred_console(struct uart_driver *drv, return serial_base_add_one_prefcon(port_match, drv->dev_name, port->line); } -#ifdef CONFIG_SERIAL_8250_CONSOLE - -/* - * Early ISA ports initialize the console before there is no struct device. - * This should be only called from serial8250_isa_init_preferred_console(), - * other callers are likely wrong and should rely on earlycon instead. - */ -int serial_base_add_isa_preferred_console(const char *name, int idx) -{ - return serial_base_add_prefcon(name, idx); -} - -#else - -int serial_base_add_isa_preferred_console(const char *name, int idx) -{ - return 0; -} - -#endif - #endif static int serial_base_init(void) From e406c56dd9e35cc0b456b592d6814b5d2d8791de Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 25 Jun 2024 07:57:58 +0200 Subject: [PATCH 1398/1578] Revert "serial: core: Handle serial console options" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit a0f32e2dd99867b164bfebcf36729c2a0d41b30b. Let's roll back all of the serial core and printk console changes that went into 6.10-rc1 as there still are problems with them that need to be sorted out. Link: https://lore.kernel.org/r/ZnpRozsdw6zbjqze@tlindgre-MOBL1 Reported-by: Petr Mladek Reported-by: Tony Lindgren Cc: Jiri Slaby Cc: John Ogness Cc: Sergey Senozhatsky Cc: Andy Shevchenko Cc: Ilpo Järvinen Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/serial_base_bus.c | 49 ---------------------------- 1 file changed, 49 deletions(-) diff --git a/drivers/tty/serial/serial_base_bus.c b/drivers/tty/serial/serial_base_bus.c index 1375be3315e6d4..4ef26a1b09b112 100644 --- a/drivers/tty/serial/serial_base_bus.c +++ b/drivers/tty/serial/serial_base_bus.c @@ -219,58 +219,9 @@ static int serial_base_add_one_prefcon(const char *match, const char *dev_name, return ret; } -#ifdef __sparc__ - -/* Handle Sparc ttya and ttyb options as done in console_setup() */ -static int serial_base_add_sparc_console(const char *dev_name, int idx) -{ - const char *name; - - switch (idx) { - case 0: - name = "ttya"; - break; - case 1: - name = "ttyb"; - break; - default: - return 0; - } - - return serial_base_add_one_prefcon(name, dev_name, idx); -} - -#else - -static inline int serial_base_add_sparc_console(const char *dev_name, int idx) -{ - return 0; -} - -#endif - static int serial_base_add_prefcon(const char *name, int idx) { const char *char_match __free(kfree) = NULL; - const char *nmbr_match __free(kfree) = NULL; - int ret; - - /* Handle ttyS specific options */ - if (strstarts(name, "ttyS")) { - /* No name, just a number */ - nmbr_match = kasprintf(GFP_KERNEL, "%i", idx); - if (!nmbr_match) - return -ENODEV; - - ret = serial_base_add_one_prefcon(nmbr_match, name, idx); - if (ret) - return ret; - - /* Sparc ttya and ttyb */ - ret = serial_base_add_sparc_console(name, idx); - if (ret) - return ret; - } /* Handle the traditional character device name style console=ttyS0 */ char_match = kasprintf(GFP_KERNEL, "%s%i", name, idx); From a5e4bb69ecddb4ad7895e21091f3657ad832e270 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 25 Jun 2024 07:58:00 +0200 Subject: [PATCH 1399/1578] Revert "serial: core: Add support for DEVNAME:0.0 style naming for kernel console" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 787a1cabac01c99846070fcf702e53befaf89f79. Let's roll back all of the serial core and printk console changes that went into 6.10-rc1 as there still are problems with them that need to be sorted out. Link: https://lore.kernel.org/r/ZnpRozsdw6zbjqze@tlindgre-MOBL1 Reported-by: Petr Mladek Reported-by: Tony Lindgren Cc: Jiri Slaby Cc: John Ogness Cc: Sergey Senozhatsky Cc: Andy Shevchenko Cc: Ilpo Järvinen Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/serial_base.h | 16 ------- drivers/tty/serial/serial_base_bus.c | 66 ---------------------------- drivers/tty/serial/serial_core.c | 4 -- 3 files changed, 86 deletions(-) diff --git a/drivers/tty/serial/serial_base.h b/drivers/tty/serial/serial_base.h index 68a86df984dabe..b6c38d2edfd401 100644 --- a/drivers/tty/serial/serial_base.h +++ b/drivers/tty/serial/serial_base.h @@ -49,19 +49,3 @@ void serial_ctrl_unregister_port(struct uart_driver *drv, struct uart_port *port int serial_core_register_port(struct uart_driver *drv, struct uart_port *port); void serial_core_unregister_port(struct uart_driver *drv, struct uart_port *port); - -#ifdef CONFIG_SERIAL_CORE_CONSOLE - -int serial_base_add_preferred_console(struct uart_driver *drv, - struct uart_port *port); - -#else - -static inline -int serial_base_add_preferred_console(struct uart_driver *drv, - struct uart_port *port) -{ - return 0; -} - -#endif diff --git a/drivers/tty/serial/serial_base_bus.c b/drivers/tty/serial/serial_base_bus.c index 4ef26a1b09b112..4df2a4b10445ad 100644 --- a/drivers/tty/serial/serial_base_bus.c +++ b/drivers/tty/serial/serial_base_bus.c @@ -8,7 +8,6 @@ * The serial core bus manages the serial core controller instances. */ -#include #include #include #include @@ -205,71 +204,6 @@ void serial_base_port_device_remove(struct serial_port_device *port_dev) put_device(&port_dev->dev); } -#ifdef CONFIG_SERIAL_CORE_CONSOLE - -static int serial_base_add_one_prefcon(const char *match, const char *dev_name, - int port_id) -{ - int ret; - - ret = add_preferred_console_match(match, dev_name, port_id); - if (ret == -ENOENT) - return 0; - - return ret; -} - -static int serial_base_add_prefcon(const char *name, int idx) -{ - const char *char_match __free(kfree) = NULL; - - /* Handle the traditional character device name style console=ttyS0 */ - char_match = kasprintf(GFP_KERNEL, "%s%i", name, idx); - if (!char_match) - return -ENOMEM; - - return serial_base_add_one_prefcon(char_match, name, idx); -} - -/** - * serial_base_add_preferred_console - Adds a preferred console - * @drv: Serial port device driver - * @port: Serial port instance - * - * Tries to add a preferred console for a serial port if specified in the - * kernel command line. Supports both the traditional character device such - * as console=ttyS0, and a hardware addressing based console=DEVNAME:0.0 - * style name. - * - * Translates the kernel command line option using a hardware based addressing - * console=DEVNAME:0.0 to the serial port character device such as ttyS0. - * Cannot be called early for ISA ports, depends on struct device. - * - * Note that duplicates are ignored by add_preferred_console(). - * - * Return: 0 on success, negative error code on failure. - */ -int serial_base_add_preferred_console(struct uart_driver *drv, - struct uart_port *port) -{ - const char *port_match __free(kfree) = NULL; - int ret; - - ret = serial_base_add_prefcon(drv->dev_name, port->line); - if (ret) - return ret; - - port_match = kasprintf(GFP_KERNEL, "%s:%i.%i", dev_name(port->dev), - port->ctrl_id, port->port_id); - if (!port_match) - return -ENOMEM; - - /* Translate a hardware addressing style console=DEVNAME:0.0 */ - return serial_base_add_one_prefcon(port_match, drv->dev_name, port->line); -} - -#endif - static int serial_base_init(void) { int ret; diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index 0c4d609766633b..2a8006e3d68786 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -3422,10 +3422,6 @@ int serial_core_register_port(struct uart_driver *drv, struct uart_port *port) if (ret) goto err_unregister_ctrl_dev; - ret = serial_base_add_preferred_console(drv, port); - if (ret) - goto err_unregister_port_dev; - ret = serial_core_add_one_port(drv, port); if (ret) goto err_unregister_port_dev; From deb091cb05a2b8555e15fcc2df5a0dcd9d06fea0 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 25 Jun 2024 07:58:03 +0200 Subject: [PATCH 1400/1578] Revert "printk: Flag register_console() if console is set on command line" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit b73c9cbe4f1fc02645228aa575998dd54067f8ef. Let's roll back all of the serial core and printk console changes that went into 6.10-rc1 as there still are problems with them that need to be sorted out. Link: https://lore.kernel.org/r/ZnpRozsdw6zbjqze@tlindgre-MOBL1 Reported-by: Petr Mladek Reported-by: Tony Lindgren Cc: Jiri Slaby Cc: John Ogness Cc: Sergey Senozhatsky Cc: Andy Shevchenko Cc: Ilpo Järvinen Signed-off-by: Greg Kroah-Hartman --- kernel/printk/printk.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 420fd310129d4b..e6e6a47acec833 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2504,9 +2504,6 @@ static int __init console_setup(char *str) if (console_opt_save(str, brl_options)) return 1; - /* Flag register_console() to not call try_enable_default_console() */ - console_set_on_cmdline = 1; - /* Don't attempt to parse a DEVNAME:0.0 style console */ if (strchr(str, ':')) return 1; @@ -3522,7 +3519,7 @@ void register_console(struct console *newcon) * Note that a console with tty binding will have CON_CONSDEV * flag set and will be first in the list. */ - if (preferred_console < 0 && !console_set_on_cmdline) { + if (preferred_console < 0) { if (hlist_empty(&console_list) || !console_first()->device || console_first()->flags & CON_BOOT) { try_enable_default_console(newcon); From 64f9f010c6177dd4f33e5023d2eab9af4af291e9 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 25 Jun 2024 07:58:07 +0200 Subject: [PATCH 1401/1578] Revert "printk: Don't try to parse DEVNAME:0.0 console options" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 8a831c584e6e80cf68f79893dc395c16cdf47dc8. Let's roll back all of the serial core and printk console changes that went into 6.10-rc1 as there still are problems with them that need to be sorted out. Link: https://lore.kernel.org/r/ZnpRozsdw6zbjqze@tlindgre-MOBL1 Reported-by: Petr Mladek Reported-by: Tony Lindgren Cc: Jiri Slaby Cc: John Ogness Cc: Sergey Senozhatsky Cc: Andy Shevchenko Cc: Ilpo Järvinen Signed-off-by: Greg Kroah-Hartman --- kernel/printk/printk.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index e6e6a47acec833..b582404cd29d19 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2504,10 +2504,6 @@ static int __init console_setup(char *str) if (console_opt_save(str, brl_options)) return 1; - /* Don't attempt to parse a DEVNAME:0.0 style console */ - if (strchr(str, ':')) - return 1; - /* * Decode str into name, index, options. */ From cc8d5a2f09a54405321769abfd6ec3395482336a Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 25 Jun 2024 07:58:10 +0200 Subject: [PATCH 1402/1578] Revert "printk: Save console options for add_preferred_console_match()" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit f03e8c1060f86c23eb49bafee99d9fcbd1c1bd77. Let's roll back all of the serial core and printk console changes that went into 6.10-rc1 as there still are problems with them that need to be sorted out. Link: https://lore.kernel.org/r/ZnpRozsdw6zbjqze@tlindgre-MOBL1 Reported-by: Petr Mladek Reported-by: Tony Lindgren Cc: Jiri Slaby Cc: John Ogness Cc: Sergey Senozhatsky Cc: Andy Shevchenko Cc: Ilpo Järvinen Signed-off-by: Greg Kroah-Hartman --- include/linux/printk.h | 3 - kernel/printk/Makefile | 2 +- kernel/printk/conopt.c | 146 -------------------------------- kernel/printk/console_cmdline.h | 6 -- kernel/printk/printk.c | 14 +-- 5 files changed, 4 insertions(+), 167 deletions(-) delete mode 100644 kernel/printk/conopt.c diff --git a/include/linux/printk.h b/include/linux/printk.h index 40afab23881a0a..65c5184470f18c 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -60,9 +60,6 @@ static inline const char *printk_skip_headers(const char *buffer) #define CONSOLE_LOGLEVEL_DEFAULT CONFIG_CONSOLE_LOGLEVEL_DEFAULT #define CONSOLE_LOGLEVEL_QUIET CONFIG_CONSOLE_LOGLEVEL_QUIET -int add_preferred_console_match(const char *match, const char *name, - const short idx); - extern int console_printk[]; #define console_loglevel (console_printk[0]) diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index 040fe7d1eda2f4..39a2b61c7232e7 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y = printk.o conopt.o +obj-y = printk.o obj-$(CONFIG_PRINTK) += printk_safe.o nbcon.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o obj-$(CONFIG_PRINTK_INDEX) += index.o diff --git a/kernel/printk/conopt.c b/kernel/printk/conopt.c deleted file mode 100644 index 9d507bac36579a..00000000000000 --- a/kernel/printk/conopt.c +++ /dev/null @@ -1,146 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Kernel command line console options for hardware based addressing - * - * Copyright (C) 2023 Texas Instruments Incorporated - https://www.ti.com/ - * Author: Tony Lindgren - */ - -#include -#include -#include -#include - -#include - -#include "console_cmdline.h" - -/* - * Allow longer DEVNAME:0.0 style console naming such as abcd0000.serial:0.0 - * in addition to the legacy ttyS0 style naming. - */ -#define CONSOLE_NAME_MAX 32 - -#define CONSOLE_OPT_MAX 16 -#define CONSOLE_BRL_OPT_MAX 16 - -struct console_option { - char name[CONSOLE_NAME_MAX]; - char opt[CONSOLE_OPT_MAX]; - char brl_opt[CONSOLE_BRL_OPT_MAX]; - u8 has_brl_opt:1; -}; - -/* Updated only at console_setup() time, no locking needed */ -static struct console_option conopt[MAX_CMDLINECONSOLES]; - -/** - * console_opt_save - Saves kernel command line console option for driver use - * @str: Kernel command line console name and option - * @brl_opt: Braille console options - * - * Saves a kernel command line console option for driver subsystems to use for - * adding a preferred console during init. Called from console_setup() only. - * - * Return: 0 on success, negative error code on failure. - */ -int __init console_opt_save(const char *str, const char *brl_opt) -{ - struct console_option *con; - size_t namelen, optlen; - const char *opt; - int i; - - namelen = strcspn(str, ","); - if (namelen == 0 || namelen >= CONSOLE_NAME_MAX) - return -EINVAL; - - opt = str + namelen; - if (*opt == ',') - opt++; - - optlen = strlen(opt); - if (optlen >= CONSOLE_OPT_MAX) - return -EINVAL; - - for (i = 0; i < MAX_CMDLINECONSOLES; i++) { - con = &conopt[i]; - - if (con->name[0]) { - if (!strncmp(str, con->name, namelen)) - return 0; - continue; - } - - /* - * The name isn't terminated, only opt is. Empty opt is fine, - * but brl_opt can be either empty or NULL. For more info, see - * _braille_console_setup(). - */ - strscpy(con->name, str, namelen + 1); - strscpy(con->opt, opt, CONSOLE_OPT_MAX); - if (brl_opt) { - strscpy(con->brl_opt, brl_opt, CONSOLE_BRL_OPT_MAX); - con->has_brl_opt = 1; - } - - return 0; - } - - return -ENOMEM; -} - -static struct console_option *console_opt_find(const char *name) -{ - struct console_option *con; - int i; - - for (i = 0; i < MAX_CMDLINECONSOLES; i++) { - con = &conopt[i]; - if (!strcmp(name, con->name)) - return con; - } - - return NULL; -} - -/** - * add_preferred_console_match - Adds a preferred console if a match is found - * @match: Expected console on kernel command line, such as console=DEVNAME:0.0 - * @name: Name of the console character device to add such as ttyS - * @idx: Index for the console - * - * Allows driver subsystems to add a console after translating the command - * line name to the character device name used for the console. Options are - * added automatically based on the kernel command line. Duplicate preferred - * consoles are ignored by __add_preferred_console(). - * - * Return: 0 on success, negative error code on failure. - */ -int add_preferred_console_match(const char *match, const char *name, - const short idx) -{ - struct console_option *con; - char *brl_opt = NULL; - - if (!match || !strlen(match) || !name || !strlen(name) || - idx < 0) - return -EINVAL; - - con = console_opt_find(match); - if (!con) - return -ENOENT; - - /* - * See __add_preferred_console(). It checks for NULL brl_options to set - * the preferred_console flag. Empty brl_opt instead of NULL leads into - * the preferred_console flag not set, and CON_CONSDEV not being set, - * and the boot console won't get disabled at the end of console_setup(). - */ - if (con->has_brl_opt) - brl_opt = con->brl_opt; - - console_opt_add_preferred_console(name, idx, con->opt, brl_opt); - - return 0; -} diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h index a125e0235589d5..3ca74ad391d6e8 100644 --- a/kernel/printk/console_cmdline.h +++ b/kernel/printk/console_cmdline.h @@ -2,12 +2,6 @@ #ifndef _CONSOLE_CMDLINE_H #define _CONSOLE_CMDLINE_H -#define MAX_CMDLINECONSOLES 8 - -int console_opt_save(const char *str, const char *brl_opt); -int console_opt_add_preferred_console(const char *name, const short idx, - char *options, char *brl_options); - struct console_cmdline { char name[16]; /* Name of the driver */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index b582404cd29d19..dddb15f48d5957 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -383,6 +383,9 @@ static int console_locked; /* * Array of consoles built from command line options (console=) */ + +#define MAX_CMDLINECONSOLES 8 + static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; static int preferred_console = -1; @@ -2500,10 +2503,6 @@ static int __init console_setup(char *str) if (_braille_console_setup(&str, &brl_options)) return 1; - /* Save the console for driver subsystem use */ - if (console_opt_save(str, brl_options)) - return 1; - /* * Decode str into name, index, options. */ @@ -2534,13 +2533,6 @@ static int __init console_setup(char *str) } __setup("console=", console_setup); -/* Only called from add_preferred_console_match() */ -int console_opt_add_preferred_console(const char *name, const short idx, - char *options, char *brl_options) -{ - return __add_preferred_console(name, idx, options, brl_options, true); -} - /** * add_preferred_console - add a device to the list of preferred consoles. * @name: device name From 0983d288caf984de0202c66641577b739caad561 Mon Sep 17 00:00:00 2001 From: Nick Child Date: Thu, 20 Jun 2024 10:23:11 -0500 Subject: [PATCH 1403/1578] ibmvnic: Add tx check to prevent skb leak Below is a summary of how the driver stores a reference to an skb during transmit: tx_buff[free_map[consumer_index]]->skb = new_skb; free_map[consumer_index] = IBMVNIC_INVALID_MAP; consumer_index ++; Where variable data looks like this: free_map == [4, IBMVNIC_INVALID_MAP, IBMVNIC_INVALID_MAP, 0, 3] consumer_index^ tx_buff == [skb=null, skb=, skb=, skb=null, skb=null] The driver has checks to ensure that free_map[consumer_index] pointed to a valid index but there was no check to ensure that this index pointed to an unused/null skb address. So, if, by some chance, our free_map and tx_buff lists become out of sync then we were previously risking an skb memory leak. This could then cause tcp congestion control to stop sending packets, eventually leading to ETIMEDOUT. Therefore, add a conditional to ensure that the skb address is null. If not then warn the user (because this is still a bug that should be patched) and free the old pointer to prevent memleak/tcp problems. Signed-off-by: Nick Child Signed-off-by: Paolo Abeni --- drivers/net/ethernet/ibm/ibmvnic.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 5490f0f9c112b6..23ebeb143987ff 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -2482,6 +2482,18 @@ static netdev_tx_t ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) (tx_pool->consumer_index + 1) % tx_pool->num_buffers; tx_buff = &tx_pool->tx_buff[bufidx]; + + /* Sanity checks on our free map to make sure it points to an index + * that is not being occupied by another skb. If skb memory is + * not freed then we see congestion control kick in and halt tx. + */ + if (unlikely(tx_buff->skb)) { + dev_warn_ratelimited(dev, "TX free map points to untracked skb (%s %d idx=%d)\n", + skb_is_gso(skb) ? "tso_pool" : "tx_pool", + queue_num, bufidx); + dev_kfree_skb_any(tx_buff->skb); + } + tx_buff->skb = skb; tx_buff->index = bufidx; tx_buff->pool_index = queue_num; From ff46e3b4421923937b7f6e44ffcd3549a074f321 Mon Sep 17 00:00:00 2001 From: luoxuanqiang Date: Fri, 21 Jun 2024 09:39:29 +0800 Subject: [PATCH 1404/1578] Fix race for duplicate reqsk on identical SYN When bonding is configured in BOND_MODE_BROADCAST mode, if two identical SYN packets are received at the same time and processed on different CPUs, it can potentially create the same sk (sock) but two different reqsk (request_sock) in tcp_conn_request(). These two different reqsk will respond with two SYNACK packets, and since the generation of the seq (ISN) incorporates a timestamp, the final two SYNACK packets will have different seq values. The consequence is that when the Client receives and replies with an ACK to the earlier SYNACK packet, we will reset(RST) it. ======================================================================== This behavior is consistently reproducible in my local setup, which comprises: | NETA1 ------ NETB1 | PC_A --- bond --- | | --- bond --- PC_B | NETA2 ------ NETB2 | - PC_A is the Server and has two network cards, NETA1 and NETA2. I have bonded these two cards using BOND_MODE_BROADCAST mode and configured them to be handled by different CPU. - PC_B is the Client, also equipped with two network cards, NETB1 and NETB2, which are also bonded and configured in BOND_MODE_BROADCAST mode. If the client attempts a TCP connection to the server, it might encounter a failure. Capturing packets from the server side reveals: 10.10.10.10.45182 > localhost: Flags [S], seq 320236027, 10.10.10.10.45182 > localhost: Flags [S], seq 320236027, localhost > 10.10.10.10.45182: Flags [S.], seq 2967855116, localhost > 10.10.10.10.45182: Flags [S.], seq 2967855123, <== 10.10.10.10.45182 > localhost: Flags [.], ack 4294967290, 10.10.10.10.45182 > localhost: Flags [.], ack 4294967290, localhost > 10.10.10.10.45182: Flags [R], seq 2967855117, <== localhost > 10.10.10.10.45182: Flags [R], seq 2967855117, Two SYNACKs with different seq numbers are sent by localhost, resulting in an anomaly. ======================================================================== The attempted solution is as follows: Add a return value to inet_csk_reqsk_queue_hash_add() to confirm if the ehash insertion is successful (Up to now, the reason for unsuccessful insertion is that a reqsk for the same connection has already been inserted). If the insertion fails, release the reqsk. Due to the refcnt, Kuniyuki suggests also adding a return value check for the DCCP module; if ehash insertion fails, indicating a successful insertion of the same connection, simply release the reqsk as well. Simultaneously, In the reqsk_queue_hash_req(), the start of the req->rsk_timer is adjusted to be after successful insertion. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: luoxuanqiang Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240621013929.1386815-1-luoxuanqiang@kylinos.cn Signed-off-by: Paolo Abeni --- include/net/inet_connection_sock.h | 2 +- net/dccp/ipv4.c | 7 +++++-- net/dccp/ipv6.c | 7 +++++-- net/ipv4/inet_connection_sock.c | 17 +++++++++++++---- net/ipv4/tcp_input.c | 7 ++++++- 5 files changed, 30 insertions(+), 10 deletions(-) diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 7d6b1254c92d53..c0deaafebfdc0b 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -263,7 +263,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, struct sock *inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, struct sock *child); -void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, +bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, unsigned long timeout); struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, struct request_sock *req, diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index ff41bd6f99c31c..5926159a6f2040 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -657,8 +657,11 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (dccp_v4_send_response(sk, req)) goto drop_and_free; - inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); - reqsk_put(req); + if (unlikely(!inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT))) + reqsk_free(req); + else + reqsk_put(req); + return 0; drop_and_free: diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 85f4b8fdbe5e08..da5dba120bc9a5 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -400,8 +400,11 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (dccp_v6_send_response(sk, req)) goto drop_and_free; - inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); - reqsk_put(req); + if (unlikely(!inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT))) + reqsk_free(req); + else + reqsk_put(req); + return 0; drop_and_free: diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index d81f74ce0f02e6..d4f0eff8b20ffe 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1122,25 +1122,34 @@ static void reqsk_timer_handler(struct timer_list *t) inet_csk_reqsk_queue_drop_and_put(oreq->rsk_listener, oreq); } -static void reqsk_queue_hash_req(struct request_sock *req, +static bool reqsk_queue_hash_req(struct request_sock *req, unsigned long timeout) { + bool found_dup_sk = false; + + if (!inet_ehash_insert(req_to_sk(req), NULL, &found_dup_sk)) + return false; + + /* The timer needs to be setup after a successful insertion. */ timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED); mod_timer(&req->rsk_timer, jiffies + timeout); - inet_ehash_insert(req_to_sk(req), NULL, NULL); /* before letting lookups find us, make sure all req fields * are committed to memory and refcnt initialized. */ smp_wmb(); refcount_set(&req->rsk_refcnt, 2 + 1); + return true; } -void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, +bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, unsigned long timeout) { - reqsk_queue_hash_req(req, timeout); + if (!reqsk_queue_hash_req(req, timeout)) + return false; + inet_csk_reqsk_queue_added(sk); + return true; } EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 01d208e0eef31f..b6d7666ac912e6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -7257,7 +7257,12 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_rsk(req)->tfo_listener = false; if (!want_cookie) { req->timeout = tcp_timeout_init((struct sock *)req); - inet_csk_reqsk_queue_hash_add(sk, req, req->timeout); + if (unlikely(!inet_csk_reqsk_queue_hash_add(sk, req, + req->timeout))) { + reqsk_free(req); + return 0; + } + } af_ops->send_synack(sk, dst, &fl, req, &foc, !want_cookie ? TCP_SYNACK_NORMAL : From c5ab94ea280a9b4108723eecf0a636e22a5bb137 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 25 Jun 2024 11:51:58 +0200 Subject: [PATCH 1405/1578] ALSA: seq: Fix missing channel at encoding RPN/NRPN MIDI2 messages The conversion from the legacy event to MIDI2 UMP for RPN and NRPN missed the setup of the channel number, resulting in always the channel 0. Fix it. Fixes: e9e02819a98a ("ALSA: seq: Automatic conversion of UMP events") Link: https://patch.msgid.link/20240625095200.25745-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/seq/seq_ump_convert.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sound/core/seq/seq_ump_convert.c b/sound/core/seq/seq_ump_convert.c index d81f776a4c3ddd..6687efdceea13c 100644 --- a/sound/core/seq/seq_ump_convert.c +++ b/sound/core/seq/seq_ump_convert.c @@ -791,7 +791,8 @@ static int paf_ev_to_ump_midi2(const struct snd_seq_event *event, /* set up the MIDI2 RPN/NRPN packet data from the parsed info */ static void fill_rpn(struct snd_seq_ump_midi2_bank *cc, - union snd_ump_midi2_msg *data) + union snd_ump_midi2_msg *data, + unsigned char channel) { if (cc->rpn_set) { data->rpn.status = UMP_MSG_STATUS_RPN; @@ -808,6 +809,7 @@ static void fill_rpn(struct snd_seq_ump_midi2_bank *cc, } data->rpn.data = upscale_14_to_32bit((cc->cc_data_msb << 7) | cc->cc_data_lsb); + data->rpn.channel = channel; cc->cc_data_msb = cc->cc_data_lsb = 0; } @@ -855,7 +857,7 @@ static int cc_ev_to_ump_midi2(const struct snd_seq_event *event, cc->cc_data_lsb = val; if (!(cc->rpn_set || cc->nrpn_set)) return 0; // skip - fill_rpn(cc, data); + fill_rpn(cc, data, channel); return 1; } @@ -957,7 +959,7 @@ static int ctrl14_ev_to_ump_midi2(const struct snd_seq_event *event, cc->cc_data_lsb = lsb; if (!(cc->rpn_set || cc->nrpn_set)) return 0; // skip - fill_rpn(cc, data); + fill_rpn(cc, data, channel); return 1; } From 1225675ca74c746f09211528588e83b3def1ff6a Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 24 Jun 2024 14:54:34 +0200 Subject: [PATCH 1406/1578] ALSA: PCM: Allow resume only for suspended streams snd_pcm_resume() should bail out if the stream isn't in a suspended state. Otherwise it'd allow doubly resume. Link: https://patch.msgid.link/20240624125443.27808-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/pcm_native.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 521ba56392a07f..c152ccf32214fc 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -1775,6 +1775,8 @@ static int snd_pcm_pre_resume(struct snd_pcm_substream *substream, snd_pcm_state_t state) { struct snd_pcm_runtime *runtime = substream->runtime; + if (runtime->state != SNDRV_PCM_STATE_SUSPENDED) + return -EBADFD; if (!(runtime->info & SNDRV_PCM_INFO_RESUME)) return -ENOSYS; runtime->trigger_master = substream; From 1d091a98c399c17d0571fa1d91a7123a698446e4 Mon Sep 17 00:00:00 2001 From: Aivaz Latypov Date: Tue, 25 Jun 2024 13:12:02 +0500 Subject: [PATCH 1407/1578] ALSA: hda/relatek: Enable Mute LED on HP Laptop 15-gw0xxx This HP Laptop uses ALC236 codec with COEF 0x07 controlling the mute LED. Enable existing quirk for this device. Signed-off-by: Aivaz Latypov Link: https://patch.msgid.link/20240625081217.1049-1-reichaivaz@gmail.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index f4454abadc8d9e..cb9b11da5581e2 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10069,6 +10069,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8788, "HP OMEN 15", ALC285_FIXUP_HP_MUTE_LED), SND_PCI_QUIRK(0x103c, 0x87b7, "HP Laptop 14-fq0xxx", ALC236_FIXUP_HP_MUTE_LED_COEFBIT2), SND_PCI_QUIRK(0x103c, 0x87c8, "HP", ALC287_FIXUP_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x87d3, "HP Laptop 15-gw0xxx", ALC236_FIXUP_HP_MUTE_LED_COEFBIT2), SND_PCI_QUIRK(0x103c, 0x87e5, "HP ProBook 440 G8 Notebook PC", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x87e7, "HP ProBook 450 G8 Notebook PC", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x87f1, "HP ProBook 630 G8 Notebook PC", ALC236_FIXUP_HP_GPIO_LED), From 6a7db25aad8ce6512b366d2ce1d0e60bac00a09d Mon Sep 17 00:00:00 2001 From: Shengjiu Wang Date: Thu, 20 Jun 2024 10:40:18 +0800 Subject: [PATCH 1408/1578] ALSA: dmaengine_pcm: terminate dmaengine before synchronize When dmaengine supports pause function, in suspend state, dmaengine_pause() is called instead of dmaengine_terminate_async(), In end of playback stream, the runtime->state will go to SNDRV_PCM_STATE_DRAINING, if system suspend & resume happen at this time, application will not resume playback stream, the stream will be closed directly, the dmaengine_terminate_async() will not be called before the dmaengine_synchronize(), which violates the call sequence for dmaengine_synchronize(). This behavior also happens for capture streams, but there is no SNDRV_PCM_STATE_DRAINING state for capture. So use dmaengine_tx_status() to check the DMA status if the status is DMA_PAUSED, then call dmaengine_terminate_async() to terminate dmaengine before dmaengine_synchronize(). Signed-off-by: Shengjiu Wang Link: https://patch.msgid.link/1718851218-27803-1-git-send-email-shengjiu.wang@nxp.com Signed-off-by: Takashi Iwai --- sound/core/pcm_dmaengine.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sound/core/pcm_dmaengine.c b/sound/core/pcm_dmaengine.c index 12aa1cef11a133..7346c993a06713 100644 --- a/sound/core/pcm_dmaengine.c +++ b/sound/core/pcm_dmaengine.c @@ -358,6 +358,12 @@ EXPORT_SYMBOL_GPL(snd_dmaengine_pcm_open_request_chan); int snd_dmaengine_pcm_close(struct snd_pcm_substream *substream) { struct dmaengine_pcm_runtime_data *prtd = substream_to_prtd(substream); + struct dma_tx_state state; + enum dma_status status; + + status = dmaengine_tx_status(prtd->dma_chan, prtd->cookie, &state); + if (status == DMA_PAUSED) + dmaengine_terminate_async(prtd->dma_chan); dmaengine_synchronize(prtd->dma_chan); kfree(prtd); @@ -378,6 +384,12 @@ EXPORT_SYMBOL_GPL(snd_dmaengine_pcm_close); int snd_dmaengine_pcm_close_release_chan(struct snd_pcm_substream *substream) { struct dmaengine_pcm_runtime_data *prtd = substream_to_prtd(substream); + struct dma_tx_state state; + enum dma_status status; + + status = dmaengine_tx_status(prtd->dma_chan, prtd->cookie, &state); + if (status == DMA_PAUSED) + dmaengine_terminate_async(prtd->dma_chan); dmaengine_synchronize(prtd->dma_chan); dma_release_channel(prtd->dma_chan); From b1c4b4d45263241ec6c2405a8df8265d4b58e707 Mon Sep 17 00:00:00 2001 From: Tristram Ha Date: Fri, 21 Jun 2024 15:34:22 -0700 Subject: [PATCH 1409/1578] net: dsa: microchip: fix wrong register write when masking interrupt The switch global port interrupt mask, REG_SW_PORT_INT_MASK__4, is defined as 0x001C in ksz9477_reg.h. The designers used 32-bit value in anticipation for increase of port count in future product but currently the maximum port count is 7 and the effective value is 0x7F in register 0x001F. Each port has its own interrupt mask and is defined as 0x#01F. It uses only 4 bits for different interrupts. The developer who implemented the current interrupt mechanism in the switch driver noticed there are similarities between the mechanism to mask port interrupts in global interrupt and individual interrupts in each port and so used the same code to handle these interrupts. He updated the code to use the new macro REG_SW_PORT_INT_MASK__1 which is defined as 0x1F in ksz_common.h but he forgot to update the 32-bit write to 8-bit as now the mask registers are 0x1F and 0x#01F. In addition all KSZ switches other than the KSZ9897/KSZ9893 and LAN937X families use only 8-bit access and so this common code will eventually be changed to accommodate them. Fixes: e1add7dd6183 ("net: dsa: microchip: use common irq routines for girq and pirq") Signed-off-by: Tristram Ha Link: https://lore.kernel.org/r/1719009262-2948-1-git-send-email-Tristram.Ha@microchip.com Signed-off-by: Paolo Abeni --- drivers/net/dsa/microchip/ksz_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index 0433109b42e570..0580b2fee21c38 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -2196,7 +2196,7 @@ static void ksz_irq_bus_sync_unlock(struct irq_data *d) struct ksz_device *dev = kirq->dev; int ret; - ret = ksz_write32(dev, kirq->reg_mask, kirq->masked); + ret = ksz_write8(dev, kirq->reg_mask, kirq->masked); if (ret) dev_err(dev->dev, "failed to change IRQ mask\n"); From 529038146ba189f7551d64faf4f4871e4ab97538 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 22 Jun 2024 14:26:33 +0200 Subject: [PATCH 1410/1578] thermal: gov_step_wise: Go straight to instance->lower when mitigation is over Commit b6846826982b ("thermal: gov_step_wise: Restore passive polling management") attempted to fix a Step-Wise thermal governor issue introduced by commit 042a3d80f118 ("thermal: core: Move passive polling management to the core"), which caused the governor to leave cooling devices in high states, by partially reverting that commit. However, this turns out to be insufficient on some systems due to interactions between the governor code restored by commit b6846826982b and the passive polling management in the thermal core. For this reason, revert commit b6846826982b and make the governor set the target cooling device state to the "lower" one as soon as the zone temperature falls below the threshold of the trip point corresponding to the given thermal instance, which means that thermal mitigation is not necessary any more. Before this change the "lower" cooling device state would be reached in steps through the passive polling mechanism which was questionable for three reasons: (1) cooling device were kept in high states when that was not necessary (and it could adversely impact performance), (2) it only worked for thermal zones with nonzero passive_delay_jiffies value, and (3) passive polling belongs to the core and should not be hijacked by governors for their internal purposes. Fixes: b6846826982b ("thermal: gov_step_wise: Restore passive polling management") Closes: https://lore.kernel.org/linux-pm/6759ce9f-281d-4fcd-bb4c-b784a1cc5f6e@oldschoolsolutions.biz Reported-by: Jens Glathe Tested-by: Jens Glathe Link: https://patch.msgid.link/12464461.O9o76ZdvQC@rjwysocki.net Signed-off-by: Rafael J. Wysocki Tested-by: Steev Klimaszewski Tested-by: Johan Hovold --- drivers/thermal/gov_step_wise.c | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/drivers/thermal/gov_step_wise.c b/drivers/thermal/gov_step_wise.c index 65974fe8be0db5..fd5527188cf91a 100644 --- a/drivers/thermal/gov_step_wise.c +++ b/drivers/thermal/gov_step_wise.c @@ -55,7 +55,11 @@ static unsigned long get_target_state(struct thermal_instance *instance, if (cur_state <= instance->lower) return THERMAL_NO_TARGET; - return clamp(cur_state - 1, instance->lower, instance->upper); + /* + * If 'throttle' is false, no mitigation is necessary, so + * request the lower state for this instance. + */ + return instance->lower; } return instance->target; @@ -93,23 +97,6 @@ static void thermal_zone_trip_update(struct thermal_zone_device *tz, if (instance->initialized && old_target == instance->target) continue; - if (trip->type == THERMAL_TRIP_PASSIVE) { - /* - * If the target state for this thermal instance - * changes from THERMAL_NO_TARGET to something else, - * ensure that the zone temperature will be updated - * (assuming enabled passive cooling) until it becomes - * THERMAL_NO_TARGET again, or the cooling device may - * not be reset to its initial state. - */ - if (old_target == THERMAL_NO_TARGET && - instance->target != THERMAL_NO_TARGET) - tz->passive++; - else if (old_target != THERMAL_NO_TARGET && - instance->target == THERMAL_NO_TARGET) - tz->passive--; - } - instance->initialized = true; mutex_lock(&instance->cdev->lock); From cea5589e958f8aef301ce9d004bc73fa5bb3b304 Mon Sep 17 00:00:00 2001 From: Jens Remus Date: Thu, 20 Jun 2024 11:12:49 +0200 Subject: [PATCH 1411/1578] s390/boot: Do not adjust GOT entries for undef weak sym Since commit 778666df60f0 ("s390: compile relocatable kernel without -fPIE") and commit 00cda11d3b2e ("s390: Compile kernel with -fPIC and link with -no-pie") the kernel on s390x may have a Global Offset Table (GOT) whose entries are adjusted for KASLR in kaslr_adjust_got(). The GOT may contain entries for undefined weak symbols that resolved to zero. That is the resulting GOT entry value is zero. Adjusting those entries unconditionally in kaslr_adjust_got() is wrong. Otherwise the following sample code would erroneously assume foo to be defined, due to the adjustment changing the zero-value to a non-zero one: extern int foo __attribute__((weak)); if (*foo) /* foo is defined [or undefined and erroneously adjusted] */ The vmlinux build at commit 00cda11d3b2e ("s390: Compile kernel with -fPIC and link with -no-pie") with defconfig actually had two GOT entries for the undefined weak symbols __start_BTF and __stop_BTF: $ objdump -tw vmlinux | grep -F "*UND*" 0000000000000000 w *UND* 0000000000000000 __stop_BTF 0000000000000000 w *UND* 0000000000000000 __start_BTF $ readelf -rw vmlinux | grep -E "R_390_GOTENT +0{16}" 000000345760 2776a0000001a R_390_GOTENT 0000000000000000 __stop_BTF + 2 000000345766 2d5480000001a R_390_GOTENT 0000000000000000 __start_BTF + 2 The s390-specific vmlinux linker script sets the section start to __START_KERNEL, which is currently defined as 0x100000 on s390x. Access to lowcore is performed via a pointer of 0 and not a symbol in a section starting at 0. The first 64K are reserved for the loader on s390x. Thus it is safe to assume that __START_KERNEL will never be 0. As a result there cannot be any defined symbols resolving to zero in the kernel. Note that the first three GOT entries are reserved for the dynamic loader on s390x. [1] In the kernel they are zero. Therefore no extra handling is required to skip these. Skip adjusting GOT entries with a value of zero in kaslr_adjust_got(). While at it update the comment when a GOT exists on s390x. Since commit 00cda11d3b2e ("s390: Compile kernel with -fPIC and link with -no-pie") it no longer only exists when compiling with Clang, but also with GCC. [1]: s390x ELF ABI, section "Global Offset Table", https://github.com/IBM/s390x-abi/releases Fixes: 778666df60f0 ("s390: compile relocatable kernel without -fPIE") Reviewed-by: Ilya Leoshkevich Acked-by: Sumanth Korikkar Acked-by: Alexander Gordeev Signed-off-by: Jens Remus Signed-off-by: Alexander Gordeev --- arch/s390/boot/startup.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 48ef5fe5c08aaa..5a36d5538dae86 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -170,11 +170,14 @@ static void kaslr_adjust_got(unsigned long offset) u64 *entry; /* - * Even without -fPIE, Clang still uses a global offset table for some - * reason. Adjust the GOT entries. + * Adjust GOT entries, except for ones for undefined weak symbols + * that resolved to zero. This also skips the first three reserved + * entries on s390x that are zero. */ - for (entry = (u64 *)vmlinux.got_start; entry < (u64 *)vmlinux.got_end; entry++) - *entry += offset - __START_KERNEL; + for (entry = (u64 *)vmlinux.got_start; entry < (u64 *)vmlinux.got_end; entry++) { + if (*entry) + *entry += offset - __START_KERNEL; + } } /* From d3882564a77c21eb746ba5364f3fa89b88de3d61 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 20 Jun 2024 14:16:37 +0200 Subject: [PATCH 1412/1578] syscalls: fix compat_sys_io_pgetevents_time64 usage Using sys_io_pgetevents() as the entry point for compat mode tasks works almost correctly, but misses the sign extension for the min_nr and nr arguments. This was addressed on parisc by switching to compat_sys_io_pgetevents_time64() in commit 6431e92fc827 ("parisc: io_pgetevents_time64() needs compat syscall in 32-bit compat mode"), as well as by using more sophisticated system call wrappers on x86 and s390. However, arm64, mips, powerpc, sparc and riscv still have the same bug. Change all of them over to use compat_sys_io_pgetevents_time64() like parisc already does. This was clearly the intention when the function was originally added, but it got hooked up incorrectly in the tables. Cc: stable@vger.kernel.org Fixes: 48166e6ea47d ("y2038: add 64-bit time_t syscalls to all 32-bit architectures") Acked-by: Heiko Carstens # s390 Signed-off-by: Arnd Bergmann --- arch/arm64/include/asm/unistd32.h | 2 +- arch/mips/kernel/syscalls/syscall_n32.tbl | 2 +- arch/mips/kernel/syscalls/syscall_o32.tbl | 2 +- arch/powerpc/kernel/syscalls/syscall.tbl | 2 +- arch/s390/kernel/syscalls/syscall.tbl | 2 +- arch/sparc/kernel/syscalls/syscall.tbl | 2 +- arch/x86/entry/syscalls/syscall_32.tbl | 2 +- include/uapi/asm-generic/unistd.h | 2 +- kernel/sys_ni.c | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 266b96acc01439..1386e8e751f247 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -840,7 +840,7 @@ __SYSCALL(__NR_pselect6_time64, compat_sys_pselect6_time64) #define __NR_ppoll_time64 414 __SYSCALL(__NR_ppoll_time64, compat_sys_ppoll_time64) #define __NR_io_pgetevents_time64 416 -__SYSCALL(__NR_io_pgetevents_time64, sys_io_pgetevents) +__SYSCALL(__NR_io_pgetevents_time64, compat_sys_io_pgetevents_time64) #define __NR_recvmmsg_time64 417 __SYSCALL(__NR_recvmmsg_time64, compat_sys_recvmmsg_time64) #define __NR_mq_timedsend_time64 418 diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index cc869f5d569318..953f5b7dc723f6 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -354,7 +354,7 @@ 412 n32 utimensat_time64 sys_utimensat 413 n32 pselect6_time64 compat_sys_pselect6_time64 414 n32 ppoll_time64 compat_sys_ppoll_time64 -416 n32 io_pgetevents_time64 sys_io_pgetevents +416 n32 io_pgetevents_time64 compat_sys_io_pgetevents_time64 417 n32 recvmmsg_time64 compat_sys_recvmmsg_time64 418 n32 mq_timedsend_time64 sys_mq_timedsend 419 n32 mq_timedreceive_time64 sys_mq_timedreceive diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 81428a2eb66048..2439a2491cffe3 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -403,7 +403,7 @@ 412 o32 utimensat_time64 sys_utimensat sys_utimensat 413 o32 pselect6_time64 sys_pselect6 compat_sys_pselect6_time64 414 o32 ppoll_time64 sys_ppoll compat_sys_ppoll_time64 -416 o32 io_pgetevents_time64 sys_io_pgetevents sys_io_pgetevents +416 o32 io_pgetevents_time64 sys_io_pgetevents compat_sys_io_pgetevents_time64 417 o32 recvmmsg_time64 sys_recvmmsg compat_sys_recvmmsg_time64 418 o32 mq_timedsend_time64 sys_mq_timedsend sys_mq_timedsend 419 o32 mq_timedreceive_time64 sys_mq_timedreceive sys_mq_timedreceive diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 3656f1ca7a21c6..c6b0546b284d96 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -502,7 +502,7 @@ 412 32 utimensat_time64 sys_utimensat sys_utimensat 413 32 pselect6_time64 sys_pselect6 compat_sys_pselect6_time64 414 32 ppoll_time64 sys_ppoll compat_sys_ppoll_time64 -416 32 io_pgetevents_time64 sys_io_pgetevents sys_io_pgetevents +416 32 io_pgetevents_time64 sys_io_pgetevents compat_sys_io_pgetevents_time64 417 32 recvmmsg_time64 sys_recvmmsg compat_sys_recvmmsg_time64 418 32 mq_timedsend_time64 sys_mq_timedsend sys_mq_timedsend 419 32 mq_timedreceive_time64 sys_mq_timedreceive sys_mq_timedreceive diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index bd0fee24ad10a3..01071182763e96 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -418,7 +418,7 @@ 412 32 utimensat_time64 - sys_utimensat 413 32 pselect6_time64 - compat_sys_pselect6_time64 414 32 ppoll_time64 - compat_sys_ppoll_time64 -416 32 io_pgetevents_time64 - sys_io_pgetevents +416 32 io_pgetevents_time64 - compat_sys_io_pgetevents_time64 417 32 recvmmsg_time64 - compat_sys_recvmmsg_time64 418 32 mq_timedsend_time64 - sys_mq_timedsend 419 32 mq_timedreceive_time64 - sys_mq_timedreceive diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index ac6c281ccfe029..b354139b40be52 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -461,7 +461,7 @@ 412 32 utimensat_time64 sys_utimensat sys_utimensat 413 32 pselect6_time64 sys_pselect6 compat_sys_pselect6_time64 414 32 ppoll_time64 sys_ppoll compat_sys_ppoll_time64 -416 32 io_pgetevents_time64 sys_io_pgetevents sys_io_pgetevents +416 32 io_pgetevents_time64 sys_io_pgetevents compat_sys_io_pgetevents_time64 417 32 recvmmsg_time64 sys_recvmmsg compat_sys_recvmmsg_time64 418 32 mq_timedsend_time64 sys_mq_timedsend sys_mq_timedsend 419 32 mq_timedreceive_time64 sys_mq_timedreceive sys_mq_timedreceive diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 7fd1f57ad3d30c..d6ebcab1d8b28b 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -420,7 +420,7 @@ 412 i386 utimensat_time64 sys_utimensat 413 i386 pselect6_time64 sys_pselect6 compat_sys_pselect6_time64 414 i386 ppoll_time64 sys_ppoll compat_sys_ppoll_time64 -416 i386 io_pgetevents_time64 sys_io_pgetevents +416 i386 io_pgetevents_time64 sys_io_pgetevents compat_sys_io_pgetevents_time64 417 i386 recvmmsg_time64 sys_recvmmsg compat_sys_recvmmsg_time64 418 i386 mq_timedsend_time64 sys_mq_timedsend 419 i386 mq_timedreceive_time64 sys_mq_timedreceive diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index d983c48a3b6af1..d4cc26932ff47b 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -737,7 +737,7 @@ __SC_COMP(__NR_pselect6_time64, sys_pselect6, compat_sys_pselect6_time64) #define __NR_ppoll_time64 414 __SC_COMP(__NR_ppoll_time64, sys_ppoll, compat_sys_ppoll_time64) #define __NR_io_pgetevents_time64 416 -__SYSCALL(__NR_io_pgetevents_time64, sys_io_pgetevents) +__SC_COMP(__NR_io_pgetevents_time64, sys_io_pgetevents, compat_sys_io_pgetevents_time64) #define __NR_recvmmsg_time64 417 __SC_COMP(__NR_recvmmsg_time64, sys_recvmmsg, compat_sys_recvmmsg_time64) #define __NR_mq_timedsend_time64 418 diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index d7eee421d4bc33..b696b85ac63e0b 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -46,8 +46,8 @@ COND_SYSCALL(io_getevents_time32); COND_SYSCALL(io_getevents); COND_SYSCALL(io_pgetevents_time32); COND_SYSCALL(io_pgetevents); -COND_SYSCALL_COMPAT(io_pgetevents_time32); COND_SYSCALL_COMPAT(io_pgetevents); +COND_SYSCALL_COMPAT(io_pgetevents_time64); COND_SYSCALL(io_uring_setup); COND_SYSCALL(io_uring_enter); COND_SYSCALL(io_uring_register); From bae6428a9fffb2023191b0723e276cf1377a7c9f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 19 Jun 2024 14:07:30 +0200 Subject: [PATCH 1413/1578] sparc: fix old compat_sys_select() sparc has two identical select syscalls at numbers 93 and 230, respectively. During the conversion to the modern syscall.tbl format, the older one of the two broke in compat mode, and now refers to the native 64-bit syscall. Restore the correct behavior. This has very little effect, as glibc has been using the newer number anyway. Fixes: 6ff645dd683a ("sparc: add system call table generation support") Signed-off-by: Arnd Bergmann --- arch/sparc/kernel/syscalls/syscall.tbl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index b354139b40be52..5e55f73f9880a2 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -117,7 +117,7 @@ 90 common dup2 sys_dup2 91 32 setfsuid32 sys_setfsuid 92 common fcntl sys_fcntl compat_sys_fcntl -93 common select sys_select +93 common select sys_select compat_sys_select 94 32 setfsgid32 sys_setfsgid 95 common fsync sys_fsync 96 common setpriority sys_setpriority From d6fbd26fb872ec518d25433a12e8ce8163e20909 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 19 Jun 2024 12:49:39 +0200 Subject: [PATCH 1414/1578] sparc: fix compat recv/recvfrom syscalls sparc has the wrong compat version of recv() and recvfrom() for both the direct syscalls and socketcall(). The direct syscalls just need to use the compat version. For socketcall, the same thing could be done, but it seems better to completely remove the custom assembler code for it and just use the same implementation that everyone else has. Fixes: 1dacc76d0014 ("net/compat/wext: send different messages to compat tasks") Signed-off-by: Arnd Bergmann --- arch/sparc/kernel/sys32.S | 221 ------------------------- arch/sparc/kernel/syscalls/syscall.tbl | 4 +- 2 files changed, 2 insertions(+), 223 deletions(-) diff --git a/arch/sparc/kernel/sys32.S b/arch/sparc/kernel/sys32.S index a45f0f31fe51ab..a3d308f2043e5f 100644 --- a/arch/sparc/kernel/sys32.S +++ b/arch/sparc/kernel/sys32.S @@ -18,224 +18,3 @@ sys32_mmap2: sethi %hi(sys_mmap), %g1 jmpl %g1 + %lo(sys_mmap), %g0 sllx %o5, 12, %o5 - - .align 32 - .globl sys32_socketcall -sys32_socketcall: /* %o0=call, %o1=args */ - cmp %o0, 1 - bl,pn %xcc, do_einval - cmp %o0, 18 - bg,pn %xcc, do_einval - sub %o0, 1, %o0 - sllx %o0, 5, %o0 - sethi %hi(__socketcall_table_begin), %g2 - or %g2, %lo(__socketcall_table_begin), %g2 - jmpl %g2 + %o0, %g0 - nop -do_einval: - retl - mov -EINVAL, %o0 - - .align 32 -__socketcall_table_begin: - - /* Each entry is exactly 32 bytes. */ -do_sys_socket: /* sys_socket(int, int, int) */ -1: ldswa [%o1 + 0x0] %asi, %o0 - sethi %hi(sys_socket), %g1 -2: ldswa [%o1 + 0x8] %asi, %o2 - jmpl %g1 + %lo(sys_socket), %g0 -3: ldswa [%o1 + 0x4] %asi, %o1 - nop - nop - nop -do_sys_bind: /* sys_bind(int fd, struct sockaddr *, int) */ -4: ldswa [%o1 + 0x0] %asi, %o0 - sethi %hi(sys_bind), %g1 -5: ldswa [%o1 + 0x8] %asi, %o2 - jmpl %g1 + %lo(sys_bind), %g0 -6: lduwa [%o1 + 0x4] %asi, %o1 - nop - nop - nop -do_sys_connect: /* sys_connect(int, struct sockaddr *, int) */ -7: ldswa [%o1 + 0x0] %asi, %o0 - sethi %hi(sys_connect), %g1 -8: ldswa [%o1 + 0x8] %asi, %o2 - jmpl %g1 + %lo(sys_connect), %g0 -9: lduwa [%o1 + 0x4] %asi, %o1 - nop - nop - nop -do_sys_listen: /* sys_listen(int, int) */ -10: ldswa [%o1 + 0x0] %asi, %o0 - sethi %hi(sys_listen), %g1 - jmpl %g1 + %lo(sys_listen), %g0 -11: ldswa [%o1 + 0x4] %asi, %o1 - nop - nop - nop - nop -do_sys_accept: /* sys_accept(int, struct sockaddr *, int *) */ -12: ldswa [%o1 + 0x0] %asi, %o0 - sethi %hi(sys_accept), %g1 -13: lduwa [%o1 + 0x8] %asi, %o2 - jmpl %g1 + %lo(sys_accept), %g0 -14: lduwa [%o1 + 0x4] %asi, %o1 - nop - nop - nop -do_sys_getsockname: /* sys_getsockname(int, struct sockaddr *, int *) */ -15: ldswa [%o1 + 0x0] %asi, %o0 - sethi %hi(sys_getsockname), %g1 -16: lduwa [%o1 + 0x8] %asi, %o2 - jmpl %g1 + %lo(sys_getsockname), %g0 -17: lduwa [%o1 + 0x4] %asi, %o1 - nop - nop - nop -do_sys_getpeername: /* sys_getpeername(int, struct sockaddr *, int *) */ -18: ldswa [%o1 + 0x0] %asi, %o0 - sethi %hi(sys_getpeername), %g1 -19: lduwa [%o1 + 0x8] %asi, %o2 - jmpl %g1 + %lo(sys_getpeername), %g0 -20: lduwa [%o1 + 0x4] %asi, %o1 - nop - nop - nop -do_sys_socketpair: /* sys_socketpair(int, int, int, int *) */ -21: ldswa [%o1 + 0x0] %asi, %o0 - sethi %hi(sys_socketpair), %g1 -22: ldswa [%o1 + 0x8] %asi, %o2 -23: lduwa [%o1 + 0xc] %asi, %o3 - jmpl %g1 + %lo(sys_socketpair), %g0 -24: ldswa [%o1 + 0x4] %asi, %o1 - nop - nop -do_sys_send: /* sys_send(int, void *, size_t, unsigned int) */ -25: ldswa [%o1 + 0x0] %asi, %o0 - sethi %hi(sys_send), %g1 -26: lduwa [%o1 + 0x8] %asi, %o2 -27: lduwa [%o1 + 0xc] %asi, %o3 - jmpl %g1 + %lo(sys_send), %g0 -28: lduwa [%o1 + 0x4] %asi, %o1 - nop - nop -do_sys_recv: /* sys_recv(int, void *, size_t, unsigned int) */ -29: ldswa [%o1 + 0x0] %asi, %o0 - sethi %hi(sys_recv), %g1 -30: lduwa [%o1 + 0x8] %asi, %o2 -31: lduwa [%o1 + 0xc] %asi, %o3 - jmpl %g1 + %lo(sys_recv), %g0 -32: lduwa [%o1 + 0x4] %asi, %o1 - nop - nop -do_sys_sendto: /* sys_sendto(int, u32, compat_size_t, unsigned int, u32, int) */ -33: ldswa [%o1 + 0x0] %asi, %o0 - sethi %hi(sys_sendto), %g1 -34: lduwa [%o1 + 0x8] %asi, %o2 -35: lduwa [%o1 + 0xc] %asi, %o3 -36: lduwa [%o1 + 0x10] %asi, %o4 -37: ldswa [%o1 + 0x14] %asi, %o5 - jmpl %g1 + %lo(sys_sendto), %g0 -38: lduwa [%o1 + 0x4] %asi, %o1 -do_sys_recvfrom: /* sys_recvfrom(int, u32, compat_size_t, unsigned int, u32, u32) */ -39: ldswa [%o1 + 0x0] %asi, %o0 - sethi %hi(sys_recvfrom), %g1 -40: lduwa [%o1 + 0x8] %asi, %o2 -41: lduwa [%o1 + 0xc] %asi, %o3 -42: lduwa [%o1 + 0x10] %asi, %o4 -43: lduwa [%o1 + 0x14] %asi, %o5 - jmpl %g1 + %lo(sys_recvfrom), %g0 -44: lduwa [%o1 + 0x4] %asi, %o1 -do_sys_shutdown: /* sys_shutdown(int, int) */ -45: ldswa [%o1 + 0x0] %asi, %o0 - sethi %hi(sys_shutdown), %g1 - jmpl %g1 + %lo(sys_shutdown), %g0 -46: ldswa [%o1 + 0x4] %asi, %o1 - nop - nop - nop - nop -do_sys_setsockopt: /* sys_setsockopt(int, int, int, char *, int) */ -47: ldswa [%o1 + 0x0] %asi, %o0 - sethi %hi(sys_setsockopt), %g1 -48: ldswa [%o1 + 0x8] %asi, %o2 -49: lduwa [%o1 + 0xc] %asi, %o3 -50: ldswa [%o1 + 0x10] %asi, %o4 - jmpl %g1 + %lo(sys_setsockopt), %g0 -51: ldswa [%o1 + 0x4] %asi, %o1 - nop -do_sys_getsockopt: /* sys_getsockopt(int, int, int, u32, u32) */ -52: ldswa [%o1 + 0x0] %asi, %o0 - sethi %hi(sys_getsockopt), %g1 -53: ldswa [%o1 + 0x8] %asi, %o2 -54: lduwa [%o1 + 0xc] %asi, %o3 -55: lduwa [%o1 + 0x10] %asi, %o4 - jmpl %g1 + %lo(sys_getsockopt), %g0 -56: ldswa [%o1 + 0x4] %asi, %o1 - nop -do_sys_sendmsg: /* compat_sys_sendmsg(int, struct compat_msghdr *, unsigned int) */ -57: ldswa [%o1 + 0x0] %asi, %o0 - sethi %hi(compat_sys_sendmsg), %g1 -58: lduwa [%o1 + 0x8] %asi, %o2 - jmpl %g1 + %lo(compat_sys_sendmsg), %g0 -59: lduwa [%o1 + 0x4] %asi, %o1 - nop - nop - nop -do_sys_recvmsg: /* compat_sys_recvmsg(int, struct compat_msghdr *, unsigned int) */ -60: ldswa [%o1 + 0x0] %asi, %o0 - sethi %hi(compat_sys_recvmsg), %g1 -61: lduwa [%o1 + 0x8] %asi, %o2 - jmpl %g1 + %lo(compat_sys_recvmsg), %g0 -62: lduwa [%o1 + 0x4] %asi, %o1 - nop - nop - nop -do_sys_accept4: /* sys_accept4(int, struct sockaddr *, int *, int) */ -63: ldswa [%o1 + 0x0] %asi, %o0 - sethi %hi(sys_accept4), %g1 -64: lduwa [%o1 + 0x8] %asi, %o2 -65: ldswa [%o1 + 0xc] %asi, %o3 - jmpl %g1 + %lo(sys_accept4), %g0 -66: lduwa [%o1 + 0x4] %asi, %o1 - nop - nop - - .section __ex_table,"a" - .align 4 - .word 1b, __retl_efault, 2b, __retl_efault - .word 3b, __retl_efault, 4b, __retl_efault - .word 5b, __retl_efault, 6b, __retl_efault - .word 7b, __retl_efault, 8b, __retl_efault - .word 9b, __retl_efault, 10b, __retl_efault - .word 11b, __retl_efault, 12b, __retl_efault - .word 13b, __retl_efault, 14b, __retl_efault - .word 15b, __retl_efault, 16b, __retl_efault - .word 17b, __retl_efault, 18b, __retl_efault - .word 19b, __retl_efault, 20b, __retl_efault - .word 21b, __retl_efault, 22b, __retl_efault - .word 23b, __retl_efault, 24b, __retl_efault - .word 25b, __retl_efault, 26b, __retl_efault - .word 27b, __retl_efault, 28b, __retl_efault - .word 29b, __retl_efault, 30b, __retl_efault - .word 31b, __retl_efault, 32b, __retl_efault - .word 33b, __retl_efault, 34b, __retl_efault - .word 35b, __retl_efault, 36b, __retl_efault - .word 37b, __retl_efault, 38b, __retl_efault - .word 39b, __retl_efault, 40b, __retl_efault - .word 41b, __retl_efault, 42b, __retl_efault - .word 43b, __retl_efault, 44b, __retl_efault - .word 45b, __retl_efault, 46b, __retl_efault - .word 47b, __retl_efault, 48b, __retl_efault - .word 49b, __retl_efault, 50b, __retl_efault - .word 51b, __retl_efault, 52b, __retl_efault - .word 53b, __retl_efault, 54b, __retl_efault - .word 55b, __retl_efault, 56b, __retl_efault - .word 57b, __retl_efault, 58b, __retl_efault - .word 59b, __retl_efault, 60b, __retl_efault - .word 61b, __retl_efault, 62b, __retl_efault - .word 63b, __retl_efault, 64b, __retl_efault - .word 65b, __retl_efault, 66b, __retl_efault - .previous diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 5e55f73f9880a2..cfdfb3707c167c 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -155,7 +155,7 @@ 123 32 fchown sys_fchown16 123 64 fchown sys_fchown 124 common fchmod sys_fchmod -125 common recvfrom sys_recvfrom +125 common recvfrom sys_recvfrom compat_sys_recvfrom 126 32 setreuid sys_setreuid16 126 64 setreuid sys_setreuid 127 32 setregid sys_setregid16 @@ -247,7 +247,7 @@ 204 32 readdir sys_old_readdir compat_sys_old_readdir 204 64 readdir sys_nis_syscall 205 common readahead sys_readahead compat_sys_readahead -206 common socketcall sys_socketcall sys32_socketcall +206 common socketcall sys_socketcall compat_sys_socketcall 207 common syslog sys_syslog 208 common lookup_dcookie sys_ni_syscall 209 common fadvise64 sys_fadvise64 compat_sys_fadvise64 From 20a50787349fadf66ac5c48f62e58d753878d2bb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 19 Jun 2024 14:27:55 +0200 Subject: [PATCH 1415/1578] parisc: use correct compat recv/recvfrom syscalls Johannes missed parisc back when he introduced the compat version of these syscalls, so receiving cmsg messages that require a compat conversion is still broken. Use the correct calls like the other architectures do. Fixes: 1dacc76d0014 ("net/compat/wext: send different messages to compat tasks") Acked-by: Helge Deller Signed-off-by: Arnd Bergmann --- arch/parisc/kernel/syscalls/syscall.tbl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index b13c21373974c8..39e67fab751508 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -108,7 +108,7 @@ 95 common fchown sys_fchown 96 common getpriority sys_getpriority 97 common setpriority sys_setpriority -98 common recv sys_recv +98 common recv sys_recv compat_sys_recv 99 common statfs sys_statfs compat_sys_statfs 100 common fstatfs sys_fstatfs compat_sys_fstatfs 101 common stat64 sys_stat64 @@ -135,7 +135,7 @@ 120 common clone sys_clone_wrapper 121 common setdomainname sys_setdomainname 122 common sendfile sys_sendfile compat_sys_sendfile -123 common recvfrom sys_recvfrom +123 common recvfrom sys_recvfrom compat_sys_recvfrom 124 32 adjtimex sys_adjtimex_time32 124 64 adjtimex sys_adjtimex 125 common mprotect sys_mprotect From 403f17a330732a666ae793f3b15bc75bb5540524 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 7 Jun 2024 13:40:45 +0200 Subject: [PATCH 1416/1578] parisc: use generic sys_fanotify_mark implementation The sys_fanotify_mark() syscall on parisc uses the reverse word order for the two halves of the 64-bit argument compared to all syscalls on all 32-bit architectures. As far as I can tell, the problem is that the function arguments on parisc are sorted backwards (26, 25, 24, 23, ...) compared to everyone else, so the calling conventions of using an even/odd register pair in native word order result in the lower word coming first in function arguments, matching the expected behavior on little-endian architectures. The system call conventions however ended up matching what the other 32-bit architectures do. A glibc cleanup in 2020 changed the userspace behavior in a way that handles all architectures consistently, but this inadvertently broke parisc32 by changing to the same method as everyone else. The change made it into glibc-2.35 and subsequently into debian 12 (bookworm), which is the latest stable release. This means we need to choose between reverting the glibc change or changing the kernel to match it again, but either hange will leave some systems broken. Pick the option that is more likely to help current and future users and change the kernel to match current glibc. This also means the behavior is now consistent across architectures, but it breaks running new kernels with old glibc builds before 2.35. Link: https://sourceware.org/git/?p=glibc.git;a=commitdiff;h=d150181d73d9 Link: https://git.kernel.org/pub/scm/linux/kernel/git/history/history.git/commit/arch/parisc/kernel/sys_parisc.c?h=57b1dfbd5b4a39d Cc: Adhemerval Zanella Tested-by: Helge Deller Acked-by: Helge Deller Signed-off-by: Arnd Bergmann --- I found this through code inspection, please double-check to make sure I got the bug and the fix right. The alternative is to fix this by reverting glibc back to the unusual behavior. --- arch/parisc/Kconfig | 1 + arch/parisc/kernel/sys_parisc32.c | 9 --------- arch/parisc/kernel/syscalls/syscall.tbl | 2 +- 3 files changed, 2 insertions(+), 10 deletions(-) diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index daafeb20f99370..dc9b902de8ea98 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -16,6 +16,7 @@ config PARISC select ARCH_HAS_UBSAN select ARCH_HAS_PTE_SPECIAL select ARCH_NO_SG_CHAIN + select ARCH_SPLIT_ARG64 if !64BIT select ARCH_SUPPORTS_HUGETLBFS if PA20 select ARCH_SUPPORTS_MEMORY_FAILURE select ARCH_STACKWALK diff --git a/arch/parisc/kernel/sys_parisc32.c b/arch/parisc/kernel/sys_parisc32.c index 2a12a547b447bd..826c8e51b5853b 100644 --- a/arch/parisc/kernel/sys_parisc32.c +++ b/arch/parisc/kernel/sys_parisc32.c @@ -23,12 +23,3 @@ asmlinkage long sys32_unimplemented(int r26, int r25, int r24, int r23, current->comm, current->pid, r20); return -ENOSYS; } - -asmlinkage long sys32_fanotify_mark(compat_int_t fanotify_fd, compat_uint_t flags, - compat_uint_t mask0, compat_uint_t mask1, compat_int_t dfd, - const char __user * pathname) -{ - return sys_fanotify_mark(fanotify_fd, flags, - ((__u64)mask1 << 32) | mask0, - dfd, pathname); -} diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 39e67fab751508..66dc406b12e448 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -364,7 +364,7 @@ 320 common accept4 sys_accept4 321 common prlimit64 sys_prlimit64 322 common fanotify_init sys_fanotify_init -323 common fanotify_mark sys_fanotify_mark sys32_fanotify_mark +323 common fanotify_mark sys_fanotify_mark compat_sys_fanotify_mark 324 32 clock_adjtime sys_clock_adjtime32 324 64 clock_adjtime sys_clock_adjtime 325 common name_to_handle_at sys_name_to_handle_at From b1e31c134a8ab2e8f5fd62323b6b45a950ac704d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 24 Apr 2024 16:36:13 +0200 Subject: [PATCH 1417/1578] powerpc: restore some missing spu syscalls A couple of system calls were inadventently removed from the table during a bugfix for 32-bit powerpc entry. Restore the original behavior. Fixes: e23750623835 ("powerpc/32: fix syscall wrappers with 64-bit arguments of unaligned register-pairs") Acked-by: Michael Ellerman Signed-off-by: Arnd Bergmann --- arch/powerpc/kernel/syscalls/syscall.tbl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index c6b0546b284d96..ebae8415dfbbab 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -230,8 +230,10 @@ 178 nospu rt_sigsuspend sys_rt_sigsuspend compat_sys_rt_sigsuspend 179 32 pread64 sys_ppc_pread64 compat_sys_ppc_pread64 179 64 pread64 sys_pread64 +179 spu pread64 sys_pread64 180 32 pwrite64 sys_ppc_pwrite64 compat_sys_ppc_pwrite64 180 64 pwrite64 sys_pwrite64 +180 spu pwrite64 sys_pwrite64 181 common chown sys_chown 182 common getcwd sys_getcwd 183 common capget sys_capget @@ -246,6 +248,7 @@ 190 common ugetrlimit sys_getrlimit compat_sys_getrlimit 191 32 readahead sys_ppc_readahead compat_sys_ppc_readahead 191 64 readahead sys_readahead +191 spu readahead sys_readahead 192 32 mmap2 sys_mmap2 compat_sys_mmap2 193 32 truncate64 sys_ppc_truncate64 compat_sys_ppc_truncate64 194 32 ftruncate64 sys_ppc_ftruncate64 compat_sys_ppc_ftruncate64 @@ -293,6 +296,7 @@ 232 nospu set_tid_address sys_set_tid_address 233 32 fadvise64 sys_ppc32_fadvise64 compat_sys_ppc32_fadvise64 233 64 fadvise64 sys_fadvise64 +233 spu fadvise64 sys_fadvise64 234 nospu exit_group sys_exit_group 235 nospu lookup_dcookie sys_ni_syscall 236 common epoll_create sys_epoll_create From 30766f1105d6d2459c3b9fe34a3e52b637a72950 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 11 Jun 2024 22:12:43 +0200 Subject: [PATCH 1418/1578] sh: rework sync_file_range ABI The unusual function calling conventions on SuperH ended up causing sync_file_range to have the wrong argument order, with the 'flags' argument getting sorted before 'nbytes' by the compiler. In userspace, I found that musl, glibc, uclibc and strace all expect the normal calling conventions with 'nbytes' last, so changing the kernel to match them should make all of those work. In order to be able to also fix libc implementations to work with existing kernels, they need to be able to tell which ABI is used. An easy way to do this is to add yet another system call using the sync_file_range2 ABI that works the same on all architectures. Old user binaries can now work on new kernels, and new binaries can try the new sync_file_range2() to work with new kernels or fall back to the old sync_file_range() version if that doesn't exist. Cc: stable@vger.kernel.org Fixes: 75c92acdd5b1 ("sh: Wire up new syscalls.") Acked-by: John Paul Adrian Glaubitz Signed-off-by: Arnd Bergmann --- arch/sh/kernel/sys_sh32.c | 11 +++++++++++ arch/sh/kernel/syscalls/syscall.tbl | 3 ++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/sh/kernel/sys_sh32.c b/arch/sh/kernel/sys_sh32.c index 9dca568509a5e5..d6f4afcb0e8705 100644 --- a/arch/sh/kernel/sys_sh32.c +++ b/arch/sh/kernel/sys_sh32.c @@ -59,3 +59,14 @@ asmlinkage int sys_fadvise64_64_wrapper(int fd, u32 offset0, u32 offset1, (u64)len0 << 32 | len1, advice); #endif } + +/* + * swap the arguments the way that libc wants them instead of + * moving flags ahead of the 64-bit nbytes argument + */ +SYSCALL_DEFINE6(sh_sync_file_range6, int, fd, SC_ARG64(offset), + SC_ARG64(nbytes), unsigned int, flags) +{ + return ksys_sync_file_range(fd, SC_VAL64(loff_t, offset), + SC_VAL64(loff_t, nbytes), flags); +} diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index bbf83a2db9868d..c55fd7696d40fc 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -321,7 +321,7 @@ 311 common set_robust_list sys_set_robust_list 312 common get_robust_list sys_get_robust_list 313 common splice sys_splice -314 common sync_file_range sys_sync_file_range +314 common sync_file_range sys_sh_sync_file_range6 315 common tee sys_tee 316 common vmsplice sys_vmsplice 317 common move_pages sys_move_pages @@ -395,6 +395,7 @@ 385 common pkey_alloc sys_pkey_alloc 386 common pkey_free sys_pkey_free 387 common rseq sys_rseq +388 common sync_file_range2 sys_sync_file_range2 # room for arch specific syscalls 393 common semget sys_semget 394 common semctl sys_semctl From 3339b99ef6fe38dac43b534cba3a8a0e29fb2eff Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 14 Jun 2024 09:54:20 +0200 Subject: [PATCH 1419/1578] csky, hexagon: fix broken sys_sync_file_range Both of these architectures require u64 function arguments to be passed in even/odd pairs of registers or stack slots, which in case of sync_file_range would result in a seven-argument system call that is not currently possible. The system call is therefore incompatible with all existing binaries. While it would be possible to implement support for seven arguments like on mips, it seems better to use a six-argument version, either with the normal argument order but misaligned as on most architectures or with the reordered sync_file_range2() calling conventions as on arm and powerpc. Cc: stable@vger.kernel.org Acked-by: Guo Ren Signed-off-by: Arnd Bergmann --- arch/csky/include/uapi/asm/unistd.h | 1 + arch/hexagon/include/uapi/asm/unistd.h | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/csky/include/uapi/asm/unistd.h b/arch/csky/include/uapi/asm/unistd.h index 7ff6a2466af10d..e0594b6370a658 100644 --- a/arch/csky/include/uapi/asm/unistd.h +++ b/arch/csky/include/uapi/asm/unistd.h @@ -6,6 +6,7 @@ #define __ARCH_WANT_SYS_CLONE3 #define __ARCH_WANT_SET_GET_RLIMIT #define __ARCH_WANT_TIME32_SYSCALLS +#define __ARCH_WANT_SYNC_FILE_RANGE2 #include #define __NR_set_thread_area (__NR_arch_specific_syscall + 0) diff --git a/arch/hexagon/include/uapi/asm/unistd.h b/arch/hexagon/include/uapi/asm/unistd.h index 432c4db1b62392..21ae22306b5dce 100644 --- a/arch/hexagon/include/uapi/asm/unistd.h +++ b/arch/hexagon/include/uapi/asm/unistd.h @@ -36,5 +36,6 @@ #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_TIME32_SYSCALLS +#define __ARCH_WANT_SYNC_FILE_RANGE2 #include From 896842284c6ccba25ec9d78b7b6e62cdd507c083 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 20 Jun 2024 15:24:11 +0200 Subject: [PATCH 1420/1578] hexagon: fix fadvise64_64 calling conventions fadvise64_64() has two 64-bit arguments at the wrong alignment for hexagon, which turns them into a 7-argument syscall that is not supported by Linux. The downstream musl port for hexagon actually asks for a 6-argument version the same way we do it on arm, csky, powerpc, so make the kernel do it the same way to avoid having to change both. Link: https://github.com/quic/musl/blob/hexagon/arch/hexagon/syscall_arch.h#L78 Cc: stable@vger.kernel.org Signed-off-by: Arnd Bergmann --- arch/hexagon/include/asm/syscalls.h | 6 ++++++ arch/hexagon/kernel/syscalltab.c | 7 +++++++ 2 files changed, 13 insertions(+) create mode 100644 arch/hexagon/include/asm/syscalls.h diff --git a/arch/hexagon/include/asm/syscalls.h b/arch/hexagon/include/asm/syscalls.h new file mode 100644 index 00000000000000..40f2d08bec92cc --- /dev/null +++ b/arch/hexagon/include/asm/syscalls.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include + +asmlinkage long sys_hexagon_fadvise64_64(int fd, int advice, + u32 a2, u32 a3, u32 a4, u32 a5); diff --git a/arch/hexagon/kernel/syscalltab.c b/arch/hexagon/kernel/syscalltab.c index 0fadd582cfc77f..5d98bdc494ec29 100644 --- a/arch/hexagon/kernel/syscalltab.c +++ b/arch/hexagon/kernel/syscalltab.c @@ -14,6 +14,13 @@ #undef __SYSCALL #define __SYSCALL(nr, call) [nr] = (call), +SYSCALL_DEFINE6(hexagon_fadvise64_64, int, fd, int, advice, + SC_ARG64(offset), SC_ARG64(len)) +{ + return ksys_fadvise64_64(fd, SC_VAL64(loff_t, offset), SC_VAL64(loff_t, len), advice); +} +#define sys_fadvise64_64 sys_hexagon_fadvise64_64 + void *sys_call_table[__NR_syscalls] = { #include }; From 5daf62da52ecd5761d63cbb6489eb434645547df Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 4 Jun 2024 23:46:36 +0200 Subject: [PATCH 1421/1578] s390: remove native mmap2() syscall The mmap2() syscall has never been used on 64-bit s390x and should have been removed as part of 5a79859ae0f3 ("s390: remove 31 bit support"). Remove it now. Acked-by: Heiko Carstens Signed-off-by: Arnd Bergmann --- arch/s390/kernel/syscall.c | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/arch/s390/kernel/syscall.c b/arch/s390/kernel/syscall.c index dc2355c623d6ea..50cbcbbaa03d12 100644 --- a/arch/s390/kernel/syscall.c +++ b/arch/s390/kernel/syscall.c @@ -38,33 +38,6 @@ #include "entry.h" -/* - * Perform the mmap() system call. Linux for S/390 isn't able to handle more - * than 5 system call parameters, so this system call uses a memory block - * for parameter passing. - */ - -struct s390_mmap_arg_struct { - unsigned long addr; - unsigned long len; - unsigned long prot; - unsigned long flags; - unsigned long fd; - unsigned long offset; -}; - -SYSCALL_DEFINE1(mmap2, struct s390_mmap_arg_struct __user *, arg) -{ - struct s390_mmap_arg_struct a; - int error = -EFAULT; - - if (copy_from_user(&a, arg, sizeof(a))) - goto out; - error = ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset); -out: - return error; -} - #ifdef CONFIG_SYSVIPC /* * sys_ipc() is the de-multiplexer for the SysV IPC calls. From 295f10061af024099440b46602bcc47364551db7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 4 Jun 2024 14:20:26 +0200 Subject: [PATCH 1422/1578] syscalls: mmap(): use unsigned offset type consistently Most architectures that implement the old-style mmap() with byte offset use 'unsigned long' as the type for that offset, but microblaze and riscv have the off_t type that is shared with userspace, matching the prototype in include/asm-generic/syscalls.h. Make this consistent by using an unsigned argument everywhere. This changes the behavior slightly, as the argument is shifted to a page number, and an user input with the top bit set would result in a negative page offset rather than a large one as we use elsewhere. For riscv, the 32-bit sys_mmap2() definition actually used a custom type that is different from the global declaration, but this was missed due to an incorrect type check. Signed-off-by: Arnd Bergmann --- arch/csky/kernel/syscall.c | 2 +- arch/loongarch/kernel/syscall.c | 2 +- arch/microblaze/kernel/sys_microblaze.c | 2 +- arch/riscv/kernel/sys_riscv.c | 4 ++-- include/asm-generic/syscalls.h | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/csky/kernel/syscall.c b/arch/csky/kernel/syscall.c index 3d30e58a45d21c..4540a271ee39a7 100644 --- a/arch/csky/kernel/syscall.c +++ b/arch/csky/kernel/syscall.c @@ -20,7 +20,7 @@ SYSCALL_DEFINE6(mmap2, unsigned long, prot, unsigned long, flags, unsigned long, fd, - off_t, offset) + unsigned long, offset) { if (unlikely(offset & (~PAGE_MASK >> 12))) return -EINVAL; diff --git a/arch/loongarch/kernel/syscall.c b/arch/loongarch/kernel/syscall.c index b4c5acd7aa3b32..8801611143ab34 100644 --- a/arch/loongarch/kernel/syscall.c +++ b/arch/loongarch/kernel/syscall.c @@ -22,7 +22,7 @@ #define __SYSCALL(nr, call) [nr] = (call), SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, unsigned long, - prot, unsigned long, flags, unsigned long, fd, off_t, offset) + prot, unsigned long, flags, unsigned long, fd, unsigned long, offset) { if (offset & ~PAGE_MASK) return -EINVAL; diff --git a/arch/microblaze/kernel/sys_microblaze.c b/arch/microblaze/kernel/sys_microblaze.c index ed9f34da1a2a49..0850b099f300ea 100644 --- a/arch/microblaze/kernel/sys_microblaze.c +++ b/arch/microblaze/kernel/sys_microblaze.c @@ -35,7 +35,7 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, unsigned long, fd, - off_t, pgoff) + unsigned long, pgoff) { if (pgoff & ~PAGE_MASK) return -EINVAL; diff --git a/arch/riscv/kernel/sys_riscv.c b/arch/riscv/kernel/sys_riscv.c index 64155323cc923a..d77afe05578f23 100644 --- a/arch/riscv/kernel/sys_riscv.c +++ b/arch/riscv/kernel/sys_riscv.c @@ -23,7 +23,7 @@ static long riscv_sys_mmap(unsigned long addr, unsigned long len, #ifdef CONFIG_64BIT SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, - unsigned long, fd, off_t, offset) + unsigned long, fd, unsigned long, offset) { return riscv_sys_mmap(addr, len, prot, flags, fd, offset, 0); } @@ -32,7 +32,7 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, #if defined(CONFIG_32BIT) || defined(CONFIG_COMPAT) SYSCALL_DEFINE6(mmap2, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, - unsigned long, fd, off_t, offset) + unsigned long, fd, unsigned long, offset) { /* * Note that the shift for mmap2 is constant (12), diff --git a/include/asm-generic/syscalls.h b/include/asm-generic/syscalls.h index 933ca6581abace..fabcefe8a80a7d 100644 --- a/include/asm-generic/syscalls.h +++ b/include/asm-generic/syscalls.h @@ -19,7 +19,7 @@ asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, #ifndef sys_mmap asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, - unsigned long fd, off_t pgoff); + unsigned long fd, unsigned long off); #endif #ifndef sys_rt_sigreturn From 0fa8ab5f3533b307a7d0e438ab08ecd92725dad7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 20 Jun 2024 14:47:27 +0200 Subject: [PATCH 1423/1578] linux/syscalls.h: add missing __user annotations A couple of declarations in linux/syscalls.h are missing __user annotations on their pointers, which can lead to warnings from sparse because these don't match the implementation that have the correct address space annotations. Signed-off-by: Arnd Bergmann --- include/linux/syscalls.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index ba933770987860..63424af87bbab2 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -322,13 +322,13 @@ asmlinkage long sys_io_pgetevents(aio_context_t ctx_id, long nr, struct io_event __user *events, struct __kernel_timespec __user *timeout, - const struct __aio_sigset *sig); + const struct __aio_sigset __user *sig); asmlinkage long sys_io_pgetevents_time32(aio_context_t ctx_id, long min_nr, long nr, struct io_event __user *events, struct old_timespec32 __user *timeout, - const struct __aio_sigset *sig); + const struct __aio_sigset __user *sig); asmlinkage long sys_io_uring_setup(u32 entries, struct io_uring_params __user *p); asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit, @@ -441,7 +441,7 @@ asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group); asmlinkage long sys_openat(int dfd, const char __user *filename, int flags, umode_t mode); asmlinkage long sys_openat2(int dfd, const char __user *filename, - struct open_how *how, size_t size); + struct open_how __user *how, size_t size); asmlinkage long sys_close(unsigned int fd); asmlinkage long sys_close_range(unsigned int fd, unsigned int max_fd, unsigned int flags); @@ -555,7 +555,7 @@ asmlinkage long sys_get_robust_list(int pid, asmlinkage long sys_set_robust_list(struct robust_list_head __user *head, size_t len); -asmlinkage long sys_futex_waitv(struct futex_waitv *waiters, +asmlinkage long sys_futex_waitv(struct futex_waitv __user *waiters, unsigned int nr_futexes, unsigned int flags, struct __kernel_timespec __user *timeout, clockid_t clockid); @@ -907,7 +907,7 @@ asmlinkage long sys_seccomp(unsigned int op, unsigned int flags, asmlinkage long sys_getrandom(char __user *buf, size_t count, unsigned int flags); asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags); -asmlinkage long sys_bpf(int cmd, union bpf_attr *attr, unsigned int size); +asmlinkage long sys_bpf(int cmd, union bpf_attr __user *attr, unsigned int size); asmlinkage long sys_execveat(int dfd, const char __user *filename, const char __user *const __user *argv, const char __user *const __user *envp, int flags); @@ -960,11 +960,11 @@ asmlinkage long sys_cachestat(unsigned int fd, struct cachestat_range __user *cstat_range, struct cachestat __user *cstat, unsigned int flags); asmlinkage long sys_map_shadow_stack(unsigned long addr, unsigned long size, unsigned int flags); -asmlinkage long sys_lsm_get_self_attr(unsigned int attr, struct lsm_ctx *ctx, - u32 *size, u32 flags); -asmlinkage long sys_lsm_set_self_attr(unsigned int attr, struct lsm_ctx *ctx, +asmlinkage long sys_lsm_get_self_attr(unsigned int attr, struct lsm_ctx __user *ctx, + u32 __user *size, u32 flags); +asmlinkage long sys_lsm_set_self_attr(unsigned int attr, struct lsm_ctx __user *ctx, u32 size, u32 flags); -asmlinkage long sys_lsm_list_modules(u64 *ids, u32 *size, u32 flags); +asmlinkage long sys_lsm_list_modules(u64 __user *ids, u32 __user *size, u32 flags); /* * Architecture-specific system calls From e0011bca603c101f2a3c007bdb77f7006fa78fb1 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 25 Jun 2024 09:04:56 +1000 Subject: [PATCH 1424/1578] nfsd: initialise nfsd_info.mutex early. nfsd_info.mutex can be dereferenced by svc_pool_stats_start() immediately after the new netns is created. Currently this can trigger an oops. Move the initialisation earlier before it can possibly be dereferenced. Fixes: 7b207ccd9833 ("svc: don't hold reference for poolstats, only mutex.") Reported-by: Sourabh Jain Closes: https://lore.kernel.org/all/c2e9f6de-1ec4-4d3a-b18d-d5a6ec0814a0@linux.ibm.com/ Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfsctl.c | 2 ++ fs/nfsd/nfssvc.c | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 533b65057e18ef..c848ebe5d08f12 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -2169,6 +2169,8 @@ static __net_init int nfsd_net_init(struct net *net) nn->nfsd_svcstats.program = &nfsd_program; nn->nfsd_versions = NULL; nn->nfsd4_minorversions = NULL; + nn->nfsd_info.mutex = &nfsd_mutex; + nn->nfsd_serv = NULL; nfsd4_init_leases_net(nn); get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key)); seqlock_init(&nn->writeverf_lock); diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index cd9a6a1a9fc8f4..89d7918de7b1a2 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -672,7 +672,6 @@ int nfsd_create_serv(struct net *net) return error; } spin_lock(&nfsd_notifier_lock); - nn->nfsd_info.mutex = &nfsd_mutex; nn->nfsd_serv = serv; spin_unlock(&nfsd_notifier_lock); From ac03629b1612ad008ea6603a3d142e291e3de9bb Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 25 Jun 2024 09:04:57 +1000 Subject: [PATCH 1425/1578] Revert "nfsd: fix oops when reading pool_stats before server is started" This reverts commit 8e948c365d9c10b685d1deb946bd833d6a9b43e0. The reverted commit moves a test on a field protected by a mutex outside of the protection of that mutex, and so is obviously racey. Depending on how the race goes, si->serv might be NULL when dereferenced in svc_pool_stats_start(), or svc_pool_stats_stop() might unlock a mutex that hadn't been locked. This bug that the commit tried to fix has been addressed by initialising ->mutex earlier. Fixes: 8e948c365d9c ("nfsd: fix oops when reading pool_stats before server is started") Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- net/sunrpc/svc_xprt.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 49a3bea33f9d5f..dd86d7f1e97e92 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -1421,13 +1421,12 @@ static void *svc_pool_stats_start(struct seq_file *m, loff_t *pos) dprintk("svc_pool_stats_start, *pidx=%u\n", pidx); - if (!si->serv) - return NULL; - mutex_lock(si->mutex); if (!pidx) return SEQ_START_TOKEN; + if (!si->serv) + return NULL; return pidx > si->serv->sv_nrpools ? NULL : &si->serv->sv_pools[pidx - 1]; } @@ -1459,8 +1458,7 @@ static void svc_pool_stats_stop(struct seq_file *m, void *p) { struct svc_info *si = m->private; - if (si->serv) - mutex_unlock(si->mutex); + mutex_unlock(si->mutex); } static int svc_pool_stats_show(struct seq_file *m, void *p) From 5a830bbce3af16833fe0092dec47b6dd30279825 Mon Sep 17 00:00:00 2001 From: Phil Chang Date: Mon, 10 Jun 2024 21:31:36 +0800 Subject: [PATCH 1426/1578] hrtimer: Prevent queuing of hrtimer without a function callback The hrtimer function callback must not be NULL. It has to be specified by the call side but it is not validated by the hrtimer code. When a hrtimer is queued without a function callback, the kernel crashes with a null pointer dereference when trying to execute the callback in __run_hrtimer(). Introduce a validation before queuing the hrtimer in hrtimer_start_range_ns(). [anna-maria: Rephrase commit message] Signed-off-by: Phil Chang Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Anna-Maria Behnsen --- kernel/time/hrtimer.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 492c14aac642bb..b8ee320208d411 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1285,6 +1285,8 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, struct hrtimer_clock_base *base; unsigned long flags; + if (WARN_ON_ONCE(!timer->function)) + return; /* * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard From 9cc5f3bf63aa98bd7cc7ce8a8599077fde13283e Mon Sep 17 00:00:00 2001 From: Dragan Simic Date: Mon, 10 Jun 2024 07:21:12 +0200 Subject: [PATCH 1427/1578] kbuild: Install dtb files as 0644 in Makefile.dtbinst The compiled dtb files aren't executable, so install them with 0644 as their permission mode, instead of defaulting to 0755 for the permission mode and installing them with the executable bits set. Some Linux distributions, including Debian, [1][2][3] already include fixes in their kernel package build recipes to change the dtb file permissions to 0644 in their kernel packages. These changes, when additionally propagated into the long-term kernel versions, will allow such distributions to remove their downstream fixes. [1] https://salsa.debian.org/kernel-team/linux/-/merge_requests/642 [2] https://salsa.debian.org/kernel-team/linux/-/merge_requests/749 [3] https://salsa.debian.org/kernel-team/linux/-/blob/debian/6.8.12-1/debian/rules.real#L193 Cc: Diederik de Haas Cc: Fixes: aefd80307a05 ("kbuild: refactor Makefile.dtbinst more") Signed-off-by: Dragan Simic Reviewed-by: Nicolas Schier Signed-off-by: Masahiro Yamada --- scripts/Makefile.dtbinst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/Makefile.dtbinst b/scripts/Makefile.dtbinst index 67956f6496a5c0..9d920419a62cf4 100644 --- a/scripts/Makefile.dtbinst +++ b/scripts/Makefile.dtbinst @@ -17,7 +17,7 @@ include $(srctree)/scripts/Kbuild.include dst := $(INSTALL_DTBS_PATH) quiet_cmd_dtb_install = INSTALL $@ - cmd_dtb_install = install -D $< $@ + cmd_dtb_install = install -D -m 0644 $< $@ $(dst)/%: $(obj)/% $(call cmd,dtb_install) From 07d4cc2e7444356faac6552d0688a1670cc9d749 Mon Sep 17 00:00:00 2001 From: Mark-PK Tsai Date: Fri, 14 Jun 2024 15:15:02 +0800 Subject: [PATCH 1428/1578] kbuild: doc: Update default INSTALL_MOD_DIR from extra to updates The default INSTALL_MOD_DIR was changed from 'extra' to 'updates' in commit b74d7bb7ca24 ("kbuild: Modify default INSTALL_MOD_DIR from extra to updates"). This commit updates the documentation to align with the latest kernel. Fixes: b74d7bb7ca24 ("kbuild: Modify default INSTALL_MOD_DIR from extra to updates") Signed-off-by: Mark-PK Tsai Signed-off-by: Masahiro Yamada --- Documentation/kbuild/modules.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/kbuild/modules.rst b/Documentation/kbuild/modules.rst index a1f3eb7a43e235..131863142cbb35 100644 --- a/Documentation/kbuild/modules.rst +++ b/Documentation/kbuild/modules.rst @@ -128,7 +128,7 @@ executed to make module versioning work. modules_install Install the external module(s). The default location is - /lib/modules//extra/, but a prefix may + /lib/modules//updates/, but a prefix may be added with INSTALL_MOD_PATH (discussed in section 5). clean @@ -417,7 +417,7 @@ directory: And external modules are installed in: - /lib/modules/$(KERNELRELEASE)/extra/ + /lib/modules/$(KERNELRELEASE)/updates/ 5.1 INSTALL_MOD_PATH -------------------- @@ -438,10 +438,10 @@ And external modules are installed in: ------------------- External modules are by default installed to a directory under - /lib/modules/$(KERNELRELEASE)/extra/, but you may wish to + /lib/modules/$(KERNELRELEASE)/updates/, but you may wish to locate modules for a specific functionality in a separate directory. For this purpose, use INSTALL_MOD_DIR to specify an - alternative name to "extra.":: + alternative name to "updates.":: $ make INSTALL_MOD_DIR=gandalf -C $KDIR \ M=$PWD modules_install From c61566538968ffb040acc411246fd7ad38c7e8c9 Mon Sep 17 00:00:00 2001 From: Thayne Harbaugh Date: Sat, 15 Jun 2024 23:34:54 -0600 Subject: [PATCH 1429/1578] kbuild: Fix build target deb-pkg: ln: failed to create hard link The make deb-pkg target calls debian-orig which attempts to either hard link the source .tar to the build-output location or copy the source .tar to the build-output location. The test to determine whether to ln or cp is incorrectly expanded by Make and consequently always attempts to ln the source .tar. This fix corrects the escaping of '$' so that the test is expanded by the shell rather than by Make and appropriately selects between ln and cp. Fixes: b44aa8c96e9e ("kbuild: deb-pkg: make .orig tarball a hard link if possible") Signed-off-by: Thayne Harbaugh Signed-off-by: Masahiro Yamada --- scripts/Makefile.package | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/Makefile.package b/scripts/Makefile.package index 38653f3e810889..bf016af8bf8ada 100644 --- a/scripts/Makefile.package +++ b/scripts/Makefile.package @@ -103,7 +103,7 @@ debian-orig: private version = $(shell dpkg-parsechangelog -S Version | sed 's/- debian-orig: private orig-name = $(source)_$(version).orig.tar$(debian-orig-suffix) debian-orig: mkdebian-opts = --need-source debian-orig: linux.tar$(debian-orig-suffix) debian - $(Q)if [ "$(df --output=target .. 2>/dev/null)" = "$(df --output=target $< 2>/dev/null)" ]; then \ + $(Q)if [ "$$(df --output=target .. 2>/dev/null)" = "$$(df --output=target $< 2>/dev/null)" ]; then \ ln -f $< ../$(orig-name); \ else \ cp $< ../$(orig-name); \ From 8d1001f7bdd0553a796998f4fff07ee13e1c1cad Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 18 Jun 2024 20:08:43 +0900 Subject: [PATCH 1430/1578] kbuild: rpm-pkg: fix build error with CONFIG_MODULES=n When CONFIG_MODULES is disabled, 'make (bin)rpm-pkg' fails: $ make allnoconfig binrpm-pkg [ snip ] error: File not found: .../linux/rpmbuild/BUILDROOT/kernel-6.10.0_rc3-1.i386/lib/modules/6.10.0-rc3/kernel error: File not found: .../linux/rpmbuild/BUILDROOT/kernel-6.10.0_rc3-1.i386/lib/modules/6.10.0-rc3/modules.order To make it work irrespective of CONFIG_MODULES, this commit specifies the directory path, /lib/modules/%{KERNELRELEASE}, instead of individual files. However, doing so would cause new warnings: warning: File listed twice: /lib/modules/6.10.0-rc3-dirty/modules.alias warning: File listed twice: /lib/modules/6.10.0-rc3-dirty/modules.alias.bin warning: File listed twice: /lib/modules/6.10.0-rc3-dirty/modules.builtin.alias.bin warning: File listed twice: /lib/modules/6.10.0-rc3-dirty/modules.builtin.bin warning: File listed twice: /lib/modules/6.10.0-rc3-dirty/modules.dep warning: File listed twice: /lib/modules/6.10.0-rc3-dirty/modules.dep.bin warning: File listed twice: /lib/modules/6.10.0-rc3-dirty/modules.devname warning: File listed twice: /lib/modules/6.10.0-rc3-dirty/modules.softdep warning: File listed twice: /lib/modules/6.10.0-rc3-dirty/modules.symbols warning: File listed twice: /lib/modules/6.10.0-rc3-dirty/modules.symbols.bin These files exist in /lib/modules/%{KERNELRELEASE} and are also explicitly marked as %ghost. Suppress depmod because depmod-generated files are not packaged. Fixes: 615b3a3d2d41 ("kbuild: rpm-pkg: do not include depmod-generated files") Signed-off-by: Masahiro Yamada Reviewed-by: Nathan Chancellor --- scripts/package/kernel.spec | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/scripts/package/kernel.spec b/scripts/package/kernel.spec index e095eb1e290ec0..fffc8af8deb174 100644 --- a/scripts/package/kernel.spec +++ b/scripts/package/kernel.spec @@ -57,7 +57,8 @@ patch -p1 < %{SOURCE2} %install mkdir -p %{buildroot}/lib/modules/%{KERNELRELEASE} cp $(%{make} %{makeflags} -s image_name) %{buildroot}/lib/modules/%{KERNELRELEASE}/vmlinuz -%{make} %{makeflags} INSTALL_MOD_PATH=%{buildroot} modules_install +# DEPMOD=true makes depmod no-op. We do not package depmod-generated files. +%{make} %{makeflags} INSTALL_MOD_PATH=%{buildroot} DEPMOD=true modules_install %{make} %{makeflags} INSTALL_HDR_PATH=%{buildroot}/usr headers_install cp System.map %{buildroot}/lib/modules/%{KERNELRELEASE} cp .config %{buildroot}/lib/modules/%{KERNELRELEASE}/config @@ -70,10 +71,7 @@ ln -fns /usr/src/kernels/%{KERNELRELEASE} %{buildroot}/lib/modules/%{KERNELRELEA %endif { - for x in System.map config kernel modules.builtin \ - modules.builtin.modinfo modules.order vmlinuz; do - echo "/lib/modules/%{KERNELRELEASE}/${x}" - done + echo "/lib/modules/%{KERNELRELEASE}" for x in alias alias.bin builtin.alias.bin builtin.bin dep dep.bin \ devname softdep symbols symbols.bin; do From 7ed9d1318c127b3aec77099802a9fdf2480250b4 Mon Sep 17 00:00:00 2001 From: Nicolas Schier Date: Mon, 24 Jun 2024 13:12:14 +0200 Subject: [PATCH 1431/1578] kbuild: Use $(obj)/%.cc to fix host C++ module builds Use $(obj)/ instead of $(src)/ prefix when building C++ modules for host, as explained in commit b1992c3772e6 ("kbuild: use $(src) instead of $(srctree)/$(src) for source directory"). This fixes build failures of 'xconfig': $ make O=build/ xconfig make[1]: Entering directory '/data/linux/kbuild-review/build' GEN Makefile make[3]: *** No rule to make target '../scripts/kconfig/qconf-moc.cc', needed by 'scripts/kconfig/qconf-moc.o'. Stop. Fixes: b1992c3772e6 ("kbuild: use $(src) instead of $(srctree)/$(src) for source directory") Reported-by: Rolf Eike Beer Signed-off-by: Nicolas Schier Tested-by: Rolf Eike Beer Signed-off-by: Masahiro Yamada --- scripts/Makefile.host | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/Makefile.host b/scripts/Makefile.host index d35f55e0d141e5..e85be7721a4804 100644 --- a/scripts/Makefile.host +++ b/scripts/Makefile.host @@ -146,7 +146,7 @@ $(call multi_depend, $(host-cxxmulti), , -objs -cxxobjs) # Create .o file from a single .cc (C++) file quiet_cmd_host-cxxobjs = HOSTCXX $@ cmd_host-cxxobjs = $(HOSTCXX) $(hostcxx_flags) -c -o $@ $< -$(host-cxxobjs): $(obj)/%.o: $(src)/%.cc FORCE +$(host-cxxobjs): $(obj)/%.o: $(obj)/%.cc FORCE $(call if_changed_dep,host-cxxobjs) # Create executable from a single Rust crate (which may consist of From 04a2aef59cfe192aa99020601d922359978cc72a Mon Sep 17 00:00:00 2001 From: Jesse Taube Date: Thu, 6 Jun 2024 14:28:00 -0400 Subject: [PATCH 1432/1578] RISC-V: fix vector insn load/store width mask RVFDQ_FL_FS_WIDTH_MASK should be 3 bits [14-12], shifted down by 12 bits. Replace GENMASK(3, 0) with GENMASK(2, 0). Fixes: cd054837243b ("riscv: Allocate user's vector context in the first-use trap") Signed-off-by: Jesse Taube Reviewed-by: Charlie Jenkins Link: https://lore.kernel.org/r/20240606182800.415831-1-jesse@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/insn.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/insn.h b/arch/riscv/include/asm/insn.h index 06e439eeef9ada..09fde95a5e8f75 100644 --- a/arch/riscv/include/asm/insn.h +++ b/arch/riscv/include/asm/insn.h @@ -145,7 +145,7 @@ /* parts of opcode for RVF, RVD and RVQ */ #define RVFDQ_FL_FS_WIDTH_OFF 12 -#define RVFDQ_FL_FS_WIDTH_MASK GENMASK(3, 0) +#define RVFDQ_FL_FS_WIDTH_MASK GENMASK(2, 0) #define RVFDQ_FL_FS_WIDTH_W 2 #define RVFDQ_FL_FS_WIDTH_D 3 #define RVFDQ_LS_FS_WIDTH_Q 4 From c223376b3019a00a0241faea0bc8c966738d1cc5 Mon Sep 17 00:00:00 2001 From: Li Ma Date: Thu, 6 Jun 2024 20:25:34 +0800 Subject: [PATCH 1433/1578] drm/amd/swsmu: add MALL init support workaround for smu_v14_0_1 [Why] SMU firmware has not supported MALL PG. [How] Disable MALL PG and make it always on until SMU firmware is ready. Signed-off-by: Li Ma Reviewed-by: Tim Huang Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 13 ++++ drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 5 ++ .../pm/swsmu/inc/pmfw_if/smu_v14_0_0_ppsmc.h | 4 +- drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h | 4 +- .../drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c | 73 +++++++++++++++++++ 5 files changed, 96 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c index 7789b313285c4b..e1796ecf9c05c4 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c @@ -324,6 +324,18 @@ static int smu_dpm_set_umsch_mm_enable(struct smu_context *smu, return ret; } +static int smu_set_mall_enable(struct smu_context *smu) +{ + int ret = 0; + + if (!smu->ppt_funcs->set_mall_enable) + return 0; + + ret = smu->ppt_funcs->set_mall_enable(smu); + + return ret; +} + /** * smu_dpm_set_power_gate - power gate/ungate the specific IP block * @@ -1791,6 +1803,7 @@ static int smu_hw_init(void *handle) smu_dpm_set_jpeg_enable(smu, true); smu_dpm_set_vpe_enable(smu, true); smu_dpm_set_umsch_mm_enable(smu, true); + smu_set_mall_enable(smu); smu_set_gfx_cgpg(smu, true); } diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h index 0917dec8efe35c..64ccdb5f14eaf4 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h @@ -1394,6 +1394,11 @@ struct pptable_funcs { */ int (*dpm_set_umsch_mm_enable)(struct smu_context *smu, bool enable); + /** + * @set_mall_enable: Init MALL power gating control. + */ + int (*set_mall_enable)(struct smu_context *smu); + /** * @notify_rlc_state: Notify RLC power state to SMU. */ diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v14_0_0_ppsmc.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v14_0_0_ppsmc.h index c4dc5881d8df09..e7f5ef49049f9e 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v14_0_0_ppsmc.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v14_0_0_ppsmc.h @@ -106,8 +106,8 @@ #define PPSMC_MSG_DisableLSdma 0x35 ///< Disable LSDMA #define PPSMC_MSG_SetSoftMaxVpe 0x36 ///< #define PPSMC_MSG_SetSoftMinVpe 0x37 ///< -#define PPSMC_MSG_AllocMALLCache 0x38 ///< Allocating MALL Cache -#define PPSMC_MSG_ReleaseMALLCache 0x39 ///< Releasing MALL Cache +#define PPSMC_MSG_MALLPowerController 0x38 ///< Set MALL control +#define PPSMC_MSG_MALLPowerState 0x39 ///< Enter/Exit MALL PG #define PPSMC_Message_Count 0x3A ///< Total number of PPSMC messages /** @}*/ diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h index c48214e3dc8e42..2e32b085824ae9 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h @@ -272,7 +272,9 @@ __SMU_DUMMY_MAP(SetSoftMinVpe), \ __SMU_DUMMY_MAP(GetMetricsVersion), \ __SMU_DUMMY_MAP(EnableUCLKShadow), \ - __SMU_DUMMY_MAP(RmaDueToBadPageThreshold), + __SMU_DUMMY_MAP(RmaDueToBadPageThreshold), \ + __SMU_DUMMY_MAP(MALLPowerController), \ + __SMU_DUMMY_MAP(MALLPowerState), #undef __SMU_DUMMY_MAP #define __SMU_DUMMY_MAP(type) SMU_MSG_##type diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c index e4419e1561ef6e..18abfbd6d059ef 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c @@ -52,6 +52,19 @@ #define mmMP1_SMN_C2PMSG_90 0x029a #define mmMP1_SMN_C2PMSG_90_BASE_IDX 0 +/* MALLPowerController message arguments (Defines for the Cache mode control) */ +#define SMU_MALL_PMFW_CONTROL 0 +#define SMU_MALL_DRIVER_CONTROL 1 + +/* + * MALLPowerState message arguments + * (Defines for the Allocate/Release Cache mode if in driver mode) + */ +#define SMU_MALL_EXIT_PG 0 +#define SMU_MALL_ENTER_PG 1 + +#define SMU_MALL_PG_CONFIG_DEFAULT SMU_MALL_PG_CONFIG_DRIVER_CONTROL_ALWAYS_ON + #define FEATURE_MASK(feature) (1ULL << feature) #define SMC_DPM_FEATURE ( \ FEATURE_MASK(FEATURE_CCLK_DPM_BIT) | \ @@ -66,6 +79,12 @@ FEATURE_MASK(FEATURE_GFX_DPM_BIT) | \ FEATURE_MASK(FEATURE_VPE_DPM_BIT)) +enum smu_mall_pg_config { + SMU_MALL_PG_CONFIG_PMFW_CONTROL = 0, + SMU_MALL_PG_CONFIG_DRIVER_CONTROL_ALWAYS_ON = 1, + SMU_MALL_PG_CONFIG_DRIVER_CONTROL_ALWAYS_OFF = 2, +}; + static struct cmn2asic_msg_mapping smu_v14_0_0_message_map[SMU_MSG_MAX_COUNT] = { MSG_MAP(TestMessage, PPSMC_MSG_TestMessage, 1), MSG_MAP(GetSmuVersion, PPSMC_MSG_GetPmfwVersion, 1), @@ -113,6 +132,8 @@ static struct cmn2asic_msg_mapping smu_v14_0_0_message_map[SMU_MSG_MAX_COUNT] = MSG_MAP(PowerDownUmsch, PPSMC_MSG_PowerDownUmsch, 1), MSG_MAP(SetSoftMaxVpe, PPSMC_MSG_SetSoftMaxVpe, 1), MSG_MAP(SetSoftMinVpe, PPSMC_MSG_SetSoftMinVpe, 1), + MSG_MAP(MALLPowerController, PPSMC_MSG_MALLPowerController, 1), + MSG_MAP(MALLPowerState, PPSMC_MSG_MALLPowerState, 1), }; static struct cmn2asic_mapping smu_v14_0_0_feature_mask_map[SMU_FEATURE_COUNT] = { @@ -1423,6 +1444,57 @@ static int smu_v14_0_common_get_dpm_table(struct smu_context *smu, struct dpm_cl return 0; } +static int smu_v14_0_1_init_mall_power_gating(struct smu_context *smu, enum smu_mall_pg_config pg_config) +{ + struct amdgpu_device *adev = smu->adev; + int ret = 0; + + if (pg_config == SMU_MALL_PG_CONFIG_PMFW_CONTROL) { + ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_MALLPowerController, + SMU_MALL_PMFW_CONTROL, NULL); + if (ret) { + dev_err(adev->dev, "Init MALL PMFW CONTROL Failure\n"); + return ret; + } + } else { + ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_MALLPowerController, + SMU_MALL_DRIVER_CONTROL, NULL); + if (ret) { + dev_err(adev->dev, "Init MALL Driver CONTROL Failure\n"); + return ret; + } + + if (pg_config == SMU_MALL_PG_CONFIG_DRIVER_CONTROL_ALWAYS_ON) { + ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_MALLPowerState, + SMU_MALL_EXIT_PG, NULL); + if (ret) { + dev_err(adev->dev, "EXIT MALL PG Failure\n"); + return ret; + } + } else if (pg_config == SMU_MALL_PG_CONFIG_DRIVER_CONTROL_ALWAYS_OFF) { + ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_MALLPowerState, + SMU_MALL_ENTER_PG, NULL); + if (ret) { + dev_err(adev->dev, "Enter MALL PG Failure\n"); + return ret; + } + } + } + + return ret; +} + +static int smu_v14_0_common_set_mall_enable(struct smu_context *smu) +{ + enum smu_mall_pg_config pg_config = SMU_MALL_PG_CONFIG_DEFAULT; + int ret = 0; + + if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 1)) + ret = smu_v14_0_1_init_mall_power_gating(smu, pg_config); + + return ret; +} + static const struct pptable_funcs smu_v14_0_0_ppt_funcs = { .check_fw_status = smu_v14_0_check_fw_status, .check_fw_version = smu_v14_0_check_fw_version, @@ -1454,6 +1526,7 @@ static const struct pptable_funcs smu_v14_0_0_ppt_funcs = { .dpm_set_vpe_enable = smu_v14_0_0_set_vpe_enable, .dpm_set_umsch_mm_enable = smu_v14_0_0_set_umsch_mm_enable, .get_dpm_clock_table = smu_v14_0_common_get_dpm_table, + .set_mall_enable = smu_v14_0_common_set_mall_enable, }; static void smu_v14_0_0_set_smu_mailbox_registers(struct smu_context *smu) From f6f49dda49db72e7a0b4ca32c77391d5ff5ce232 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 14 Jun 2024 13:48:26 -0400 Subject: [PATCH 1434/1578] drm/amdgpu/atomfirmware: fix parsing of vram_info v3.x changed the how vram width was encoded. The previous implementation actually worked correctly for most boards. Fix the implementation to work correctly everywhere. This fixes the vram width reported in the kernel log on some boards. Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c index 108003bdf1e911..2e13c7c4b2b411 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c @@ -400,7 +400,7 @@ amdgpu_atomfirmware_get_vram_info(struct amdgpu_device *adev, mem_channel_number = vram_info->v30.channel_num; mem_channel_width = vram_info->v30.channel_width; if (vram_width) - *vram_width = mem_channel_number * (1 << mem_channel_width); + *vram_width = mem_channel_number * 16; break; default: return -EINVAL; From 74fa02c4a5ea1ade5156a6ce494d3ea83881c2d8 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Tue, 18 Jun 2024 14:04:38 +0530 Subject: [PATCH 1435/1578] drm/amdgpu: Fix pci state save during mode-1 reset Cache the PCI state before bus master is disabled. The saved state is later used for other cases like restoring config space after mode-2 reset. Fixes: 5c03e5843e6b ("drm/amdgpu:add smu mode1/2 support for aldebaran") Signed-off-by: Lijo Lazar Reviewed-by: Feifei Xu Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 932dc93b2e631d..33f791d92ddf3d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5220,11 +5220,14 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev) dev_info(adev->dev, "GPU mode1 reset\n"); + /* Cache the state before bus master disable. The saved config space + * values are used in other cases like restore after mode-2 reset. + */ + amdgpu_device_cache_pci_state(adev->pdev); + /* disable BM */ pci_clear_master(adev->pdev); - amdgpu_device_cache_pci_state(adev->pdev); - if (amdgpu_dpm_is_mode1_reset_supported(adev)) { dev_info(adev->dev, "GPU smu mode1 reset\n"); ret = amdgpu_dpm_mode1_reset(adev); From 2ec6c7f802332d1eff16f03e7c757f1543ee1183 Mon Sep 17 00:00:00 2001 From: Michael Strauss Date: Tue, 28 Nov 2023 10:31:12 -0500 Subject: [PATCH 1436/1578] drm/amd/display: Send DP_TOTAL_LTTPR_CNT during detection if LTTPR is present [WHY] New register field added in DP2.1 SCR, needed for auxless ALPM [HOW] Echo value read from 0xF0007 back to sink Reviewed-by: Wenjing Liu Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Signed-off-by: Alex Hung Signed-off-by: Michael Strauss Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../amd/display/dc/link/protocols/link_dp_capability.c | 10 +++++++++- drivers/gpu/drm/amd/display/include/dpcd_defs.h | 5 +++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c index a01d0842bf8e5d..d487dfcd219b02 100644 --- a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c +++ b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c @@ -1590,9 +1590,17 @@ static bool retrieve_link_cap(struct dc_link *link) return false; } - if (dp_is_lttpr_present(link)) + if (dp_is_lttpr_present(link)) { configure_lttpr_mode_transparent(link); + // Echo TOTAL_LTTPR_CNT back downstream + core_link_write_dpcd( + link, + DP_TOTAL_LTTPR_CNT, + &link->dpcd_caps.lttpr_caps.phy_repeater_cnt, + sizeof(link->dpcd_caps.lttpr_caps.phy_repeater_cnt)); + } + /* Read DP tunneling information. */ status = dpcd_get_tunneling_device_data(link); diff --git a/drivers/gpu/drm/amd/display/include/dpcd_defs.h b/drivers/gpu/drm/amd/display/include/dpcd_defs.h index 914f28e9f22426..aee5170f5fb231 100644 --- a/drivers/gpu/drm/amd/display/include/dpcd_defs.h +++ b/drivers/gpu/drm/amd/display/include/dpcd_defs.h @@ -177,4 +177,9 @@ enum dpcd_psr_sink_states { #define DP_SINK_PR_PIXEL_DEVIATION_PER_LINE 0x379 #define DP_SINK_PR_MAX_NUMBER_OF_DEVIATION_LINE 0x37A +/* Remove once drm_dp_helper.h is updated upstream */ +#ifndef DP_TOTAL_LTTPR_CNT +#define DP_TOTAL_LTTPR_CNT 0xF000A /* 2.1 */ +#endif + #endif /* __DAL_DPCD_DEFS_H__ */ From bcfa48ff785bd121316592b131ff6531e3e696bb Mon Sep 17 00:00:00 2001 From: Julia Zhang Date: Mon, 3 Jun 2024 19:31:09 +0800 Subject: [PATCH 1437/1578] drm/amdgpu: avoid using null object of framebuffer Instead of using state->fb->obj[0] directly, get object from framebuffer by calling drm_gem_fb_get_obj() and return error code when object is null to avoid using null object of framebuffer. Reported-by: Fusheng Huang Signed-off-by: Julia Zhang Reviewed-by: Huang Rui Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c index e30eecd02ae1c1..fde66225c481a3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include "amdgpu.h" @@ -314,7 +315,13 @@ static int amdgpu_vkms_prepare_fb(struct drm_plane *plane, return 0; } afb = to_amdgpu_framebuffer(new_state->fb); - obj = new_state->fb->obj[0]; + + obj = drm_gem_fb_get_obj(new_state->fb, 0); + if (!obj) { + DRM_ERROR("Failed to get obj from framebuffer\n"); + return -EINVAL; + } + rbo = gem_to_amdgpu_bo(obj); adev = amdgpu_ttm_adev(rbo->tbo.bdev); @@ -368,12 +375,19 @@ static void amdgpu_vkms_cleanup_fb(struct drm_plane *plane, struct drm_plane_state *old_state) { struct amdgpu_bo *rbo; + struct drm_gem_object *obj; int r; if (!old_state->fb) return; - rbo = gem_to_amdgpu_bo(old_state->fb->obj[0]); + obj = drm_gem_fb_get_obj(old_state->fb, 0); + if (!obj) { + DRM_ERROR("Failed to get obj from framebuffer\n"); + return; + } + + rbo = gem_to_amdgpu_bo(obj); r = amdgpu_bo_reserve(rbo, false); if (unlikely(r)) { DRM_ERROR("failed to reserve rbo before unpin\n"); From 48880f9686b1ac2ea0831f65df953a63d1437fc0 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Mon, 3 Jun 2024 12:12:18 +0530 Subject: [PATCH 1438/1578] drm/amdgpu: Don't show false warning for reg list If reg list is already loaded on PSP 13.0.2 SOCs, psp will give TEE_ERR_CANCEL response on second time load. Avoid printing warn message for it. Signed-off-by: Lijo Lazar Reviewed-by: Feifei Xu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 25 +++++++++++++++++++++---- drivers/gpu/drm/amd/amdgpu/psp_gfx_if.h | 5 +++-- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 4bd4602d11b1e4..cef9dd0a012b54 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -640,6 +640,20 @@ static const char *psp_gfx_cmd_name(enum psp_gfx_cmd_id cmd_id) } } +static bool psp_err_warn(struct psp_context *psp) +{ + struct psp_gfx_cmd_resp *cmd = psp->cmd_buf_mem; + + /* This response indicates reg list is already loaded */ + if (amdgpu_ip_version(psp->adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 2) && + cmd->cmd_id == GFX_CMD_ID_LOAD_IP_FW && + cmd->cmd.cmd_load_ip_fw.fw_type == GFX_FW_TYPE_REG_LIST && + cmd->resp.status == TEE_ERROR_CANCEL) + return false; + + return true; +} + static int psp_cmd_submit_buf(struct psp_context *psp, struct amdgpu_firmware_info *ucode, @@ -699,10 +713,13 @@ psp_cmd_submit_buf(struct psp_context *psp, dev_warn(psp->adev->dev, "failed to load ucode %s(0x%X) ", amdgpu_ucode_name(ucode->ucode_id), ucode->ucode_id); - dev_warn(psp->adev->dev, - "psp gfx command %s(0x%X) failed and response status is (0x%X)\n", - psp_gfx_cmd_name(psp->cmd_buf_mem->cmd_id), psp->cmd_buf_mem->cmd_id, - psp->cmd_buf_mem->resp.status); + if (psp_err_warn(psp)) + dev_warn( + psp->adev->dev, + "psp gfx command %s(0x%X) failed and response status is (0x%X)\n", + psp_gfx_cmd_name(psp->cmd_buf_mem->cmd_id), + psp->cmd_buf_mem->cmd_id, + psp->cmd_buf_mem->resp.status); /* If any firmware (including CAP) load fails under SRIOV, it should * return failure to stop the VF from initializing. * Also return failure in case of timeout diff --git a/drivers/gpu/drm/amd/amdgpu/psp_gfx_if.h b/drivers/gpu/drm/amd/amdgpu/psp_gfx_if.h index 7566973ed8f596..37b5ddd6f13b33 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_gfx_if.h +++ b/drivers/gpu/drm/amd/amdgpu/psp_gfx_if.h @@ -464,8 +464,9 @@ struct psp_gfx_rb_frame #define PSP_ERR_UNKNOWN_COMMAND 0x00000100 enum tee_error_code { - TEE_SUCCESS = 0x00000000, - TEE_ERROR_NOT_SUPPORTED = 0xFFFF000A, + TEE_SUCCESS = 0x00000000, + TEE_ERROR_CANCEL = 0xFFFF0002, + TEE_ERROR_NOT_SUPPORTED = 0xFFFF000A, }; #endif /* _PSP_TEE_GFX_IF_H_ */ From 6d411c8ccc0137a612e0044489030a194ff5c843 Mon Sep 17 00:00:00 2001 From: Ma Ke Date: Tue, 25 Jun 2024 16:10:29 +0800 Subject: [PATCH 1439/1578] drm/nouveau/dispnv04: fix null pointer dereference in nv17_tv_get_hd_modes In nv17_tv_get_hd_modes(), the return value of drm_mode_duplicate() is assigned to mode, which will lead to a possible NULL pointer dereference on failure of drm_mode_duplicate(). The same applies to drm_cvt_mode(). Add a check to avoid null pointer dereference. Cc: stable@vger.kernel.org Signed-off-by: Ma Ke Signed-off-by: Lyude Paul Link: https://patchwork.freedesktop.org/patch/msgid/20240625081029.2619437-1-make24@iscas.ac.cn --- drivers/gpu/drm/nouveau/dispnv04/tvnv17.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/nouveau/dispnv04/tvnv17.c b/drivers/gpu/drm/nouveau/dispnv04/tvnv17.c index 670c9739e5e18c..9c3dc9a5bb46e5 100644 --- a/drivers/gpu/drm/nouveau/dispnv04/tvnv17.c +++ b/drivers/gpu/drm/nouveau/dispnv04/tvnv17.c @@ -258,6 +258,8 @@ static int nv17_tv_get_hd_modes(struct drm_encoder *encoder, if (modes[i].hdisplay == output_mode->hdisplay && modes[i].vdisplay == output_mode->vdisplay) { mode = drm_mode_duplicate(encoder->dev, output_mode); + if (!mode) + continue; mode->type |= DRM_MODE_TYPE_PREFERRED; } else { @@ -265,6 +267,8 @@ static int nv17_tv_get_hd_modes(struct drm_encoder *encoder, modes[i].vdisplay, 60, false, (output_mode->flags & DRM_MODE_FLAG_INTERLACE), false); + if (!mode) + continue; } /* CVT modes are sometimes unsuitable... */ From 66edf3fb331b6c55439b10f9862987b0916b3726 Mon Sep 17 00:00:00 2001 From: Ma Ke Date: Tue, 25 Jun 2024 16:18:28 +0800 Subject: [PATCH 1440/1578] drm/nouveau/dispnv04: fix null pointer dereference in nv17_tv_get_ld_modes In nv17_tv_get_ld_modes(), the return value of drm_mode_duplicate() is assigned to mode, which will lead to a possible NULL pointer dereference on failure of drm_mode_duplicate(). Add a check to avoid npd. Cc: stable@vger.kernel.org Signed-off-by: Ma Ke Signed-off-by: Lyude Paul Link: https://patchwork.freedesktop.org/patch/msgid/20240625081828.2620794-1-make24@iscas.ac.cn --- drivers/gpu/drm/nouveau/dispnv04/tvnv17.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/nouveau/dispnv04/tvnv17.c b/drivers/gpu/drm/nouveau/dispnv04/tvnv17.c index 9c3dc9a5bb46e5..2033214c4b7848 100644 --- a/drivers/gpu/drm/nouveau/dispnv04/tvnv17.c +++ b/drivers/gpu/drm/nouveau/dispnv04/tvnv17.c @@ -209,6 +209,8 @@ static int nv17_tv_get_ld_modes(struct drm_encoder *encoder, struct drm_display_mode *mode; mode = drm_mode_duplicate(encoder->dev, tv_mode); + if (!mode) + continue; mode->clock = tv_norm->tv_enc_mode.vrefresh * mode->htotal / 1000 * From 211c581de28e7741898720b5f74da4e62f37f972 Mon Sep 17 00:00:00 2001 From: Pei Li Date: Tue, 25 Jun 2024 13:04:59 -0700 Subject: [PATCH 1441/1578] bcachefs: slab-use-after-free Read in bch2_sb_errors_from_cpu Acquire fsck_error_counts_lock before accessing the critical section protected by this lock. syzbot has tested the proposed patch and the reproducer did not trigger any issue. Reported-by: syzbot+a2bc0e838efd7663f4d9@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=a2bc0e838efd7663f4d9 Signed-off-by: Pei Li Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-errors.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c index bda33e59e2264b..c1270d790e43bf 100644 --- a/fs/bcachefs/sb-errors.c +++ b/fs/bcachefs/sb-errors.c @@ -110,19 +110,25 @@ void bch2_sb_error_count(struct bch_fs *c, enum bch_sb_error_id err) void bch2_sb_errors_from_cpu(struct bch_fs *c) { bch_sb_errors_cpu *src = &c->fsck_error_counts; - struct bch_sb_field_errors *dst = - bch2_sb_field_resize(&c->disk_sb, errors, - bch2_sb_field_errors_u64s(src->nr)); + struct bch_sb_field_errors *dst; unsigned i; + mutex_lock(&c->fsck_error_counts_lock); + + dst = bch2_sb_field_resize(&c->disk_sb, errors, + bch2_sb_field_errors_u64s(src->nr)); + if (!dst) - return; + goto err; for (i = 0; i < src->nr; i++) { SET_BCH_SB_ERROR_ENTRY_ID(&dst->entries[i], src->data[i].id); SET_BCH_SB_ERROR_ENTRY_NR(&dst->entries[i], src->data[i].nr); dst->entries[i].last_error_time = cpu_to_le64(src->data[i].last_error_time); } + +err: + mutex_unlock(&c->fsck_error_counts_lock); } static int bch2_sb_errors_to_cpu(struct bch_fs *c) From 472237b69d071c877e97bf0bc3eab1be865fad29 Mon Sep 17 00:00:00 2001 From: Pei Li Date: Tue, 25 Jun 2024 11:41:29 -0700 Subject: [PATCH 1442/1578] bcachefs: Fix shift-out-of-bounds in bch2_blacklist_entries_gc This series fix the shift-out-of-bounds issue in bch2_blacklist_entries_gc(). Instead of passing 0 to eytzinger0_first() when iterating the entries, we explicitly check 0 and initialize i to be 0. syzbot has tested the proposed patch and the reproducer did not trigger any issue: Reported-and-tested-by: syzbot+835d255ad6bc7f29ee12@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=835d255ad6bc7f29ee12 Signed-off-by: Pei Li Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_seq_blacklist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index ed484670961152..1f25c111c54cdd 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -232,7 +232,7 @@ bool bch2_blacklist_entries_gc(struct bch_fs *c) BUG_ON(nr != t->nr); unsigned i; - for (src = bl->start, i = eytzinger0_first(t->nr); + for (src = bl->start, i = t->nr == 0 ? 0 : eytzinger0_first(t->nr); src < bl->start + nr; src++, i = eytzinger0_next(i, nr)) { BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); From 64ee1431cc7d11e01a1007ead0afe737781cbbab Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 23 Jun 2024 00:53:44 -0400 Subject: [PATCH 1443/1578] bcachefs: Discard, invalidate workers are now per device There's no reason for discards to be single threaded across all devices; this will improve performance on multi device setups. Additionally, making them per-device simplifies the refcounting on bch_dev->io_ref; we now hold it for the duration that the discard path is running, which fixes a race between the discard path and device removal. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 263 +++++++++++++++++---------------- fs/bcachefs/alloc_background.h | 6 +- fs/bcachefs/alloc_foreground.c | 4 +- fs/bcachefs/bcachefs.h | 16 +- fs/bcachefs/super.c | 5 +- 5 files changed, 161 insertions(+), 133 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 8dec2c6cbb7eb0..1de9fac3bcf4f7 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -29,7 +29,7 @@ #include #include -static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket); +static void bch2_discard_one_bucket_fast(struct bch_dev *, u64); /* Persistent alloc info: */ @@ -893,12 +893,12 @@ int bch2_trigger_alloc(struct btree_trans *trans, if (statechange(a->data_type == BCH_DATA_need_discard) && !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) && bucket_flushed(new_a)) - bch2_discard_one_bucket_fast(c, new.k->p); + bch2_discard_one_bucket_fast(ca, new.k->p.offset); if (statechange(a->data_type == BCH_DATA_cached) && !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) && should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) - bch2_do_invalidates(c); + bch2_dev_do_invalidates(ca); if (statechange(a->data_type == BCH_DATA_need_gc_gens)) bch2_gc_gens_async(c); @@ -1636,34 +1636,38 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c) return ret; } -static int discard_in_flight_add(struct bch_fs *c, struct bpos bucket) +static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress) { int ret; - mutex_lock(&c->discard_buckets_in_flight_lock); - darray_for_each(c->discard_buckets_in_flight, i) - if (bkey_eq(*i, bucket)) { + mutex_lock(&ca->discard_buckets_in_flight_lock); + darray_for_each(ca->discard_buckets_in_flight, i) + if (i->bucket == bucket) { ret = -BCH_ERR_EEXIST_discard_in_flight_add; goto out; } - ret = darray_push(&c->discard_buckets_in_flight, bucket); + ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) { + .in_progress = in_progress, + .bucket = bucket, + })); out: - mutex_unlock(&c->discard_buckets_in_flight_lock); + mutex_unlock(&ca->discard_buckets_in_flight_lock); return ret; } -static void discard_in_flight_remove(struct bch_fs *c, struct bpos bucket) +static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket) { - mutex_lock(&c->discard_buckets_in_flight_lock); - darray_for_each(c->discard_buckets_in_flight, i) - if (bkey_eq(*i, bucket)) { - darray_remove_item(&c->discard_buckets_in_flight, i); + mutex_lock(&ca->discard_buckets_in_flight_lock); + darray_for_each(ca->discard_buckets_in_flight, i) + if (i->bucket == bucket) { + BUG_ON(!i->in_progress); + darray_remove_item(&ca->discard_buckets_in_flight, i); goto found; } BUG(); found: - mutex_unlock(&c->discard_buckets_in_flight_lock); + mutex_unlock(&ca->discard_buckets_in_flight_lock); } struct discard_buckets_state { @@ -1671,26 +1675,11 @@ struct discard_buckets_state { u64 open; u64 need_journal_commit; u64 discarded; - struct bch_dev *ca; u64 need_journal_commit_this_dev; }; -static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_state *s, struct bch_dev *ca) -{ - if (s->ca == ca) - return; - - if (s->ca && s->need_journal_commit_this_dev > - bch2_dev_usage_read(s->ca).d[BCH_DATA_free].buckets) - bch2_journal_flush_async(&c->journal, NULL); - - if (s->ca) - percpu_ref_put(&s->ca->io_ref); - s->ca = ca; - s->need_journal_commit_this_dev = 0; -} - static int bch2_discard_one_bucket(struct btree_trans *trans, + struct bch_dev *ca, struct btree_iter *need_discard_iter, struct bpos *discard_pos_done, struct discard_buckets_state *s) @@ -1704,16 +1693,6 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, bool discard_locked = false; int ret = 0; - struct bch_dev *ca = s->ca && s->ca->dev_idx == pos.inode - ? s->ca - : bch2_dev_get_ioref(c, pos.inode, WRITE); - if (!ca) { - bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0)); - return 0; - } - - discard_buckets_next_dev(c, s, ca); - if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) { s->open++; goto out; @@ -1773,7 +1752,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, goto out; } - if (discard_in_flight_add(c, SPOS(iter.pos.inode, iter.pos.offset, true))) + if (discard_in_flight_add(ca, iter.pos.offset, true)) goto out; discard_locked = true; @@ -1811,7 +1790,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, s->discarded++; out: if (discard_locked) - discard_in_flight_remove(c, iter.pos); + discard_in_flight_remove(ca, iter.pos.offset); s->seen++; bch2_trans_iter_exit(trans, &iter); printbuf_exit(&buf); @@ -1820,7 +1799,8 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, static void bch2_do_discards_work(struct work_struct *work) { - struct bch_fs *c = container_of(work, struct bch_fs, discard_work); + struct bch_dev *ca = container_of(work, struct bch_dev, discard_work); + struct bch_fs *c = ca->fs; struct discard_buckets_state s = {}; struct bpos discard_pos_done = POS_MAX; int ret; @@ -1831,23 +1811,41 @@ static void bch2_do_discards_work(struct work_struct *work) * successful commit: */ ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, - BTREE_ID_need_discard, POS_MIN, 0, k, - bch2_discard_one_bucket(trans, &iter, &discard_pos_done, &s))); - - discard_buckets_next_dev(c, &s, NULL); + for_each_btree_key_upto(trans, iter, + BTREE_ID_need_discard, + POS(ca->dev_idx, 0), + POS(ca->dev_idx, U64_MAX), 0, k, + bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s))); trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); bch2_write_ref_put(c, BCH_WRITE_REF_discard); + percpu_ref_put(&ca->io_ref); +} + +void bch2_dev_do_discards(struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + return; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard)) + goto put_ioref; + + if (queue_work(c->write_ref_wq, &ca->discard_work)) + return; + + bch2_write_ref_put(c, BCH_WRITE_REF_discard); +put_ioref: + percpu_ref_put(&ca->io_ref); } void bch2_do_discards(struct bch_fs *c) { - if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) && - !queue_work(c->write_ref_wq, &c->discard_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_discard); + for_each_member_device(c, ca) + bch2_dev_do_discards(ca); } static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket) @@ -1876,68 +1874,69 @@ static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpo static void bch2_do_discards_fast_work(struct work_struct *work) { - struct bch_fs *c = container_of(work, struct bch_fs, discard_fast_work); + struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work); + struct bch_fs *c = ca->fs; while (1) { bool got_bucket = false; - struct bpos bucket; - struct bch_dev *ca; + u64 bucket; - mutex_lock(&c->discard_buckets_in_flight_lock); - darray_for_each(c->discard_buckets_in_flight, i) { - if (i->snapshot) + mutex_lock(&ca->discard_buckets_in_flight_lock); + darray_for_each(ca->discard_buckets_in_flight, i) { + if (i->in_progress) continue; - ca = bch2_dev_get_ioref(c, i->inode, WRITE); - if (!ca) { - darray_remove_item(&c->discard_buckets_in_flight, i); - continue; - } - got_bucket = true; - bucket = *i; - i->snapshot = true; + bucket = i->bucket; + i->in_progress = true; break; } - mutex_unlock(&c->discard_buckets_in_flight_lock); + mutex_unlock(&ca->discard_buckets_in_flight_lock); if (!got_bucket) break; if (ca->mi.discard && !c->opts.nochanges) blkdev_issue_discard(ca->disk_sb.bdev, - bucket.offset * ca->mi.bucket_size, + bucket_to_sector(ca, bucket), ca->mi.bucket_size, GFP_KERNEL); int ret = bch2_trans_do(c, NULL, NULL, - BCH_WATERMARK_btree| - BCH_TRANS_COMMIT_no_enospc, - bch2_clear_bucket_needs_discard(trans, bucket)); + BCH_WATERMARK_btree| + BCH_TRANS_COMMIT_no_enospc, + bch2_clear_bucket_needs_discard(trans, POS(ca->dev_idx, bucket))); bch_err_fn(c, ret); - percpu_ref_put(&ca->io_ref); - discard_in_flight_remove(c, bucket); + discard_in_flight_remove(ca, bucket); if (ret) break; } bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); + percpu_ref_put(&ca->io_ref); } -static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket) +static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) { - rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu(c, bucket.inode); - bool dead = !ca || percpu_ref_is_dying(&ca->io_ref); - rcu_read_unlock(); + struct bch_fs *c = ca->fs; + + if (discard_in_flight_add(ca, bucket, false)) + return; + + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + return; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast)) + goto put_ioref; + + if (queue_work(c->write_ref_wq, &ca->discard_fast_work)) + return; - if (!dead && - !discard_in_flight_add(c, bucket) && - bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast) && - !queue_work(c->write_ref_wq, &c->discard_fast_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); + bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); +put_ioref: + percpu_ref_put(&ca->io_ref); } static int invalidate_one_bucket(struct btree_trans *trans, @@ -2038,7 +2037,8 @@ static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter static void bch2_do_invalidates_work(struct work_struct *work) { - struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work); + struct bch_dev *ca = container_of(work, struct bch_dev, invalidate_work); + struct bch_fs *c = ca->fs; struct btree_trans *trans = bch2_trans_get(c); int ret = 0; @@ -2046,52 +2046,63 @@ static void bch2_do_invalidates_work(struct work_struct *work) if (ret) goto err; - for_each_member_device(c, ca) { - s64 nr_to_invalidate = - should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); - struct btree_iter iter; - bool wrapped = false; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_lru, - lru_pos(ca->dev_idx, 0, - ((bch2_current_io_time(c, READ) + U32_MAX) & - LRU_TIME_MAX)), 0); - - while (true) { - bch2_trans_begin(trans); - - struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped); - ret = bkey_err(k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - if (!k.k) - break; + s64 nr_to_invalidate = + should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); + struct btree_iter iter; + bool wrapped = false; - ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate); - if (ret) - break; + bch2_trans_iter_init(trans, &iter, BTREE_ID_lru, + lru_pos(ca->dev_idx, 0, + ((bch2_current_io_time(c, READ) + U32_MAX) & + LRU_TIME_MAX)), 0); - bch2_btree_iter_advance(&iter); - } - bch2_trans_iter_exit(trans, &iter); + while (true) { + bch2_trans_begin(trans); - if (ret < 0) { - bch2_dev_put(ca); + struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped); + ret = bkey_err(k); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) break; - } + if (!k.k) + break; + + ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate); + if (ret) + break; + + bch2_btree_iter_advance(&iter); } + bch2_trans_iter_exit(trans, &iter); err: bch2_trans_put(trans); bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); + percpu_ref_put(&ca->io_ref); +} + +void bch2_dev_do_invalidates(struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + return; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate)) + goto put_ioref; + + if (queue_work(c->write_ref_wq, &ca->invalidate_work)) + return; + + bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); +put_ioref: + percpu_ref_put(&ca->io_ref); } void bch2_do_invalidates(struct bch_fs *c) { - if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) && - !queue_work(c->write_ref_wq, &c->invalidate_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); + for_each_member_device(c, ca) + bch2_dev_do_invalidates(ca); } int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, @@ -2407,16 +2418,20 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) set_bit(ca->dev_idx, c->rw_devs[i].d); } -void bch2_fs_allocator_background_exit(struct bch_fs *c) +void bch2_dev_allocator_background_exit(struct bch_dev *ca) +{ + darray_exit(&ca->discard_buckets_in_flight); +} + +void bch2_dev_allocator_background_init(struct bch_dev *ca) { - darray_exit(&c->discard_buckets_in_flight); + mutex_init(&ca->discard_buckets_in_flight_lock); + INIT_WORK(&ca->discard_work, bch2_do_discards_work); + INIT_WORK(&ca->discard_fast_work, bch2_do_discards_fast_work); + INIT_WORK(&ca->invalidate_work, bch2_do_invalidates_work); } void bch2_fs_allocator_background_init(struct bch_fs *c) { spin_lock_init(&c->freelist_lock); - mutex_init(&c->discard_buckets_in_flight_lock); - INIT_WORK(&c->discard_work, bch2_do_discards_work); - INIT_WORK(&c->discard_fast_work, bch2_do_discards_fast_work); - INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work); } diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index c3cc3c5ba5b63f..ba2c5557a3f0e5 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -275,6 +275,7 @@ int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned, enum btree_iter_update_trigger_flags); int bch2_check_alloc_info(struct bch_fs *); int bch2_check_alloc_to_lru_refs(struct bch_fs *); +void bch2_dev_do_discards(struct bch_dev *); void bch2_do_discards(struct bch_fs *); static inline u64 should_invalidate_buckets(struct bch_dev *ca, @@ -289,6 +290,7 @@ static inline u64 should_invalidate_buckets(struct bch_dev *ca, return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets); } +void bch2_dev_do_invalidates(struct bch_dev *); void bch2_do_invalidates(struct bch_fs *); static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a) @@ -312,7 +314,9 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *); void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); -void bch2_fs_allocator_background_exit(struct bch_fs *); +void bch2_dev_allocator_background_exit(struct bch_dev *); +void bch2_dev_allocator_background_init(struct bch_dev *); + void bch2_fs_allocator_background_init(struct bch_fs *); #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 927a5f300b30e8..9d3d64746a5be6 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -621,13 +621,13 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, avail = dev_buckets_free(ca, *usage, watermark); if (usage->d[BCH_DATA_need_discard].buckets > avail) - bch2_do_discards(c); + bch2_dev_do_discards(ca); if (usage->d[BCH_DATA_need_gc_gens].buckets > avail) bch2_gc_gens_async(c); if (should_invalidate_buckets(ca, *usage)) - bch2_do_invalidates(c); + bch2_dev_do_invalidates(ca); if (!avail) { if (cl && !waiting) { diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index a6b83ecab7ce50..1106fec6e155e4 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -493,6 +493,11 @@ struct io_count { u64 sectors[2][BCH_DATA_NR]; }; +struct discard_in_flight { + bool in_progress:1; + u64 bucket:63; +}; + struct bch_dev { struct kobject kobj; #ifdef CONFIG_BCACHEFS_DEBUG @@ -554,6 +559,12 @@ struct bch_dev { size_t inc_gen_really_needs_gc; size_t buckets_waiting_on_journal; + struct work_struct invalidate_work; + struct work_struct discard_work; + struct mutex discard_buckets_in_flight_lock; + DARRAY(struct discard_in_flight) discard_buckets_in_flight; + struct work_struct discard_fast_work; + atomic64_t rebalance_work; struct journal_device journal; @@ -915,11 +926,6 @@ struct bch_fs { unsigned write_points_nr; struct buckets_waiting_for_journal buckets_waiting_for_journal; - struct work_struct invalidate_work; - struct work_struct discard_work; - struct mutex discard_buckets_in_flight_lock; - DARRAY(struct bpos) discard_buckets_in_flight; - struct work_struct discard_fast_work; /* GARBAGE COLLECTION */ struct work_struct gc_gens_work; diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 641f2975177b61..fb906467201e92 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -536,7 +536,6 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_find_btree_nodes_exit(&c->found_btree_nodes); bch2_free_pending_node_rewrites(c); - bch2_fs_allocator_background_exit(c); bch2_fs_sb_errors_exit(c); bch2_fs_counters_exit(c); bch2_fs_snapshots_exit(c); @@ -1195,6 +1194,7 @@ static void bch2_dev_free(struct bch_dev *ca) kfree(ca->buckets_nouse); bch2_free_super(&ca->disk_sb); + bch2_dev_allocator_background_exit(ca); bch2_dev_journal_exit(ca); free_percpu(ca->io_done); @@ -1317,6 +1317,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, atomic_long_set(&ca->ref, 1); #endif + bch2_dev_allocator_background_init(ca); + if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) || @@ -1541,6 +1543,7 @@ static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); + bch2_dev_do_discards(ca); } int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, From 84b767f9e34fdb143c09e66a2a20722fc2921821 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Mon, 24 Jun 2024 10:50:15 -0700 Subject: [PATCH 1444/1578] ionic: use dev_consume_skb_any outside of napi If we're not in a NAPI softirq context, we need to be careful about how we call napi_consume_skb(), specifically we need to call it with budget==0 to signal to it that we're not in a safe context. This was found while running some configuration stress testing of traffic and a change queue config loop running, and this curious note popped out: [ 4371.402645] BUG: using smp_processor_id() in preemptible [00000000] code: ethtool/20545 [ 4371.402897] caller is napi_skb_cache_put+0x16/0x80 [ 4371.403120] CPU: 25 PID: 20545 Comm: ethtool Kdump: loaded Tainted: G OE 6.10.0-rc3-netnext+ #8 [ 4371.403302] Hardware name: HPE ProLiant DL360 Gen10/ProLiant DL360 Gen10, BIOS U32 01/23/2021 [ 4371.403460] Call Trace: [ 4371.403613] [ 4371.403758] dump_stack_lvl+0x4f/0x70 [ 4371.403904] check_preemption_disabled+0xc1/0xe0 [ 4371.404051] napi_skb_cache_put+0x16/0x80 [ 4371.404199] ionic_tx_clean+0x18a/0x240 [ionic] [ 4371.404354] ionic_tx_cq_service+0xc4/0x200 [ionic] [ 4371.404505] ionic_tx_flush+0x15/0x70 [ionic] [ 4371.404653] ? ionic_lif_qcq_deinit.isra.23+0x5b/0x70 [ionic] [ 4371.404805] ionic_txrx_deinit+0x71/0x190 [ionic] [ 4371.404956] ionic_reconfigure_queues+0x5f5/0xff0 [ionic] [ 4371.405111] ionic_set_ringparam+0x2e8/0x3e0 [ionic] [ 4371.405265] ethnl_set_rings+0x1f1/0x300 [ 4371.405418] ethnl_default_set_doit+0xbb/0x160 [ 4371.405571] genl_family_rcv_msg_doit+0xff/0x130 [...] I found that ionic_tx_clean() calls napi_consume_skb() which calls napi_skb_cache_put(), but before that last call is the note /* Zero budget indicate non-NAPI context called us, like netpoll */ and DEBUG_NET_WARN_ON_ONCE(!in_softirq()); Those are pretty big hints that we're doing it wrong. We can pass a context hint down through the calls to let ionic_tx_clean() know what we're doing so it can call napi_consume_skb() correctly. Fixes: 386e69865311 ("ionic: Make use napi_consume_skb") Signed-off-by: Shannon Nelson Link: https://patch.msgid.link/20240624175015.4520-1-shannon.nelson@amd.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/pensando/ionic/ionic_dev.h | 4 ++- .../net/ethernet/pensando/ionic/ionic_lif.c | 2 +- .../net/ethernet/pensando/ionic/ionic_txrx.c | 28 +++++++++++-------- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_dev.h b/drivers/net/ethernet/pensando/ionic/ionic_dev.h index f30eee4a5a80e4..b6c01a88098dc5 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_dev.h +++ b/drivers/net/ethernet/pensando/ionic/ionic_dev.h @@ -375,7 +375,9 @@ typedef void (*ionic_cq_done_cb)(void *done_arg); unsigned int ionic_cq_service(struct ionic_cq *cq, unsigned int work_to_do, ionic_cq_cb cb, ionic_cq_done_cb done_cb, void *done_arg); -unsigned int ionic_tx_cq_service(struct ionic_cq *cq, unsigned int work_to_do); +unsigned int ionic_tx_cq_service(struct ionic_cq *cq, + unsigned int work_to_do, + bool in_napi); int ionic_q_init(struct ionic_lif *lif, struct ionic_dev *idev, struct ionic_queue *q, unsigned int index, const char *name, diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index 1934e9d6d9e4fb..1837a30ba08ac4 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -1189,7 +1189,7 @@ static int ionic_adminq_napi(struct napi_struct *napi, int budget) ionic_rx_service, NULL, NULL); if (lif->hwstamp_txq) - tx_work = ionic_tx_cq_service(&lif->hwstamp_txq->cq, budget); + tx_work = ionic_tx_cq_service(&lif->hwstamp_txq->cq, budget, !!budget); work_done = max(max(n_work, a_work), max(rx_work, tx_work)); if (work_done < budget && napi_complete_done(napi, work_done)) { diff --git a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c index aed7d9cbce038f..9fdd7cd3ef19d4 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c @@ -23,7 +23,8 @@ static void ionic_tx_desc_unmap_bufs(struct ionic_queue *q, static void ionic_tx_clean(struct ionic_queue *q, struct ionic_tx_desc_info *desc_info, - struct ionic_txq_comp *comp); + struct ionic_txq_comp *comp, + bool in_napi); static inline void ionic_txq_post(struct ionic_queue *q, bool ring_dbell) { @@ -944,7 +945,7 @@ int ionic_tx_napi(struct napi_struct *napi, int budget) u32 work_done = 0; u32 flags = 0; - work_done = ionic_tx_cq_service(cq, budget); + work_done = ionic_tx_cq_service(cq, budget, !!budget); if (unlikely(!budget)) return budget; @@ -1028,7 +1029,7 @@ int ionic_txrx_napi(struct napi_struct *napi, int budget) txqcq = lif->txqcqs[qi]; txcq = &lif->txqcqs[qi]->cq; - tx_work_done = ionic_tx_cq_service(txcq, IONIC_TX_BUDGET_DEFAULT); + tx_work_done = ionic_tx_cq_service(txcq, IONIC_TX_BUDGET_DEFAULT, !!budget); if (unlikely(!budget)) return budget; @@ -1161,7 +1162,8 @@ static void ionic_tx_desc_unmap_bufs(struct ionic_queue *q, static void ionic_tx_clean(struct ionic_queue *q, struct ionic_tx_desc_info *desc_info, - struct ionic_txq_comp *comp) + struct ionic_txq_comp *comp, + bool in_napi) { struct ionic_tx_stats *stats = q_to_tx_stats(q); struct ionic_qcq *qcq = q_to_qcq(q); @@ -1213,11 +1215,13 @@ static void ionic_tx_clean(struct ionic_queue *q, desc_info->bytes = skb->len; stats->clean++; - napi_consume_skb(skb, 1); + napi_consume_skb(skb, likely(in_napi) ? 1 : 0); } static bool ionic_tx_service(struct ionic_cq *cq, - unsigned int *total_pkts, unsigned int *total_bytes) + unsigned int *total_pkts, + unsigned int *total_bytes, + bool in_napi) { struct ionic_tx_desc_info *desc_info; struct ionic_queue *q = cq->bound_q; @@ -1239,7 +1243,7 @@ static bool ionic_tx_service(struct ionic_cq *cq, desc_info->bytes = 0; index = q->tail_idx; q->tail_idx = (q->tail_idx + 1) & (q->num_descs - 1); - ionic_tx_clean(q, desc_info, comp); + ionic_tx_clean(q, desc_info, comp, in_napi); if (desc_info->skb) { pkts++; bytes += desc_info->bytes; @@ -1253,7 +1257,9 @@ static bool ionic_tx_service(struct ionic_cq *cq, return true; } -unsigned int ionic_tx_cq_service(struct ionic_cq *cq, unsigned int work_to_do) +unsigned int ionic_tx_cq_service(struct ionic_cq *cq, + unsigned int work_to_do, + bool in_napi) { unsigned int work_done = 0; unsigned int bytes = 0; @@ -1262,7 +1268,7 @@ unsigned int ionic_tx_cq_service(struct ionic_cq *cq, unsigned int work_to_do) if (work_to_do == 0) return 0; - while (ionic_tx_service(cq, &pkts, &bytes)) { + while (ionic_tx_service(cq, &pkts, &bytes, in_napi)) { if (cq->tail_idx == cq->num_descs - 1) cq->done_color = !cq->done_color; cq->tail_idx = (cq->tail_idx + 1) & (cq->num_descs - 1); @@ -1288,7 +1294,7 @@ void ionic_tx_flush(struct ionic_cq *cq) { u32 work_done; - work_done = ionic_tx_cq_service(cq, cq->num_descs); + work_done = ionic_tx_cq_service(cq, cq->num_descs, false); if (work_done) ionic_intr_credits(cq->idev->intr_ctrl, cq->bound_intr->index, work_done, IONIC_INTR_CRED_RESET_COALESCE); @@ -1305,7 +1311,7 @@ void ionic_tx_empty(struct ionic_queue *q) desc_info = &q->tx_info[q->tail_idx]; desc_info->bytes = 0; q->tail_idx = (q->tail_idx + 1) & (q->num_descs - 1); - ionic_tx_clean(q, desc_info, NULL); + ionic_tx_clean(q, desc_info, NULL, false); if (desc_info->skb) { pkts++; bytes += desc_info->bytes; From 5dfe9d273932c647bdc9d664f939af9a5a398cbc Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Mon, 24 Jun 2024 14:43:23 +0000 Subject: [PATCH 1445/1578] tcp: fix tcp_rcv_fastopen_synack() to enter TCP_CA_Loss for failed TFO Testing determined that the recent commit 9e046bb111f1 ("tcp: clear tp->retrans_stamp in tcp_rcv_fastopen_synack()") has a race, and does not always ensure retrans_stamp is 0 after a TFO payload retransmit. If transmit completion for the SYN+data skb happens after the client TCP stack receives the SYNACK (which sometimes happens), then retrans_stamp can erroneously remain non-zero for the lifetime of the connection, causing a premature ETIMEDOUT later. Testing and tracing showed that the buggy scenario is the following somewhat tricky sequence: + Client attempts a TFO handshake. tcp_send_syn_data() sends SYN + TFO cookie + data in a single packet in the syn_data skb. It hands the syn_data skb to tcp_transmit_skb(), which makes a clone. Crucially, it then reuses the same original (non-clone) syn_data skb, transforming it by advancing the seq by one byte and removing the FIN bit, and enques the resulting payload-only skb in the sk->tcp_rtx_queue. + Client sets retrans_stamp to the start time of the three-way handshake. + Cookie mismatches or server has TFO disabled, and server only ACKs SYN. + tcp_ack() sees SYN is acked, tcp_clean_rtx_queue() clears retrans_stamp. + Since the client SYN was acked but not the payload, the TFO failure code path in tcp_rcv_fastopen_synack() tries to retransmit the payload skb. However, in some cases the transmit completion for the clone of the syn_data (which had SYN + TFO cookie + data) hasn't happened. In those cases, skb_still_in_host_queue() returns true for the retransmitted TFO payload, because the clone of the syn_data skb has not had its tx completetion. + Because skb_still_in_host_queue() finds skb_fclone_busy() is true, it sets the TSQ_THROTTLED bit and the retransmit does not happen in the tcp_rcv_fastopen_synack() call chain. + The tcp_rcv_fastopen_synack() code next implicitly assumes the retransmit process is finished, and sets retrans_stamp to 0 to clear it, but this is later overwritten (see below). + Later, upon tx completion, tcp_tsq_write() calls tcp_xmit_retransmit_queue(), which puts the retransmit in flight and sets retrans_stamp to a non-zero value. + The client receives an ACK for the retransmitted TFO payload data. + Since we're in CA_Open and there are no dupacks/SACKs/DSACKs/ECN to make tcp_ack_is_dubious() true and make us call tcp_fastretrans_alert() and reach a code path that clears retrans_stamp, retrans_stamp stays nonzero. + Later, if there is a TLP, RTO, RTO sequence, then the connection will suffer an early ETIMEDOUT due to the erroneously ancient retrans_stamp. The fix: this commit refactors the code to have tcp_rcv_fastopen_synack() retransmit by reusing the relevant parts of tcp_simple_retransmit() that enter CA_Loss (without changing cwnd) and call tcp_xmit_retransmit_queue(). We have tcp_simple_retransmit() and tcp_rcv_fastopen_synack() share code in this way because in both cases we get a packet indicating non-congestion loss (MTU reduction or TFO failure) and thus in both cases we want to retransmit as many packets as cwnd allows, without reducing cwnd. And given that retransmits will set retrans_stamp to a non-zero value (and may do so in a later calling context due to TSQ), we also want to enter CA_Loss so that we track when all retransmitted packets are ACked and clear retrans_stamp when that happens (to ensure later recurring RTOs are using the correct retrans_stamp and don't declare ETIMEDOUT prematurely). Fixes: 9e046bb111f1 ("tcp: clear tp->retrans_stamp in tcp_rcv_fastopen_synack()") Fixes: a7abf3cd76e1 ("tcp: consider using standard rtx logic in tcp_rcv_fastopen_synack()") Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Link: https://patch.msgid.link/20240624144323.2371403-1-ncardwell.sw@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b6d7666ac912e6..2e39cb881e2097 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2782,13 +2782,37 @@ static void tcp_mtup_probe_success(struct sock *sk) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS); } +/* Sometimes we deduce that packets have been dropped due to reasons other than + * congestion, like path MTU reductions or failed client TFO attempts. In these + * cases we call this function to retransmit as many packets as cwnd allows, + * without reducing cwnd. Given that retransmits will set retrans_stamp to a + * non-zero value (and may do so in a later calling context due to TSQ), we + * also enter CA_Loss so that we track when all retransmitted packets are ACKed + * and clear retrans_stamp when that happens (to ensure later recurring RTOs + * are using the correct retrans_stamp and don't declare ETIMEDOUT + * prematurely). + */ +static void tcp_non_congestion_loss_retransmit(struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + + if (icsk->icsk_ca_state != TCP_CA_Loss) { + tp->high_seq = tp->snd_nxt; + tp->snd_ssthresh = tcp_current_ssthresh(sk); + tp->prior_ssthresh = 0; + tp->undo_marker = 0; + tcp_set_ca_state(sk, TCP_CA_Loss); + } + tcp_xmit_retransmit_queue(sk); +} + /* Do a simple retransmit without using the backoff mechanisms in * tcp_timer. This is used for path mtu discovery. * The socket is already locked here. */ void tcp_simple_retransmit(struct sock *sk) { - const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int mss; @@ -2828,14 +2852,7 @@ void tcp_simple_retransmit(struct sock *sk) * in network, but units changed and effective * cwnd/ssthresh really reduced now. */ - if (icsk->icsk_ca_state != TCP_CA_Loss) { - tp->high_seq = tp->snd_nxt; - tp->snd_ssthresh = tcp_current_ssthresh(sk); - tp->prior_ssthresh = 0; - tp->undo_marker = 0; - tcp_set_ca_state(sk, TCP_CA_Loss); - } - tcp_xmit_retransmit_queue(sk); + tcp_non_congestion_loss_retransmit(sk); } EXPORT_SYMBOL(tcp_simple_retransmit); @@ -6295,8 +6312,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, tp->fastopen_client_fail = TFO_DATA_NOT_ACKED; skb_rbtree_walk_from(data) tcp_mark_skb_lost(sk, data); - tcp_xmit_retransmit_queue(sk); - tp->retrans_stamp = 0; + tcp_non_congestion_loss_retransmit(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); return true; From 64cd7de998f393e73981e2aa4ee13e4e887f01ea Mon Sep 17 00:00:00 2001 From: Pei Li Date: Tue, 25 Jun 2024 17:39:56 -0700 Subject: [PATCH 1446/1578] bcachefs: Fix kmalloc bug in __snapshot_t_mut When allocating too huge a snapshot table, we should fail gracefully in __snapshot_t_mut() instead of fail in kmalloc(). Reported-by: syzbot+770e99b65e26fa023ab1@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=770e99b65e26fa023ab1 Tested-by: syzbot+770e99b65e26fa023ab1@syzkaller.appspotmail.com Signed-off-by: Pei Li Signed-off-by: Kent Overstreet --- fs/bcachefs/snapshot.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 4ef98e696673fd..24023d6a9698bf 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -168,6 +168,9 @@ static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id) size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1)); size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]); + if (unlikely(new_bytes > INT_MAX)) + return NULL; + new = kvzalloc(new_bytes, GFP_KERNEL); if (!new) return NULL; From d3710853fd4a7020904a16686986cf5541ad1c38 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 25 Jun 2024 17:52:12 +0200 Subject: [PATCH 1447/1578] ALSA: hda/realtek: Fix conflicting quirk for PCI SSID 17aa:3820 The recent fix for Lenovo IdeaPad 330-17IKB replaced the quirk entry, and this eventually breaks the existing quirk for Lenovo Yoga Duet 7 13ITL6 equipped with the same PCI SSID 17aa:3820. For applying a proper quirk for each model, check the codec SSID additionally. Fortunately Yoga Duet has a different codec SSID, 0x17aa3802. (Interestingly, 17aa:3802 has another conflict of SSID between another Yoga model vs 14IRP8 which we had to work around similarly.) Fixes: b1fd0d1285b1 ("ALSA: hda/realtek: Enable headset mic on IdeaPad 330-17IKB 81DM") Link: https://patch.msgid.link/20240625155217.18767-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index cb9b11da5581e2..c901daf8fd458e 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -7525,6 +7525,7 @@ enum { ALC287_FIXUP_LENOVO_THKPAD_WH_ALC1318, ALC256_FIXUP_CHROME_BOOK, ALC287_FIXUP_LENOVO_14ARP8_LEGION_IAH7, + ALC287_FIXUP_LENOVO_SSID_17AA3820, }; /* A special fixup for Lenovo C940 and Yoga Duet 7; @@ -7596,6 +7597,20 @@ static void alc287_fixup_lenovo_legion_7(struct hda_codec *codec, __snd_hda_apply_fixup(codec, id, action, 0); } +/* Yet more conflicting PCI SSID (17aa:3820) on two Lenovo models */ +static void alc287_fixup_lenovo_ssid_17aa3820(struct hda_codec *codec, + const struct hda_fixup *fix, + int action) +{ + int id; + + if (codec->core.subsystem_id == 0x17aa3820) + id = ALC269_FIXUP_ASPIRE_HEADSET_MIC; /* IdeaPad 330-17IKB 81DM */ + else /* 0x17aa3802 */ + id = ALC287_FIXUP_YOGA7_14ITL_SPEAKERS; /* "Yoga Duet 7 13ITL6 */ + __snd_hda_apply_fixup(codec, id, action, 0); +} + static const struct hda_fixup alc269_fixups[] = { [ALC269_FIXUP_GPIO2] = { .type = HDA_FIXUP_FUNC, @@ -9832,6 +9847,10 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC225_FIXUP_HEADSET_JACK }, + [ALC287_FIXUP_LENOVO_SSID_17AA3820] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc287_fixup_lenovo_ssid_17aa3820, + }, }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -10531,7 +10550,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x3813, "Legion 7i 15IMHG05", ALC287_FIXUP_LEGION_15IMHG05_SPEAKERS), SND_PCI_QUIRK(0x17aa, 0x3818, "Lenovo C940 / Yoga Duet 7", ALC298_FIXUP_LENOVO_C940_DUET7), SND_PCI_QUIRK(0x17aa, 0x3819, "Lenovo 13s Gen2 ITL", ALC287_FIXUP_13S_GEN2_SPEAKERS), - SND_PCI_QUIRK(0x17aa, 0x3820, "IdeaPad 330-17IKB 81DM", ALC269_FIXUP_ASPIRE_HEADSET_MIC), + SND_PCI_QUIRK(0x17aa, 0x3820, "IdeaPad 330 / Yoga Duet 7", ALC287_FIXUP_LENOVO_SSID_17AA3820), SND_PCI_QUIRK(0x17aa, 0x3824, "Legion Y9000X 2020", ALC285_FIXUP_LEGION_Y9000X_SPEAKERS), SND_PCI_QUIRK(0x17aa, 0x3827, "Ideapad S740", ALC285_FIXUP_IDEAPAD_S740_COEF), SND_PCI_QUIRK(0x17aa, 0x3834, "Lenovo IdeaPad Slim 9i 14ITL5", ALC287_FIXUP_YOGA7_14ITL_SPEAKERS), From 3cd59d8ef8df7d7a079f54d56502dae8f716b39b Mon Sep 17 00:00:00 2001 From: Dirk Su Date: Wed, 26 Jun 2024 10:14:36 +0800 Subject: [PATCH 1448/1578] ALSA: hda/realtek: fix mute/micmute LEDs don't work for EliteBook 645/665 G11. HP EliteBook 645/665 G11 needs ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF quirk to make mic-mute/audio-mute working. Signed-off-by: Dirk Su Cc: Link: https://patch.msgid.link/20240626021437.77039-1-dirk.su@canonical.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index c901daf8fd458e..811e82474200fd 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10242,6 +10242,9 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8c7c, "HP ProBook 445 G11", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), SND_PCI_QUIRK(0x103c, 0x8c7d, "HP ProBook 465 G11", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), SND_PCI_QUIRK(0x103c, 0x8c7e, "HP ProBook 465 G11", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), + SND_PCI_QUIRK(0x103c, 0x8c7f, "HP EliteBook 645 G11", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), + SND_PCI_QUIRK(0x103c, 0x8c80, "HP EliteBook 645 G11", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), + SND_PCI_QUIRK(0x103c, 0x8c81, "HP EliteBook 665 G11", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), SND_PCI_QUIRK(0x103c, 0x8c89, "HP ProBook 460 G11", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8c8a, "HP EliteBook 630", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8c8c, "HP EliteBook 660", ALC236_FIXUP_HP_GPIO_LED), From 888119571b7c9518faeeeb2b431ffc4455e0028d Mon Sep 17 00:00:00 2001 From: Aapo Vienamo Date: Tue, 25 Jun 2024 16:53:43 +0300 Subject: [PATCH 1449/1578] gpio: graniterapids: Add missing raw_spinlock_init() Add the missing raw_spin_lock_init() call to gnr_gpio_probe(). Fixes: ecc4b1418e23 ("gpio: Add Intel Granite Rapids-D vGPIO driver") Signed-off-by: Aapo Vienamo Acked-by: Mika Westerberg Link: https://lore.kernel.org/r/20240625135343.673745-1-aapo.vienamo@linux.intel.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-graniterapids.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpio/gpio-graniterapids.c b/drivers/gpio/gpio-graniterapids.c index c693fe05d50fdd..f2e911a3d2ca02 100644 --- a/drivers/gpio/gpio-graniterapids.c +++ b/drivers/gpio/gpio-graniterapids.c @@ -296,6 +296,8 @@ static int gnr_gpio_probe(struct platform_device *pdev) if (!priv) return -ENOMEM; + raw_spin_lock_init(&priv->lock); + regs = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(regs)) return PTR_ERR(regs); From 610b29161b0aa9feb59b78dc867553274f17fb01 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 19 Jun 2024 10:32:43 -0700 Subject: [PATCH 1450/1578] xfs: fix freeing speculative preallocations for preallocated files xfs_can_free_eofblocks returns false for files that have persistent preallocations unless the force flag is passed and there are delayed blocks. This means it won't free delalloc reservations for files with persistent preallocations unless the force flag is set, and it will also free the persistent preallocations if the force flag is set and the file happens to have delayed allocations. Both of these are bad, so do away with the force flag and always free only post-EOF delayed allocations for files with the XFS_DIFLAG_PREALLOC or APPEND flags set. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/xfs_bmap_util.c | 30 ++++++++++++++++++++++-------- fs/xfs/xfs_bmap_util.h | 2 +- fs/xfs/xfs_icache.c | 2 +- fs/xfs/xfs_inode.c | 14 ++++---------- 4 files changed, 28 insertions(+), 20 deletions(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index ac2e77ebb54c73..a4d9fbc21b8343 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -486,13 +486,11 @@ xfs_bmap_punch_delalloc_range( /* * Test whether it is appropriate to check an inode for and free post EOF - * blocks. The 'force' parameter determines whether we should also consider - * regular files that are marked preallocated or append-only. + * blocks. */ bool xfs_can_free_eofblocks( - struct xfs_inode *ip, - bool force) + struct xfs_inode *ip) { struct xfs_bmbt_irec imap; struct xfs_mount *mp = ip->i_mount; @@ -526,11 +524,11 @@ xfs_can_free_eofblocks( return false; /* - * Do not free real preallocated or append-only files unless the file - * has delalloc blocks and we are forced to remove them. + * Only free real extents for inodes with persistent preallocations or + * the append-only flag. */ if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) - if (!force || ip->i_delayed_blks == 0) + if (ip->i_delayed_blks == 0) return false; /* @@ -584,6 +582,22 @@ xfs_free_eofblocks( /* Wait on dio to ensure i_size has settled. */ inode_dio_wait(VFS_I(ip)); + /* + * For preallocated files only free delayed allocations. + * + * Note that this means we also leave speculative preallocations in + * place for preallocated files. + */ + if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) { + if (ip->i_delayed_blks) { + xfs_bmap_punch_delalloc_range(ip, + round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize), + LLONG_MAX); + } + xfs_inode_clear_eofblocks_tag(ip); + return 0; + } + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) { ASSERT(xfs_is_shutdown(mp)); @@ -891,7 +905,7 @@ xfs_prepare_shift( * Trim eofblocks to avoid shifting uninitialized post-eof preallocation * into the accessible region of the file. */ - if (xfs_can_free_eofblocks(ip, true)) { + if (xfs_can_free_eofblocks(ip)) { error = xfs_free_eofblocks(ip); if (error) return error; diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 51f84d8ff372fa..eb0895bfb9dae4 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -63,7 +63,7 @@ int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset, xfs_off_t len); /* EOF block manipulation functions */ -bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force); +bool xfs_can_free_eofblocks(struct xfs_inode *ip); int xfs_free_eofblocks(struct xfs_inode *ip); int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip, diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 0953163a2d8492..9967334ea99f1a 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1155,7 +1155,7 @@ xfs_inode_free_eofblocks( } *lockflags |= XFS_IOLOCK_EXCL; - if (xfs_can_free_eofblocks(ip, false)) + if (xfs_can_free_eofblocks(ip)) return xfs_free_eofblocks(ip); /* inode could be preallocated or append-only */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index f36091e1e7f50b..38f946e3be2da3 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1595,7 +1595,7 @@ xfs_release( if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) return 0; - if (xfs_can_free_eofblocks(ip, false)) { + if (xfs_can_free_eofblocks(ip)) { /* * Check if the inode is being opened, written and closed * frequently and we have delayed allocation blocks outstanding @@ -1856,15 +1856,13 @@ xfs_inode_needs_inactive( /* * This file isn't being freed, so check if there are post-eof blocks - * to free. @force is true because we are evicting an inode from the - * cache. Post-eof blocks must be freed, lest we end up with broken - * free space accounting. + * to free. * * Note: don't bother with iolock here since lockdep complains about * acquiring it in reclaim context. We have the only reference to the * inode at this point anyways. */ - return xfs_can_free_eofblocks(ip, true); + return xfs_can_free_eofblocks(ip); } /* @@ -1947,15 +1945,11 @@ xfs_inactive( if (VFS_I(ip)->i_nlink != 0) { /* - * force is true because we are evicting an inode from the - * cache. Post-eof blocks must be freed, lest we end up with - * broken free space accounting. - * * Note: don't bother with iolock here since lockdep complains * about acquiring it in reclaim context. We have the only * reference to the inode at this point anyways. */ - if (xfs_can_free_eofblocks(ip, true)) + if (xfs_can_free_eofblocks(ip)) error = xfs_free_eofblocks(ip); goto out; From 288e1f693f04e66be99f27e7cbe4a45936a66745 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 19 Jun 2024 10:32:44 -0700 Subject: [PATCH 1451/1578] xfs: restrict when we try to align cow fork delalloc to cowextsz hints xfs/205 produces the following failure when always_cow is enabled: --- a/tests/xfs/205.out 2024-02-28 16:20:24.437887970 -0800 +++ b/tests/xfs/205.out.bad 2024-06-03 21:13:40.584000000 -0700 @@ -1,4 +1,5 @@ QA output created by 205 *** one file + !!! disk full (expected) *** one file, a few bytes at a time *** done This is the result of overly aggressive attempts to align cow fork delalloc reservations to the CoW extent size hint. Looking at the trace data, we're trying to append a single fsblock to the "fred" file. Trying to create a speculative post-eof reservation fails because there's not enough space. We then set @prealloc_blocks to zero and try again, but the cowextsz alignment code triggers, which expands our request for a 1-fsblock reservation into a 39-block reservation. There's not enough space for that, so the whole write fails with ENOSPC even though there's sufficient space in the filesystem to allocate the single block that we need to land the write. There are two things wrong here -- first, we shouldn't be attempting speculative preallocations beyond what was requested when we're low on space. Second, if we've already computed a posteof preallocation, we shouldn't bother trying to align that to the cowextsize hint. Fix both of these problems by adding a flag that only enables the expansion of the delalloc reservation to the cowextsize if we're doing a non-extending write, and only if we're not doing an ENOSPC retry. This requires us to move the ENOSPC retry logic to xfs_bmapi_reserve_delalloc. I probably should have caught this six years ago when 6ca30729c206d was being reviewed, but oh well. Update the comments to reflect what the code does now. Fixes: 6ca30729c206d ("xfs: bmap code cleanup") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_bmap.c | 31 +++++++++++++++++++++++++++---- fs/xfs/xfs_iomap.c | 34 ++++++++++++---------------------- 2 files changed, 39 insertions(+), 26 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index c101cf266bc4db..6af6f744fdd6a2 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4058,20 +4058,32 @@ xfs_bmapi_reserve_delalloc( xfs_extlen_t indlen; uint64_t fdblocks; int error; - xfs_fileoff_t aoff = off; + xfs_fileoff_t aoff; + bool use_cowextszhint = + whichfork == XFS_COW_FORK && !prealloc; +retry: /* * Cap the alloc length. Keep track of prealloc so we know whether to * tag the inode before we return. */ + aoff = off; alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN); if (!eof) alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); if (prealloc && alen >= len) prealloc = alen - len; - /* Figure out the extent size, adjust alen */ - if (whichfork == XFS_COW_FORK) { + /* + * If we're targetting the COW fork but aren't creating a speculative + * posteof preallocation, try to expand the reservation to align with + * the COW extent size hint if there's sufficient free space. + * + * Unlike the data fork, the CoW cancellation functions will free all + * the reservations at inactivation, so we don't require that every + * delalloc reservation have a dirty pagecache. + */ + if (use_cowextszhint) { struct xfs_bmbt_irec prev; xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip); @@ -4090,7 +4102,7 @@ xfs_bmapi_reserve_delalloc( */ error = xfs_quota_reserve_blkres(ip, alen); if (error) - return error; + goto out; /* * Split changing sb for alen and indlen since they could be coming @@ -4140,6 +4152,17 @@ xfs_bmapi_reserve_delalloc( out_unreserve_quota: if (XFS_IS_QUOTA_ON(mp)) xfs_quota_unreserve_blkres(ip, alen); +out: + if (error == -ENOSPC || error == -EDQUOT) { + trace_xfs_delalloc_enospc(ip, off, len); + + if (prealloc || use_cowextszhint) { + /* retry without any preallocation */ + use_cowextszhint = false; + prealloc = 0; + goto retry; + } + } return error; } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 3783426739258c..414903885ab905 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1148,33 +1148,23 @@ xfs_buffered_write_iomap_begin( } } -retry: - error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb, - end_fsb - offset_fsb, prealloc_blocks, - allocfork == XFS_DATA_FORK ? &imap : &cmap, - allocfork == XFS_DATA_FORK ? &icur : &ccur, - allocfork == XFS_DATA_FORK ? eof : cow_eof); - switch (error) { - case 0: - break; - case -ENOSPC: - case -EDQUOT: - /* retry without any preallocation */ - trace_xfs_delalloc_enospc(ip, offset, count); - if (prealloc_blocks) { - prealloc_blocks = 0; - goto retry; - } - fallthrough; - default: - goto out_unlock; - } - if (allocfork == XFS_COW_FORK) { + error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb, + end_fsb - offset_fsb, prealloc_blocks, &cmap, + &ccur, cow_eof); + if (error) + goto out_unlock; + trace_xfs_iomap_alloc(ip, offset, count, allocfork, &cmap); goto found_cow; } + error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb, + end_fsb - offset_fsb, prealloc_blocks, &imap, &icur, + eof); + if (error) + goto out_unlock; + /* * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch * them out if the write happens to fail. From 1ec9307fc066dd8a140d5430f8a7576aa9d78cd3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 19 Jun 2024 10:32:45 -0700 Subject: [PATCH 1452/1578] xfs: allow unlinked symlinks and dirs with zero size For a very very long time, inode inactivation has set the inode size to zero before unmapping the extents associated with the data fork. Unfortunately, commit 3c6f46eacd876 changed the inode verifier to prohibit zero-length symlinks and directories. If an inode happens to get logged in this state and the system crashes before freeing the inode, log recovery will also fail on the broken inode. Therefore, allow zero-size symlinks and directories as long as the link count is zero; nobody will be able to open these files by handle so there isn't any risk of data exposure. Fixes: 3c6f46eacd876 ("xfs: sanity check directory inode di_size") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_inode_buf.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index e7a7bfbe75b46a..513b50da6215f7 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -379,10 +379,13 @@ xfs_dinode_verify_fork( /* * A directory small enough to fit in the inode must be stored * in local format. The directory sf <-> extents conversion - * code updates the directory size accordingly. + * code updates the directory size accordingly. Directories + * being truncated have zero size and are not subject to this + * check. */ if (S_ISDIR(mode)) { - if (be64_to_cpu(dip->di_size) <= fork_size && + if (dip->di_size && + be64_to_cpu(dip->di_size) <= fork_size && fork_format != XFS_DINODE_FMT_LOCAL) return __this_address; } @@ -528,9 +531,19 @@ xfs_dinode_verify( if (mode && xfs_mode_to_ftype(mode) == XFS_DIR3_FT_UNKNOWN) return __this_address; - /* No zero-length symlinks/dirs. */ - if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0) - return __this_address; + /* + * No zero-length symlinks/dirs unless they're unlinked and hence being + * inactivated. + */ + if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0) { + if (dip->di_version > 1) { + if (dip->di_nlink) + return __this_address; + } else { + if (dip->di_onlink) + return __this_address; + } + } fa = xfs_dinode_verify_nrext64(mp, dip); if (fa) From dc5e1cbae270b625dcb978f8ea762eb16a93a016 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 20 Jun 2024 15:05:26 -0700 Subject: [PATCH 1453/1578] xfs: fix direction in XFS_IOC_EXCHANGE_RANGE The kernel reads userspace's buffer but does not write it back. Therefore this is really an _IOW ioctl. Change this before 6.10 final releases. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 97996cb79aaaa8..454b63ef720162 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -996,7 +996,7 @@ struct xfs_getparents_by_handle { #define XFS_IOC_FSGEOMETRY _IOR ('X', 126, struct xfs_fsop_geom) #define XFS_IOC_BULKSTAT _IOR ('X', 127, struct xfs_bulkstat_req) #define XFS_IOC_INUMBERS _IOR ('X', 128, struct xfs_inumbers_req) -#define XFS_IOC_EXCHANGE_RANGE _IOWR('X', 129, struct xfs_exchange_range) +#define XFS_IOC_EXCHANGE_RANGE _IOW ('X', 129, struct xfs_exchange_range) /* XFS_IOC_GETFSUUID ---------- deprecated 140 */ From 673cd885bbbfd873aa6983ce2363a813b7826425 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 19 Jun 2024 10:32:46 -0700 Subject: [PATCH 1454/1578] xfs: honor init_xattrs in xfs_init_new_inode for !ATTR fs xfs_init_new_inode ignores the init_xattrs parameter for filesystems that do not have ATTR enabled. As a result, the first init_xattrs file to be created by the kernel will not have an attr fork created to store acls. Storing that first acl will add ATTR to the superblock flags, so subsequent files will be created with attr forks. The overhead of this is so small that chances are that nobody has noticed this behavior. However, this is disastrous on a filesystem with parent pointers because it requires that a new linkable file /must/ have a pre-existing attr fork, and the parent pointers code uses init_xattrs to create that fork. The preproduction version of mkfs.xfs used to set this, but the V5 sb verifier only requires ATTR2, not ATTR. There is no guard for filesystems with (PARENT && !ATTR). It turns out that I misunderstood the two flags -- ATTR means that we at some point created an attr fork to store xattrs in a file; ATTR2 apparently means only that inodes have dynamic fork offsets or that the filesystem was mounted with the "attr2" option. Fixes: 2442ee15bb1e ("xfs: eager inode attr fork init needs attr feature awareness") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/xfs_inode.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 38f946e3be2da3..a4e3cd8971fc6e 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -42,6 +42,7 @@ #include "xfs_pnfs.h" #include "xfs_parent.h" #include "xfs_xattr.h" +#include "xfs_sb.h" struct kmem_cache *xfs_inode_cache; @@ -870,9 +871,16 @@ xfs_init_new_inode( * this saves us from needing to run a separate transaction to set the * fork offset in the immediate future. */ - if (init_xattrs && xfs_has_attr(mp)) { + if (init_xattrs) { ip->i_forkoff = xfs_default_attroffset(ip) >> 3; xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0); + + if (!xfs_has_attr(mp)) { + spin_lock(&mp->m_sb_lock); + xfs_add_attr(mp); + spin_unlock(&mp->m_sb_lock); + xfs_log_sb(tp); + } } /* From 63b47f026cc841bd3d3438dd6fccbc394dfead87 Mon Sep 17 00:00:00 2001 From: Vyacheslav Frantsishko Date: Wed, 26 Jun 2024 10:03:34 +0300 Subject: [PATCH 1455/1578] ASoC: amd: yc: Fix non-functional mic on ASUS M5602RA The Vivobook S 16X IPS needs a quirks-table entry for the internal microphone to function properly. Signed-off-by: Vyacheslav Frantsishko Reviewed-by: Mario Limonciello Link: https://patch.msgid.link/20240626070334.45633-1-itmymaill@gmail.com Signed-off-by: Mark Brown --- sound/soc/amd/yc/acp6x-mach.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sound/soc/amd/yc/acp6x-mach.c b/sound/soc/amd/yc/acp6x-mach.c index 1760b5d42460af..4e3a8ce690a458 100644 --- a/sound/soc/amd/yc/acp6x-mach.c +++ b/sound/soc/amd/yc/acp6x-mach.c @@ -283,6 +283,13 @@ static const struct dmi_system_id yc_acp_quirk_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "M5402RA"), } }, + { + .driver_data = &acp6x_card, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_PRODUCT_NAME, "M5602RA"), + } + }, { .driver_data = &acp6x_card, .matches = { From 103458874baca0bbc8ae0b66d50201d5faa8c17b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 28 May 2024 14:06:30 +0200 Subject: [PATCH 1456/1578] i2c: viai2c: turn common code into a proper module The i2c-viai2c-common.c file is used by two drivers, but is not a proper abstraction and can get linked into both modules in the same configuration, which results in a warning: scripts/Makefile.build:236: drivers/i2c/busses/Makefile: i2c-viai2c-common.o is added to multiple modules: i2c-wmt i2c-zhaoxin The other problems with this include the incorrect use of a __weak function when both are built-in, and the fact that the "common" module is sprinked with 'if (i2c->plat == ...)' checks that have knowledge about the differences between the drivers using it. Avoid the link time warning by making the common driver a proper module with MODULE_LICENCE()/MODULE_AUTHOR() tags, and remove the __weak function by slightly rearranging the code. This adds a little more duplication between the two main drivers, but those versions get more readable in the process. Fixes: a06b80e83011 ("i2c: add zhaoxin i2c controller driver") Signed-off-by: Arnd Bergmann Tested-by: Hans Hu Signed-off-by: Andi Shyti --- drivers/i2c/busses/Makefile | 6 +- drivers/i2c/busses/i2c-viai2c-common.c | 71 ++------------- drivers/i2c/busses/i2c-viai2c-common.h | 2 +- drivers/i2c/busses/i2c-viai2c-wmt.c | 36 ++++++++ drivers/i2c/busses/i2c-viai2c-zhaoxin.c | 113 +++++++++++++++++++----- 5 files changed, 139 insertions(+), 89 deletions(-) diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile index 3d65934f5eb486..78d0561339e5be 100644 --- a/drivers/i2c/busses/Makefile +++ b/drivers/i2c/busses/Makefile @@ -29,8 +29,7 @@ obj-$(CONFIG_I2C_SIS630) += i2c-sis630.o obj-$(CONFIG_I2C_SIS96X) += i2c-sis96x.o obj-$(CONFIG_I2C_VIA) += i2c-via.o obj-$(CONFIG_I2C_VIAPRO) += i2c-viapro.o -i2c-zhaoxin-objs := i2c-viai2c-zhaoxin.o i2c-viai2c-common.o -obj-$(CONFIG_I2C_ZHAOXIN) += i2c-zhaoxin.o +obj-$(CONFIG_I2C_ZHAOXIN) += i2c-viai2c-zhaoxin.o i2c-viai2c-common.o # Mac SMBus host controller drivers obj-$(CONFIG_I2C_HYDRA) += i2c-hydra.o @@ -120,8 +119,7 @@ obj-$(CONFIG_I2C_TEGRA_BPMP) += i2c-tegra-bpmp.o obj-$(CONFIG_I2C_UNIPHIER) += i2c-uniphier.o obj-$(CONFIG_I2C_UNIPHIER_F) += i2c-uniphier-f.o obj-$(CONFIG_I2C_VERSATILE) += i2c-versatile.o -i2c-wmt-objs := i2c-viai2c-wmt.o i2c-viai2c-common.o -obj-$(CONFIG_I2C_WMT) += i2c-wmt.o +obj-$(CONFIG_I2C_WMT) += i2c-viai2c-wmt.o i2c-viai2c-common.o i2c-octeon-objs := i2c-octeon-core.o i2c-octeon-platdrv.o obj-$(CONFIG_I2C_OCTEON) += i2c-octeon.o i2c-thunderx-objs := i2c-octeon-core.o i2c-thunderx-pcidrv.o diff --git a/drivers/i2c/busses/i2c-viai2c-common.c b/drivers/i2c/busses/i2c-viai2c-common.c index 1844d13f1f7986..162b31306cba8d 100644 --- a/drivers/i2c/busses/i2c-viai2c-common.c +++ b/drivers/i2c/busses/i2c-viai2c-common.c @@ -17,6 +17,7 @@ int viai2c_wait_bus_not_busy(struct viai2c *i2c) return 0; } +EXPORT_SYMBOL_GPL(viai2c_wait_bus_not_busy); static int viai2c_write(struct viai2c *i2c, struct i2c_msg *pmsg, int last) { @@ -121,6 +122,7 @@ int viai2c_xfer(struct i2c_adapter *adap, struct i2c_msg msgs[], int num) return (ret < 0) ? ret : i; } +EXPORT_SYMBOL_GPL(viai2c_xfer); /* * Main process of the byte mode xfer @@ -130,7 +132,7 @@ int viai2c_xfer(struct i2c_adapter *adap, struct i2c_msg msgs[], int num) * 0: there is still data that needs to be transferred * -EIO: error occurred */ -static int viai2c_irq_xfer(struct viai2c *i2c) +int viai2c_irq_xfer(struct viai2c *i2c) { u16 val; struct i2c_msg *msg = i2c->msg; @@ -171,51 +173,11 @@ static int viai2c_irq_xfer(struct viai2c *i2c) return i2c->xfered_len == msg->len; } - -int __weak viai2c_fifo_irq_xfer(struct viai2c *i2c, bool irq) -{ - return 0; -} - -static irqreturn_t viai2c_isr(int irq, void *data) -{ - struct viai2c *i2c = data; - u8 status; - - /* save the status and write-clear it */ - status = readw(i2c->base + VIAI2C_REG_ISR); - if (!status && i2c->platform == VIAI2C_PLAT_ZHAOXIN) - return IRQ_NONE; - - writew(status, i2c->base + VIAI2C_REG_ISR); - - i2c->ret = 0; - if (status & VIAI2C_ISR_NACK_ADDR) - i2c->ret = -EIO; - - if (i2c->platform == VIAI2C_PLAT_WMT && (status & VIAI2C_ISR_SCL_TIMEOUT)) - i2c->ret = -ETIMEDOUT; - - if (!i2c->ret) { - if (i2c->mode == VIAI2C_BYTE_MODE) - i2c->ret = viai2c_irq_xfer(i2c); - else - i2c->ret = viai2c_fifo_irq_xfer(i2c, true); - } - - /* All the data has been successfully transferred or error occurred */ - if (i2c->ret) - complete(&i2c->complete); - - return IRQ_HANDLED; -} +EXPORT_SYMBOL_GPL(viai2c_irq_xfer); int viai2c_init(struct platform_device *pdev, struct viai2c **pi2c, int plat) { - int err; - int irq_flags; struct viai2c *i2c; - struct device_node *np = pdev->dev.of_node; i2c = devm_kzalloc(&pdev->dev, sizeof(*i2c), GFP_KERNEL); if (!i2c) @@ -225,28 +187,8 @@ int viai2c_init(struct platform_device *pdev, struct viai2c **pi2c, int plat) if (IS_ERR(i2c->base)) return PTR_ERR(i2c->base); - if (plat == VIAI2C_PLAT_WMT) { - irq_flags = 0; - i2c->irq = irq_of_parse_and_map(np, 0); - if (!i2c->irq) - return -EINVAL; - } else if (plat == VIAI2C_PLAT_ZHAOXIN) { - irq_flags = IRQF_SHARED; - i2c->irq = platform_get_irq(pdev, 0); - if (i2c->irq < 0) - return i2c->irq; - } else { - return dev_err_probe(&pdev->dev, -EINVAL, "wrong platform type\n"); - } - i2c->platform = plat; - err = devm_request_irq(&pdev->dev, i2c->irq, viai2c_isr, - irq_flags, pdev->name, i2c); - if (err) - return dev_err_probe(&pdev->dev, err, - "failed to request irq %i\n", i2c->irq); - i2c->dev = &pdev->dev; init_completion(&i2c->complete); platform_set_drvdata(pdev, i2c); @@ -254,3 +196,8 @@ int viai2c_init(struct platform_device *pdev, struct viai2c **pi2c, int plat) *pi2c = i2c; return 0; } +EXPORT_SYMBOL_GPL(viai2c_init); + +MODULE_DESCRIPTION("Via/Wondermedia/Zhaoxin I2C master-mode bus adapter"); +MODULE_AUTHOR("Tony Prisk "); +MODULE_LICENSE("GPL"); diff --git a/drivers/i2c/busses/i2c-viai2c-common.h b/drivers/i2c/busses/i2c-viai2c-common.h index 81e827c544347d..00f17733223c57 100644 --- a/drivers/i2c/busses/i2c-viai2c-common.h +++ b/drivers/i2c/busses/i2c-viai2c-common.h @@ -80,6 +80,6 @@ struct viai2c { int viai2c_wait_bus_not_busy(struct viai2c *i2c); int viai2c_xfer(struct i2c_adapter *adap, struct i2c_msg msgs[], int num); int viai2c_init(struct platform_device *pdev, struct viai2c **pi2c, int plat); -int viai2c_fifo_irq_xfer(struct viai2c *i2c, bool irq); +int viai2c_irq_xfer(struct viai2c *i2c); #endif diff --git a/drivers/i2c/busses/i2c-viai2c-wmt.c b/drivers/i2c/busses/i2c-viai2c-wmt.c index e1988f94602662..420fd10fe3aa55 100644 --- a/drivers/i2c/busses/i2c-viai2c-wmt.c +++ b/drivers/i2c/busses/i2c-viai2c-wmt.c @@ -72,6 +72,32 @@ static int wmt_i2c_reset_hardware(struct viai2c *i2c) return 0; } +static irqreturn_t wmt_i2c_isr(int irq, void *data) +{ + struct viai2c *i2c = data; + u8 status; + + /* save the status and write-clear it */ + status = readw(i2c->base + VIAI2C_REG_ISR); + writew(status, i2c->base + VIAI2C_REG_ISR); + + i2c->ret = 0; + if (status & VIAI2C_ISR_NACK_ADDR) + i2c->ret = -EIO; + + if (status & VIAI2C_ISR_SCL_TIMEOUT) + i2c->ret = -ETIMEDOUT; + + if (!i2c->ret) + i2c->ret = viai2c_irq_xfer(i2c); + + /* All the data has been successfully transferred or error occurred */ + if (i2c->ret) + complete(&i2c->complete); + + return IRQ_HANDLED; +} + static int wmt_i2c_probe(struct platform_device *pdev) { struct device_node *np = pdev->dev.of_node; @@ -84,6 +110,16 @@ static int wmt_i2c_probe(struct platform_device *pdev) if (err) return err; + i2c->irq = platform_get_irq(pdev, 0); + if (i2c->irq < 0) + return i2c->irq; + + err = devm_request_irq(&pdev->dev, i2c->irq, wmt_i2c_isr, + 0, pdev->name, i2c); + if (err) + return dev_err_probe(&pdev->dev, err, + "failed to request irq %i\n", i2c->irq); + i2c->clk = of_clk_get(np, 0); if (IS_ERR(i2c->clk)) { dev_err(&pdev->dev, "unable to request clock\n"); diff --git a/drivers/i2c/busses/i2c-viai2c-zhaoxin.c b/drivers/i2c/busses/i2c-viai2c-zhaoxin.c index 7e3ac2a3e1fd73..ab3e44e147e91a 100644 --- a/drivers/i2c/busses/i2c-viai2c-zhaoxin.c +++ b/drivers/i2c/busses/i2c-viai2c-zhaoxin.c @@ -49,8 +49,7 @@ struct viai2c_zhaoxin { u16 xfer_len; }; -/* 'irq == true' means in interrupt context */ -int viai2c_fifo_irq_xfer(struct viai2c *i2c, bool irq) +static int viai2c_fifo_xfer(struct viai2c *i2c) { u16 i; u8 tmp; @@ -59,17 +58,6 @@ int viai2c_fifo_irq_xfer(struct viai2c *i2c, bool irq) bool read = !!(msg->flags & I2C_M_RD); struct viai2c_zhaoxin *priv = i2c->pltfm_priv; - if (irq) { - /* get the received data */ - if (read) - for (i = 0; i < priv->xfer_len; i++) - msg->buf[i2c->xfered_len + i] = ioread8(base + ZXI2C_REG_HRDR); - - i2c->xfered_len += priv->xfer_len; - if (i2c->xfered_len == msg->len) - return 1; - } - /* reset fifo buffer */ tmp = ioread8(base + ZXI2C_REG_HCR); iowrite8(tmp | ZXI2C_HCR_RST_FIFO, base + ZXI2C_REG_HCR); @@ -92,18 +80,59 @@ int viai2c_fifo_irq_xfer(struct viai2c *i2c, bool irq) iowrite8(tmp, base + VIAI2C_REG_CR); } - if (irq) { - /* continue transmission */ - tmp = ioread8(base + VIAI2C_REG_CR); - iowrite8(tmp |= VIAI2C_CR_CPU_RDY, base + VIAI2C_REG_CR); + u16 tcr_val = i2c->tcr; + + /* start transmission */ + tcr_val |= read ? VIAI2C_TCR_READ : 0; + writew(tcr_val | msg->addr, base + VIAI2C_REG_TCR); + + return 0; +} + +static int viai2c_fifo_irq_xfer(struct viai2c *i2c) +{ + u16 i; + u8 tmp; + struct i2c_msg *msg = i2c->msg; + void __iomem *base = i2c->base; + bool read = !!(msg->flags & I2C_M_RD); + struct viai2c_zhaoxin *priv = i2c->pltfm_priv; + + /* get the received data */ + if (read) + for (i = 0; i < priv->xfer_len; i++) + msg->buf[i2c->xfered_len + i] = ioread8(base + ZXI2C_REG_HRDR); + + i2c->xfered_len += priv->xfer_len; + if (i2c->xfered_len == msg->len) + return 1; + + /* reset fifo buffer */ + tmp = ioread8(base + ZXI2C_REG_HCR); + iowrite8(tmp | ZXI2C_HCR_RST_FIFO, base + ZXI2C_REG_HCR); + + /* set xfer len */ + priv->xfer_len = min_t(u16, msg->len - i2c->xfered_len, ZXI2C_FIFO_SIZE); + if (read) { + iowrite8(priv->xfer_len - 1, base + ZXI2C_REG_HRLR); } else { - u16 tcr_val = i2c->tcr; + iowrite8(priv->xfer_len - 1, base + ZXI2C_REG_HTLR); + /* set write data */ + for (i = 0; i < priv->xfer_len; i++) + iowrite8(msg->buf[i2c->xfered_len + i], base + ZXI2C_REG_HTDR); + } - /* start transmission */ - tcr_val |= read ? VIAI2C_TCR_READ : 0; - writew(tcr_val | msg->addr, base + VIAI2C_REG_TCR); + /* prepare to stop transmission */ + if (priv->hrv && msg->len == (i2c->xfered_len + priv->xfer_len)) { + tmp = ioread8(base + VIAI2C_REG_CR); + tmp |= read ? VIAI2C_CR_RX_END : VIAI2C_CR_TX_END; + iowrite8(tmp, base + VIAI2C_REG_CR); } + /* continue transmission */ + tmp = ioread8(base + VIAI2C_REG_CR); + iowrite8(tmp |= VIAI2C_CR_CPU_RDY, base + VIAI2C_REG_CR); + return 0; } @@ -135,7 +164,7 @@ static int zxi2c_master_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int priv->xfer_len = 0; i2c->xfered_len = 0; - viai2c_fifo_irq_xfer(i2c, 0); + viai2c_fifo_xfer(i2c); if (!wait_for_completion_timeout(&i2c->complete, VIAI2C_TIMEOUT)) return -ETIMEDOUT; @@ -228,6 +257,36 @@ static void zxi2c_get_bus_speed(struct viai2c *i2c) dev_info(i2c->dev, "speed mode is %s\n", i2c_freq_mode_string(params[0])); } +static irqreturn_t zxi2c_isr(int irq, void *data) +{ + struct viai2c *i2c = data; + u8 status; + + /* save the status and write-clear it */ + status = readw(i2c->base + VIAI2C_REG_ISR); + if (!status) + return IRQ_NONE; + + writew(status, i2c->base + VIAI2C_REG_ISR); + + i2c->ret = 0; + if (status & VIAI2C_ISR_NACK_ADDR) + i2c->ret = -EIO; + + if (!i2c->ret) { + if (i2c->mode == VIAI2C_BYTE_MODE) + i2c->ret = viai2c_irq_xfer(i2c); + else + i2c->ret = viai2c_fifo_irq_xfer(i2c); + } + + /* All the data has been successfully transferred or error occurred */ + if (i2c->ret) + complete(&i2c->complete); + + return IRQ_HANDLED; +} + static int zxi2c_probe(struct platform_device *pdev) { int error; @@ -239,6 +298,16 @@ static int zxi2c_probe(struct platform_device *pdev) if (error) return error; + i2c->irq = platform_get_irq(pdev, 0); + if (i2c->irq < 0) + return i2c->irq; + + error = devm_request_irq(&pdev->dev, i2c->irq, zxi2c_isr, + IRQF_SHARED, pdev->name, i2c); + if (error) + return dev_err_probe(&pdev->dev, error, + "failed to request irq %i\n", i2c->irq); + priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; From 77453e2b015b5ced5b3f45364dd5a72dfc3bdecb Mon Sep 17 00:00:00 2001 From: Daniele Palmas Date: Tue, 25 Jun 2024 12:22:36 +0200 Subject: [PATCH 1457/1578] net: usb: qmi_wwan: add Telit FN912 compositions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the following Telit FN912 compositions: 0x3000: rmnet + tty (AT/NMEA) + tty (AT) + tty (diag) T: Bus=03 Lev=01 Prnt=03 Port=07 Cnt=01 Dev#= 8 Spd=480 MxCh= 0 D: Ver= 2.01 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=3000 Rev=05.15 S: Manufacturer=Telit Cinterion S: Product=FN912 S: SerialNumber=92c4c4d8 C: #Ifs= 4 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=03(Int.) MxPS= 8 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=60 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms 0x3001: rmnet + tty (AT) + tty (diag) + DPL (data packet logging) + adb T: Bus=03 Lev=01 Prnt=03 Port=07 Cnt=01 Dev#= 7 Spd=480 MxCh= 0 D: Ver= 2.01 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=3001 Rev=05.15 S: Manufacturer=Telit Cinterion S: Product=FN912 S: SerialNumber=92c4c4d8 C: #Ifs= 5 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=03(Int.) MxPS= 8 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=80 Driver=(none) E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=usbfs E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms Signed-off-by: Daniele Palmas Acked-by: Bjørn Mork Link: https://patch.msgid.link/20240625102236.69539-1-dnlplm@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/qmi_wwan.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 663e46348ce333..386d62769dedb1 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1372,6 +1372,8 @@ static const struct usb_device_id products[] = { {QMI_QUIRK_SET_DTR(0x1bc7, 0x1260, 2)}, /* Telit LE910Cx */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1261, 2)}, /* Telit LE910Cx */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1900, 1)}, /* Telit LN940 series */ + {QMI_QUIRK_SET_DTR(0x1bc7, 0x3000, 0)}, /* Telit FN912 series */ + {QMI_QUIRK_SET_DTR(0x1bc7, 0x3001, 0)}, /* Telit FN912 series */ {QMI_FIXED_INTF(0x1c9e, 0x9801, 3)}, /* Telewell TW-3G HSPA+ */ {QMI_FIXED_INTF(0x1c9e, 0x9803, 4)}, /* Telewell TW-3G HSPA+ */ {QMI_FIXED_INTF(0x1c9e, 0x9b01, 3)}, /* XS Stick W100-2 from 4G Systems */ From edf2d546bfd6f5c4d143715cef1b1e7ce5718c4e Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 24 Jun 2024 10:21:41 +0200 Subject: [PATCH 1458/1578] riscv: patch: Flush the icache right after patching to avoid illegal insns We cannot delay the icache flush after patching some functions as we may have patched a function that will get called before the icache flush. The only way to completely avoid such scenario is by flushing the icache as soon as we patch a function. This will probably be costly as we don't batch the icache maintenance anymore. Fixes: 6ca445d8af0e ("riscv: Fix early ftrace nop patching") Reported-by: Conor Dooley Closes: https://lore.kernel.org/linux-riscv/20240613-lubricant-breath-061192a9489a@wendy/ Signed-off-by: Alexandre Ghiti Reviewed-by: Andy Chiu Link: https://lore.kernel.org/r/20240624082141.153871-1-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/ftrace.c | 7 ++----- arch/riscv/kernel/patch.c | 26 ++++++++++++++++++-------- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c index 87cbd86576b281..4b95c574fd0457 100644 --- a/arch/riscv/kernel/ftrace.c +++ b/arch/riscv/kernel/ftrace.c @@ -120,9 +120,6 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) out = ftrace_make_nop(mod, rec, MCOUNT_ADDR); mutex_unlock(&text_mutex); - if (!mod) - local_flush_icache_range(rec->ip, rec->ip + MCOUNT_INSN_SIZE); - return out; } @@ -156,9 +153,9 @@ static int __ftrace_modify_code(void *data) } else { while (atomic_read(¶m->cpu_count) <= num_online_cpus()) cpu_relax(); - } - local_flush_icache_all(); + local_flush_icache_all(); + } return 0; } diff --git a/arch/riscv/kernel/patch.c b/arch/riscv/kernel/patch.c index 4007563fb60735..ab03732d06c46f 100644 --- a/arch/riscv/kernel/patch.c +++ b/arch/riscv/kernel/patch.c @@ -89,6 +89,14 @@ static int __patch_insn_set(void *addr, u8 c, size_t len) memset(waddr, c, len); + /* + * We could have just patched a function that is about to be + * called so make sure we don't execute partially patched + * instructions by flushing the icache as soon as possible. + */ + local_flush_icache_range((unsigned long)waddr, + (unsigned long)waddr + len); + patch_unmap(FIX_TEXT_POKE0); if (across_pages) @@ -135,6 +143,14 @@ static int __patch_insn_write(void *addr, const void *insn, size_t len) ret = copy_to_kernel_nofault(waddr, insn, len); + /* + * We could have just patched a function that is about to be + * called so make sure we don't execute partially patched + * instructions by flushing the icache as soon as possible. + */ + local_flush_icache_range((unsigned long)waddr, + (unsigned long)waddr + len); + patch_unmap(FIX_TEXT_POKE0); if (across_pages) @@ -189,9 +205,6 @@ int patch_text_set_nosync(void *addr, u8 c, size_t len) ret = patch_insn_set(tp, c, len); - if (!ret) - flush_icache_range((uintptr_t)tp, (uintptr_t)tp + len); - return ret; } NOKPROBE_SYMBOL(patch_text_set_nosync); @@ -224,9 +237,6 @@ int patch_text_nosync(void *addr, const void *insns, size_t len) ret = patch_insn_write(tp, insns, len); - if (!ret) - flush_icache_range((uintptr_t) tp, (uintptr_t) tp + len); - return ret; } NOKPROBE_SYMBOL(patch_text_nosync); @@ -253,9 +263,9 @@ static int patch_text_cb(void *data) } else { while (atomic_read(&patch->cpu_count) <= num_online_cpus()) cpu_relax(); - } - local_flush_icache_all(); + local_flush_icache_all(); + } return ret; } From 23b2188920a25e88d447dd7d819a0b0f62fb4455 Mon Sep 17 00:00:00 2001 From: Andy Chiu Date: Thu, 13 Jun 2024 15:11:06 +0800 Subject: [PATCH 1459/1578] riscv: stacktrace: convert arch_stack_walk() to noinstr arch_stack_walk() is called intensively in function_graph when the kernel is compiled with CONFIG_TRACE_IRQFLAGS. As a result, the kernel logs a lot of arch_stack_walk and its sub-functions into the ftrace buffer. However, these functions should not appear on the trace log because they are part of the ftrace itself. This patch references what arm64 does for the smae function. So it further prevent the re-enter kprobe issue, which is also possible on riscv. Related-to: commit 0fbcd8abf337 ("arm64: Prohibit instrumentation on arch_stack_walk()") Fixes: 680341382da5 ("riscv: add CALLER_ADDRx support") Signed-off-by: Andy Chiu Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20240613-dev-andyc-dyn-ftrace-v4-v1-1-1a538e12c01e@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/stacktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index 528ec7cc9a6225..0d3f00eb0baee3 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -156,7 +156,7 @@ unsigned long __get_wchan(struct task_struct *task) return pc; } -noinline void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, +noinline noinstr void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, struct task_struct *task, struct pt_regs *regs) { walk_stackframe(task, regs, consume_entry, cookie); From 9d65ab6050d25f17c13f4195aa8e160c6ac638f6 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 26 Jun 2024 16:51:13 +0200 Subject: [PATCH 1460/1578] ALSA: seq: Fix missing MSB in MIDI2 SPP conversion The conversion of SPP to MIDI2 UMP called a wrong function, and the secondary argument wasn't taken. As a result, MSB of SPP was always zero. Fix to call the right function. Fixes: e9e02819a98a ("ALSA: seq: Automatic conversion of UMP events") Link: https://patch.msgid.link/20240626145141.16648-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/seq/seq_ump_convert.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/core/seq/seq_ump_convert.c b/sound/core/seq/seq_ump_convert.c index 6687efdceea13c..e90b27a135e6f1 100644 --- a/sound/core/seq/seq_ump_convert.c +++ b/sound/core/seq/seq_ump_convert.c @@ -1020,7 +1020,7 @@ static int system_2p_ev_to_ump_midi2(const struct snd_seq_event *event, union snd_ump_midi2_msg *data, unsigned char status) { - return system_1p_ev_to_ump_midi1(event, dest_port, + return system_2p_ev_to_ump_midi1(event, dest_port, (union snd_ump_midi1_msg *)data, status); } From cf546dd289e0f6d2594c25e2fb4e19ee67c6d988 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 27 May 2024 17:40:10 +0200 Subject: [PATCH 1461/1578] block: change rq_integrity_vec to respect the iterator If we allocate a bio that is larger than NVMe maximum request size, attach integrity metadata to it and send it to the NVMe subsystem, the integrity metadata will be corrupted. Splitting the bio works correctly. The function bio_split will clone the bio, trim the iterator of the first bio and advance the iterator of the second bio. However, the function rq_integrity_vec has a bug - it returns the first vector of the bio's metadata and completely disregards the metadata iterator that was advanced when the bio was split. Thus, the second bio uses the same metadata as the first bio and this leads to metadata corruption. This commit changes rq_integrity_vec, so that it calls mp_bvec_iter_bvec instead of returning the first vector. mp_bvec_iter_bvec reads the iterator and uses it to build a bvec for the current position in the iterator. The "queue_max_integrity_segments(rq->q) > 1" check was removed, because the updated rq_integrity_vec function works correctly with multiple segments. Signed-off-by: Mikulas Patocka Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/49d1afaa-f934-6ed2-a678-e0d428c63a65@redhat.com Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 6 +++--- include/linux/blk-integrity.h | 14 +++++++------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 102a9fb0c65fff..5d8035218de9bd 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -826,9 +826,9 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req, struct nvme_command *cmnd) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct bio_vec bv = rq_integrity_vec(req); - iod->meta_dma = dma_map_bvec(dev->dev, rq_integrity_vec(req), - rq_dma_dir(req), 0); + iod->meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0); if (dma_mapping_error(dev->dev, iod->meta_dma)) return BLK_STS_IOERR; cmnd->rw.metadata = cpu_to_le64(iod->meta_dma); @@ -967,7 +967,7 @@ static __always_inline void nvme_pci_unmap_rq(struct request *req) struct nvme_iod *iod = blk_mq_rq_to_pdu(req); dma_unmap_page(dev->dev, iod->meta_dma, - rq_integrity_vec(req)->bv_len, rq_dma_dir(req)); + rq_integrity_vec(req).bv_len, rq_dma_dir(req)); } if (blk_rq_nr_phys_segments(req)) diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index d201140d77a336..0fdd62e6d4b02e 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -90,14 +90,13 @@ static inline bool blk_integrity_rq(struct request *rq) } /* - * Return the first bvec that contains integrity data. Only drivers that are - * limited to a single integrity segment should use this helper. + * Return the current bvec that contains the integrity data. bip_iter may be + * advanced to iterate over the integrity data. */ -static inline struct bio_vec *rq_integrity_vec(struct request *rq) +static inline struct bio_vec rq_integrity_vec(struct request *rq) { - if (WARN_ON_ONCE(queue_max_integrity_segments(rq->q) > 1)) - return NULL; - return rq->bio->bi_integrity->bip_vec; + return mp_bvec_iter_bvec(rq->bio->bi_integrity->bip_vec, + rq->bio->bi_integrity->bip_iter); } #else /* CONFIG_BLK_DEV_INTEGRITY */ static inline int blk_rq_count_integrity_sg(struct request_queue *q, @@ -148,7 +147,8 @@ static inline int blk_integrity_rq(struct request *rq) static inline struct bio_vec *rq_integrity_vec(struct request *rq) { - return NULL; + /* the optimizer will remove all calls to this function */ + return (struct bio_vec){ }; } #endif /* CONFIG_BLK_DEV_INTEGRITY */ From 68f97fe330e01450ace63da0ce5cab676fc97f9a Mon Sep 17 00:00:00 2001 From: Jack Yu Date: Wed, 26 Jun 2024 08:25:34 +0000 Subject: [PATCH 1462/1578] ASoC: rt5645: fix issue of random interrupt from push-button Modify register setting sequence of enabling inline command to fix issue of random interrupt from push-button. Signed-off-by: Jack Yu Link: https://patch.msgid.link/9a7a3a66cbcb426487ca6f558f45e922@realtek.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt5645.c | 24 ++++++++++++++++++------ sound/soc/codecs/rt5645.h | 6 ++++++ 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/sound/soc/codecs/rt5645.c b/sound/soc/codecs/rt5645.c index cdb7ff7020e931..51187b1e0ed2f7 100644 --- a/sound/soc/codecs/rt5645.c +++ b/sound/soc/codecs/rt5645.c @@ -81,7 +81,7 @@ static const struct reg_sequence init_list[] = { static const struct reg_sequence rt5650_init_list[] = { {0xf6, 0x0100}, {RT5645_PWR_ANLG1, 0x02}, - {RT5645_IL_CMD3, 0x0018}, + {RT5645_IL_CMD3, 0x6728}, }; static const struct reg_default rt5645_reg[] = { @@ -3130,20 +3130,32 @@ static void rt5645_enable_push_button_irq(struct snd_soc_component *component, bool enable) { struct snd_soc_dapm_context *dapm = snd_soc_component_get_dapm(component); + int ret; if (enable) { snd_soc_dapm_force_enable_pin(dapm, "ADC L power"); snd_soc_dapm_force_enable_pin(dapm, "ADC R power"); snd_soc_dapm_sync(dapm); + snd_soc_component_update_bits(component, RT5650_4BTN_IL_CMD2, + RT5645_EN_4BTN_IL_MASK | RT5645_RST_4BTN_IL_MASK, + RT5645_EN_4BTN_IL_EN | RT5645_RST_4BTN_IL_RST); + usleep_range(10000, 15000); + snd_soc_component_update_bits(component, RT5650_4BTN_IL_CMD2, + RT5645_EN_4BTN_IL_MASK | RT5645_RST_4BTN_IL_MASK, + RT5645_EN_4BTN_IL_EN | RT5645_RST_4BTN_IL_NORM); + msleep(50); + ret = snd_soc_component_read(component, RT5645_INT_IRQ_ST); + pr_debug("%s read %x = %x\n", __func__, RT5645_INT_IRQ_ST, + snd_soc_component_read(component, RT5645_INT_IRQ_ST)); + snd_soc_component_write(component, RT5645_INT_IRQ_ST, ret); + ret = snd_soc_component_read(component, RT5650_4BTN_IL_CMD1); + pr_debug("%s read %x = %x\n", __func__, RT5650_4BTN_IL_CMD1, + snd_soc_component_read(component, RT5650_4BTN_IL_CMD1)); + snd_soc_component_write(component, RT5650_4BTN_IL_CMD1, ret); snd_soc_component_update_bits(component, RT5650_4BTN_IL_CMD1, 0x3, 0x3); snd_soc_component_update_bits(component, RT5645_INT_IRQ_ST, 0x8, 0x8); - snd_soc_component_update_bits(component, - RT5650_4BTN_IL_CMD2, 0x8000, 0x8000); - snd_soc_component_read(component, RT5650_4BTN_IL_CMD1); - pr_debug("%s read %x = %x\n", __func__, RT5650_4BTN_IL_CMD1, - snd_soc_component_read(component, RT5650_4BTN_IL_CMD1)); } else { snd_soc_component_update_bits(component, RT5650_4BTN_IL_CMD2, 0x8000, 0x0); snd_soc_component_update_bits(component, RT5645_INT_IRQ_ST, 0x8, 0x0); diff --git a/sound/soc/codecs/rt5645.h b/sound/soc/codecs/rt5645.h index 90816b2c5489fa..bef74b29fd541b 100644 --- a/sound/soc/codecs/rt5645.h +++ b/sound/soc/codecs/rt5645.h @@ -2011,6 +2011,12 @@ #define RT5645_ZCD_HP_DIS (0x0 << 15) #define RT5645_ZCD_HP_EN (0x1 << 15) +/* Buttons Inline Command Function 2 (0xe0) */ +#define RT5645_EN_4BTN_IL_MASK (0x1 << 15) +#define RT5645_EN_4BTN_IL_EN (0x1 << 15) +#define RT5645_RST_4BTN_IL_MASK (0x1 << 14) +#define RT5645_RST_4BTN_IL_RST (0x0 << 14) +#define RT5645_RST_4BTN_IL_NORM (0x1 << 14) /* Codec Private Register definition */ /* DAC ADC Digital Volume (0x00) */ From 573d5abf3df00c879fbd25774e4cf3e22c9cabd0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2024 16:26:22 +0200 Subject: [PATCH 1463/1578] md: set md-specific flags for all queue limits The md driver wants to enforce a number of flags for all devices, even when not inheriting them from the underlying devices. To make sure these flags survive the queue_limits_set calls that md uses to update the queue limits without deriving them form the previous limits add a new md_init_stacking_limits helper that calls blk_set_stacking_limits and sets these flags. Fixes: 1122c0c1cc71 ("block: move cache control settings out of queue->flags") Reported-by: kernel test robot Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20240626142637.300624-2-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/md.c | 14 +++++++++----- drivers/md/md.h | 1 + drivers/md/raid0.c | 2 +- drivers/md/raid1.c | 2 +- drivers/md/raid10.c | 2 +- drivers/md/raid5.c | 2 +- 6 files changed, 14 insertions(+), 9 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 69ea54aedd99a1..0ff26a547f1afc 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5853,6 +5853,14 @@ static void mddev_delayed_delete(struct work_struct *ws) kobject_put(&mddev->kobj); } +void md_init_stacking_limits(struct queue_limits *lim) +{ + blk_set_stacking_limits(lim); + lim->features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | + BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT; +} +EXPORT_SYMBOL_GPL(md_init_stacking_limits); + struct mddev *md_alloc(dev_t dev, char *name) { /* @@ -5871,10 +5879,6 @@ struct mddev *md_alloc(dev_t dev, char *name) int shift; int unit; int error; - struct queue_limits lim = { - .features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | - BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT, - }; /* * Wait for any previous instance of this device to be completely @@ -5914,7 +5918,7 @@ struct mddev *md_alloc(dev_t dev, char *name) */ mddev->hold_active = UNTIL_STOP; - disk = blk_alloc_disk(&lim, NUMA_NO_NODE); + disk = blk_alloc_disk(NULL, NUMA_NO_NODE); if (IS_ERR(disk)) { error = PTR_ERR(disk); goto out_free_mddev; diff --git a/drivers/md/md.h b/drivers/md/md.h index c4d7ebf9587d07..28cb4b0b6c1740 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -893,6 +893,7 @@ extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); extern int mddev_init(struct mddev *mddev); extern void mddev_destroy(struct mddev *mddev); +void md_init_stacking_limits(struct queue_limits *lim); struct mddev *md_alloc(dev_t dev, char *name); void mddev_put(struct mddev *mddev); extern int md_run(struct mddev *mddev); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 62634e2a33bd0f..32d58752477847 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -379,7 +379,7 @@ static int raid0_set_limits(struct mddev *mddev) struct queue_limits lim; int err; - blk_set_stacking_limits(&lim); + md_init_stacking_limits(&lim); lim.max_hw_sectors = mddev->chunk_sectors; lim.max_write_zeroes_sectors = mddev->chunk_sectors; lim.io_min = mddev->chunk_sectors << 9; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 1a0eba65b8a92b..04a0c2ca173245 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -3194,7 +3194,7 @@ static int raid1_set_limits(struct mddev *mddev) struct queue_limits lim; int err; - blk_set_stacking_limits(&lim); + md_init_stacking_limits(&lim); lim.max_write_zeroes_sectors = 0; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); if (err) { diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 3334aa803c8380..2a9c4ee982e023 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3974,7 +3974,7 @@ static int raid10_set_queue_limits(struct mddev *mddev) struct queue_limits lim; int err; - blk_set_stacking_limits(&lim); + md_init_stacking_limits(&lim); lim.max_write_zeroes_sectors = 0; lim.io_min = mddev->chunk_sectors << 9; lim.io_opt = lim.io_min * raid10_nr_stripes(conf); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 0192a6323f09ba..10219205160bbf 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7708,7 +7708,7 @@ static int raid5_set_limits(struct mddev *mddev) */ stripe = roundup_pow_of_two(data_disks * (mddev->chunk_sectors << 9)); - blk_set_stacking_limits(&lim); + md_init_stacking_limits(&lim); lim.io_min = mddev->chunk_sectors << 9; lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded); lim.features |= BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE; From 78887d004fb2bb03233122a048eaf46e850dabf4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2024 16:26:23 +0200 Subject: [PATCH 1464/1578] block: correctly report cache type Check the features flag and the override flag using the blk_queue_write_cache, helper otherwise we're going to always report "write through". Fixes: 1122c0c1cc71 ("block: move cache control settings out of queue->flags") Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: John Garry Link: https://lore.kernel.org/r/20240626142637.300624-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 1a984179f3acc5..3a167abecdceae 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -453,9 +453,9 @@ static ssize_t queue_io_timeout_store(struct request_queue *q, const char *page, static ssize_t queue_wc_show(struct request_queue *q, char *page) { - if (q->limits.features & BLK_FLAG_WRITE_CACHE_DISABLED) - return sprintf(page, "write through\n"); - return sprintf(page, "write back\n"); + if (blk_queue_write_cache(q)) + return sprintf(page, "write back\n"); + return sprintf(page, "write through\n"); } static ssize_t queue_wc_store(struct request_queue *q, const char *page, From ec9b1cf0b0ebfb52274971a8a0e74e0a133f64fb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2024 16:26:24 +0200 Subject: [PATCH 1465/1578] block: rename BLK_FEAT_MISALIGNED This is a flag for ->flags and not a feature for ->features. And fix the one place that actually incorrectly cleared it from ->features. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20240626142637.300624-4-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 18 +++++++++--------- include/linux/blkdev.h | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index ec7dbe93d5c324..ed39a55c5bae7c 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -351,7 +351,7 @@ static int blk_validate_limits(struct queue_limits *lim) if (lim->alignment_offset) { lim->alignment_offset &= (lim->physical_block_size - 1); - lim->features &= ~BLK_FEAT_MISALIGNED; + lim->flags &= ~BLK_FLAG_MISALIGNED; } if (!(lim->features & BLK_FEAT_WRITE_CACHE)) @@ -564,7 +564,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, if (!(b->features & BLK_FEAT_POLL)) t->features &= ~BLK_FEAT_POLL; - t->flags |= (b->flags & BLK_FEAT_MISALIGNED); + t->flags |= (b->flags & BLK_FLAG_MISALIGNED); t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); t->max_user_sectors = min_not_zero(t->max_user_sectors, @@ -603,7 +603,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, /* Verify that top and bottom intervals line up */ if (max(top, bottom) % min(top, bottom)) { - t->flags |= BLK_FEAT_MISALIGNED; + t->flags |= BLK_FLAG_MISALIGNED; ret = -1; } } @@ -625,28 +625,28 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, /* Physical block size a multiple of the logical block size? */ if (t->physical_block_size & (t->logical_block_size - 1)) { t->physical_block_size = t->logical_block_size; - t->flags |= BLK_FEAT_MISALIGNED; + t->flags |= BLK_FLAG_MISALIGNED; ret = -1; } /* Minimum I/O a multiple of the physical block size? */ if (t->io_min & (t->physical_block_size - 1)) { t->io_min = t->physical_block_size; - t->flags |= BLK_FEAT_MISALIGNED; + t->flags |= BLK_FLAG_MISALIGNED; ret = -1; } /* Optimal I/O a multiple of the physical block size? */ if (t->io_opt & (t->physical_block_size - 1)) { t->io_opt = 0; - t->flags |= BLK_FEAT_MISALIGNED; + t->flags |= BLK_FLAG_MISALIGNED; ret = -1; } /* chunk_sectors a multiple of the physical block size? */ if ((t->chunk_sectors << 9) & (t->physical_block_size - 1)) { t->chunk_sectors = 0; - t->flags |= BLK_FEAT_MISALIGNED; + t->flags |= BLK_FLAG_MISALIGNED; ret = -1; } @@ -656,7 +656,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, /* Verify that new alignment_offset is on a logical block boundary */ if (t->alignment_offset & (t->logical_block_size - 1)) { - t->flags |= BLK_FEAT_MISALIGNED; + t->flags |= BLK_FLAG_MISALIGNED; ret = -1; } @@ -809,7 +809,7 @@ int bdev_alignment_offset(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); - if (q->limits.flags & BLK_FEAT_MISALIGNED) + if (q->limits.flags & BLK_FLAG_MISALIGNED) return -1; if (bdev_is_partition(bdev)) return queue_limit_alignment_offset(&q->limits, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b2f1362c46814f..1a7e9d9c16d78b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -347,7 +347,7 @@ enum { BLK_FLAG_WRITE_CACHE_DISABLED = (1u << 0), /* I/O topology is misaligned */ - BLK_FEAT_MISALIGNED = (1u << 1), + BLK_FLAG_MISALIGNED = (1u << 1), }; struct queue_limits { From fcf865e357f80285af12c0c9a49f89d71acb7f4b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2024 16:26:25 +0200 Subject: [PATCH 1466/1578] block: convert features and flags to __bitwise types ... and let sparse help us catch mismatches or abuses. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20240626142637.300624-5-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 6 +-- include/linux/blkdev.h | 85 +++++++++++++++++++++--------------------- 2 files changed, 46 insertions(+), 45 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 3a167abecdceae..2e6d9b918127fe 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -288,7 +288,7 @@ static ssize_t queue_dma_alignment_show(struct request_queue *q, char *page) } static ssize_t queue_feature_store(struct request_queue *q, const char *page, - size_t count, unsigned int feature) + size_t count, blk_features_t feature) { struct queue_limits lim; unsigned long val; @@ -418,7 +418,7 @@ static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page, static ssize_t queue_poll_show(struct request_queue *q, char *page) { - return queue_var_show(q->limits.features & BLK_FEAT_POLL, page); + return queue_var_show(!!(q->limits.features & BLK_FEAT_POLL), page); } static ssize_t queue_poll_store(struct request_queue *q, const char *page, @@ -492,7 +492,7 @@ static ssize_t queue_fua_show(struct request_queue *q, char *page) static ssize_t queue_dax_show(struct request_queue *q, char *page) { - return queue_var_show(blk_queue_dax(q), page); + return queue_var_show(!!blk_queue_dax(q), page); } #define QUEUE_RO_ENTRY(_prefix, _name) \ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1a7e9d9c16d78b..b37826b350a2e3 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -283,55 +283,56 @@ static inline bool blk_op_is_passthrough(blk_opf_t op) } /* flags set by the driver in queue_limits.features */ -enum { - /* supports a volatile write cache */ - BLK_FEAT_WRITE_CACHE = (1u << 0), +typedef unsigned int __bitwise blk_features_t; - /* supports passing on the FUA bit */ - BLK_FEAT_FUA = (1u << 1), +/* supports a volatile write cache */ +#define BLK_FEAT_WRITE_CACHE ((__force blk_features_t)(1u << 0)) - /* rotational device (hard drive or floppy) */ - BLK_FEAT_ROTATIONAL = (1u << 2), +/* supports passing on the FUA bit */ +#define BLK_FEAT_FUA ((__force blk_features_t)(1u << 1)) - /* contributes to the random number pool */ - BLK_FEAT_ADD_RANDOM = (1u << 3), +/* rotational device (hard drive or floppy) */ +#define BLK_FEAT_ROTATIONAL ((__force blk_features_t)(1u << 2)) - /* do disk/partitions IO accounting */ - BLK_FEAT_IO_STAT = (1u << 4), +/* contributes to the random number pool */ +#define BLK_FEAT_ADD_RANDOM ((__force blk_features_t)(1u << 3)) - /* don't modify data until writeback is done */ - BLK_FEAT_STABLE_WRITES = (1u << 5), +/* do disk/partitions IO accounting */ +#define BLK_FEAT_IO_STAT ((__force blk_features_t)(1u << 4)) - /* always completes in submit context */ - BLK_FEAT_SYNCHRONOUS = (1u << 6), +/* don't modify data until writeback is done */ +#define BLK_FEAT_STABLE_WRITES ((__force blk_features_t)(1u << 5)) - /* supports REQ_NOWAIT */ - BLK_FEAT_NOWAIT = (1u << 7), +/* always completes in submit context */ +#define BLK_FEAT_SYNCHRONOUS ((__force blk_features_t)(1u << 6)) - /* supports DAX */ - BLK_FEAT_DAX = (1u << 8), +/* supports REQ_NOWAIT */ +#define BLK_FEAT_NOWAIT ((__force blk_features_t)(1u << 7)) - /* supports I/O polling */ - BLK_FEAT_POLL = (1u << 9), +/* supports DAX */ +#define BLK_FEAT_DAX ((__force blk_features_t)(1u << 8)) - /* is a zoned device */ - BLK_FEAT_ZONED = (1u << 10), +/* supports I/O polling */ +#define BLK_FEAT_POLL ((__force blk_features_t)(1u << 9)) - /* supports Zone Reset All */ - BLK_FEAT_ZONE_RESETALL = (1u << 11), +/* is a zoned device */ +#define BLK_FEAT_ZONED ((__force blk_features_t)(1u << 10)) - /* supports PCI(e) p2p requests */ - BLK_FEAT_PCI_P2PDMA = (1u << 12), +/* supports Zone Reset All */ +#define BLK_FEAT_ZONE_RESETALL ((__force blk_features_t)(1u << 11)) - /* skip this queue in blk_mq_(un)quiesce_tagset */ - BLK_FEAT_SKIP_TAGSET_QUIESCE = (1u << 13), +/* supports PCI(e) p2p requests */ +#define BLK_FEAT_PCI_P2PDMA ((__force blk_features_t)(1u << 12)) - /* bounce all highmem pages */ - BLK_FEAT_BOUNCE_HIGH = (1u << 14), +/* skip this queue in blk_mq_(un)quiesce_tagset */ +#define BLK_FEAT_SKIP_TAGSET_QUIESCE ((__force blk_features_t)(1u << 13)) - /* undocumented magic for bcache */ - BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE = (1u << 15), -}; +/* bounce all highmem pages */ +#define BLK_FEAT_BOUNCE_HIGH ((__force blk_features_t)(1u << 14)) + +/* undocumented magic for bcache */ +#define BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE \ + ((__force blk_features_t)(1u << 15)) /* * Flags automatically inherited when stacking limits. @@ -342,17 +343,17 @@ enum { BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE) /* internal flags in queue_limits.flags */ -enum { - /* do not send FLUSH/FUA commands despite advertising a write cache */ - BLK_FLAG_WRITE_CACHE_DISABLED = (1u << 0), +typedef unsigned int __bitwise blk_flags_t; - /* I/O topology is misaligned */ - BLK_FLAG_MISALIGNED = (1u << 1), -}; +/* do not send FLUSH/FUA commands despite advertising a write cache */ +#define BLK_FLAG_WRITE_CACHE_DISABLED ((__force blk_flags_t)(1u << 0)) + +/* I/O topology is misaligned */ +#define BLK_FLAG_MISALIGNED ((__force blk_flags_t)(1u << 1)) struct queue_limits { - unsigned int features; - unsigned int flags; + blk_features_t features; + blk_flags_t flags; unsigned long seg_boundary_mask; unsigned long virt_boundary_mask; From 3302f6f09052274945f877beeb83f74641de2418 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2024 16:26:26 +0200 Subject: [PATCH 1467/1578] block: conding style fixup for blk_queue_max_guaranteed_bio "static" never goes on a line of its own. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20240626142637.300624-6-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index ed39a55c5bae7c..c2221b7406d46a 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -142,8 +142,7 @@ static int blk_validate_integrity_limits(struct queue_limits *lim) * so we assume that we can fit in at least PAGE_SIZE in a segment, apart from * the first and last segments. */ -static -unsigned int blk_queue_max_guaranteed_bio(struct queue_limits *lim) +static unsigned int blk_queue_max_guaranteed_bio(struct queue_limits *lim) { unsigned int max_segments = min(BIO_MAX_VECS, lim->max_segments); unsigned int length; From 73781b3b81e76583708a652c853d54d03dce031d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2024 16:26:27 +0200 Subject: [PATCH 1468/1578] block: remove disk_update_readahead Mark blk_apply_bdi_limits non-static and open code disk_update_readahead in the only caller. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20240626142637.300624-7-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 8 +------- block/blk.h | 2 ++ block/genhd.c | 2 +- include/linux/blkdev.h | 1 - 4 files changed, 4 insertions(+), 9 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index c2221b7406d46a..c692e80bb4f890 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -55,7 +55,7 @@ void blk_set_stacking_limits(struct queue_limits *lim) } EXPORT_SYMBOL(blk_set_stacking_limits); -static void blk_apply_bdi_limits(struct backing_dev_info *bdi, +void blk_apply_bdi_limits(struct backing_dev_info *bdi, struct queue_limits *lim) { /* @@ -434,12 +434,6 @@ int queue_limits_set(struct request_queue *q, struct queue_limits *lim) } EXPORT_SYMBOL_GPL(queue_limits_set); -void disk_update_readahead(struct gendisk *disk) -{ - blk_apply_bdi_limits(disk->bdi, &disk->queue->limits); -} -EXPORT_SYMBOL_GPL(disk_update_readahead); - /** * blk_limits_io_min - set minimum request size for a device * @limits: the queue limits diff --git a/block/blk.h b/block/blk.h index d0a986d8ee507e..95e5a4f81693c4 100644 --- a/block/blk.h +++ b/block/blk.h @@ -358,6 +358,8 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio); enum elv_merge blk_try_merge(struct request *rq, struct bio *bio); int blk_set_default_limits(struct queue_limits *lim); +void blk_apply_bdi_limits(struct backing_dev_info *bdi, + struct queue_limits *lim); int blk_dev_init(void); /* diff --git a/block/genhd.c b/block/genhd.c index 8f1f3c6b4d6729..4dc95a46350532 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -524,7 +524,7 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, disk->part0->bd_dev = MKDEV(disk->major, disk->first_minor); } - disk_update_readahead(disk); + blk_apply_bdi_limits(disk->bdi, &disk->queue->limits); disk_add_events(disk); set_bit(GD_ADDED, &disk->state); return 0; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b37826b350a2e3..6b88382012e958 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -973,7 +973,6 @@ static inline void blk_queue_disable_write_zeroes(struct request_queue *q) /* * Access functions for manipulating queue properties */ -void disk_update_readahead(struct gendisk *disk); extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth); From abfc9d810926dfbf5645c7755c8d5ab96273f27d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2024 16:26:28 +0200 Subject: [PATCH 1469/1578] block: remove the fallback case in queue_dma_alignment Now that all updates go through blk_validate_limits the default of 511 is set at initialization time. Also remove the unused NULL check as calling this helper on a NULL queue can't happen (and doesn't make much sense to start with). Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20240626142637.300624-8-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6b88382012e958..94fcbc91231208 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1394,7 +1394,7 @@ static inline bool bdev_is_zone_start(struct block_device *bdev, static inline int queue_dma_alignment(const struct request_queue *q) { - return q ? q->limits.dma_alignment : 511; + return q->limits.dma_alignment; } static inline unsigned int From e94b45d08b5d1c230c0f59c3eed758d28658851e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2024 16:26:29 +0200 Subject: [PATCH 1470/1578] block: move dma_pad_mask into queue_limits dma_pad_mask is a queue_limits by all ways of looking at it, so move it there and set it through the atomic queue limits APIs. Add a little helper that takes the alignment and pad into account to simplify the code that is touched a bit. Note that there never was any need for the > check in blk_queue_update_dma_pad, this probably was just copy and paste from dma_update_dma_alignment. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20240626142637.300624-9-hch@lst.de Signed-off-by: Jens Axboe --- block/bio-integrity.c | 2 +- block/blk-map.c | 2 +- block/blk-settings.c | 17 ----------------- drivers/ata/libata-scsi.c | 3 +-- drivers/ata/pata_macio.c | 4 ++-- drivers/scsi/scsi_lib.c | 4 ++-- drivers/ufs/core/ufshcd.c | 10 ++++++---- include/linux/blkdev.h | 12 ++++++++---- 8 files changed, 21 insertions(+), 33 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 173ffd4d623788..356ca0d3d62f5a 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -312,7 +312,7 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes, u32 seed) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); - unsigned int align = q->dma_pad_mask | queue_dma_alignment(q); + unsigned int align = blk_lim_dma_alignment_and_pad(&q->limits); struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages; struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec; unsigned int direction, nr_bvecs; diff --git a/block/blk-map.c b/block/blk-map.c index 71210cdb34426d..bce144091128f6 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -634,7 +634,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, const struct iov_iter *iter, gfp_t gfp_mask) { bool copy = false, map_bvec = false; - unsigned long align = q->dma_pad_mask | queue_dma_alignment(q); + unsigned long align = blk_lim_dma_alignment_and_pad(&q->limits); struct bio *bio = NULL; struct iov_iter i; int ret = -EINVAL; diff --git a/block/blk-settings.c b/block/blk-settings.c index c692e80bb4f890..2e559cf97cc834 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -768,23 +768,6 @@ bool queue_limits_stack_integrity(struct queue_limits *t, } EXPORT_SYMBOL_GPL(queue_limits_stack_integrity); -/** - * blk_queue_update_dma_pad - update pad mask - * @q: the request queue for the device - * @mask: pad mask - * - * Update dma pad mask. - * - * Appending pad buffer to a request modifies the last entry of a - * scatter list such that it includes the pad buffer. - **/ -void blk_queue_update_dma_pad(struct request_queue *q, unsigned int mask) -{ - if (mask > q->dma_pad_mask) - q->dma_pad_mask = mask; -} -EXPORT_SYMBOL(blk_queue_update_dma_pad); - /** * blk_set_queue_depth - tell the block layer about the device queue depth * @q: the request queue for the device diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index cdf29b178ddc1e..682971c4cbe418 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -1024,7 +1024,6 @@ EXPORT_SYMBOL_GPL(ata_scsi_dma_need_drain); int ata_scsi_dev_config(struct scsi_device *sdev, struct queue_limits *lim, struct ata_device *dev) { - struct request_queue *q = sdev->request_queue; int depth = 1; if (!ata_id_has_unload(dev->id)) @@ -1038,7 +1037,7 @@ int ata_scsi_dev_config(struct scsi_device *sdev, struct queue_limits *lim, sdev->sector_size = ATA_SECT_SIZE; /* set DMA padding */ - blk_queue_update_dma_pad(q, ATA_DMA_PAD_SZ - 1); + lim->dma_pad_mask = ATA_DMA_PAD_SZ - 1; /* make room for appending the drain */ lim->max_segments--; diff --git a/drivers/ata/pata_macio.c b/drivers/ata/pata_macio.c index 3cb455a32d9266..1b85e8bf4ef91b 100644 --- a/drivers/ata/pata_macio.c +++ b/drivers/ata/pata_macio.c @@ -816,7 +816,7 @@ static int pata_macio_device_configure(struct scsi_device *sdev, /* OHare has issues with non cache aligned DMA on some chipsets */ if (priv->kind == controller_ohare) { lim->dma_alignment = 31; - blk_queue_update_dma_pad(sdev->request_queue, 31); + lim->dma_pad_mask = 31; /* Tell the world about it */ ata_dev_info(dev, "OHare alignment limits applied\n"); @@ -831,7 +831,7 @@ static int pata_macio_device_configure(struct scsi_device *sdev, if (priv->kind == controller_sh_ata6 || priv->kind == controller_k2_ata6) { /* Allright these are bad, apply restrictions */ lim->dma_alignment = 15; - blk_queue_update_dma_pad(sdev->request_queue, 15); + lim->dma_pad_mask = 15; /* We enable MWI and hack cache line size directly here, this * is specific to this chipset and not normal values, we happen diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index e2f7bfb2b9e450..3958a6d14bf457 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1139,9 +1139,9 @@ blk_status_t scsi_alloc_sgtables(struct scsi_cmnd *cmd) */ count = __blk_rq_map_sg(rq->q, rq, cmd->sdb.table.sgl, &last_sg); - if (blk_rq_bytes(rq) & rq->q->dma_pad_mask) { + if (blk_rq_bytes(rq) & rq->q->limits.dma_pad_mask) { unsigned int pad_len = - (rq->q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1; + (rq->q->limits.dma_pad_mask & ~blk_rq_bytes(rq)) + 1; last_sg->length += pad_len; cmd->extra_len += pad_len; diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 0cf07194bbe89d..b7957a431589dd 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -5193,17 +5193,19 @@ static int ufshcd_change_queue_depth(struct scsi_device *sdev, int depth) } /** - * ufshcd_slave_configure - adjust SCSI device configurations + * ufshcd_device_configure - adjust SCSI device configurations * @sdev: pointer to SCSI device + * @lim: queue limits * * Return: 0 (success). */ -static int ufshcd_slave_configure(struct scsi_device *sdev) +static int ufshcd_device_configure(struct scsi_device *sdev, + struct queue_limits *lim) { struct ufs_hba *hba = shost_priv(sdev->host); struct request_queue *q = sdev->request_queue; - blk_queue_update_dma_pad(q, PRDT_DATA_BYTE_COUNT_PAD - 1); + lim->dma_pad_mask = PRDT_DATA_BYTE_COUNT_PAD - 1; /* * Block runtime-pm until all consumers are added. @@ -8907,7 +8909,7 @@ static const struct scsi_host_template ufshcd_driver_template = { .queuecommand = ufshcd_queuecommand, .mq_poll = ufshcd_poll, .slave_alloc = ufshcd_slave_alloc, - .slave_configure = ufshcd_slave_configure, + .device_configure = ufshcd_device_configure, .slave_destroy = ufshcd_slave_destroy, .change_queue_depth = ufshcd_change_queue_depth, .eh_abort_handler = ufshcd_abort, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 94fcbc91231208..a53e3434e1a28c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -401,6 +401,7 @@ struct queue_limits { * due to possible offsets. */ unsigned int dma_alignment; + unsigned int dma_pad_mask; struct blk_integrity integrity; }; @@ -509,8 +510,6 @@ struct request_queue { */ int id; - unsigned int dma_pad_mask; - /* * queue settings */ @@ -981,7 +980,6 @@ extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, sector_t offset); void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, sector_t offset, const char *pfx); -extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); struct blk_independent_access_ranges * @@ -1433,10 +1431,16 @@ static inline bool bdev_iter_is_aligned(struct block_device *bdev, bdev_logical_block_size(bdev) - 1); } +static inline int blk_lim_dma_alignment_and_pad(struct queue_limits *lim) +{ + return lim->dma_alignment | lim->dma_pad_mask; +} + static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr, unsigned int len) { - unsigned int alignment = queue_dma_alignment(q) | q->dma_pad_mask; + unsigned int alignment = blk_lim_dma_alignment_and_pad(&q->limits); + return !(addr & alignment) && !(len & alignment); } From 440e2051c577896275c0e0513ec26964e04c7810 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 29 May 2024 14:42:40 -0700 Subject: [PATCH 1471/1578] nvmet-fc: Remove __counted_by from nvmet_fc_tgt_queue.fod[] Work for __counted_by on generic pointers in structures (not just flexible array members) has started landing in Clang 19 (current tip of tree). During the development of this feature, a restriction was added to __counted_by to prevent the flexible array member's element type from including a flexible array member itself such as: struct foo { int count; char buf[]; }; struct bar { int count; struct foo data[] __counted_by(count); }; because the size of data cannot be calculated with the standard array size formula: sizeof(struct foo) * count This restriction was downgraded to a warning but due to CONFIG_WERROR, it can still break the build. The application of __counted_by on the fod member of 'struct nvmet_fc_tgt_queue' triggers this restriction, resulting in: drivers/nvme/target/fc.c:151:2: error: 'counted_by' should not be applied to an array with element of unknown size because 'struct nvmet_fc_fcp_iod' is a struct type with a flexible array member. This will be an error in a future compiler version [-Werror,-Wbounds-safety-counted-by-elt-type-unknown-size] 151 | struct nvmet_fc_fcp_iod fod[] __counted_by(sqsize); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1 error generated. Remove this use of __counted_by to fix the warning/error. However, rather than remove it altogether, leave it commented, as it may be possible to support this in future compiler releases. Cc: stable@vger.kernel.org Closes: https://github.com/ClangBuiltLinux/linux/issues/2027 Fixes: ccd3129aca28 ("nvmet-fc: Annotate struct nvmet_fc_tgt_queue with __counted_by") Signed-off-by: Nathan Chancellor Signed-off-by: Keith Busch --- drivers/nvme/target/fc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 337ee1cb09ae64..381b4394731f1d 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -148,7 +148,7 @@ struct nvmet_fc_tgt_queue { struct workqueue_struct *work_q; struct kref ref; /* array of fcp_iods */ - struct nvmet_fc_fcp_iod fod[] __counted_by(sqsize); + struct nvmet_fc_fcp_iod fod[] /* __counted_by(sqsize) */; } __aligned(sizeof(unsigned long long)); struct nvmet_fc_hostport { From a11aaf6d0bb4282ce1989e388b13f8d87154ba75 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Wed, 26 Jun 2024 14:06:16 +0200 Subject: [PATCH 1472/1578] kbuild: scripts/gdb: bring the "abspath" back Use the "abspath" call when symlinking the gdb python scripts in scripts/gdb/linux. This call is needed to avoid broken links when running the scripts_gdb target on a build directory located directly under the source tree (e.g., O=builddir). Fixes: 659bbf7e1b08 ("kbuild: scripts/gdb: Replace missed $(srctree)/$(src) w/ $(src)") Signed-off-by: Joel Granados Reviewed-by: Douglas Anderson Tested-by: Douglas Anderson Signed-off-by: Masahiro Yamada --- scripts/gdb/linux/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gdb/linux/Makefile b/scripts/gdb/linux/Makefile index fd1402c0a1a18f..fcd32fcf3ae0ba 100644 --- a/scripts/gdb/linux/Makefile +++ b/scripts/gdb/linux/Makefile @@ -5,7 +5,7 @@ ifdef building_out_of_srctree symlinks := $(patsubst $(src)/%,%,$(wildcard $(src)/*.py)) quiet_cmd_symlink = SYMLINK $@ - cmd_symlink = ln -fsn $(patsubst $(obj)/%,$(src)/%,$@) $@ + cmd_symlink = ln -fsn $(patsubst $(obj)/%,$(abspath $(src))/%,$@) $@ always-y += $(symlinks) $(addprefix $(obj)/, $(symlinks)): FORCE From 7931d32955e09d0a11b1fe0b6aac1bfa061c005c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 26 Jun 2024 23:15:38 +0200 Subject: [PATCH 1473/1578] netfilter: nf_tables: fully validate NFT_DATA_VALUE on store to data registers register store validation for NFT_DATA_VALUE is conditional, however, the datatype is always either NFT_DATA_VALUE or NFT_DATA_VERDICT. This only requires a new helper function to infer the register type from the set datatype so this conditional check can be removed. Otherwise, pointer to chain object can be leaked through the registers. Fixes: 96518518cc41 ("netfilter: add nftables") Reported-by: Linus Torvalds Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 5 +++++ net/netfilter/nf_tables_api.c | 8 ++++---- net/netfilter/nft_lookup.c | 3 ++- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 2796153b03dad3..188d41da1a40af 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -619,6 +619,11 @@ static inline void *nft_set_priv(const struct nft_set *set) return (void *)set->data; } +static inline enum nft_data_types nft_set_datatype(const struct nft_set *set) +{ + return set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE; +} + static inline bool nft_set_gc_is_pending(const struct nft_set *s) { return refcount_read(&s->refs) != 1; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index be3b4c90d2eda1..e8dcf41d360d9b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5740,8 +5740,7 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && nft_data_dump(skb, NFTA_SET_ELEM_DATA, nft_set_ext_data(ext), - set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE, - set->dlen) < 0) + nft_set_datatype(set), set->dlen) < 0) goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS) && @@ -11073,6 +11072,9 @@ static int nft_validate_register_store(const struct nft_ctx *ctx, return 0; default: + if (type != NFT_DATA_VALUE) + return -EINVAL; + if (reg < NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE) return -EINVAL; if (len == 0) @@ -11081,8 +11083,6 @@ static int nft_validate_register_store(const struct nft_ctx *ctx, sizeof_field(struct nft_regs, data)) return -ERANGE; - if (data != NULL && type != NFT_DATA_VALUE) - return -EINVAL; return 0; } } diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index b314ca728a2912..f3080fa1b22631 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -132,7 +132,8 @@ static int nft_lookup_init(const struct nft_ctx *ctx, return -EINVAL; err = nft_parse_register_store(ctx, tb[NFTA_LOOKUP_DREG], - &priv->dreg, NULL, set->dtype, + &priv->dreg, NULL, + nft_set_datatype(set), set->dlen); if (err < 0) return err; From 69b6517687a4b1fb250bd8c9c193a0a304c8ba17 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 26 Jun 2024 19:01:58 -0600 Subject: [PATCH 1474/1578] block: use the right type for stub rq_integrity_vec() For !CONFIG_BLK_DEV_INTEGRITY, rq_integrity_vec() wasn't updated properly. Fix it up. Fixes: cf546dd289e0 ("block: change rq_integrity_vec to respect the iterator") Signed-off-by: Jens Axboe --- include/linux/blk-integrity.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index 0fdd62e6d4b02e..c58634924782d0 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -145,7 +145,7 @@ static inline int blk_integrity_rq(struct request *rq) return 0; } -static inline struct bio_vec *rq_integrity_vec(struct request *rq) +static inline struct bio_vec rq_integrity_vec(struct request *rq) { /* the optimizer will remove all calls to this function */ return (struct bio_vec){ }; From bab4923132feb3e439ae45962979c5d9d5c7c1f1 Mon Sep 17 00:00:00 2001 From: Yunseong Kim Date: Tue, 25 Jun 2024 02:33:23 +0900 Subject: [PATCH 1475/1578] tracing/net_sched: NULL pointer dereference in perf_trace_qdisc_reset() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the TRACE_EVENT(qdisc_reset) NULL dereference occurred from qdisc->dev_queue->dev ->name This situation simulated from bunch of veths and Bluetooth disconnection and reconnection. During qdisc initialization, qdisc was being set to noop_queue. In veth_init_queue, the initial tx_num was reduced back to one, causing the qdisc reset to be called with noop, which led to the kernel panic. I've attached the GitHub gist link that C converted syz-execprogram source code and 3 log of reproduced vmcore-dmesg. https://gist.github.com/yskelg/cc64562873ce249cdd0d5a358b77d740 Yeoreum and I use two fuzzing tool simultaneously. One process with syz-executor : https://github.com/google/syzkaller $ ./syz-execprog -executor=./syz-executor -repeat=1 -sandbox=setuid \ -enable=none -collide=false log1 The other process with perf fuzzer: https://github.com/deater/perf_event_tests/tree/master/fuzzer $ perf_event_tests/fuzzer/perf_fuzzer I think this will happen on the kernel version. Linux kernel version +v6.7.10, +v6.8, +v6.9 and it could happen in v6.10. This occurred from 51270d573a8d. I think this patch is absolutely necessary. Previously, It was showing not intended string value of name. I've reproduced 3 time from my fedora 40 Debug Kernel with any other module or patched. version: 6.10.0-0.rc2.20240608gitdc772f8237f9.29.fc41.aarch64+debug [ 5287.164555] veth0_vlan: left promiscuous mode [ 5287.164929] veth1_macvtap: left promiscuous mode [ 5287.164950] veth0_macvtap: left promiscuous mode [ 5287.164983] veth1_vlan: left promiscuous mode [ 5287.165008] veth0_vlan: left promiscuous mode [ 5287.165450] veth1_macvtap: left promiscuous mode [ 5287.165472] veth0_macvtap: left promiscuous mode [ 5287.165502] veth1_vlan: left promiscuous mode … [ 5297.598240] bridge0: port 2(bridge_slave_1) entered blocking state [ 5297.598262] bridge0: port 2(bridge_slave_1) entered forwarding state [ 5297.598296] bridge0: port 1(bridge_slave_0) entered blocking state [ 5297.598313] bridge0: port 1(bridge_slave_0) entered forwarding state [ 5297.616090] 8021q: adding VLAN 0 to HW filter on device bond0 [ 5297.620405] bridge0: port 1(bridge_slave_0) entered disabled state [ 5297.620730] bridge0: port 2(bridge_slave_1) entered disabled state [ 5297.627247] 8021q: adding VLAN 0 to HW filter on device team0 [ 5297.629636] bridge0: port 1(bridge_slave_0) entered blocking state … [ 5298.002798] bridge_slave_0: left promiscuous mode [ 5298.002869] bridge0: port 1(bridge_slave_0) entered disabled state [ 5298.309444] bond0 (unregistering): (slave bond_slave_0): Releasing backup interface [ 5298.315206] bond0 (unregistering): (slave bond_slave_1): Releasing backup interface [ 5298.320207] bond0 (unregistering): Released all slaves [ 5298.354296] hsr_slave_0: left promiscuous mode [ 5298.360750] hsr_slave_1: left promiscuous mode [ 5298.374889] veth1_macvtap: left promiscuous mode [ 5298.374931] veth0_macvtap: left promiscuous mode [ 5298.374988] veth1_vlan: left promiscuous mode [ 5298.375024] veth0_vlan: left promiscuous mode [ 5299.109741] team0 (unregistering): Port device team_slave_1 removed [ 5299.185870] team0 (unregistering): Port device team_slave_0 removed … [ 5300.155443] Bluetooth: hci3: unexpected cc 0x0c03 length: 249 > 1 [ 5300.155724] Bluetooth: hci3: unexpected cc 0x1003 length: 249 > 9 [ 5300.155988] Bluetooth: hci3: unexpected cc 0x1001 length: 249 > 9 …. [ 5301.075531] team0: Port device team_slave_1 added [ 5301.085515] bridge0: port 1(bridge_slave_0) entered blocking state [ 5301.085531] bridge0: port 1(bridge_slave_0) entered disabled state [ 5301.085588] bridge_slave_0: entered allmulticast mode [ 5301.085800] bridge_slave_0: entered promiscuous mode [ 5301.095617] bridge0: port 1(bridge_slave_0) entered blocking state [ 5301.095633] bridge0: port 1(bridge_slave_0) entered disabled state … [ 5301.149734] bond0: (slave bond_slave_0): Enslaving as an active interface with an up link [ 5301.173234] bond0: (slave bond_slave_0): Enslaving as an active interface with an up link [ 5301.180517] bond0: (slave bond_slave_1): Enslaving as an active interface with an up link [ 5301.193481] hsr_slave_0: entered promiscuous mode [ 5301.204425] hsr_slave_1: entered promiscuous mode [ 5301.210172] debugfs: Directory 'hsr0' with parent 'hsr' already present! [ 5301.210185] Cannot create hsr debugfs directory [ 5301.224061] bond0: (slave bond_slave_1): Enslaving as an active interface with an up link [ 5301.246901] bond0: (slave bond_slave_0): Enslaving as an active interface with an up link [ 5301.255934] team0: Port device team_slave_0 added [ 5301.256480] team0: Port device team_slave_1 added [ 5301.256948] team0: Port device team_slave_0 added … [ 5301.435928] hsr_slave_0: entered promiscuous mode [ 5301.446029] hsr_slave_1: entered promiscuous mode [ 5301.455872] debugfs: Directory 'hsr0' with parent 'hsr' already present! [ 5301.455884] Cannot create hsr debugfs directory [ 5301.502664] hsr_slave_0: entered promiscuous mode [ 5301.513675] hsr_slave_1: entered promiscuous mode [ 5301.526155] debugfs: Directory 'hsr0' with parent 'hsr' already present! [ 5301.526164] Cannot create hsr debugfs directory [ 5301.563662] hsr_slave_0: entered promiscuous mode [ 5301.576129] hsr_slave_1: entered promiscuous mode [ 5301.580259] debugfs: Directory 'hsr0' with parent 'hsr' already present! [ 5301.580270] Cannot create hsr debugfs directory [ 5301.590269] 8021q: adding VLAN 0 to HW filter on device bond0 [ 5301.595872] KASAN: null-ptr-deref in range [0x0000000000000130-0x0000000000000137] [ 5301.595877] Mem abort info: [ 5301.595881] ESR = 0x0000000096000006 [ 5301.595885] EC = 0x25: DABT (current EL), IL = 32 bits [ 5301.595889] SET = 0, FnV = 0 [ 5301.595893] EA = 0, S1PTW = 0 [ 5301.595896] FSC = 0x06: level 2 translation fault [ 5301.595900] Data abort info: [ 5301.595903] ISV = 0, ISS = 0x00000006, ISS2 = 0x00000000 [ 5301.595907] CM = 0, WnR = 0, TnD = 0, TagAccess = 0 [ 5301.595911] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [ 5301.595915] [dfff800000000026] address between user and kernel address ranges [ 5301.595971] Internal error: Oops: 0000000096000006 [#1] SMP … [ 5301.596076] CPU: 2 PID: 102769 Comm: syz-executor.3 Kdump: loaded Tainted: G W ------- --- 6.10.0-0.rc2.20240608gitdc772f8237f9.29.fc41.aarch64+debug #1 [ 5301.596080] Hardware name: VMware, Inc. VMware20,1/VBSA, BIOS VMW201.00V.21805430.BA64.2305221830 05/22/2023 [ 5301.596082] pstate: 01400005 (nzcv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--) [ 5301.596085] pc : strnlen+0x40/0x88 [ 5301.596114] lr : trace_event_get_offsets_qdisc_reset+0x6c/0x2b0 [ 5301.596124] sp : ffff8000beef6b40 [ 5301.596126] x29: ffff8000beef6b40 x28: dfff800000000000 x27: 0000000000000001 [ 5301.596131] x26: 6de1800082c62bd0 x25: 1ffff000110aa9e0 x24: ffff800088554f00 [ 5301.596136] x23: ffff800088554ec0 x22: 0000000000000130 x21: 0000000000000140 [ 5301.596140] x20: dfff800000000000 x19: ffff8000beef6c60 x18: ffff7000115106d8 [ 5301.596143] x17: ffff800121bad000 x16: ffff800080020000 x15: 0000000000000006 [ 5301.596147] x14: 0000000000000002 x13: ffff0001f3ed8d14 x12: ffff700017ddeda5 [ 5301.596151] x11: 1ffff00017ddeda4 x10: ffff700017ddeda4 x9 : ffff800082cc5eec [ 5301.596155] x8 : 0000000000000004 x7 : 00000000f1f1f1f1 x6 : 00000000f2f2f200 [ 5301.596158] x5 : 00000000f3f3f3f3 x4 : ffff700017dded80 x3 : 00000000f204f1f1 [ 5301.596162] x2 : 0000000000000026 x1 : 0000000000000000 x0 : 0000000000000130 [ 5301.596166] Call trace: [ 5301.596175] strnlen+0x40/0x88 [ 5301.596179] trace_event_get_offsets_qdisc_reset+0x6c/0x2b0 [ 5301.596182] perf_trace_qdisc_reset+0xb0/0x538 [ 5301.596184] __traceiter_qdisc_reset+0x68/0xc0 [ 5301.596188] qdisc_reset+0x43c/0x5e8 [ 5301.596190] netif_set_real_num_tx_queues+0x288/0x770 [ 5301.596194] veth_init_queues+0xfc/0x130 [veth] [ 5301.596198] veth_newlink+0x45c/0x850 [veth] [ 5301.596202] rtnl_newlink_create+0x2c8/0x798 [ 5301.596205] __rtnl_newlink+0x92c/0xb60 [ 5301.596208] rtnl_newlink+0xd8/0x130 [ 5301.596211] rtnetlink_rcv_msg+0x2e0/0x890 [ 5301.596214] netlink_rcv_skb+0x1c4/0x380 [ 5301.596225] rtnetlink_rcv+0x20/0x38 [ 5301.596227] netlink_unicast+0x3c8/0x640 [ 5301.596231] netlink_sendmsg+0x658/0xa60 [ 5301.596234] __sock_sendmsg+0xd0/0x180 [ 5301.596243] __sys_sendto+0x1c0/0x280 [ 5301.596246] __arm64_sys_sendto+0xc8/0x150 [ 5301.596249] invoke_syscall+0xdc/0x268 [ 5301.596256] el0_svc_common.constprop.0+0x16c/0x240 [ 5301.596259] do_el0_svc+0x48/0x68 [ 5301.596261] el0_svc+0x50/0x188 [ 5301.596265] el0t_64_sync_handler+0x120/0x130 [ 5301.596268] el0t_64_sync+0x194/0x198 [ 5301.596272] Code: eb15001f 54000120 d343fc02 12000801 (38f46842) [ 5301.596285] SMP: stopping secondary CPUs [ 5301.597053] Starting crashdump kernel... [ 5301.597057] Bye! After applying our patch, I didn't find any kernel panic errors. We've found a simple reproducer # echo 1 > /sys/kernel/debug/tracing/events/qdisc/qdisc_reset/enable # ip link add veth0 type veth peer name veth1 Error: Unknown device type. However, without our patch applied, I tested upstream 6.10.0-rc3 kernel using the qdisc_reset event and the ip command on my qemu virtual machine. This 2 commands makes always kernel panic. Linux version: 6.10.0-rc3 [ 0.000000] Linux version 6.10.0-rc3-00164-g44ef20baed8e-dirty (paran@fedora) (gcc (GCC) 14.1.1 20240522 (Red Hat 14.1.1-4), GNU ld version 2.41-34.fc40) #20 SMP PREEMPT Sat Jun 15 16:51:25 KST 2024 Kernel panic message: [ 615.236484] Internal error: Oops: 0000000096000005 [#1] PREEMPT SMP [ 615.237250] Dumping ftrace buffer: [ 615.237679] (ftrace buffer empty) [ 615.238097] Modules linked in: veth crct10dif_ce virtio_gpu virtio_dma_buf drm_shmem_helper drm_kms_helper zynqmp_fpga xilinx_can xilinx_spi xilinx_selectmap xilinx_core xilinx_pr_decoupler versal_fpga uvcvideo uvc videobuf2_vmalloc videobuf2_memops videobuf2_v4l2 videodev videobuf2_common mc usbnet deflate zstd ubifs ubi rcar_canfd rcar_can omap_mailbox ntb_msi_test ntb_hw_epf lattice_sysconfig_spi lattice_sysconfig ice40_spi gpio_xilinx dwmac_altr_socfpga mdio_regmap stmmac_platform stmmac pcs_xpcs dfl_fme_region dfl_fme_mgr dfl_fme_br dfl_afu dfl fpga_region fpga_bridge can can_dev br_netfilter bridge stp llc atl1c ath11k_pci mhi ath11k_ahb ath11k qmi_helpers ath10k_sdio ath10k_pci ath10k_core ath mac80211 libarc4 cfg80211 drm fuse backlight ipv6 Jun 22 02:36:5[3 6k152.62-4sm98k4-0k]v kCePUr:n e1l :P IUDn:a b4le6 8t oC ohmma: nidpl eN oketr nteali nptaedg i6n.g1 0re.0q-urecs3t- 0at0 1v6i4r-tgu4a4le fa2d0dbraeeds0se-dir tyd f#f2f08 615.252376] Hardware name: linux,dummy-virt (DT) [ 615.253220] pstate: 80400005 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 615.254433] pc : strnlen+0x6c/0xe0 [ 615.255096] lr : trace_event_get_offsets_qdisc_reset+0x94/0x3d0 [ 615.256088] sp : ffff800080b269a0 [ 615.256615] x29: ffff800080b269a0 x28: ffffc070f3f98500 x27: 0000000000000001 [ 615.257831] x26: 0000000000000010 x25: ffffc070f3f98540 x24: ffffc070f619cf60 [ 615.259020] x23: 0000000000000128 x22: 0000000000000138 x21: dfff800000000000 [ 615.260241] x20: ffffc070f631ad00 x19: 0000000000000128 x18: ffffc070f448b800 [ 615.261454] x17: 0000000000000000 x16: 0000000000000001 x15: ffffc070f4ba2a90 [ 615.262635] x14: ffff700010164d73 x13: 1ffff80e1e8d5eb3 x12: 1ffff00010164d72 [ 615.263877] x11: ffff700010164d72 x10: dfff800000000000 x9 : ffffc070e85d6184 [ 615.265047] x8 : ffffc070e4402070 x7 : 000000000000f1f1 x6 : 000000001504a6d3 [ 615.266336] x5 : ffff28ca21122140 x4 : ffffc070f5043ea8 x3 : 0000000000000000 [ 615.267528] x2 : 0000000000000025 x1 : 0000000000000000 x0 : 0000000000000000 [ 615.268747] Call trace: [ 615.269180] strnlen+0x6c/0xe0 [ 615.269767] trace_event_get_offsets_qdisc_reset+0x94/0x3d0 [ 615.270716] trace_event_raw_event_qdisc_reset+0xe8/0x4e8 [ 615.271667] __traceiter_qdisc_reset+0xa0/0x140 [ 615.272499] qdisc_reset+0x554/0x848 [ 615.273134] netif_set_real_num_tx_queues+0x360/0x9a8 [ 615.274050] veth_init_queues+0x110/0x220 [veth] [ 615.275110] veth_newlink+0x538/0xa50 [veth] [ 615.276172] __rtnl_newlink+0x11e4/0x1bc8 [ 615.276944] rtnl_newlink+0xac/0x120 [ 615.277657] rtnetlink_rcv_msg+0x4e4/0x1370 [ 615.278409] netlink_rcv_skb+0x25c/0x4f0 [ 615.279122] rtnetlink_rcv+0x48/0x70 [ 615.279769] netlink_unicast+0x5a8/0x7b8 [ 615.280462] netlink_sendmsg+0xa70/0x1190 Yeoreum and I don't know if the patch we wrote will fix the underlying cause, but we think that priority is to prevent kernel panic happening. So, we're sending this patch. Fixes: 51270d573a8d ("tracing/net_sched: Fix tracepoints that save qdisc_dev() as a string") Link: https://lore.kernel.org/lkml/20240229143432.273b4871@gandalf.local.home/t/ Cc: netdev@vger.kernel.org Tested-by: Yunseong Kim Signed-off-by: Yunseong Kim Signed-off-by: Yeoreum Yun Link: https://lore.kernel.org/r/20240624173320.24945-4-yskelg@gmail.com Signed-off-by: Paolo Abeni --- include/trace/events/qdisc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/events/qdisc.h b/include/trace/events/qdisc.h index f1b5e816e7e5e9..ff33f41a9db706 100644 --- a/include/trace/events/qdisc.h +++ b/include/trace/events/qdisc.h @@ -81,7 +81,7 @@ TRACE_EVENT(qdisc_reset, TP_ARGS(q), TP_STRUCT__entry( - __string( dev, qdisc_dev(q)->name ) + __string( dev, qdisc_dev(q) ? qdisc_dev(q)->name : "(null)" ) __string( kind, q->ops->id ) __field( u32, parent ) __field( u32, handle ) From 7d139181a8912d8bf0ede4f38d37688449b9af23 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 24 Jun 2024 18:36:35 -0700 Subject: [PATCH 1476/1578] selftest: af_unix: Remove test_unix_oob.c. test_unix_oob.c does not fully cover AF_UNIX's MSG_OOB functionality, thus there are discrepancies between TCP behaviour. Also, the test uses fork() to create message producer, and it's not easy to understand and add more test cases. Let's remove test_unix_oob.c and rewrite a new test. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/.gitignore | 1 - tools/testing/selftests/net/af_unix/Makefile | 2 +- .../selftests/net/af_unix/test_unix_oob.c | 436 ------------------ 3 files changed, 1 insertion(+), 438 deletions(-) delete mode 100644 tools/testing/selftests/net/af_unix/test_unix_oob.c diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 49a56eb5d03684..666ab7d9390b17 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -43,7 +43,6 @@ tap tcp_fastopen_backup_key tcp_inq tcp_mmap -test_unix_oob timestamping tls toeplitz diff --git a/tools/testing/selftests/net/af_unix/Makefile b/tools/testing/selftests/net/af_unix/Makefile index 3b83c797650d4b..a25845251eed7d 100644 --- a/tools/testing/selftests/net/af_unix/Makefile +++ b/tools/testing/selftests/net/af_unix/Makefile @@ -1,4 +1,4 @@ CFLAGS += $(KHDR_INCLUDES) -TEST_GEN_PROGS := diag_uid test_unix_oob unix_connect scm_pidfd scm_rights +TEST_GEN_PROGS := diag_uid scm_pidfd scm_rights unix_connect include ../../lib.mk diff --git a/tools/testing/selftests/net/af_unix/test_unix_oob.c b/tools/testing/selftests/net/af_unix/test_unix_oob.c deleted file mode 100644 index a7c51889acd5a2..00000000000000 --- a/tools/testing/selftests/net/af_unix/test_unix_oob.c +++ /dev/null @@ -1,436 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static int pipefd[2]; -static int signal_recvd; -static pid_t producer_id; -static char sock_name[32]; - -static void sig_hand(int sn, siginfo_t *si, void *p) -{ - signal_recvd = sn; -} - -static int set_sig_handler(int signal) -{ - struct sigaction sa; - - sa.sa_sigaction = sig_hand; - sigemptyset(&sa.sa_mask); - sa.sa_flags = SA_SIGINFO | SA_RESTART; - - return sigaction(signal, &sa, NULL); -} - -static void set_filemode(int fd, int set) -{ - int flags = fcntl(fd, F_GETFL, 0); - - if (set) - flags &= ~O_NONBLOCK; - else - flags |= O_NONBLOCK; - fcntl(fd, F_SETFL, flags); -} - -static void signal_producer(int fd) -{ - char cmd; - - cmd = 'S'; - write(fd, &cmd, sizeof(cmd)); -} - -static void wait_for_signal(int fd) -{ - char buf[5]; - - read(fd, buf, 5); -} - -static void die(int status) -{ - fflush(NULL); - unlink(sock_name); - kill(producer_id, SIGTERM); - exit(status); -} - -int is_sioctatmark(int fd) -{ - int ans = -1; - - if (ioctl(fd, SIOCATMARK, &ans, sizeof(ans)) < 0) { -#ifdef DEBUG - perror("SIOCATMARK Failed"); -#endif - } - return ans; -} - -void read_oob(int fd, char *c) -{ - - *c = ' '; - if (recv(fd, c, sizeof(*c), MSG_OOB) < 0) { -#ifdef DEBUG - perror("Reading MSG_OOB Failed"); -#endif - } -} - -int read_data(int pfd, char *buf, int size) -{ - int len = 0; - - memset(buf, size, '0'); - len = read(pfd, buf, size); -#ifdef DEBUG - if (len < 0) - perror("read failed"); -#endif - return len; -} - -static void wait_for_data(int pfd, int event) -{ - struct pollfd pfds[1]; - - pfds[0].fd = pfd; - pfds[0].events = event; - poll(pfds, 1, -1); -} - -void producer(struct sockaddr_un *consumer_addr) -{ - int cfd; - char buf[64]; - int i; - - memset(buf, 'x', sizeof(buf)); - cfd = socket(AF_UNIX, SOCK_STREAM, 0); - - wait_for_signal(pipefd[0]); - if (connect(cfd, (struct sockaddr *)consumer_addr, - sizeof(*consumer_addr)) != 0) { - perror("Connect failed"); - kill(0, SIGTERM); - exit(1); - } - - for (i = 0; i < 2; i++) { - /* Test 1: Test for SIGURG and OOB */ - wait_for_signal(pipefd[0]); - memset(buf, 'x', sizeof(buf)); - buf[63] = '@'; - send(cfd, buf, sizeof(buf), MSG_OOB); - - wait_for_signal(pipefd[0]); - - /* Test 2: Test for OOB being overwitten */ - memset(buf, 'x', sizeof(buf)); - buf[63] = '%'; - send(cfd, buf, sizeof(buf), MSG_OOB); - - memset(buf, 'x', sizeof(buf)); - buf[63] = '#'; - send(cfd, buf, sizeof(buf), MSG_OOB); - - wait_for_signal(pipefd[0]); - - /* Test 3: Test for SIOCATMARK */ - memset(buf, 'x', sizeof(buf)); - buf[63] = '@'; - send(cfd, buf, sizeof(buf), MSG_OOB); - - memset(buf, 'x', sizeof(buf)); - buf[63] = '%'; - send(cfd, buf, sizeof(buf), MSG_OOB); - - memset(buf, 'x', sizeof(buf)); - send(cfd, buf, sizeof(buf), 0); - - wait_for_signal(pipefd[0]); - - /* Test 4: Test for 1byte OOB msg */ - memset(buf, 'x', sizeof(buf)); - buf[0] = '@'; - send(cfd, buf, 1, MSG_OOB); - } -} - -int -main(int argc, char **argv) -{ - int lfd, pfd; - struct sockaddr_un consumer_addr, paddr; - socklen_t len = sizeof(consumer_addr); - char buf[1024]; - int on = 0; - char oob; - int atmark; - - lfd = socket(AF_UNIX, SOCK_STREAM, 0); - memset(&consumer_addr, 0, sizeof(consumer_addr)); - consumer_addr.sun_family = AF_UNIX; - sprintf(sock_name, "unix_oob_%d", getpid()); - unlink(sock_name); - strcpy(consumer_addr.sun_path, sock_name); - - if ((bind(lfd, (struct sockaddr *)&consumer_addr, - sizeof(consumer_addr))) != 0) { - perror("socket bind failed"); - exit(1); - } - - pipe(pipefd); - - listen(lfd, 1); - - producer_id = fork(); - if (producer_id == 0) { - producer(&consumer_addr); - exit(0); - } - - set_sig_handler(SIGURG); - signal_producer(pipefd[1]); - - pfd = accept(lfd, (struct sockaddr *) &paddr, &len); - fcntl(pfd, F_SETOWN, getpid()); - - signal_recvd = 0; - signal_producer(pipefd[1]); - - /* Test 1: - * veriyf that SIGURG is - * delivered, 63 bytes are - * read, oob is '@', and POLLPRI works. - */ - wait_for_data(pfd, POLLPRI); - read_oob(pfd, &oob); - len = read_data(pfd, buf, 1024); - if (!signal_recvd || len != 63 || oob != '@') { - fprintf(stderr, "Test 1 failed sigurg %d len %d %c\n", - signal_recvd, len, oob); - die(1); - } - - signal_recvd = 0; - signal_producer(pipefd[1]); - - /* Test 2: - * Verify that the first OOB is over written by - * the 2nd one and the first OOB is returned as - * part of the read, and sigurg is received. - */ - wait_for_data(pfd, POLLIN | POLLPRI); - len = 0; - while (len < 70) - len = recv(pfd, buf, 1024, MSG_PEEK); - len = read_data(pfd, buf, 1024); - read_oob(pfd, &oob); - if (!signal_recvd || len != 127 || oob != '#') { - fprintf(stderr, "Test 2 failed, sigurg %d len %d OOB %c\n", - signal_recvd, len, oob); - die(1); - } - - signal_recvd = 0; - signal_producer(pipefd[1]); - - /* Test 3: - * verify that 2nd oob over writes - * the first one and read breaks at - * oob boundary returning 127 bytes - * and sigurg is received and atmark - * is set. - * oob is '%' and second read returns - * 64 bytes. - */ - len = 0; - wait_for_data(pfd, POLLIN | POLLPRI); - while (len < 150) - len = recv(pfd, buf, 1024, MSG_PEEK); - len = read_data(pfd, buf, 1024); - atmark = is_sioctatmark(pfd); - read_oob(pfd, &oob); - - if (!signal_recvd || len != 127 || oob != '%' || atmark != 1) { - fprintf(stderr, - "Test 3 failed, sigurg %d len %d OOB %c atmark %d\n", - signal_recvd, len, oob, atmark); - die(1); - } - - signal_recvd = 0; - - len = read_data(pfd, buf, 1024); - if (len != 64) { - fprintf(stderr, "Test 3.1 failed, sigurg %d len %d OOB %c\n", - signal_recvd, len, oob); - die(1); - } - - signal_recvd = 0; - signal_producer(pipefd[1]); - - /* Test 4: - * verify that a single byte - * oob message is delivered. - * set non blocking mode and - * check proper error is - * returned and sigurg is - * received and correct - * oob is read. - */ - - set_filemode(pfd, 0); - - wait_for_data(pfd, POLLIN | POLLPRI); - len = read_data(pfd, buf, 1024); - if ((len == -1) && (errno == 11)) - len = 0; - - read_oob(pfd, &oob); - - if (!signal_recvd || len != 0 || oob != '@') { - fprintf(stderr, "Test 4 failed, sigurg %d len %d OOB %c\n", - signal_recvd, len, oob); - die(1); - } - - set_filemode(pfd, 1); - - /* Inline Testing */ - - on = 1; - if (setsockopt(pfd, SOL_SOCKET, SO_OOBINLINE, &on, sizeof(on))) { - perror("SO_OOBINLINE"); - die(1); - } - - signal_recvd = 0; - signal_producer(pipefd[1]); - - /* Test 1 -- Inline: - * Check that SIGURG is - * delivered and 63 bytes are - * read and oob is '@' - */ - - wait_for_data(pfd, POLLIN | POLLPRI); - len = read_data(pfd, buf, 1024); - - if (!signal_recvd || len != 63) { - fprintf(stderr, "Test 1 Inline failed, sigurg %d len %d\n", - signal_recvd, len); - die(1); - } - - len = read_data(pfd, buf, 1024); - - if (len != 1) { - fprintf(stderr, - "Test 1.1 Inline failed, sigurg %d len %d oob %c\n", - signal_recvd, len, oob); - die(1); - } - - signal_recvd = 0; - signal_producer(pipefd[1]); - - /* Test 2 -- Inline: - * Verify that the first OOB is over written by - * the 2nd one and read breaks correctly on - * 2nd OOB boundary with the first OOB returned as - * part of the read, and sigurg is delivered and - * siocatmark returns true. - * next read returns one byte, the oob byte - * and siocatmark returns false. - */ - len = 0; - wait_for_data(pfd, POLLIN | POLLPRI); - while (len < 70) - len = recv(pfd, buf, 1024, MSG_PEEK); - len = read_data(pfd, buf, 1024); - atmark = is_sioctatmark(pfd); - if (len != 127 || atmark != 1 || !signal_recvd) { - fprintf(stderr, "Test 2 Inline failed, len %d atmark %d\n", - len, atmark); - die(1); - } - - len = read_data(pfd, buf, 1024); - atmark = is_sioctatmark(pfd); - if (len != 1 || buf[0] != '#' || atmark == 1) { - fprintf(stderr, "Test 2.1 Inline failed, len %d data %c atmark %d\n", - len, buf[0], atmark); - die(1); - } - - signal_recvd = 0; - signal_producer(pipefd[1]); - - /* Test 3 -- Inline: - * verify that 2nd oob over writes - * the first one and read breaks at - * oob boundary returning 127 bytes - * and sigurg is received and siocatmark - * is true after the read. - * subsequent read returns 65 bytes - * because of oob which should be '%'. - */ - len = 0; - wait_for_data(pfd, POLLIN | POLLPRI); - while (len < 126) - len = recv(pfd, buf, 1024, MSG_PEEK); - len = read_data(pfd, buf, 1024); - atmark = is_sioctatmark(pfd); - if (!signal_recvd || len != 127 || !atmark) { - fprintf(stderr, - "Test 3 Inline failed, sigurg %d len %d data %c\n", - signal_recvd, len, buf[0]); - die(1); - } - - len = read_data(pfd, buf, 1024); - atmark = is_sioctatmark(pfd); - if (len != 65 || buf[0] != '%' || atmark != 0) { - fprintf(stderr, - "Test 3.1 Inline failed, len %d oob %c atmark %d\n", - len, buf[0], atmark); - die(1); - } - - signal_recvd = 0; - signal_producer(pipefd[1]); - - /* Test 4 -- Inline: - * verify that a single - * byte oob message is delivered - * and read returns one byte, the oob - * byte and sigurg is received - */ - wait_for_data(pfd, POLLIN | POLLPRI); - len = read_data(pfd, buf, 1024); - if (!signal_recvd || len != 1 || buf[0] != '@') { - fprintf(stderr, - "Test 4 Inline failed, signal %d len %d data %c\n", - signal_recvd, len, buf[0]); - die(1); - } - die(0); -} From d098d77232c375cb2cded4a7099f0a763016ee0d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 24 Jun 2024 18:36:36 -0700 Subject: [PATCH 1477/1578] selftest: af_unix: Add msg_oob.c. AF_UNIX's MSG_OOB functionality lacked thorough testing, and we found some bizarre behaviour. The new selftest validates every MSG_OOB operation against TCP as a reference implementation. This patch adds only a few tests with basic send() and recv() that do not fail. The following patches will add more test cases for SO_OOBINLINE, SIGURG, EPOLLPRI, and SIOCATMARK. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/af_unix/Makefile | 2 +- tools/testing/selftests/net/af_unix/msg_oob.c | 220 ++++++++++++++++++ 2 files changed, 221 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/net/af_unix/msg_oob.c diff --git a/tools/testing/selftests/net/af_unix/Makefile b/tools/testing/selftests/net/af_unix/Makefile index a25845251eed7d..50584479540b09 100644 --- a/tools/testing/selftests/net/af_unix/Makefile +++ b/tools/testing/selftests/net/af_unix/Makefile @@ -1,4 +1,4 @@ CFLAGS += $(KHDR_INCLUDES) -TEST_GEN_PROGS := diag_uid scm_pidfd scm_rights unix_connect +TEST_GEN_PROGS := diag_uid msg_oob scm_pidfd scm_rights unix_connect include ../../lib.mk diff --git a/tools/testing/selftests/net/af_unix/msg_oob.c b/tools/testing/selftests/net/af_unix/msg_oob.c new file mode 100644 index 00000000000000..d427d39d080638 --- /dev/null +++ b/tools/testing/selftests/net/af_unix/msg_oob.c @@ -0,0 +1,220 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright Amazon.com Inc. or its affiliates. */ + +#include +#include +#include + +#include +#include + +#include "../../kselftest_harness.h" + +#define BUF_SZ 32 + +FIXTURE(msg_oob) +{ + int fd[4]; /* 0: AF_UNIX sender + * 1: AF_UNIX receiver + * 2: TCP sender + * 3: TCP receiver + */ +}; + +static void create_unix_socketpair(struct __test_metadata *_metadata, + FIXTURE_DATA(msg_oob) *self) +{ + int ret; + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0, self->fd); + ASSERT_EQ(ret, 0); +} + +static void create_tcp_socketpair(struct __test_metadata *_metadata, + FIXTURE_DATA(msg_oob) *self) +{ + struct sockaddr_in addr; + socklen_t addrlen; + int listen_fd; + int ret; + + listen_fd = socket(AF_INET, SOCK_STREAM, 0); + ASSERT_GE(listen_fd, 0); + + ret = listen(listen_fd, -1); + ASSERT_EQ(ret, 0); + + addrlen = sizeof(addr); + ret = getsockname(listen_fd, (struct sockaddr *)&addr, &addrlen); + ASSERT_EQ(ret, 0); + + self->fd[2] = socket(AF_INET, SOCK_STREAM, 0); + ASSERT_GE(self->fd[2], 0); + + ret = connect(self->fd[2], (struct sockaddr *)&addr, addrlen); + ASSERT_EQ(ret, 0); + + self->fd[3] = accept(listen_fd, (struct sockaddr *)&addr, &addrlen); + ASSERT_GE(self->fd[3], 0); + + ret = fcntl(self->fd[3], F_SETFL, O_NONBLOCK); + ASSERT_EQ(ret, 0); +} + +static void close_sockets(FIXTURE_DATA(msg_oob) *self) +{ + int i; + + for (i = 0; i < 4; i++) + close(self->fd[i]); +} + +FIXTURE_SETUP(msg_oob) +{ + create_unix_socketpair(_metadata, self); + create_tcp_socketpair(_metadata, self); +} + +FIXTURE_TEARDOWN(msg_oob) +{ + close_sockets(self); +} + +static void __sendpair(struct __test_metadata *_metadata, + FIXTURE_DATA(msg_oob) *self, + const void *buf, size_t len, int flags) +{ + int i, ret[2]; + + for (i = 0; i < 2; i++) + ret[i] = send(self->fd[i * 2], buf, len, flags); + + ASSERT_EQ(ret[0], len); + ASSERT_EQ(ret[0], ret[1]); +} + +static void __recvpair(struct __test_metadata *_metadata, + FIXTURE_DATA(msg_oob) *self, + const void *expected_buf, int expected_len, + int buf_len, int flags) +{ + int i, ret[2], recv_errno[2], expected_errno = 0; + char recv_buf[2][BUF_SZ] = {}; + + ASSERT_GE(BUF_SZ, buf_len); + + errno = 0; + + for (i = 0; i < 2; i++) { + ret[i] = recv(self->fd[i * 2 + 1], recv_buf[i], buf_len, flags); + recv_errno[i] = errno; + } + + if (expected_len < 0) { + expected_errno = -expected_len; + expected_len = -1; + } + + if (ret[0] != expected_len || recv_errno[0] != expected_errno) { + TH_LOG("AF_UNIX :%s", ret[0] < 0 ? strerror(recv_errno[0]) : recv_buf[0]); + TH_LOG("Expected:%s", expected_errno ? strerror(expected_errno) : expected_buf); + + ASSERT_EQ(ret[0], expected_len); + ASSERT_EQ(recv_errno[0], expected_errno); + } + + if (ret[0] != ret[1] || recv_errno[0] != recv_errno[1]) { + TH_LOG("AF_UNIX :%s", ret[0] < 0 ? strerror(recv_errno[0]) : recv_buf[0]); + TH_LOG("TCP :%s", ret[1] < 0 ? strerror(recv_errno[1]) : recv_buf[1]); + + ASSERT_EQ(ret[0], ret[1]); + ASSERT_EQ(recv_errno[0], recv_errno[1]); + } + + if (expected_len >= 0) { + int cmp; + + cmp = strncmp(expected_buf, recv_buf[0], expected_len); + if (cmp) { + TH_LOG("AF_UNIX :%s", ret[0] < 0 ? strerror(recv_errno[0]) : recv_buf[0]); + TH_LOG("Expected:%s", expected_errno ? strerror(expected_errno) : expected_buf); + + ASSERT_EQ(cmp, 0); + } + + cmp = strncmp(recv_buf[0], recv_buf[1], expected_len); + if (cmp) { + TH_LOG("AF_UNIX :%s", ret[0] < 0 ? strerror(recv_errno[0]) : recv_buf[0]); + TH_LOG("TCP :%s", ret[1] < 0 ? strerror(recv_errno[1]) : recv_buf[1]); + + ASSERT_EQ(cmp, 0); + } + } +} + +#define sendpair(buf, len, flags) \ + __sendpair(_metadata, self, buf, len, flags) + +#define recvpair(expected_buf, expected_len, buf_len, flags) \ + __recvpair(_metadata, self, \ + expected_buf, expected_len, buf_len, flags) + +TEST_F(msg_oob, non_oob) +{ + sendpair("x", 1, 0); + + recvpair("", -EINVAL, 1, MSG_OOB); +} + +TEST_F(msg_oob, oob) +{ + sendpair("x", 1, MSG_OOB); + + recvpair("x", 1, 1, MSG_OOB); +} + +TEST_F(msg_oob, oob_drop) +{ + sendpair("x", 1, MSG_OOB); + + recvpair("", -EAGAIN, 1, 0); /* Drop OOB. */ + recvpair("", -EINVAL, 1, MSG_OOB); +} + +TEST_F(msg_oob, oob_ahead) +{ + sendpair("hello", 5, MSG_OOB); + + recvpair("o", 1, 1, MSG_OOB); + recvpair("hell", 4, 4, 0); +} + +TEST_F(msg_oob, oob_break) +{ + sendpair("hello", 5, MSG_OOB); + + recvpair("hell", 4, 5, 0); /* Break at OOB even with enough buffer. */ + recvpair("o", 1, 1, MSG_OOB); +} + +TEST_F(msg_oob, oob_ahead_break) +{ + sendpair("hello", 5, MSG_OOB); + sendpair("world", 5, 0); + + recvpair("o", 1, 1, MSG_OOB); + recvpair("hell", 4, 9, 0); /* Break at OOB even after it's recv()ed. */ + recvpair("world", 5, 5, 0); +} + +TEST_F(msg_oob, oob_break_drop) +{ + sendpair("hello", 5, MSG_OOB); + sendpair("world", 5, 0); + + recvpair("hell", 4, 10, 0); /* Break at OOB even with enough buffer. */ + recvpair("world", 5, 10, 0); /* Drop OOB and recv() the next skb. */ + recvpair("", -EINVAL, 1, MSG_OOB); +} + +TEST_HARNESS_MAIN From b94038d841a91d0e3f59cfe4d073e210910366ee Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 24 Jun 2024 18:36:37 -0700 Subject: [PATCH 1478/1578] af_unix: Stop recv(MSG_PEEK) at consumed OOB skb. After consuming OOB data, recv() reading the preceding data must break at the OOB skb regardless of MSG_PEEK. Currently, MSG_PEEK does not stop recv() for AF_UNIX, and the behaviour is not compliant with TCP. >>> from socket import * >>> c1, c2 = socketpair(AF_UNIX) >>> c1.send(b'hello', MSG_OOB) 5 >>> c1.send(b'world') 5 >>> c2.recv(1, MSG_OOB) b'o' >>> c2.recv(9, MSG_PEEK) # This should return b'hell' b'hellworld' # even with enough buffer. Let's fix it by returning NULL for consumed skb and unlinking it only if MSG_PEEK is not specified. This patch also adds test cases that add recv(MSG_PEEK) before each recv(). Without fix: # RUN msg_oob.peek.oob_ahead_break ... # msg_oob.c:134:oob_ahead_break:AF_UNIX :hellworld # msg_oob.c:135:oob_ahead_break:Expected:hell # msg_oob.c:137:oob_ahead_break:Expected ret[0] (9) == expected_len (4) # oob_ahead_break: Test terminated by assertion # FAIL msg_oob.peek.oob_ahead_break not ok 13 msg_oob.peek.oob_ahead_break With fix: # RUN msg_oob.peek.oob_ahead_break ... # OK msg_oob.peek.oob_ahead_break ok 13 msg_oob.peek.oob_ahead_break Fixes: 314001f0bf92 ("af_unix: Add OOB support") Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 9 ++++--- tools/testing/selftests/net/af_unix/msg_oob.c | 25 +++++++++++++++++-- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 5e695a9a609c26..2eaecf9d78a495 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2613,9 +2613,12 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, { struct unix_sock *u = unix_sk(sk); - if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) { - skb_unlink(skb, &sk->sk_receive_queue); - consume_skb(skb); + if (!unix_skb_len(skb)) { + if (!(flags & MSG_PEEK)) { + skb_unlink(skb, &sk->sk_receive_queue); + consume_skb(skb); + } + skb = NULL; } else { struct sk_buff *unlinked_skb = NULL; diff --git a/tools/testing/selftests/net/af_unix/msg_oob.c b/tools/testing/selftests/net/af_unix/msg_oob.c index d427d39d080638..de8d1fcde88377 100644 --- a/tools/testing/selftests/net/af_unix/msg_oob.c +++ b/tools/testing/selftests/net/af_unix/msg_oob.c @@ -21,6 +21,21 @@ FIXTURE(msg_oob) */ }; +FIXTURE_VARIANT(msg_oob) +{ + bool peek; +}; + +FIXTURE_VARIANT_ADD(msg_oob, no_peek) +{ + .peek = false, +}; + +FIXTURE_VARIANT_ADD(msg_oob, peek) +{ + .peek = true +}; + static void create_unix_socketpair(struct __test_metadata *_metadata, FIXTURE_DATA(msg_oob) *self) { @@ -156,8 +171,14 @@ static void __recvpair(struct __test_metadata *_metadata, __sendpair(_metadata, self, buf, len, flags) #define recvpair(expected_buf, expected_len, buf_len, flags) \ - __recvpair(_metadata, self, \ - expected_buf, expected_len, buf_len, flags) + do { \ + if (variant->peek) \ + __recvpair(_metadata, self, \ + expected_buf, expected_len, \ + buf_len, (flags) | MSG_PEEK); \ + __recvpair(_metadata, self, \ + expected_buf, expected_len, buf_len, flags); \ + } while (0) TEST_F(msg_oob, non_oob) { From 93c99f21db360957d49853e5666b5c147f593bda Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 24 Jun 2024 18:36:38 -0700 Subject: [PATCH 1479/1578] af_unix: Don't stop recv(MSG_DONTWAIT) if consumed OOB skb is at the head. Let's say a socket send()s "hello" with MSG_OOB and "world" without flags, >>> from socket import * >>> c1, c2 = socketpair(AF_UNIX) >>> c1.send(b'hello', MSG_OOB) 5 >>> c1.send(b'world') 5 and its peer recv()s "hell" and "o". >>> c2.recv(10) b'hell' >>> c2.recv(1, MSG_OOB) b'o' Now the consumed OOB skb stays at the head of recvq to return a correct value for ioctl(SIOCATMARK), which is broken now and fixed by a later patch. Then, if peer issues recv() with MSG_DONTWAIT, manage_oob() returns NULL, so recv() ends up with -EAGAIN. >>> c2.setblocking(False) # This causes -EAGAIN even with available data >>> c2.recv(5) Traceback (most recent call last): File "", line 1, in BlockingIOError: [Errno 11] Resource temporarily unavailable However, next recv() will return the following available data, "world". >>> c2.recv(5) b'world' When the consumed OOB skb is at the head of the queue, we need to fetch the next skb to fix the weird behaviour. Note that the issue does not happen without MSG_DONTWAIT because we can retry after manage_oob(). This patch also adds a test case that covers the issue. Without fix: # RUN msg_oob.no_peek.ex_oob_break ... # msg_oob.c:134:ex_oob_break:AF_UNIX :Resource temporarily unavailable # msg_oob.c:135:ex_oob_break:Expected:ld # msg_oob.c:137:ex_oob_break:Expected ret[0] (-1) == expected_len (2) # ex_oob_break: Test terminated by assertion # FAIL msg_oob.no_peek.ex_oob_break not ok 8 msg_oob.no_peek.ex_oob_break With fix: # RUN msg_oob.no_peek.ex_oob_break ... # OK msg_oob.no_peek.ex_oob_break ok 8 msg_oob.no_peek.ex_oob_break Fixes: 314001f0bf92 ("af_unix: Add OOB support") Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 19 +++++++++++++++---- tools/testing/selftests/net/af_unix/msg_oob.c | 11 +++++++++++ 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 2eaecf9d78a495..b0b97f8d0d0985 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2614,12 +2614,23 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, struct unix_sock *u = unix_sk(sk); if (!unix_skb_len(skb)) { - if (!(flags & MSG_PEEK)) { - skb_unlink(skb, &sk->sk_receive_queue); - consume_skb(skb); + struct sk_buff *unlinked_skb = NULL; + + spin_lock(&sk->sk_receive_queue.lock); + + if (copied) { + skb = NULL; + } else if (flags & MSG_PEEK) { + skb = skb_peek_next(skb, &sk->sk_receive_queue); + } else { + unlinked_skb = skb; + skb = skb_peek_next(skb, &sk->sk_receive_queue); + __skb_unlink(unlinked_skb, &sk->sk_receive_queue); } - skb = NULL; + spin_unlock(&sk->sk_receive_queue.lock); + + consume_skb(unlinked_skb); } else { struct sk_buff *unlinked_skb = NULL; diff --git a/tools/testing/selftests/net/af_unix/msg_oob.c b/tools/testing/selftests/net/af_unix/msg_oob.c index de8d1fcde88377..b5226ccec3ecef 100644 --- a/tools/testing/selftests/net/af_unix/msg_oob.c +++ b/tools/testing/selftests/net/af_unix/msg_oob.c @@ -238,4 +238,15 @@ TEST_F(msg_oob, oob_break_drop) recvpair("", -EINVAL, 1, MSG_OOB); } +TEST_F(msg_oob, ex_oob_break) +{ + sendpair("hello", 5, MSG_OOB); + sendpair("wor", 3, MSG_OOB); + sendpair("ld", 2, 0); + + recvpair("hellowo", 7, 10, 0); /* Break at OOB but not at ex-OOB. */ + recvpair("r", 1, 1, MSG_OOB); + recvpair("ld", 2, 2, 0); +} + TEST_HARNESS_MAIN From f5ea0768a2554152cac0a6202fcefb597b77486d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 24 Jun 2024 18:36:39 -0700 Subject: [PATCH 1480/1578] selftest: af_unix: Add non-TCP-compliant test cases in msg_oob.c. While testing, I found some weird behaviour on the TCP side as well. For example, TCP drops the preceding OOB data when queueing a new OOB data if the old OOB data is at the head of recvq. # RUN msg_oob.no_peek.ex_oob_drop ... # msg_oob.c:146:ex_oob_drop:AF_UNIX :x # msg_oob.c:147:ex_oob_drop:TCP :Resource temporarily unavailable # msg_oob.c:146:ex_oob_drop:AF_UNIX :y # msg_oob.c:147:ex_oob_drop:TCP :Invalid argument # OK msg_oob.no_peek.ex_oob_drop ok 9 msg_oob.no_peek.ex_oob_drop # RUN msg_oob.no_peek.ex_oob_drop_2 ... # msg_oob.c:146:ex_oob_drop_2:AF_UNIX :x # msg_oob.c:147:ex_oob_drop_2:TCP :Resource temporarily unavailable # OK msg_oob.no_peek.ex_oob_drop_2 ok 10 msg_oob.no_peek.ex_oob_drop_2 This patch allows AF_UNIX's MSG_OOB implementation to produce different results from TCP when operations are guarded with tcp_incompliant{}. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/af_unix/msg_oob.c | 49 +++++++++++++++++-- 1 file changed, 44 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/net/af_unix/msg_oob.c b/tools/testing/selftests/net/af_unix/msg_oob.c index b5226ccec3ecef..46e92d06b0a399 100644 --- a/tools/testing/selftests/net/af_unix/msg_oob.c +++ b/tools/testing/selftests/net/af_unix/msg_oob.c @@ -19,6 +19,7 @@ FIXTURE(msg_oob) * 2: TCP sender * 3: TCP receiver */ + bool tcp_compliant; }; FIXTURE_VARIANT(msg_oob) @@ -88,6 +89,8 @@ FIXTURE_SETUP(msg_oob) { create_unix_socketpair(_metadata, self); create_tcp_socketpair(_metadata, self); + + self->tcp_compliant = true; } FIXTURE_TEARDOWN(msg_oob) @@ -115,6 +118,7 @@ static void __recvpair(struct __test_metadata *_metadata, { int i, ret[2], recv_errno[2], expected_errno = 0; char recv_buf[2][BUF_SZ] = {}; + bool printed = false; ASSERT_GE(BUF_SZ, buf_len); @@ -142,8 +146,12 @@ static void __recvpair(struct __test_metadata *_metadata, TH_LOG("AF_UNIX :%s", ret[0] < 0 ? strerror(recv_errno[0]) : recv_buf[0]); TH_LOG("TCP :%s", ret[1] < 0 ? strerror(recv_errno[1]) : recv_buf[1]); - ASSERT_EQ(ret[0], ret[1]); - ASSERT_EQ(recv_errno[0], recv_errno[1]); + printed = true; + + if (self->tcp_compliant) { + ASSERT_EQ(ret[0], ret[1]); + ASSERT_EQ(recv_errno[0], recv_errno[1]); + } } if (expected_len >= 0) { @@ -159,10 +167,13 @@ static void __recvpair(struct __test_metadata *_metadata, cmp = strncmp(recv_buf[0], recv_buf[1], expected_len); if (cmp) { - TH_LOG("AF_UNIX :%s", ret[0] < 0 ? strerror(recv_errno[0]) : recv_buf[0]); - TH_LOG("TCP :%s", ret[1] < 0 ? strerror(recv_errno[1]) : recv_buf[1]); + if (!printed) { + TH_LOG("AF_UNIX :%s", ret[0] < 0 ? strerror(recv_errno[0]) : recv_buf[0]); + TH_LOG("TCP :%s", ret[1] < 0 ? strerror(recv_errno[1]) : recv_buf[1]); + } - ASSERT_EQ(cmp, 0); + if (self->tcp_compliant) + ASSERT_EQ(cmp, 0); } } } @@ -180,6 +191,11 @@ static void __recvpair(struct __test_metadata *_metadata, expected_buf, expected_len, buf_len, flags); \ } while (0) +#define tcp_incompliant \ + for (self->tcp_compliant = false; \ + self->tcp_compliant == false; \ + self->tcp_compliant = true) + TEST_F(msg_oob, non_oob) { sendpair("x", 1, 0); @@ -249,4 +265,27 @@ TEST_F(msg_oob, ex_oob_break) recvpair("ld", 2, 2, 0); } +TEST_F(msg_oob, ex_oob_drop) +{ + sendpair("x", 1, MSG_OOB); + sendpair("y", 1, MSG_OOB); /* TCP drops "x" at this moment. */ + + tcp_incompliant { + recvpair("x", 1, 1, 0); /* TCP drops "y" by passing through it. */ + recvpair("y", 1, 1, MSG_OOB); /* TCP returns -EINVAL. */ + } +} + +TEST_F(msg_oob, ex_oob_drop_2) +{ + sendpair("x", 1, MSG_OOB); + sendpair("y", 1, MSG_OOB); /* TCP drops "x" at this moment. */ + + recvpair("y", 1, 1, MSG_OOB); + + tcp_incompliant { + recvpair("x", 1, 1, 0); /* TCP returns -EAGAIN. */ + } +} + TEST_HARNESS_MAIN From 36893ef0b661671ee64eb37bf5f345f33d2cabb7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 24 Jun 2024 18:36:40 -0700 Subject: [PATCH 1481/1578] af_unix: Don't stop recv() at consumed ex-OOB skb. Currently, recv() is stopped at a consumed OOB skb even if a new OOB skb is queued and we can ignore the old OOB skb. >>> from socket import * >>> c1, c2 = socket(AF_UNIX, SOCK_STREAM) >>> c1.send(b'hellowor', MSG_OOB) 8 >>> c2.recv(1, MSG_OOB) # consume OOB data stays at middle of recvq. b'r' >>> c1.send(b'ld', MSG_OOB) 2 >>> c2.recv(10) # recv() stops at the old consumed OOB b'hellowo' # should be 'hellowol' manage_oob() should not stop recv() at the old consumed OOB skb if there is a new OOB data queued. Note that TCP behaviour is apparently wrong in this test case because we can recv() the same OOB data twice. Without fix: # RUN msg_oob.no_peek.ex_oob_ahead_break ... # msg_oob.c:138:ex_oob_ahead_break:AF_UNIX :hellowo # msg_oob.c:139:ex_oob_ahead_break:Expected:hellowol # msg_oob.c:141:ex_oob_ahead_break:Expected ret[0] (7) == expected_len (8) # ex_oob_ahead_break: Test terminated by assertion # FAIL msg_oob.no_peek.ex_oob_ahead_break not ok 11 msg_oob.no_peek.ex_oob_ahead_break With fix: # RUN msg_oob.no_peek.ex_oob_ahead_break ... # msg_oob.c:146:ex_oob_ahead_break:AF_UNIX :hellowol # msg_oob.c:147:ex_oob_ahead_break:TCP :helloworl # OK msg_oob.no_peek.ex_oob_ahead_break ok 11 msg_oob.no_peek.ex_oob_ahead_break Fixes: 314001f0bf92 ("af_unix: Add OOB support") Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 2 +- tools/testing/selftests/net/af_unix/msg_oob.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index b0b97f8d0d0985..07f5eaa04b5be3 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2618,7 +2618,7 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, spin_lock(&sk->sk_receive_queue.lock); - if (copied) { + if (copied && (!u->oob_skb || skb == u->oob_skb)) { skb = NULL; } else if (flags & MSG_PEEK) { skb = skb_peek_next(skb, &sk->sk_receive_queue); diff --git a/tools/testing/selftests/net/af_unix/msg_oob.c b/tools/testing/selftests/net/af_unix/msg_oob.c index 46e92d06b0a399..acf4bd0afe1722 100644 --- a/tools/testing/selftests/net/af_unix/msg_oob.c +++ b/tools/testing/selftests/net/af_unix/msg_oob.c @@ -288,4 +288,20 @@ TEST_F(msg_oob, ex_oob_drop_2) } } +TEST_F(msg_oob, ex_oob_ahead_break) +{ + sendpair("hello", 5, MSG_OOB); + sendpair("wor", 3, MSG_OOB); + + recvpair("r", 1, 1, MSG_OOB); + + sendpair("ld", 2, MSG_OOB); + + tcp_incompliant { + recvpair("hellowol", 8, 10, 0); /* TCP recv()s "helloworl", why "r" ?? */ + } + + recvpair("d", 1, 1, MSG_OOB); +} + TEST_HARNESS_MAIN From 436352e8e57e219aae83d28568d9bd9857311e5a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 24 Jun 2024 18:36:41 -0700 Subject: [PATCH 1482/1578] selftest: af_unix: Add SO_OOBINLINE test cases in msg_oob.c When SO_OOBINLINE is enabled on a socket, MSG_OOB can be recv()ed without MSG_OOB flag, and ioctl(SIOCATMARK) will behaves differently. This patch adds some test cases for SO_OOBINLINE. Note the new test cases found two bugs in TCP. 1) After reading OOB data with non-inline mode, we can re-read the data by setting SO_OOBINLINE. # RUN msg_oob.no_peek.inline_oob_ahead_break ... # msg_oob.c:146:inline_oob_ahead_break:AF_UNIX :world # msg_oob.c:147:inline_oob_ahead_break:TCP :oworld # OK msg_oob.no_peek.inline_oob_ahead_break ok 14 msg_oob.no_peek.inline_oob_ahead_break 2) The head OOB data is dropped if SO_OOBINLINE is disabled if a new OOB data is queued. # RUN msg_oob.no_peek.inline_ex_oob_drop ... # msg_oob.c:171:inline_ex_oob_drop:AF_UNIX :x # msg_oob.c:172:inline_ex_oob_drop:TCP :y # msg_oob.c:146:inline_ex_oob_drop:AF_UNIX :y # msg_oob.c:147:inline_ex_oob_drop:TCP :Resource temporarily unavailable # OK msg_oob.no_peek.inline_ex_oob_drop ok 17 msg_oob.no_peek.inline_ex_oob_drop Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/af_unix/msg_oob.c | 91 +++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/tools/testing/selftests/net/af_unix/msg_oob.c b/tools/testing/selftests/net/af_unix/msg_oob.c index acf4bd0afe1722..62361b5e98c313 100644 --- a/tools/testing/selftests/net/af_unix/msg_oob.c +++ b/tools/testing/selftests/net/af_unix/msg_oob.c @@ -178,6 +178,20 @@ static void __recvpair(struct __test_metadata *_metadata, } } +static void __setinlinepair(struct __test_metadata *_metadata, + FIXTURE_DATA(msg_oob) *self) +{ + int i, oob_inline = 1; + + for (i = 0; i < 2; i++) { + int ret; + + ret = setsockopt(self->fd[i * 2 + 1], SOL_SOCKET, SO_OOBINLINE, + &oob_inline, sizeof(oob_inline)); + ASSERT_EQ(ret, 0); + } +} + #define sendpair(buf, len, flags) \ __sendpair(_metadata, self, buf, len, flags) @@ -191,6 +205,9 @@ static void __recvpair(struct __test_metadata *_metadata, expected_buf, expected_len, buf_len, flags); \ } while (0) +#define setinlinepair() \ + __setinlinepair(_metadata, self) + #define tcp_incompliant \ for (self->tcp_compliant = false; \ self->tcp_compliant == false; \ @@ -304,4 +321,78 @@ TEST_F(msg_oob, ex_oob_ahead_break) recvpair("d", 1, 1, MSG_OOB); } +TEST_F(msg_oob, inline_oob) +{ + setinlinepair(); + + sendpair("x", 1, MSG_OOB); + + recvpair("", -EINVAL, 1, MSG_OOB); + recvpair("x", 1, 1, 0); +} + +TEST_F(msg_oob, inline_oob_break) +{ + setinlinepair(); + + sendpair("hello", 5, MSG_OOB); + + recvpair("", -EINVAL, 1, MSG_OOB); + recvpair("hell", 4, 5, 0); /* Break at OOB but not at ex-OOB. */ + recvpair("o", 1, 1, 0); +} + +TEST_F(msg_oob, inline_oob_ahead_break) +{ + sendpair("hello", 5, MSG_OOB); + sendpair("world", 5, 0); + + recvpair("o", 1, 1, MSG_OOB); + + setinlinepair(); + + recvpair("hell", 4, 9, 0); /* Break at OOB even with enough buffer. */ + + tcp_incompliant { + recvpair("world", 5, 6, 0); /* TCP recv()s "oworld", ... "o" ??? */ + } +} + +TEST_F(msg_oob, inline_ex_oob_break) +{ + sendpair("hello", 5, MSG_OOB); + sendpair("wor", 3, MSG_OOB); + sendpair("ld", 2, 0); + + setinlinepair(); + + recvpair("hellowo", 7, 10, 0); /* Break at OOB but not at ex-OOB. */ + recvpair("rld", 3, 3, 0); +} + +TEST_F(msg_oob, inline_ex_oob_no_drop) +{ + sendpair("x", 1, MSG_OOB); + + setinlinepair(); + + sendpair("y", 1, MSG_OOB); /* TCP does NOT drops "x" at this moment. */ + + recvpair("x", 1, 1, 0); + recvpair("y", 1, 1, 0); +} + +TEST_F(msg_oob, inline_ex_oob_drop) +{ + sendpair("x", 1, MSG_OOB); + sendpair("y", 1, MSG_OOB); /* TCP drops "x" at this moment. */ + + setinlinepair(); + + tcp_incompliant { + recvpair("x", 1, 1, 0); /* TCP recv()s "y". */ + recvpair("y", 1, 1, 0); /* TCP returns -EAGAIN. */ + } +} + TEST_HARNESS_MAIN From d02689e6860df0f7eff066f268bfb53ef6993ea7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 24 Jun 2024 18:36:42 -0700 Subject: [PATCH 1483/1578] selftest: af_unix: Check SIGURG after every send() in msg_oob.c When data is sent with MSG_OOB, SIGURG is sent to a process if the receiver socket has set its owner to the process by ioctl(FIOSETOWN) or fcntl(F_SETOWN). This patch adds SIGURG check after every send(MSG_OOB) call. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/af_unix/msg_oob.c | 51 ++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/af_unix/msg_oob.c b/tools/testing/selftests/net/af_unix/msg_oob.c index 62361b5e98c313..123dee0b6739fe 100644 --- a/tools/testing/selftests/net/af_unix/msg_oob.c +++ b/tools/testing/selftests/net/af_unix/msg_oob.c @@ -6,6 +6,8 @@ #include #include +#include +#include #include #include "../../kselftest_harness.h" @@ -19,6 +21,7 @@ FIXTURE(msg_oob) * 2: TCP sender * 3: TCP receiver */ + int signal_fd; bool tcp_compliant; }; @@ -77,6 +80,35 @@ static void create_tcp_socketpair(struct __test_metadata *_metadata, ASSERT_EQ(ret, 0); } +static void setup_sigurg(struct __test_metadata *_metadata, + FIXTURE_DATA(msg_oob) *self) +{ + struct signalfd_siginfo siginfo; + int pid = getpid(); + sigset_t mask; + int i, ret; + + for (i = 0; i < 2; i++) { + ret = ioctl(self->fd[i * 2 + 1], FIOSETOWN, &pid); + ASSERT_EQ(ret, 0); + } + + ret = sigemptyset(&mask); + ASSERT_EQ(ret, 0); + + ret = sigaddset(&mask, SIGURG); + ASSERT_EQ(ret, 0); + + ret = sigprocmask(SIG_BLOCK, &mask, NULL); + ASSERT_EQ(ret, 0); + + self->signal_fd = signalfd(-1, &mask, SFD_NONBLOCK); + ASSERT_GE(self->signal_fd, 0); + + ret = read(self->signal_fd, &siginfo, sizeof(siginfo)); + ASSERT_EQ(ret, -1); +} + static void close_sockets(FIXTURE_DATA(msg_oob) *self) { int i; @@ -90,6 +122,8 @@ FIXTURE_SETUP(msg_oob) create_unix_socketpair(_metadata, self); create_tcp_socketpair(_metadata, self); + setup_sigurg(_metadata, self); + self->tcp_compliant = true; } @@ -104,9 +138,24 @@ static void __sendpair(struct __test_metadata *_metadata, { int i, ret[2]; - for (i = 0; i < 2; i++) + for (i = 0; i < 2; i++) { + struct signalfd_siginfo siginfo = {}; + int bytes; + ret[i] = send(self->fd[i * 2], buf, len, flags); + bytes = read(self->signal_fd, &siginfo, sizeof(siginfo)); + + if (flags & MSG_OOB) { + ASSERT_EQ(bytes, sizeof(siginfo)); + ASSERT_EQ(siginfo.ssi_signo, SIGURG); + + bytes = read(self->signal_fd, &siginfo, sizeof(siginfo)); + } + + ASSERT_EQ(bytes, -1); + } + ASSERT_EQ(ret[0], len); ASSERT_EQ(ret[0], ret[1]); } From 48a998373090f33e558a20a976c6028e11e93184 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 24 Jun 2024 18:36:43 -0700 Subject: [PATCH 1484/1578] selftest: af_unix: Check EPOLLPRI after every send()/recv() in msg_oob.c When OOB data is in recvq, we can detect it with epoll by checking EPOLLPRI. This patch add checks for EPOLLPRI after every send() and recv() in all test cases. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/af_unix/msg_oob.c | 147 ++++++++++++++++++ 1 file changed, 147 insertions(+) diff --git a/tools/testing/selftests/net/af_unix/msg_oob.c b/tools/testing/selftests/net/af_unix/msg_oob.c index 123dee0b6739fe..28b09b36a2f16b 100644 --- a/tools/testing/selftests/net/af_unix/msg_oob.c +++ b/tools/testing/selftests/net/af_unix/msg_oob.c @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -22,6 +23,9 @@ FIXTURE(msg_oob) * 3: TCP receiver */ int signal_fd; + int epoll_fd[2]; /* 0: AF_UNIX receiver + * 1: TCP receiver + */ bool tcp_compliant; }; @@ -109,6 +113,25 @@ static void setup_sigurg(struct __test_metadata *_metadata, ASSERT_EQ(ret, -1); } +static void setup_epollpri(struct __test_metadata *_metadata, + FIXTURE_DATA(msg_oob) *self) +{ + struct epoll_event event = { + .events = EPOLLPRI, + }; + int i; + + for (i = 0; i < 2; i++) { + int ret; + + self->epoll_fd[i] = epoll_create1(0); + ASSERT_GE(self->epoll_fd[i], 0); + + ret = epoll_ctl(self->epoll_fd[i], EPOLL_CTL_ADD, self->fd[i * 2 + 1], &event); + ASSERT_EQ(ret, 0); + } +} + static void close_sockets(FIXTURE_DATA(msg_oob) *self) { int i; @@ -123,6 +146,7 @@ FIXTURE_SETUP(msg_oob) create_tcp_socketpair(_metadata, self); setup_sigurg(_metadata, self); + setup_epollpri(_metadata, self); self->tcp_compliant = true; } @@ -132,6 +156,29 @@ FIXTURE_TEARDOWN(msg_oob) close_sockets(self); } +static void __epollpair(struct __test_metadata *_metadata, + FIXTURE_DATA(msg_oob) *self, + bool oob_remaining) +{ + struct epoll_event event[2] = {}; + int i, ret[2]; + + for (i = 0; i < 2; i++) + ret[i] = epoll_wait(self->epoll_fd[i], &event[i], 1, 0); + + ASSERT_EQ(ret[0], oob_remaining); + + if (self->tcp_compliant) + ASSERT_EQ(ret[0], ret[1]); + + if (oob_remaining) { + ASSERT_EQ(event[0].events, EPOLLPRI); + + if (self->tcp_compliant) + ASSERT_EQ(event[0].events, event[1].events); + } +} + static void __sendpair(struct __test_metadata *_metadata, FIXTURE_DATA(msg_oob) *self, const void *buf, size_t len, int flags) @@ -254,6 +301,9 @@ static void __setinlinepair(struct __test_metadata *_metadata, expected_buf, expected_len, buf_len, flags); \ } while (0) +#define epollpair(oob_remaining) \ + __epollpair(_metadata, self, oob_remaining) + #define setinlinepair() \ __setinlinepair(_metadata, self) @@ -265,109 +315,170 @@ static void __setinlinepair(struct __test_metadata *_metadata, TEST_F(msg_oob, non_oob) { sendpair("x", 1, 0); + epollpair(false); recvpair("", -EINVAL, 1, MSG_OOB); + epollpair(false); } TEST_F(msg_oob, oob) { sendpair("x", 1, MSG_OOB); + epollpair(true); recvpair("x", 1, 1, MSG_OOB); + epollpair(false); } TEST_F(msg_oob, oob_drop) { sendpair("x", 1, MSG_OOB); + epollpair(true); recvpair("", -EAGAIN, 1, 0); /* Drop OOB. */ + epollpair(false); + recvpair("", -EINVAL, 1, MSG_OOB); + epollpair(false); } TEST_F(msg_oob, oob_ahead) { sendpair("hello", 5, MSG_OOB); + epollpair(true); recvpair("o", 1, 1, MSG_OOB); + epollpair(false); + recvpair("hell", 4, 4, 0); + epollpair(false); } TEST_F(msg_oob, oob_break) { sendpair("hello", 5, MSG_OOB); + epollpair(true); recvpair("hell", 4, 5, 0); /* Break at OOB even with enough buffer. */ + epollpair(true); + recvpair("o", 1, 1, MSG_OOB); + epollpair(false); } TEST_F(msg_oob, oob_ahead_break) { sendpair("hello", 5, MSG_OOB); + epollpair(true); + sendpair("world", 5, 0); + epollpair(true); recvpair("o", 1, 1, MSG_OOB); + epollpair(false); + recvpair("hell", 4, 9, 0); /* Break at OOB even after it's recv()ed. */ + epollpair(false); + recvpair("world", 5, 5, 0); + epollpair(false); } TEST_F(msg_oob, oob_break_drop) { sendpair("hello", 5, MSG_OOB); + epollpair(true); + sendpair("world", 5, 0); + epollpair(true); recvpair("hell", 4, 10, 0); /* Break at OOB even with enough buffer. */ + epollpair(true); + recvpair("world", 5, 10, 0); /* Drop OOB and recv() the next skb. */ + epollpair(false); + recvpair("", -EINVAL, 1, MSG_OOB); + epollpair(false); } TEST_F(msg_oob, ex_oob_break) { sendpair("hello", 5, MSG_OOB); + epollpair(true); + sendpair("wor", 3, MSG_OOB); + epollpair(true); + sendpair("ld", 2, 0); + epollpair(true); recvpair("hellowo", 7, 10, 0); /* Break at OOB but not at ex-OOB. */ + epollpair(true); + recvpair("r", 1, 1, MSG_OOB); + epollpair(false); + recvpair("ld", 2, 2, 0); + epollpair(false); } TEST_F(msg_oob, ex_oob_drop) { sendpair("x", 1, MSG_OOB); + epollpair(true); + sendpair("y", 1, MSG_OOB); /* TCP drops "x" at this moment. */ + epollpair(true); tcp_incompliant { recvpair("x", 1, 1, 0); /* TCP drops "y" by passing through it. */ + epollpair(true); + recvpair("y", 1, 1, MSG_OOB); /* TCP returns -EINVAL. */ + epollpair(false); } } TEST_F(msg_oob, ex_oob_drop_2) { sendpair("x", 1, MSG_OOB); + epollpair(true); + sendpair("y", 1, MSG_OOB); /* TCP drops "x" at this moment. */ + epollpair(true); recvpair("y", 1, 1, MSG_OOB); + epollpair(false); tcp_incompliant { recvpair("x", 1, 1, 0); /* TCP returns -EAGAIN. */ + epollpair(false); } } TEST_F(msg_oob, ex_oob_ahead_break) { sendpair("hello", 5, MSG_OOB); + epollpair(true); + sendpair("wor", 3, MSG_OOB); + epollpair(true); recvpair("r", 1, 1, MSG_OOB); + epollpair(false); sendpair("ld", 2, MSG_OOB); + epollpair(true); tcp_incompliant { recvpair("hellowol", 8, 10, 0); /* TCP recv()s "helloworl", why "r" ?? */ } + epollpair(true); + recvpair("d", 1, 1, MSG_OOB); + epollpair(false); } TEST_F(msg_oob, inline_oob) @@ -375,9 +486,13 @@ TEST_F(msg_oob, inline_oob) setinlinepair(); sendpair("x", 1, MSG_OOB); + epollpair(true); recvpair("", -EINVAL, 1, MSG_OOB); + epollpair(true); + recvpair("x", 1, 1, 0); + epollpair(false); } TEST_F(msg_oob, inline_oob_break) @@ -385,62 +500,94 @@ TEST_F(msg_oob, inline_oob_break) setinlinepair(); sendpair("hello", 5, MSG_OOB); + epollpair(true); recvpair("", -EINVAL, 1, MSG_OOB); + epollpair(true); + recvpair("hell", 4, 5, 0); /* Break at OOB but not at ex-OOB. */ + epollpair(true); + recvpair("o", 1, 1, 0); + epollpair(false); } TEST_F(msg_oob, inline_oob_ahead_break) { sendpair("hello", 5, MSG_OOB); + epollpair(true); + sendpair("world", 5, 0); + epollpair(true); recvpair("o", 1, 1, MSG_OOB); + epollpair(false); setinlinepair(); recvpair("hell", 4, 9, 0); /* Break at OOB even with enough buffer. */ + epollpair(false); tcp_incompliant { recvpair("world", 5, 6, 0); /* TCP recv()s "oworld", ... "o" ??? */ } + + epollpair(false); } TEST_F(msg_oob, inline_ex_oob_break) { sendpair("hello", 5, MSG_OOB); + epollpair(true); + sendpair("wor", 3, MSG_OOB); + epollpair(true); + sendpair("ld", 2, 0); + epollpair(true); setinlinepair(); recvpair("hellowo", 7, 10, 0); /* Break at OOB but not at ex-OOB. */ + epollpair(true); + recvpair("rld", 3, 3, 0); + epollpair(false); } TEST_F(msg_oob, inline_ex_oob_no_drop) { sendpair("x", 1, MSG_OOB); + epollpair(true); setinlinepair(); sendpair("y", 1, MSG_OOB); /* TCP does NOT drops "x" at this moment. */ + epollpair(true); recvpair("x", 1, 1, 0); + epollpair(true); + recvpair("y", 1, 1, 0); + epollpair(false); } TEST_F(msg_oob, inline_ex_oob_drop) { sendpair("x", 1, MSG_OOB); + epollpair(true); + sendpair("y", 1, MSG_OOB); /* TCP drops "x" at this moment. */ + epollpair(true); setinlinepair(); tcp_incompliant { recvpair("x", 1, 1, 0); /* TCP recv()s "y". */ + epollpair(true); + recvpair("y", 1, 1, 0); /* TCP returns -EAGAIN. */ + epollpair(false); } } From e400cfa38bb0419cf1313e5494ea2b7d114e86d7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 24 Jun 2024 18:36:44 -0700 Subject: [PATCH 1485/1578] af_unix: Fix wrong ioctl(SIOCATMARK) when consumed OOB skb is at the head. Even if OOB data is recv()ed, ioctl(SIOCATMARK) must return 1 when the OOB skb is at the head of the receive queue and no new OOB data is queued. Without fix: # RUN msg_oob.no_peek.oob ... # msg_oob.c:305:oob:Expected answ[0] (0) == oob_head (1) # oob: Test terminated by assertion # FAIL msg_oob.no_peek.oob not ok 2 msg_oob.no_peek.oob With fix: # RUN msg_oob.no_peek.oob ... # OK msg_oob.no_peek.oob ok 2 msg_oob.no_peek.oob Fixes: 314001f0bf92 ("af_unix: Add OOB support") Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 15 +++- tools/testing/selftests/net/af_unix/msg_oob.c | 68 +++++++++++++++++++ 2 files changed, 81 insertions(+), 2 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 07f5eaa04b5be3..142f56770b77f7 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -3107,12 +3107,23 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) #if IS_ENABLED(CONFIG_AF_UNIX_OOB) case SIOCATMARK: { + struct unix_sock *u = unix_sk(sk); struct sk_buff *skb; int answ = 0; + mutex_lock(&u->iolock); + skb = skb_peek(&sk->sk_receive_queue); - if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb)) - answ = 1; + if (skb) { + struct sk_buff *oob_skb = READ_ONCE(u->oob_skb); + + if (skb == oob_skb || + (!oob_skb && !unix_skb_len(skb))) + answ = 1; + } + + mutex_unlock(&u->iolock); + err = put_user(answ, (int __user *)arg); } break; diff --git a/tools/testing/selftests/net/af_unix/msg_oob.c b/tools/testing/selftests/net/af_unix/msg_oob.c index 28b09b36a2f16b..2d0024329437aa 100644 --- a/tools/testing/selftests/net/af_unix/msg_oob.c +++ b/tools/testing/selftests/net/af_unix/msg_oob.c @@ -288,6 +288,26 @@ static void __setinlinepair(struct __test_metadata *_metadata, } } +static void __siocatmarkpair(struct __test_metadata *_metadata, + FIXTURE_DATA(msg_oob) *self, + bool oob_head) +{ + int answ[2] = {}; + int i; + + for (i = 0; i < 2; i++) { + int ret; + + ret = ioctl(self->fd[i * 2 + 1], SIOCATMARK, &answ[i]); + ASSERT_EQ(ret, 0); + } + + ASSERT_EQ(answ[0], oob_head); + + if (self->tcp_compliant) + ASSERT_EQ(answ[0], answ[1]); +} + #define sendpair(buf, len, flags) \ __sendpair(_metadata, self, buf, len, flags) @@ -304,6 +324,9 @@ static void __setinlinepair(struct __test_metadata *_metadata, #define epollpair(oob_remaining) \ __epollpair(_metadata, self, oob_remaining) +#define siocatmarkpair(oob_head) \ + __siocatmarkpair(_metadata, self, oob_head) + #define setinlinepair() \ __setinlinepair(_metadata, self) @@ -325,9 +348,11 @@ TEST_F(msg_oob, oob) { sendpair("x", 1, MSG_OOB); epollpair(true); + siocatmarkpair(true); recvpair("x", 1, 1, MSG_OOB); epollpair(false); + siocatmarkpair(true); } TEST_F(msg_oob, oob_drop) @@ -481,18 +506,40 @@ TEST_F(msg_oob, ex_oob_ahead_break) epollpair(false); } +TEST_F(msg_oob, ex_oob_siocatmark) +{ + sendpair("hello", 5, MSG_OOB); + epollpair(true); + siocatmarkpair(false); + + recvpair("o", 1, 1, MSG_OOB); + epollpair(false); + siocatmarkpair(false); + + sendpair("world", 5, MSG_OOB); + epollpair(true); + siocatmarkpair(false); + + recvpair("hell", 4, 4, 0); /* Intentionally stop at ex-OOB. */ + epollpair(true); + siocatmarkpair(false); +} + TEST_F(msg_oob, inline_oob) { setinlinepair(); sendpair("x", 1, MSG_OOB); epollpair(true); + siocatmarkpair(true); recvpair("", -EINVAL, 1, MSG_OOB); epollpair(true); + siocatmarkpair(true); recvpair("x", 1, 1, 0); epollpair(false); + siocatmarkpair(false); } TEST_F(msg_oob, inline_oob_break) @@ -591,4 +638,25 @@ TEST_F(msg_oob, inline_ex_oob_drop) } } +TEST_F(msg_oob, inline_ex_oob_siocatmark) +{ + sendpair("hello", 5, MSG_OOB); + epollpair(true); + siocatmarkpair(false); + + recvpair("o", 1, 1, MSG_OOB); + epollpair(false); + siocatmarkpair(false); + + setinlinepair(); + + sendpair("world", 5, MSG_OOB); + epollpair(true); + siocatmarkpair(false); + + recvpair("hell", 4, 4, 0); /* Intentionally stop at ex-OOB. */ + epollpair(true); + siocatmarkpair(false); +} + TEST_HARNESS_MAIN From 91b7186c8d141fb89412930d6b9974c9cba24af7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 24 Jun 2024 18:36:45 -0700 Subject: [PATCH 1486/1578] selftest: af_unix: Check SIOCATMARK after every send()/recv() in msg_oob.c. To catch regression, let's check ioctl(SIOCATMARK) after every send() and recv() calls. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/af_unix/msg_oob.c | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/tools/testing/selftests/net/af_unix/msg_oob.c b/tools/testing/selftests/net/af_unix/msg_oob.c index 2d0024329437aa..16d0c172eaebec 100644 --- a/tools/testing/selftests/net/af_unix/msg_oob.c +++ b/tools/testing/selftests/net/af_unix/msg_oob.c @@ -339,9 +339,11 @@ TEST_F(msg_oob, non_oob) { sendpair("x", 1, 0); epollpair(false); + siocatmarkpair(false); recvpair("", -EINVAL, 1, MSG_OOB); epollpair(false); + siocatmarkpair(false); } TEST_F(msg_oob, oob) @@ -359,109 +361,142 @@ TEST_F(msg_oob, oob_drop) { sendpair("x", 1, MSG_OOB); epollpair(true); + siocatmarkpair(true); recvpair("", -EAGAIN, 1, 0); /* Drop OOB. */ epollpair(false); + siocatmarkpair(false); recvpair("", -EINVAL, 1, MSG_OOB); epollpair(false); + siocatmarkpair(false); } TEST_F(msg_oob, oob_ahead) { sendpair("hello", 5, MSG_OOB); epollpair(true); + siocatmarkpair(false); recvpair("o", 1, 1, MSG_OOB); epollpair(false); + siocatmarkpair(false); recvpair("hell", 4, 4, 0); epollpair(false); + siocatmarkpair(true); } TEST_F(msg_oob, oob_break) { sendpair("hello", 5, MSG_OOB); epollpair(true); + siocatmarkpair(false); recvpair("hell", 4, 5, 0); /* Break at OOB even with enough buffer. */ epollpair(true); + siocatmarkpair(true); recvpair("o", 1, 1, MSG_OOB); epollpair(false); + siocatmarkpair(true); + + recvpair("", -EAGAIN, 1, 0); + siocatmarkpair(false); } TEST_F(msg_oob, oob_ahead_break) { sendpair("hello", 5, MSG_OOB); epollpair(true); + siocatmarkpair(false); sendpair("world", 5, 0); epollpair(true); + siocatmarkpair(false); recvpair("o", 1, 1, MSG_OOB); epollpair(false); + siocatmarkpair(false); recvpair("hell", 4, 9, 0); /* Break at OOB even after it's recv()ed. */ epollpair(false); + siocatmarkpair(true); recvpair("world", 5, 5, 0); epollpair(false); + siocatmarkpair(false); } TEST_F(msg_oob, oob_break_drop) { sendpair("hello", 5, MSG_OOB); epollpair(true); + siocatmarkpair(false); sendpair("world", 5, 0); epollpair(true); + siocatmarkpair(false); recvpair("hell", 4, 10, 0); /* Break at OOB even with enough buffer. */ epollpair(true); + siocatmarkpair(true); recvpair("world", 5, 10, 0); /* Drop OOB and recv() the next skb. */ epollpair(false); + siocatmarkpair(false); recvpair("", -EINVAL, 1, MSG_OOB); epollpair(false); + siocatmarkpair(false); } TEST_F(msg_oob, ex_oob_break) { sendpair("hello", 5, MSG_OOB); epollpair(true); + siocatmarkpair(false); sendpair("wor", 3, MSG_OOB); epollpair(true); + siocatmarkpair(false); sendpair("ld", 2, 0); epollpair(true); + siocatmarkpair(false); recvpair("hellowo", 7, 10, 0); /* Break at OOB but not at ex-OOB. */ epollpair(true); + siocatmarkpair(true); recvpair("r", 1, 1, MSG_OOB); epollpair(false); + siocatmarkpair(true); recvpair("ld", 2, 2, 0); epollpair(false); + siocatmarkpair(false); } TEST_F(msg_oob, ex_oob_drop) { sendpair("x", 1, MSG_OOB); epollpair(true); + siocatmarkpair(true); sendpair("y", 1, MSG_OOB); /* TCP drops "x" at this moment. */ epollpair(true); tcp_incompliant { + siocatmarkpair(false); + recvpair("x", 1, 1, 0); /* TCP drops "y" by passing through it. */ epollpair(true); + siocatmarkpair(true); recvpair("y", 1, 1, MSG_OOB); /* TCP returns -EINVAL. */ epollpair(false); + siocatmarkpair(true); } } @@ -469,16 +504,24 @@ TEST_F(msg_oob, ex_oob_drop_2) { sendpair("x", 1, MSG_OOB); epollpair(true); + siocatmarkpair(true); sendpair("y", 1, MSG_OOB); /* TCP drops "x" at this moment. */ epollpair(true); + tcp_incompliant { + siocatmarkpair(false); + } + recvpair("y", 1, 1, MSG_OOB); epollpair(false); tcp_incompliant { + siocatmarkpair(false); + recvpair("x", 1, 1, 0); /* TCP returns -EAGAIN. */ epollpair(false); + siocatmarkpair(true); } } @@ -486,24 +529,30 @@ TEST_F(msg_oob, ex_oob_ahead_break) { sendpair("hello", 5, MSG_OOB); epollpair(true); + siocatmarkpair(false); sendpair("wor", 3, MSG_OOB); epollpair(true); + siocatmarkpair(false); recvpair("r", 1, 1, MSG_OOB); epollpair(false); + siocatmarkpair(false); sendpair("ld", 2, MSG_OOB); epollpair(true); + siocatmarkpair(false); tcp_incompliant { recvpair("hellowol", 8, 10, 0); /* TCP recv()s "helloworl", why "r" ?? */ } epollpair(true); + siocatmarkpair(true); recvpair("d", 1, 1, MSG_OOB); epollpair(false); + siocatmarkpair(true); } TEST_F(msg_oob, ex_oob_siocatmark) @@ -548,81 +597,100 @@ TEST_F(msg_oob, inline_oob_break) sendpair("hello", 5, MSG_OOB); epollpair(true); + siocatmarkpair(false); recvpair("", -EINVAL, 1, MSG_OOB); epollpair(true); + siocatmarkpair(false); recvpair("hell", 4, 5, 0); /* Break at OOB but not at ex-OOB. */ epollpair(true); + siocatmarkpair(true); recvpair("o", 1, 1, 0); epollpair(false); + siocatmarkpair(false); } TEST_F(msg_oob, inline_oob_ahead_break) { sendpair("hello", 5, MSG_OOB); epollpair(true); + siocatmarkpair(false); sendpair("world", 5, 0); epollpair(true); + siocatmarkpair(false); recvpair("o", 1, 1, MSG_OOB); epollpair(false); + siocatmarkpair(false); setinlinepair(); recvpair("hell", 4, 9, 0); /* Break at OOB even with enough buffer. */ epollpair(false); + siocatmarkpair(true); tcp_incompliant { recvpair("world", 5, 6, 0); /* TCP recv()s "oworld", ... "o" ??? */ } epollpair(false); + siocatmarkpair(false); } TEST_F(msg_oob, inline_ex_oob_break) { sendpair("hello", 5, MSG_OOB); epollpair(true); + siocatmarkpair(false); sendpair("wor", 3, MSG_OOB); epollpair(true); + siocatmarkpair(false); sendpair("ld", 2, 0); epollpair(true); + siocatmarkpair(false); setinlinepair(); recvpair("hellowo", 7, 10, 0); /* Break at OOB but not at ex-OOB. */ epollpair(true); + siocatmarkpair(true); recvpair("rld", 3, 3, 0); epollpair(false); + siocatmarkpair(false); } TEST_F(msg_oob, inline_ex_oob_no_drop) { sendpair("x", 1, MSG_OOB); epollpair(true); + siocatmarkpair(true); setinlinepair(); sendpair("y", 1, MSG_OOB); /* TCP does NOT drops "x" at this moment. */ epollpair(true); + siocatmarkpair(false); recvpair("x", 1, 1, 0); epollpair(true); + siocatmarkpair(true); recvpair("y", 1, 1, 0); epollpair(false); + siocatmarkpair(false); } TEST_F(msg_oob, inline_ex_oob_drop) { sendpair("x", 1, MSG_OOB); epollpair(true); + siocatmarkpair(true); sendpair("y", 1, MSG_OOB); /* TCP drops "x" at this moment. */ epollpair(true); @@ -630,11 +698,15 @@ TEST_F(msg_oob, inline_ex_oob_drop) setinlinepair(); tcp_incompliant { + siocatmarkpair(false); + recvpair("x", 1, 1, 0); /* TCP recv()s "y". */ epollpair(true); + siocatmarkpair(true); recvpair("y", 1, 1, 0); /* TCP returns -EAGAIN. */ epollpair(false); + siocatmarkpair(false); } } From c362f32a59a84fe4453abecc6b53f5f70894a6d5 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Thu, 20 Jun 2024 06:05:52 +0000 Subject: [PATCH 1487/1578] iommu/amd: Invalidate cache before removing device from domain list Commit 87a6f1f22c97 ("iommu/amd: Introduce per-device domain ID to fix potential TLB aliasing issue") introduced per device domain ID when domain is configured with v2 page table. And in invalidation path, it uses per device structure (dev_data->gcr3_info.domid) to get the domain ID. In detach_device() path, current code tries to invalidate IOMMU cache after removing dev_data from domain device list. This means when domain is configured with v2 page table, amd_iommu_domain_flush_all() will not be able to invalidate cache as device is already removed from domain device list. This is causing change domain tests (changing domain type from identity to DMA) to fail with IO_PAGE_FAULT issue. Hence invalidate cache and update DTE before updating data structures. Reported-by: FahHean Lee Reported-by: Dheeraj Kumar Srivastava Fixes: 87a6f1f22c97 ("iommu/amd: Introduce per-device domain ID to fix potential TLB aliasing issue") Tested-by: Dheeraj Kumar Srivastava Tested-by: Sairaj Arun Kodilkar Tested-by: FahHean Lee Signed-off-by: Vasant Hegde Reviewed-by: Jerry Snitselaar Link: https://lore.kernel.org/r/20240620060552.13984-1-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index c2703599bb1668..b19e8c0f48fa25 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2061,6 +2061,12 @@ static void do_detach(struct iommu_dev_data *dev_data) struct protection_domain *domain = dev_data->domain; struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); + /* Clear DTE and flush the entry */ + amd_iommu_dev_update_dte(dev_data, false); + + /* Flush IOTLB and wait for the flushes to finish */ + amd_iommu_domain_flush_all(domain); + /* Clear GCR3 table */ if (pdom_is_sva_capable(domain)) destroy_gcr3_table(dev_data, domain); @@ -2069,12 +2075,6 @@ static void do_detach(struct iommu_dev_data *dev_data) dev_data->domain = NULL; list_del(&dev_data->list); - /* Clear DTE and flush the entry */ - amd_iommu_dev_update_dte(dev_data, false); - - /* Flush IOTLB and wait for the flushes to finish */ - amd_iommu_domain_flush_all(domain); - /* decrease reference counters - needs to happen after the flushes */ domain->dev_iommu[iommu->index] -= 1; domain->dev_cnt -= 1; From 041be2717b198dd65032f726648401ba293c1bba Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Thu, 20 Jun 2024 14:29:40 +0800 Subject: [PATCH 1488/1578] iommu/vt-d: Fix missed device TLB cache tag When a domain is attached to a device, the required cache tags are assigned to the domain so that the related caches can be flushed whenever it is needed. The device TLB cache tag is created based on whether the ats_enabled field of the device's iommu data is set. This creates an ordered dependency between cache tag assignment and ATS enabling. The device TLB cache tag would not be created if device's ATS is enabled after the cache tag assignment. This causes devices with PCI ATS support to malfunction. The ATS control is exclusively owned by the iommu driver. Hence, move cache_tag_assign_domain() after PCI ATS enabling to make sure that the device TLB cache tag is created for the domain. Fixes: 3b1d9e2b2d68 ("iommu/vt-d: Add cache tag assignment interface") Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240620062940.201786-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 2e9811bf2a4e55..fd11a080380c84 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -2114,12 +2114,6 @@ static int dmar_domain_attach_device(struct dmar_domain *domain, if (ret) return ret; - ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID); - if (ret) { - domain_detach_iommu(domain, iommu); - return ret; - } - info->domain = domain; spin_lock_irqsave(&domain->lock, flags); list_add(&info->link, &domain->devices); @@ -2137,15 +2131,21 @@ static int dmar_domain_attach_device(struct dmar_domain *domain, else ret = intel_pasid_setup_second_level(iommu, domain, dev, IOMMU_NO_PASID); - if (ret) { - device_block_translation(dev); - return ret; - } + if (ret) + goto out_block_translation; if (sm_supported(info->iommu) || !domain_type_is_si(info->domain)) iommu_enable_pci_caps(info); + ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID); + if (ret) + goto out_block_translation; + return 0; + +out_block_translation: + device_block_translation(dev); + return ret; } /** From 150bdf5f8d8f805d70bebbbfd07697bd2416771a Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Fri, 21 Jun 2024 10:15:33 +0000 Subject: [PATCH 1489/1578] iommu/amd: Fix GT feature enablement again MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Current code configures GCR3 even when device is attached to identity domain. So that we can support SVA with identity domain. This means in attach device path it updates Guest Translation related bits in DTE. Commit de111f6b4f6a ("iommu/amd: Enable Guest Translation after reading IOMMU feature register") missed to enable Control[GT] bit in resume path. Its causing certain laptop to fail to resume after suspend. This is because we have inconsistency between between control register (GT is disabled) and DTE (where we have enabled guest translation related bits) in resume path. And IOMMU hardware throws ILLEGAL_DEV_TABLE_ENTRY. Fix it by enabling GT bit in resume path. Reported-by: Błażej Szczygieł Link: https://bugzilla.kernel.org/show_bug.cgi?id=218975 Fixes: de111f6b4f6a ("iommu/amd: Enable Guest Translation after reading IOMMU feature register") Tested-by: Błażej Szczygieł Signed-off-by: Vasant Hegde Reviewed-by: Jerry Snitselaar Link: https://lore.kernel.org/r/20240621101533.20216-1-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/init.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 16124806777647..c89d85b54a1a58 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -2743,6 +2743,7 @@ static void early_enable_iommu(struct amd_iommu *iommu) iommu_enable_command_buffer(iommu); iommu_enable_event_buffer(iommu); iommu_set_exclusion_range(iommu); + iommu_enable_gt(iommu); iommu_enable_ga(iommu); iommu_enable_xt(iommu); iommu_enable_irtcachedis(iommu); From 1864b8224195d0e43ddb92a8151f54f6562090cc Mon Sep 17 00:00:00 2001 From: Ma Ke Date: Tue, 25 Jun 2024 21:03:14 +0800 Subject: [PATCH 1490/1578] net: mana: Fix possible double free in error handling path When auxiliary_device_add() returns error and then calls auxiliary_device_uninit(), callback function adev_release calls kfree(madev). We shouldn't call kfree(madev) again in the error handling path. Set 'madev' to NULL. Fixes: a69839d4327d ("net: mana: Add support for auxiliary device") Signed-off-by: Ma Ke Link: https://patch.msgid.link/20240625130314.2661257-1-make24@iscas.ac.cn Signed-off-by: Paolo Abeni --- drivers/net/ethernet/microsoft/mana/mana_en.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index d087cf954f7552..608ad31a97022b 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -2798,6 +2798,8 @@ static int add_adev(struct gdma_dev *gd) if (ret) goto init_fail; + /* madev is owned by the auxiliary device */ + madev = NULL; ret = auxiliary_device_add(adev); if (ret) goto add_fail; From e269537e491da6336776b5548a3c73f62273aa15 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Tue, 25 Jun 2024 19:55:17 +0800 Subject: [PATCH 1491/1578] block: clean up the check in blkdev_iomap_begin() It is odd to check the offset amidst a series of assignments. Moving this check to the beginning of the function makes the code look better. Signed-off-by: Li Nan Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240625115517.1472120-1-linan666@huaweicloud.com Signed-off-by: Jens Axboe --- block/fops.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/block/fops.c b/block/fops.c index be36c9fbd500bb..9825c1713a49a9 100644 --- a/block/fops.c +++ b/block/fops.c @@ -394,10 +394,11 @@ static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length, struct block_device *bdev = I_BDEV(inode); loff_t isize = i_size_read(inode); - iomap->bdev = bdev; - iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev)); if (offset >= isize) return -EIO; + + iomap->bdev = bdev; + iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev)); iomap->type = IOMAP_MAPPED; iomap->addr = iomap->offset; iomap->length = isize - iomap->offset; From fcdd7b7bda3c21d1ba1247419e4a1eb8e2d0bfbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bernhard=20Rosenkr=C3=A4nzer?= Date: Thu, 27 Jun 2024 14:44:19 +0200 Subject: [PATCH 1492/1578] staging: vchiq_debugfs: Fix build if CONFIG_DEBUG_FS is not set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 42a2f6664e18 ("staging: vc04_services: Move global g_state to vchiq_state") adds a parameter to vchiq_debugfs_init, but leaves the dummy implementation in the !CONFIG_DEBUG_FS case untouched, causing a compile time error. Fixes: c3552ab19aeb ("staging: vchiq_debugfs: Fix NPD in vchiq_dump_state") Signed-off-by: Bernhard Rosenkränzer Reviewed-by: Stefan Wahren Link: https://lore.kernel.org/r/20240627124419.2498642-1-bero@baylibre.com Signed-off-by: Greg Kroah-Hartman --- .../staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.c index 1f74d0bb33bae6..d5f7f61c562693 100644 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.c +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.c @@ -138,7 +138,7 @@ void vchiq_debugfs_deinit(void) #else /* CONFIG_DEBUG_FS */ -void vchiq_debugfs_init(void) +void vchiq_debugfs_init(struct vchiq_state *state) { } From 24bf27b92b1c6a322faa88977de2207aa8c26509 Mon Sep 17 00:00:00 2001 From: Ferry Toth Date: Thu, 20 Jun 2024 22:46:41 +0200 Subject: [PATCH 1493/1578] Revert "usb: gadget: u_ether: Re-attach netif device to mirror detachment" This reverts commit 76c945730cdffb572c7767073cc6515fd3f646b4. Prerequisite revert for the reverting of the original commit f49449fbc21e. Fixes: 76c945730cdf ("usb: gadget: u_ether: Re-attach netif device to mirror detachment") Fixes: f49449fbc21e ("usb: gadget: u_ether: Replace netif_stop_queue with netif_device_detach") Reported-by: Ferry Toth Cc: stable@vger.kernel.org Signed-off-by: Ferry Toth Link: https://lore.kernel.org/r/20240620204832.24518-2-ftoth@exalondelft.nl Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/u_ether.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/usb/gadget/function/u_ether.c b/drivers/usb/gadget/function/u_ether.c index 11dd0b9e847f0d..aa0511c3a62c21 100644 --- a/drivers/usb/gadget/function/u_ether.c +++ b/drivers/usb/gadget/function/u_ether.c @@ -1163,8 +1163,6 @@ struct net_device *gether_connect(struct gether *link) if (netif_running(dev->net)) eth_start(dev, GFP_ATOMIC); - netif_device_attach(dev->net); - /* on error, disable any endpoints */ } else { (void) usb_ep_disable(link->out_ep); From c50814a288dcee687285abc0cf935e9fe8928e59 Mon Sep 17 00:00:00 2001 From: Ferry Toth Date: Thu, 20 Jun 2024 22:46:42 +0200 Subject: [PATCH 1494/1578] Revert "usb: gadget: u_ether: Replace netif_stop_queue with netif_device_detach" This reverts commit f49449fbc21e7e9550a5203902d69c8ae7dfd918. This commit breaks u_ether on some setups (at least Merrifield). The fix "usb: gadget: u_ether: Re-attach netif device to mirror detachment" party restores u-ether. However the netif usb: remains up even usb is switched from device to host mode. This creates problems for user space as the interface remains in the routing table while not realy present and network managers (connman) not detecting a network change. Various attempts to find the root cause were unsuccesful up to now. Therefore revert until a solution is found. Link: https://lore.kernel.org/linux-usb/20231006141231.7220-1-hgajjar@de.adit-jv.com/ Reported-by: Andy Shevchenko Reported-by: Ferry Toth Fixes: f49449fbc21e ("usb: gadget: u_ether: Replace netif_stop_queue with netif_device_detach") Cc: stable@vger.kernel.org Signed-off-by: Ferry Toth Link: https://lore.kernel.org/r/20240620204832.24518-3-ftoth@exalondelft.nl Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/u_ether.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/gadget/function/u_ether.c b/drivers/usb/gadget/function/u_ether.c index aa0511c3a62c21..95191083b4556b 100644 --- a/drivers/usb/gadget/function/u_ether.c +++ b/drivers/usb/gadget/function/u_ether.c @@ -1200,7 +1200,7 @@ void gether_disconnect(struct gether *link) DBG(dev, "%s\n", __func__); - netif_device_detach(dev->net); + netif_stop_queue(dev->net); netif_carrier_off(dev->net); /* disable endpoints, forcing (synchronous) completion From fc1d1a712b517bbcb383b1f1f7ef478e7d0579f2 Mon Sep 17 00:00:00 2001 From: Jos Wang Date: Wed, 19 Jun 2024 19:45:29 +0800 Subject: [PATCH 1495/1578] usb: dwc3: core: Workaround for CSR read timeout This is a workaround for STAR 4846132, which only affects DWC_usb31 version2.00a operating in host mode. There is a problem in DWC_usb31 version 2.00a operating in host mode that would cause a CSR read timeout When CSR read coincides with RAM Clock Gating Entry. By disable Clock Gating, sacrificing power consumption for normal operation. Cc: stable # 5.10.x: 1e43c86d: usb: dwc3: core: Add DWC31 version 2.00a controller Signed-off-by: Jos Wang Acked-by: Thinh Nguyen Link: https://lore.kernel.org/r/20240619114529.3441-1-joswang1221@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/core.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c index 9d47c3aa577710..cb82557678ddd4 100644 --- a/drivers/usb/dwc3/core.c +++ b/drivers/usb/dwc3/core.c @@ -957,12 +957,16 @@ static bool dwc3_core_is_valid(struct dwc3 *dwc) static void dwc3_core_setup_global_control(struct dwc3 *dwc) { + unsigned int power_opt; + unsigned int hw_mode; u32 reg; reg = dwc3_readl(dwc->regs, DWC3_GCTL); reg &= ~DWC3_GCTL_SCALEDOWN_MASK; + hw_mode = DWC3_GHWPARAMS0_MODE(dwc->hwparams.hwparams0); + power_opt = DWC3_GHWPARAMS1_EN_PWROPT(dwc->hwparams.hwparams1); - switch (DWC3_GHWPARAMS1_EN_PWROPT(dwc->hwparams.hwparams1)) { + switch (power_opt) { case DWC3_GHWPARAMS1_EN_PWROPT_CLK: /** * WORKAROUND: DWC3 revisions between 2.10a and 2.50a have an @@ -995,6 +999,20 @@ static void dwc3_core_setup_global_control(struct dwc3 *dwc) break; } + /* + * This is a workaround for STAR#4846132, which only affects + * DWC_usb31 version2.00a operating in host mode. + * + * There is a problem in DWC_usb31 version 2.00a operating + * in host mode that would cause a CSR read timeout When CSR + * read coincides with RAM Clock Gating Entry. By disable + * Clock Gating, sacrificing power consumption for normal + * operation. + */ + if (power_opt != DWC3_GHWPARAMS1_EN_PWROPT_NO && + hw_mode != DWC3_GHWPARAMS0_MODE_GADGET && DWC3_VER_IS(DWC31, 200A)) + reg |= DWC3_GCTL_DSBLCLKGTNG; + /* check if current dwc3 is on simulation board */ if (dwc->hwparams.hwparams6 & DWC3_GHWPARAMS6_EN_FPGA) { dev_info(dwc->dev, "Running with FPGA optimizations\n"); From 9919cce62f68e6ab68dc2a975b5dc670f8ca7d40 Mon Sep 17 00:00:00 2001 From: Kent Gibson Date: Wed, 26 Jun 2024 13:29:22 +0800 Subject: [PATCH 1496/1578] gpiolib: cdev: Disallow reconfiguration without direction (uAPI v1) linehandle_set_config() behaves badly when direction is not set. The configuration validation is borrowed from linehandle_create(), where, to verify the intent of the user, the direction must be set to in order to effect a change to the electrical configuration of a line. But, when applied to reconfiguration, that validation does not allow for the unset direction case, making it possible to clear flags set previously without specifying the line direction. Adding to the inconsistency, those changes are not immediately applied by linehandle_set_config(), but will take effect when the line value is next get or set. For example, by requesting a configuration with no flags set, an output line with GPIOHANDLE_REQUEST_ACTIVE_LOW and GPIOHANDLE_REQUEST_OPEN_DRAIN requested could have those flags cleared, inverting the sense of the line and changing the line drive to push-pull on the next line value set. Ensure the intent of the user by disallowing configurations which do not have direction set, returning an error to userspace to indicate that the configuration is invalid. And, for clarity, use lflags, a local copy of gcnf.flags, throughout when dealing with the requested flags, rather than a mixture of both. Fixes: e588bb1eae31 ("gpio: add new SET_CONFIG ioctl() to gpio chardev") Signed-off-by: Kent Gibson Link: https://lore.kernel.org/r/20240626052925.174272-2-warthog618@gmail.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-cdev.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/gpio/gpiolib-cdev.c b/drivers/gpio/gpiolib-cdev.c index 9dad67ea25974e..04261adf320b85 100644 --- a/drivers/gpio/gpiolib-cdev.c +++ b/drivers/gpio/gpiolib-cdev.c @@ -89,6 +89,10 @@ struct linehandle_state { GPIOHANDLE_REQUEST_OPEN_DRAIN | \ GPIOHANDLE_REQUEST_OPEN_SOURCE) +#define GPIOHANDLE_REQUEST_DIRECTION_FLAGS \ + (GPIOHANDLE_REQUEST_INPUT | \ + GPIOHANDLE_REQUEST_OUTPUT) + static int linehandle_validate_flags(u32 flags) { /* Return an error if an unknown flag is set */ @@ -169,21 +173,21 @@ static long linehandle_set_config(struct linehandle_state *lh, if (ret) return ret; + /* Lines must be reconfigured explicitly as input or output. */ + if (!(lflags & GPIOHANDLE_REQUEST_DIRECTION_FLAGS)) + return -EINVAL; + for (i = 0; i < lh->num_descs; i++) { desc = lh->descs[i]; - linehandle_flags_to_desc_flags(gcnf.flags, &desc->flags); + linehandle_flags_to_desc_flags(lflags, &desc->flags); - /* - * Lines have to be requested explicitly for input - * or output, else the line will be treated "as is". - */ if (lflags & GPIOHANDLE_REQUEST_OUTPUT) { int val = !!gcnf.default_values[i]; ret = gpiod_direction_output(desc, val); if (ret) return ret; - } else if (lflags & GPIOHANDLE_REQUEST_INPUT) { + } else { ret = gpiod_direction_input(desc); if (ret) return ret; From b440396387418fe2feaacd41ca16080e7a8bc9ad Mon Sep 17 00:00:00 2001 From: Kent Gibson Date: Wed, 26 Jun 2024 13:29:23 +0800 Subject: [PATCH 1497/1578] gpiolib: cdev: Ignore reconfiguration without direction linereq_set_config() behaves badly when direction is not set. The configuration validation is borrowed from linereq_create(), where, to verify the intent of the user, the direction must be set to in order to effect a change to the electrical configuration of a line. But, when applied to reconfiguration, that validation does not allow for the unset direction case, making it possible to clear flags set previously without specifying the line direction. Adding to the inconsistency, those changes are not immediately applied by linereq_set_config(), but will take effect when the line value is next get or set. For example, by requesting a configuration with no flags set, an output line with GPIO_V2_LINE_FLAG_ACTIVE_LOW and GPIO_V2_LINE_FLAG_OPEN_DRAIN set could have those flags cleared, inverting the sense of the line and changing the line drive to push-pull on the next line value set. Skip the reconfiguration of lines for which the direction is not set, and only reconfigure the lines for which direction is set. Fixes: a54756cb24ea ("gpiolib: cdev: support GPIO_V2_LINE_SET_CONFIG_IOCTL") Signed-off-by: Kent Gibson Link: https://lore.kernel.org/r/20240626052925.174272-3-warthog618@gmail.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-cdev.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/gpio/gpiolib-cdev.c b/drivers/gpio/gpiolib-cdev.c index 04261adf320b85..5639abce6ec577 100644 --- a/drivers/gpio/gpiolib-cdev.c +++ b/drivers/gpio/gpiolib-cdev.c @@ -1534,12 +1534,14 @@ static long linereq_set_config(struct linereq *lr, void __user *ip) line = &lr->lines[i]; desc = lr->lines[i].desc; flags = gpio_v2_line_config_flags(&lc, i); - gpio_v2_line_config_flags_to_desc_flags(flags, &desc->flags); - edflags = flags & GPIO_V2_LINE_EDGE_DETECTOR_FLAGS; /* - * Lines have to be requested explicitly for input - * or output, else the line will be treated "as is". + * Lines not explicitly reconfigured as input or output + * are left unchanged. */ + if (!(flags & GPIO_V2_LINE_DIRECTION_FLAGS)) + continue; + gpio_v2_line_config_flags_to_desc_flags(flags, &desc->flags); + edflags = flags & GPIO_V2_LINE_EDGE_DETECTOR_FLAGS; if (flags & GPIO_V2_LINE_FLAG_OUTPUT) { int val = gpio_v2_line_config_output_value(&lc, i); @@ -1547,7 +1549,7 @@ static long linereq_set_config(struct linereq *lr, void __user *ip) ret = gpiod_direction_output(desc, val); if (ret) return ret; - } else if (flags & GPIO_V2_LINE_FLAG_INPUT) { + } else { ret = gpiod_direction_input(desc); if (ret) return ret; From 7e1f4eb9a60d40dd17a97d9b76818682a024a127 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 4 Apr 2024 12:04:54 +0200 Subject: [PATCH 1498/1578] kallsyms: rework symbol lookup return codes Building with W=1 in some configurations produces a false positive warning for kallsyms: kernel/kallsyms.c: In function '__sprint_symbol.isra': kernel/kallsyms.c:503:17: error: 'strcpy' source argument is the same as destination [-Werror=restrict] 503 | strcpy(buffer, name); | ^~~~~~~~~~~~~~~~~~~~ This originally showed up while building with -O3, but later started happening in other configurations as well, depending on inlining decisions. The underlying issue is that the local 'name' variable is always initialized to the be the same as 'buffer' in the called functions that fill the buffer, which gcc notices while inlining, though it could see that the address check always skips the copy. The calling conventions here are rather unusual, as all of the internal lookup functions (bpf_address_lookup, ftrace_mod_address_lookup, ftrace_func_address_lookup, module_address_lookup and kallsyms_lookup_buildid) already use the provided buffer and either return the address of that buffer to indicate success, or NULL for failure, but the callers are written to also expect an arbitrary other buffer to be returned. Rework the calling conventions to return the length of the filled buffer instead of its address, which is simpler and easier to follow as well as avoiding the warning. Leave only the kallsyms_lookup() calling conventions unchanged, since that is called from 16 different functions and adapting this would be a much bigger change. Link: https://lore.kernel.org/lkml/20200107214042.855757-1-arnd@arndb.de/ Link: https://lore.kernel.org/lkml/20240326130647.7bfb1d92@gandalf.local.home/ Tested-by: Geert Uytterhoeven Reviewed-by: Luis Chamberlain Acked-by: Steven Rostedt (Google) Signed-off-by: Arnd Bergmann --- include/linux/filter.h | 14 +++++++------- include/linux/ftrace.h | 6 +++--- include/linux/module.h | 14 +++++++------- kernel/bpf/core.c | 7 +++---- kernel/kallsyms.c | 23 ++++++++++++----------- kernel/module/kallsyms.c | 25 ++++++++++++------------- kernel/trace/ftrace.c | 13 +++++-------- 7 files changed, 49 insertions(+), 53 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 0f12cf01070e34..5669da513cd7cc 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1208,18 +1208,18 @@ static inline bool bpf_jit_kallsyms_enabled(void) return false; } -const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, +int __bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char *sym); bool is_bpf_text_address(unsigned long addr); int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym); struct bpf_prog *bpf_prog_ksym_find(unsigned long addr); -static inline const char * +static inline int bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char **modname, char *sym) { - const char *ret = __bpf_address_lookup(addr, size, off, sym); + int ret = __bpf_address_lookup(addr, size, off, sym); if (ret && modname) *modname = NULL; @@ -1263,11 +1263,11 @@ static inline bool bpf_jit_kallsyms_enabled(void) return false; } -static inline const char * +static inline int __bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char *sym) { - return NULL; + return 0; } static inline bool is_bpf_text_address(unsigned long addr) @@ -1286,11 +1286,11 @@ static inline struct bpf_prog *bpf_prog_ksym_find(unsigned long addr) return NULL; } -static inline const char * +static inline int bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char **modname, char *sym) { - return NULL; + return 0; } static inline void bpf_prog_kallsyms_add(struct bpf_prog *fp) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 800995c425e086..b792274189a3b0 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -86,15 +86,15 @@ struct ftrace_hash; #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_MODULES) && \ defined(CONFIG_DYNAMIC_FTRACE) -const char * +int ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char **modname, char *sym); #else -static inline const char * +static inline int ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char **modname, char *sym) { - return NULL; + return 0; } #endif diff --git a/include/linux/module.h b/include/linux/module.h index ffa1c603163c29..330ffb59efe511 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -931,11 +931,11 @@ int module_kallsyms_on_each_symbol(const char *modname, * least KSYM_NAME_LEN long: a pointer to namebuf is returned if * found, otherwise NULL. */ -const char *module_address_lookup(unsigned long addr, - unsigned long *symbolsize, - unsigned long *offset, - char **modname, const unsigned char **modbuildid, - char *namebuf); +int module_address_lookup(unsigned long addr, + unsigned long *symbolsize, + unsigned long *offset, + char **modname, const unsigned char **modbuildid, + char *namebuf); int lookup_module_symbol_name(unsigned long addr, char *symname); int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, @@ -964,14 +964,14 @@ static inline int module_kallsyms_on_each_symbol(const char *modname, } /* For kallsyms to ask for address resolution. NULL means not found. */ -static inline const char *module_address_lookup(unsigned long addr, +static inline int module_address_lookup(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, const unsigned char **modbuildid, char *namebuf) { - return NULL; + return 0; } static inline int lookup_module_symbol_name(unsigned long addr, char *symname) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 1a6c3faa6e4a19..695a0fb2cd4df4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -736,11 +736,11 @@ static struct bpf_ksym *bpf_ksym_find(unsigned long addr) return n ? container_of(n, struct bpf_ksym, tnode) : NULL; } -const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, +int __bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char *sym) { struct bpf_ksym *ksym; - char *ret = NULL; + int ret = 0; rcu_read_lock(); ksym = bpf_ksym_find(addr); @@ -748,9 +748,8 @@ const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long symbol_start = ksym->start; unsigned long symbol_end = ksym->end; - strscpy(sym, ksym->name, KSYM_NAME_LEN); + ret = strscpy(sym, ksym->name, KSYM_NAME_LEN); - ret = sym; if (size) *size = symbol_end - symbol_start; if (off) diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 22ea19a36e6e67..98b9622d372e42 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -388,12 +388,12 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, !!__bpf_address_lookup(addr, symbolsize, offset, namebuf); } -static const char *kallsyms_lookup_buildid(unsigned long addr, +static int kallsyms_lookup_buildid(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, const unsigned char **modbuildid, char *namebuf) { - const char *ret; + int ret; namebuf[KSYM_NAME_LEN - 1] = 0; namebuf[0] = 0; @@ -410,7 +410,7 @@ static const char *kallsyms_lookup_buildid(unsigned long addr, if (modbuildid) *modbuildid = NULL; - ret = namebuf; + ret = strlen(namebuf); goto found; } @@ -442,8 +442,13 @@ const char *kallsyms_lookup(unsigned long addr, unsigned long *offset, char **modname, char *namebuf) { - return kallsyms_lookup_buildid(addr, symbolsize, offset, modname, - NULL, namebuf); + int ret = kallsyms_lookup_buildid(addr, symbolsize, offset, modname, + NULL, namebuf); + + if (!ret) + return NULL; + + return namebuf; } int lookup_symbol_name(unsigned long addr, char *symname) @@ -478,19 +483,15 @@ static int __sprint_symbol(char *buffer, unsigned long address, { char *modname; const unsigned char *buildid; - const char *name; unsigned long offset, size; int len; address += symbol_offset; - name = kallsyms_lookup_buildid(address, &size, &offset, &modname, &buildid, + len = kallsyms_lookup_buildid(address, &size, &offset, &modname, &buildid, buffer); - if (!name) + if (!len) return sprintf(buffer, "0x%lx", address - symbol_offset); - if (name != buffer) - strcpy(buffer, name); - len = strlen(buffer); offset -= symbol_offset; if (add_offset) diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c index 62fb57bb9f161c..bf65e0c3c86fc0 100644 --- a/kernel/module/kallsyms.c +++ b/kernel/module/kallsyms.c @@ -321,14 +321,15 @@ void * __weak dereference_module_function_descriptor(struct module *mod, * For kallsyms to ask for address resolution. NULL means not found. Careful * not to lock to avoid deadlock on oopses, simply disable preemption. */ -const char *module_address_lookup(unsigned long addr, - unsigned long *size, - unsigned long *offset, - char **modname, - const unsigned char **modbuildid, - char *namebuf) +int module_address_lookup(unsigned long addr, + unsigned long *size, + unsigned long *offset, + char **modname, + const unsigned char **modbuildid, + char *namebuf) { - const char *ret = NULL; + const char *sym; + int ret = 0; struct module *mod; preempt_disable(); @@ -344,12 +345,10 @@ const char *module_address_lookup(unsigned long addr, #endif } - ret = find_kallsyms_symbol(mod, addr, size, offset); - } - /* Make a copy in here where it's safe */ - if (ret) { - strscpy(namebuf, ret, KSYM_NAME_LEN); - ret = namebuf; + sym = find_kallsyms_symbol(mod, addr, size, offset); + + if (sym) + ret = strscpy(namebuf, sym, KSYM_NAME_LEN); } preempt_enable(); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 65208d3b5ed983..eacab4020508ee 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6969,7 +6969,7 @@ allocate_ftrace_mod_map(struct module *mod, return mod_map; } -static const char * +static int ftrace_func_address_lookup(struct ftrace_mod_map *mod_map, unsigned long addr, unsigned long *size, unsigned long *off, char *sym) @@ -6990,21 +6990,18 @@ ftrace_func_address_lookup(struct ftrace_mod_map *mod_map, *size = found_func->size; if (off) *off = addr - found_func->ip; - if (sym) - strscpy(sym, found_func->name, KSYM_NAME_LEN); - - return found_func->name; + return strscpy(sym, found_func->name, KSYM_NAME_LEN); } - return NULL; + return 0; } -const char * +int ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char **modname, char *sym) { struct ftrace_mod_map *mod_map; - const char *ret = NULL; + int ret = 0; /* mod_map is freed via call_rcu() */ preempt_disable(); From 63db4a1f795a19e4e12f036a12a5f61c48b03e5c Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 27 Jun 2024 16:07:35 +0000 Subject: [PATCH 1499/1578] block: Delete blk_queue_flag_test_and_set() Since commit 70200574cc22 ("block: remove QUEUE_FLAG_DISCARD"), blk_queue_flag_test_and_set() has not been used, so delete it. Signed-off-by: John Garry Link: https://lore.kernel.org/r/20240627160735.842189-1-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- block/blk-core.c | 14 -------------- include/linux/blkdev.h | 1 - 2 files changed, 15 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 6fc1a5a1980db3..71b7622c523a30 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -94,20 +94,6 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q) } EXPORT_SYMBOL(blk_queue_flag_clear); -/** - * blk_queue_flag_test_and_set - atomically test and set a queue flag - * @flag: flag to be set - * @q: request queue - * - * Returns the previous value of @flag - 0 if the flag was not set and 1 if - * the flag was already set. - */ -bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q) -{ - return test_and_set_bit(flag, &q->queue_flags); -} -EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set); - #define REQ_OP_NAME(name) [REQ_OP_##name] = #name static const char *const blk_op_name[] = { REQ_OP_NAME(READ), diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a53e3434e1a28c..53c41ef4222c3d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -609,7 +609,6 @@ struct request_queue { void blk_queue_flag_set(unsigned int flag, struct request_queue *q); void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); -bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) #define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags) From 18048c1af7836b8e31739d9eaefebc2bf76261f7 Mon Sep 17 00:00:00 2001 From: Gulam Mohamed Date: Tue, 18 Jun 2024 16:40:42 +0000 Subject: [PATCH 1500/1578] loop: Fix a race between loop detach and loop open 1. Userspace sends the command "losetup -d" which uses the open() call to open the device 2. Kernel receives the ioctl command "LOOP_CLR_FD" which calls the function loop_clr_fd() 3. If LOOP_CLR_FD is the first command received at the time, then the AUTOCLEAR flag is not set and deletion of the loop device proceeds ahead and scans the partitions (drop/add partitions) if (disk_openers(lo->lo_disk) > 1) { lo->lo_flags |= LO_FLAGS_AUTOCLEAR; loop_global_unlock(lo, true); return 0; } 4. Before scanning partitions, it will check to see if any partition of the loop device is currently opened 5. If any partition is opened, then it will return EBUSY: if (disk->open_partitions) return -EBUSY; 6. So, after receiving the "LOOP_CLR_FD" command and just before the above check for open_partitions, if any other command (like blkid) opens any partition of the loop device, then the partition scan will not proceed and EBUSY is returned as shown in above code 7. But in "__loop_clr_fd()", this EBUSY error is not propagated 8. We have noticed that this is causing the partitions of the loop to remain stale even after the loop device is detached resulting in the IO errors on the partitions Fix: Defer the detach of loop device to release function, which is called when the last close happens, by setting the lo_flags to LO_FLAGS_AUTOCLEAR at the time of detach i.e in loop_clr_fd() function. Test case involves the following two scripts: script1.sh: while [ 1 ]; do losetup -P -f /home/opt/looptest/test10.img blkid /dev/loop0p1 done script2.sh: while [ 1 ]; do losetup -d /dev/loop0 done Without fix, the following IO errors have been observed: kernel: __loop_clr_fd: partition scan of loop0 failed (rc=-16) kernel: I/O error, dev loop0, sector 20971392 op 0x0:(READ) flags 0x80700 phys_seg 1 prio class 0 kernel: I/O error, dev loop0, sector 108868 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 0 kernel: Buffer I/O error on dev loop0p1, logical block 27201, async page read Signed-off-by: Gulam Mohamed Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240618164042.343777-1-gulam.mohamed@oracle.com Signed-off-by: Jens Axboe --- drivers/block/loop.c | 75 +++++++++++++++++++++----------------------- 1 file changed, 36 insertions(+), 39 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 86b5d956dc4e02..cbda971d22b40b 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1123,20 +1123,12 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode, return error; } -static void __loop_clr_fd(struct loop_device *lo, bool release) +static void __loop_clr_fd(struct loop_device *lo) { struct queue_limits lim; struct file *filp; gfp_t gfp = lo->old_gfp_mask; - /* - * Freeze the request queue when unbinding on a live file descriptor and - * thus an open device. When called from ->release we are guaranteed - * that there is no I/O in progress already. - */ - if (!release) - blk_mq_freeze_queue(lo->lo_queue); - spin_lock_irq(&lo->lo_lock); filp = lo->lo_backing_file; lo->lo_backing_file = NULL; @@ -1161,8 +1153,6 @@ static void __loop_clr_fd(struct loop_device *lo, bool release) mapping_set_gfp_mask(filp->f_mapping, gfp); /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); - if (!release) - blk_mq_unfreeze_queue(lo->lo_queue); disk_force_media_change(lo->lo_disk); @@ -1177,11 +1167,7 @@ static void __loop_clr_fd(struct loop_device *lo, bool release) * must be at least one and it can only become zero when the * current holder is released. */ - if (!release) - mutex_lock(&lo->lo_disk->open_mutex); err = bdev_disk_changed(lo->lo_disk, false); - if (!release) - mutex_unlock(&lo->lo_disk->open_mutex); if (err) pr_warn("%s: partition scan of loop%d failed (rc=%d)\n", __func__, lo->lo_number, err); @@ -1230,24 +1216,16 @@ static int loop_clr_fd(struct loop_device *lo) return -ENXIO; } /* - * If we've explicitly asked to tear down the loop device, - * and it has an elevated reference count, set it for auto-teardown when - * the last reference goes away. This stops $!~#$@ udev from - * preventing teardown because it decided that it needs to run blkid on - * the loopback device whenever they appear. xfstests is notorious for - * failing tests because blkid via udev races with a losetup - * /do something like mkfs/losetup -d causing the losetup -d - * command to fail with EBUSY. + * Mark the device for removing the backing device on last close. + * If we are the only opener, also switch the state to roundown here to + * prevent new openers from coming in. */ - if (disk_openers(lo->lo_disk) > 1) { - lo->lo_flags |= LO_FLAGS_AUTOCLEAR; - loop_global_unlock(lo, true); - return 0; - } - lo->lo_state = Lo_rundown; + + lo->lo_flags |= LO_FLAGS_AUTOCLEAR; + if (disk_openers(lo->lo_disk) == 1) + lo->lo_state = Lo_rundown; loop_global_unlock(lo, true); - __loop_clr_fd(lo, false); return 0; } @@ -1714,25 +1692,43 @@ static int lo_compat_ioctl(struct block_device *bdev, blk_mode_t mode, } #endif +static int lo_open(struct gendisk *disk, blk_mode_t mode) +{ + struct loop_device *lo = disk->private_data; + int err; + + err = mutex_lock_killable(&lo->lo_mutex); + if (err) + return err; + + if (lo->lo_state == Lo_deleting || lo->lo_state == Lo_rundown) + err = -ENXIO; + mutex_unlock(&lo->lo_mutex); + return err; +} + static void lo_release(struct gendisk *disk) { struct loop_device *lo = disk->private_data; + bool need_clear = false; if (disk_openers(disk) > 0) return; + /* + * Clear the backing device information if this is the last close of + * a device that's been marked for auto clear, or on which LOOP_CLR_FD + * has been called. + */ mutex_lock(&lo->lo_mutex); - if (lo->lo_state == Lo_bound && (lo->lo_flags & LO_FLAGS_AUTOCLEAR)) { + if (lo->lo_state == Lo_bound && (lo->lo_flags & LO_FLAGS_AUTOCLEAR)) lo->lo_state = Lo_rundown; - mutex_unlock(&lo->lo_mutex); - /* - * In autoclear mode, stop the loop thread - * and remove configuration after last close. - */ - __loop_clr_fd(lo, true); - return; - } + + need_clear = (lo->lo_state == Lo_rundown); mutex_unlock(&lo->lo_mutex); + + if (need_clear) + __loop_clr_fd(lo); } static void lo_free_disk(struct gendisk *disk) @@ -1749,6 +1745,7 @@ static void lo_free_disk(struct gendisk *disk) static const struct block_device_operations lo_fops = { .owner = THIS_MODULE, + .open = lo_open, .release = lo_release, .ioctl = lo_ioctl, #ifdef CONFIG_COMPAT From 4f2a129b33a2054e62273edd5a051c34c08d96e9 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 27 Jun 2024 11:26:00 +1000 Subject: [PATCH 1501/1578] drm/drm_file: Fix pid refcounting race , Maxime Ripard , Thomas Zimmermann filp->pid is supposed to be a refcounted pointer; however, before this patch, drm_file_update_pid() only increments the refcount of a struct pid after storing a pointer to it in filp->pid and dropping the dev->filelist_mutex, making the following race possible: process A process B ========= ========= begin drm_file_update_pid mutex_lock(&dev->filelist_mutex) rcu_replace_pointer(filp->pid, , 1) mutex_unlock(&dev->filelist_mutex) begin drm_file_update_pid mutex_lock(&dev->filelist_mutex) rcu_replace_pointer(filp->pid, , 1) mutex_unlock(&dev->filelist_mutex) get_pid() synchronize_rcu() put_pid() *** pid B reaches refcount 0 and is freed here *** get_pid() *** UAF *** synchronize_rcu() put_pid() As far as I know, this race can only occur with CONFIG_PREEMPT_RCU=y because it requires RCU to detect a quiescent state in code that is not explicitly calling into the scheduler. This race leads to use-after-free of a "struct pid". It is probably somewhat hard to hit because process A has to pass through a synchronize_rcu() operation while process B is between mutex_unlock() and get_pid(). Fix it by ensuring that by the time a pointer to the current task's pid is stored in the file, an extra reference to the pid has been taken. This fix also removes the condition for synchronize_rcu(); I think that optimization is unnecessary complexity, since in that case we would usually have bailed out on the lockless check above. Fixes: 1c7a387ffef8 ("drm: Update file owner during use") Cc: Signed-off-by: Jann Horn Signed-off-by: Dave Airlie --- drivers/gpu/drm/drm_file.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/drm_file.c b/drivers/gpu/drm/drm_file.c index 638ffa4444f51a..714e42b0510800 100644 --- a/drivers/gpu/drm/drm_file.c +++ b/drivers/gpu/drm/drm_file.c @@ -469,14 +469,12 @@ void drm_file_update_pid(struct drm_file *filp) dev = filp->minor->dev; mutex_lock(&dev->filelist_mutex); + get_pid(pid); old = rcu_replace_pointer(filp->pid, pid, 1); mutex_unlock(&dev->filelist_mutex); - if (pid != old) { - get_pid(pid); - synchronize_rcu(); - put_pid(old); - } + synchronize_rcu(); + put_pid(old); } /** From ebb5b260af67c677700cd51be6845c2cab3edfbd Mon Sep 17 00:00:00 2001 From: David Arcari Date: Mon, 20 May 2024 14:57:49 -0400 Subject: [PATCH 1502/1578] tools/power turbostat: option '-n' is ambiguous In some cases specifying the '-n' command line argument will cause turbostat to fail. For instance 'turbostat -n 1' works fine; however, 'turbostat -n 1 -d' will fail. This is the result of the first call to getopt_long_only() where "MP" is specified as the optstring. This can be easily fixed by changing the optstring from "MP" to "MPn:" to remove ambiguity between the arguments. tools/power turbostat: option '-n' is ambiguous; possibilities: '-num_iterations' '-no-msr' '-no-perf' Fixes: a0e86c90b83c ("tools/power turbostat: Add --no-perf option") Signed-off-by: David Arcari Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 8cdf41906e9818..12c1872aa42e30 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -8424,7 +8424,7 @@ void cmdline(int argc, char **argv) * Parse some options early, because they may make other options invalid, * like adding the MSR counter with --add and at the same time using --no-msr. */ - while ((opt = getopt_long_only(argc, argv, "MP", long_options, &option_index)) != -1) { + while ((opt = getopt_long_only(argc, argv, "MPn:", long_options, &option_index)) != -1) { switch (opt) { case 'M': no_msr = 1; From c5120a3356755f9e9d2d592c1347f3b9ff4022a7 Mon Sep 17 00:00:00 2001 From: Adam Hawley Date: Wed, 22 May 2024 16:27:21 +0300 Subject: [PATCH 1503/1578] tools/power turbostat: Fix unc freq columns not showing with '-q' or '-l' Commit 78464d7681f7 ("tools/power turbostat: Add columns for clustered uncore frequency") introduced 'probe_intel_uncore_frequency_cluster()' in a way which prevents printing uncore frequency columns if either of the '-q' or '-l' options are used. Systems which do not have multiple uncore frequencies per package are unaffected by this regression. Fix the function so that uncore frequency columns are shown when either the '-l' or '-q' option is used by checking if 'quiet' is true after adding counters for the uncore frequency columns. Fixes: 78464d7681f7 ("tools/power turbostat: Add columns for clustered uncore frequency") Signed-off-by: Adam Hawley Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 12c1872aa42e30..ad10feb9fafd97 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5695,9 +5695,6 @@ static void probe_intel_uncore_frequency_cluster(void) if (access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00/current_freq_khz", R_OK)) return; - if (quiet) - return; - for (uncore_max_id = 0;; ++uncore_max_id) { sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/uncore%02d", uncore_max_id); @@ -5727,6 +5724,14 @@ static void probe_intel_uncore_frequency_cluster(void) sprintf(path, "%s/fabric_cluster_id", path_base); cluster_id = read_sysfs_int(path); + sprintf(path, "%s/current_freq_khz", path_base); + sprintf(name_buf, "UMHz%d.%d", domain_id, cluster_id); + + add_counter(0, path, name_buf, 0, SCOPE_PACKAGE, COUNTER_K2M, FORMAT_AVERAGE, 0, package_id); + + if (quiet) + continue; + sprintf(path, "%s/min_freq_khz", path_base); k = read_sysfs_int(path); sprintf(path, "%s/max_freq_khz", path_base); @@ -5743,11 +5748,6 @@ static void probe_intel_uncore_frequency_cluster(void) sprintf(path, "%s/current_freq_khz", path_base); k = read_sysfs_int(path); fprintf(outf, " %d MHz\n", k / 1000); - - sprintf(path, "%s/current_freq_khz", path_base); - sprintf(name_buf, "UMHz%d.%d", domain_id, cluster_id); - - add_counter(0, path, name_buf, 0, SCOPE_PACKAGE, COUNTER_K2M, FORMAT_AVERAGE, 0, package_id); } } From b15943c4b3351173d5f3b0d87362d2994a89bb66 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Thu, 30 May 2024 09:16:39 +0200 Subject: [PATCH 1504/1578] tools/power turbostat: Add local build_bug.h header for snapshot target Fixes compilation errors for Makefile snapshot target described in: commit 231ce08b662a ("tools/power turbostat: Add "snapshot:" Makefile target") Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/Makefile | 6 +++++- tools/power/x86/turbostat/turbostat.c | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/power/x86/turbostat/Makefile b/tools/power/x86/turbostat/Makefile index 2d6dce2c8f77fa..b1e6817f1e54ab 100644 --- a/tools/power/x86/turbostat/Makefile +++ b/tools/power/x86/turbostat/Makefile @@ -14,6 +14,7 @@ turbostat : turbostat.c override CFLAGS += -O2 -Wall -Wextra -I../../../include override CFLAGS += -DMSRHEADER='"../../../../arch/x86/include/asm/msr-index.h"' override CFLAGS += -DINTEL_FAMILY_HEADER='"../../../../arch/x86/include/asm/intel-family.h"' +override CFLAGS += -DBUILD_BUG_HEADER='"../../../../include/linux/build_bug.h"' override CFLAGS += -D_FILE_OFFSET_BITS=64 override CFLAGS += -D_FORTIFY_SOURCE=2 @@ -44,10 +45,13 @@ snapshot: turbostat @echo "#define GENMASK(h, l) (((~0UL) << (l)) & (~0UL >> (sizeof(long) * 8 - 1 - (h))))" >> $(SNAPSHOT)/bits.h @echo "#define GENMASK_ULL(h, l) (((~0ULL) << (l)) & (~0ULL >> (sizeof(long long) * 8 - 1 - (h))))" >> $(SNAPSHOT)/bits.h + @echo '#define BUILD_BUG_ON(cond) do { enum { compile_time_check ## __COUNTER__ = 1/(!(cond)) }; } while (0)' > $(SNAPSHOT)/build_bug.h + @echo PWD=. > $(SNAPSHOT)/Makefile @echo "CFLAGS += -DMSRHEADER='\"msr-index.h\"'" >> $(SNAPSHOT)/Makefile @echo "CFLAGS += -DINTEL_FAMILY_HEADER='\"intel-family.h\"'" >> $(SNAPSHOT)/Makefile - @sed -e's/.*MSRHEADER.*//' -e's/.*INTEL_FAMILY_HEADER.*//' Makefile >> $(SNAPSHOT)/Makefile + @echo "CFLAGS += -DBUILD_BUG_HEADER='\"build_bug.h\"'" >> $(SNAPSHOT)/Makefile + @sed -e's/.*MSRHEADER.*//' -e's/.*INTEL_FAMILY_HEADER.*//' -e's/.*BUILD_BUG_HEADER.*//' Makefile >> $(SNAPSHOT)/Makefile @rm -f $(SNAPSHOT).tar.gz tar cvzf $(SNAPSHOT).tar.gz $(SNAPSHOT) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index ad10feb9fafd97..9f5d053d4bc639 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -10,6 +10,7 @@ #define _GNU_SOURCE #include MSRHEADER #include INTEL_FAMILY_HEADER +#include BUILD_BUG_HEADER #include #include #include @@ -38,7 +39,6 @@ #include #include #include -#include #define UNUSED(x) (void)(x) From 09aaa2d0642359fddae607b6950b2ca7bd1cf04f Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 28 Jun 2024 14:28:06 +0200 Subject: [PATCH 1505/1578] MAINTAINERS: Update IOMMU tree location Update the maintainers entries to the new location of the IOMMU tree. Signed-off-by: Joerg Roedel --- MAINTAINERS | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 2ca8f35dfe0399..932caa703e8324 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1044,7 +1044,7 @@ M: Joerg Roedel R: Suravee Suthikulpanit L: iommu@lists.linux.dev S: Maintained -T: git git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/iommu/linux.git F: drivers/iommu/amd/ F: include/linux/amd-iommu.h @@ -11156,7 +11156,7 @@ M: David Woodhouse M: Lu Baolu L: iommu@lists.linux.dev S: Supported -T: git git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/iommu/linux.git F: drivers/iommu/intel/ INTEL IPU3 CSI-2 CIO2 DRIVER @@ -11529,7 +11529,7 @@ IOMMU DMA-API LAYER M: Robin Murphy L: iommu@lists.linux.dev S: Maintained -T: git git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/iommu/linux.git F: drivers/iommu/dma-iommu.c F: drivers/iommu/dma-iommu.h F: drivers/iommu/iova.c @@ -11541,7 +11541,7 @@ M: Will Deacon R: Robin Murphy L: iommu@lists.linux.dev S: Maintained -T: git git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/iommu/linux.git F: Documentation/devicetree/bindings/iommu/ F: Documentation/userspace-api/iommu.rst F: drivers/iommu/ From 1066fe825987da007669d7c25306b4dbb50bd7dd Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 27 Jun 2024 12:55:52 +0200 Subject: [PATCH 1506/1578] ata: libata-core: Add ATA_HORKAGE_NOLPM for all Crucial BX SSD1 models We got another report that CT1000BX500SSD1 does not work with LPM. If you look in libata-core.c, we have six different Crucial devices that are marked with ATA_HORKAGE_NOLPM. This model would have been the seventh. (This quirk is used on Crucial models starting with both CT* and Crucial_CT*) It is obvious that this vendor does not have a great history of supporting LPM properly, therefore, add the ATA_HORKAGE_NOLPM quirk for all Crucial BX SSD1 models. Fixes: 7627a0edef54 ("ata: ahci: Drop low power policy board type") Cc: stable@vger.kernel.org Reported-by: Alessandro Maggio Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218832 Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20240627105551.4159447-2-cassel@kernel.org Signed-off-by: Niklas Cassel --- drivers/ata/libata-core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index e1bf8a19b3c89e..efb5195da60c56 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -4137,8 +4137,7 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { { "PIONEER BD-RW BDR-205", NULL, ATA_HORKAGE_NOLPM }, /* Crucial devices with broken LPM support */ - { "CT500BX100SSD1", NULL, ATA_HORKAGE_NOLPM }, - { "CT240BX500SSD1", NULL, ATA_HORKAGE_NOLPM }, + { "CT*0BX*00SSD1", NULL, ATA_HORKAGE_NOLPM }, /* 512GB MX100 with MU01 firmware has both queued TRIM and LPM issues */ { "Crucial_CT512MX100*", "MU01", ATA_HORKAGE_NO_NCQ_TRIM | From 6a4805b2f51a2e5dc346651e0d0cd8abcc2937c8 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Fri, 31 May 2024 16:07:26 -0700 Subject: [PATCH 1507/1578] string: kunit: add missing MODULE_DESCRIPTION() macros make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in lib/string_kunit.o WARNING: modpost: missing MODULE_DESCRIPTION() in lib/string_helpers_kunit.o Add the missing invocation of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Link: https://lore.kernel.org/r/20240531-md-lib-string-v1-1-2738cf057d94@quicinc.com Signed-off-by: Kees Cook --- lib/string_helpers_kunit.c | 1 + lib/string_kunit.c | 1 + 2 files changed, 2 insertions(+) diff --git a/lib/string_helpers_kunit.c b/lib/string_helpers_kunit.c index f88e39fd68d6e6..c853046183d245 100644 --- a/lib/string_helpers_kunit.c +++ b/lib/string_helpers_kunit.c @@ -625,4 +625,5 @@ static struct kunit_suite string_helpers_test_suite = { kunit_test_suites(&string_helpers_test_suite); +MODULE_DESCRIPTION("Test cases for string helpers module"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/lib/string_kunit.c b/lib/string_kunit.c index 2a812decf14bd3..c919e3293da6aa 100644 --- a/lib/string_kunit.c +++ b/lib/string_kunit.c @@ -633,4 +633,5 @@ static struct kunit_suite string_test_suite = { kunit_test_suites(&string_test_suite); +MODULE_DESCRIPTION("Test cases for string functions"); MODULE_LICENSE("GPL v2"); From 6db1208bf95b4c091897b597c415e11edeab2e2d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 19 Jun 2024 14:47:15 -0700 Subject: [PATCH 1508/1578] randomize_kstack: Remove non-functional per-arch entropy filtering An unintended consequence of commit 9c573cd31343 ("randomize_kstack: Improve entropy diffusion") was that the per-architecture entropy size filtering reduced how many bits were being added to the mix, rather than how many bits were being used during the offsetting. All architectures fell back to the existing default of 0x3FF (10 bits), which will consume at most 1KiB of stack space. It seems that this is working just fine, so let's avoid the confusion and update everything to use the default. The prior intent of the per-architecture limits were: arm64: capped at 0x1FF (9 bits), 5 bits effective powerpc: uncapped (10 bits), 6 or 7 bits effective riscv: uncapped (10 bits), 6 bits effective x86: capped at 0xFF (8 bits), 5 (x86_64) or 6 (ia32) bits effective s390: capped at 0xFF (8 bits), undocumented effective entropy Current discussion has led to just dropping the original per-architecture filters. The additional entropy appears to be safe for arm64, x86, and s390. Quoting Arnd, "There is no point pretending that 15.75KB is somehow safe to use while 15.00KB is not." Co-developed-by: Yuntao Liu Signed-off-by: Yuntao Liu Fixes: 9c573cd31343 ("randomize_kstack: Improve entropy diffusion") Link: https://lore.kernel.org/r/20240617133721.377540-1-liuyuntao12@huawei.com Reviewed-by: Arnd Bergmann Acked-by: Mark Rutland Acked-by: Heiko Carstens # s390 Link: https://lore.kernel.org/r/20240619214711.work.953-kees@kernel.org Signed-off-by: Kees Cook --- arch/arm64/kernel/syscall.c | 16 +++++++--------- arch/s390/include/asm/entry-common.h | 2 +- arch/x86/include/asm/entry-common.h | 15 ++++++--------- 3 files changed, 14 insertions(+), 19 deletions(-) diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c index ad198262b9817a..7230f6e20ab8bf 100644 --- a/arch/arm64/kernel/syscall.c +++ b/arch/arm64/kernel/syscall.c @@ -53,17 +53,15 @@ static void invoke_syscall(struct pt_regs *regs, unsigned int scno, syscall_set_return_value(current, regs, 0, ret); /* - * Ultimately, this value will get limited by KSTACK_OFFSET_MAX(), - * but not enough for arm64 stack utilization comfort. To keep - * reasonable stack head room, reduce the maximum offset to 9 bits. + * This value will get limited by KSTACK_OFFSET_MAX(), which is 10 + * bits. The actual entropy will be further reduced by the compiler + * when applying stack alignment constraints: the AAPCS mandates a + * 16-byte aligned SP at function boundaries, which will remove the + * 4 low bits from any entropy chosen here. * - * The actual entropy will be further reduced by the compiler when - * applying stack alignment constraints: the AAPCS mandates a - * 16-byte (i.e. 4-bit) aligned SP at function boundaries. - * - * The resulting 5 bits of entropy is seen in SP[8:4]. + * The resulting 6 bits of entropy is seen in SP[9:4]. */ - choose_random_kstack_offset(get_random_u16() & 0x1FF); + choose_random_kstack_offset(get_random_u16()); } static inline bool has_syscall_work(unsigned long flags) diff --git a/arch/s390/include/asm/entry-common.h b/arch/s390/include/asm/entry-common.h index 7f5004065e8aa2..35555c9446308d 100644 --- a/arch/s390/include/asm/entry-common.h +++ b/arch/s390/include/asm/entry-common.h @@ -54,7 +54,7 @@ static __always_inline void arch_exit_to_user_mode(void) static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, unsigned long ti_work) { - choose_random_kstack_offset(get_tod_clock_fast() & 0xff); + choose_random_kstack_offset(get_tod_clock_fast()); } #define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index 7e523bb3d2d31a..fb2809b20b0ac4 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -73,19 +73,16 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, #endif /* - * Ultimately, this value will get limited by KSTACK_OFFSET_MAX(), - * but not enough for x86 stack utilization comfort. To keep - * reasonable stack head room, reduce the maximum offset to 8 bits. - * - * The actual entropy will be further reduced by the compiler when - * applying stack alignment constraints (see cc_stack_align4/8 in + * This value will get limited by KSTACK_OFFSET_MAX(), which is 10 + * bits. The actual entropy will be further reduced by the compiler + * when applying stack alignment constraints (see cc_stack_align4/8 in * arch/x86/Makefile), which will remove the 3 (x86_64) or 2 (ia32) * low bits from any entropy chosen here. * - * Therefore, final stack offset entropy will be 5 (x86_64) or - * 6 (ia32) bits. + * Therefore, final stack offset entropy will be 7 (x86_64) or + * 8 (ia32) bits. */ - choose_random_kstack_offset(rdtsc() & 0xFF); + choose_random_kstack_offset(rdtsc()); } #define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare From 1c07c9be87dd3dd0634033bf08728b32465f08fb Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 29 May 2024 14:29:42 -0700 Subject: [PATCH 1509/1578] tty: mxser: Remove __counted_by from mxser_board.ports[] Work for __counted_by on generic pointers in structures (not just flexible array members) has started landing in Clang 19 (current tip of tree). During the development of this feature, a restriction was added to __counted_by to prevent the flexible array member's element type from including a flexible array member itself such as: struct foo { int count; char buf[]; }; struct bar { int count; struct foo data[] __counted_by(count); }; because the size of data cannot be calculated with the standard array size formula: sizeof(struct foo) * count This restriction was downgraded to a warning but due to CONFIG_WERROR, it can still break the build. The application of __counted_by on the ports member of 'struct mxser_board' triggers this restriction, resulting in: drivers/tty/mxser.c:291:2: error: 'counted_by' should not be applied to an array with element of unknown size because 'struct mxser_port' is a struct type with a flexible array member. This will be an error in a future compiler version [-Werror,-Wbounds-safety-counted-by-elt-type-unknown-size] 291 | struct mxser_port ports[] __counted_by(nports); | ^~~~~~~~~~~~~~~~~~~~~~~~~ 1 error generated. Remove this use of __counted_by to fix the warning/error. However, rather than remove it altogether, leave it commented, as it may be possible to support this in future compiler releases. Cc: Closes: https://github.com/ClangBuiltLinux/linux/issues/2026 Fixes: f34907ecca71 ("mxser: Annotate struct mxser_board with __counted_by") Signed-off-by: Nathan Chancellor Link: https://lore.kernel.org/r/20240529-drop-counted-by-ports-mxser-board-v1-1-0ab217f4da6d@kernel.org Signed-off-by: Kees Cook --- drivers/tty/mxser.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/mxser.c b/drivers/tty/mxser.c index 458bb1280ebf9d..5b97e420a95f84 100644 --- a/drivers/tty/mxser.c +++ b/drivers/tty/mxser.c @@ -288,7 +288,7 @@ struct mxser_board { enum mxser_must_hwid must_hwid; speed_t max_baud; - struct mxser_port ports[] __counted_by(nports); + struct mxser_port ports[] /* __counted_by(nports) */; }; static DECLARE_BITMAP(mxser_boards, MXSER_BOARDS); From f1e46758e8b2b04c725ac706b5f455c0de0486a4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 28 Jun 2024 15:16:48 +0200 Subject: [PATCH 1510/1578] bcache: work around a __bitwise to bool conversion sparse warning Sparse is a bit dumb about bitwise operation on __bitwise types used in boolean contexts. Add a !! to explicitly propagate to boolean without a warning. Fixes: fcf865e357f8 ("block: convert features and flags to __bitwise types") Reported-by: kernel test robot Signed-off-by: Christoph Hellwig Acked-by: Kent Overstreet Link: https://lore.kernel.org/r/20240628131657.667797-1-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 283b2511c6d21f..b5d6ef430b86fc 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1416,8 +1416,8 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) } if (bdev_io_opt(dc->bdev)) - dc->partial_stripes_expensive = q->limits.features & - BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE; + dc->partial_stripes_expensive = !!(q->limits.features & + BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE); ret = bcache_device_init(&dc->disk, block_size, bdev_nr_sectors(dc->bdev) - dc->sb.data_offset, From c546d6f438338017480d105ab597292da67f6f6a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2024 06:59:34 +0200 Subject: [PATCH 1511/1578] block: only zero non-PI metadata tuples in bio_integrity_prep The PI generation helpers already zero the app tag, so relax the zeroing to non-PI metadata. Signed-off-by: Christoph Hellwig Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240626045950.189758-2-hch@lst.de Signed-off-by: Jens Axboe --- block/bio-integrity.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 356ca0d3d62f5a..4da003b86a1bbf 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -456,11 +456,11 @@ bool bio_integrity_prep(struct bio *bio) /* * Zero the memory allocated to not leak uninitialized kernel - * memory to disk. For PI this only affects the app tag, but - * for non-integrity metadata it affects the entire metadata - * buffer. + * memory to disk for non-integrity metadata where nothing else + * initializes the memory. */ - gfp |= __GFP_ZERO; + if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE) + gfp |= __GFP_ZERO; } /* Allocate kernel buffer for protection data */ From c096df908393b0b3445f4335dd9bbd9d98252951 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2024 06:59:35 +0200 Subject: [PATCH 1512/1578] block: simplify adding the payload in bio_integrity_prep bio_integrity_add_page can add physically contiguous regions of any size, so don't bother chunking up the kmalloced buffer. Signed-off-by: Christoph Hellwig Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240626045950.189758-3-hch@lst.de Signed-off-by: Jens Axboe --- block/bio-integrity.c | 32 ++++++-------------------------- 1 file changed, 6 insertions(+), 26 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 4da003b86a1bbf..259acb60c56a3a 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -428,10 +428,8 @@ bool bio_integrity_prep(struct bio *bio) { struct bio_integrity_payload *bip; struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + unsigned int len; void *buf; - unsigned long start, end; - unsigned int len, nr_pages; - unsigned int bytes, offset, i; gfp_t gfp = GFP_NOIO; if (!bi) @@ -471,12 +469,7 @@ bool bio_integrity_prep(struct bio *bio) goto err_end_io; } - end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - start = ((unsigned long) buf) >> PAGE_SHIFT; - nr_pages = end - start; - - /* Allocate bio integrity payload and integrity vectors */ - bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages); + bip = bio_integrity_alloc(bio, GFP_NOIO, 1); if (IS_ERR(bip)) { printk(KERN_ERR "could not allocate data integrity bioset\n"); kfree(buf); @@ -489,23 +482,10 @@ bool bio_integrity_prep(struct bio *bio) if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) bip->bip_flags |= BIP_IP_CHECKSUM; - /* Map it */ - offset = offset_in_page(buf); - for (i = 0; i < nr_pages && len > 0; i++) { - bytes = PAGE_SIZE - offset; - - if (bytes > len) - bytes = len; - - if (bio_integrity_add_page(bio, virt_to_page(buf), - bytes, offset) < bytes) { - printk(KERN_ERR "could not attach integrity payload\n"); - goto err_end_io; - } - - buf += bytes; - len -= bytes; - offset = 0; + if (bio_integrity_add_page(bio, virt_to_page(buf), len, + offset_in_page(buf)) < len) { + printk(KERN_ERR "could not attach integrity payload\n"); + goto err_end_io; } /* Auto-generate integrity metadata if this is a write */ From dac18fabba59149acec42621b9b603654e9459b2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2024 06:59:36 +0200 Subject: [PATCH 1513/1578] block: remove allocation failure warnings in bio_integrity_prep Allocation failures already leave a stack trace, so don't spew another warning. Signed-off-by: Christoph Hellwig Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240626045950.189758-4-hch@lst.de Signed-off-by: Jens Axboe --- block/bio-integrity.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 259acb60c56a3a..2da7fb222a3582 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -465,13 +465,11 @@ bool bio_integrity_prep(struct bio *bio) len = bio_integrity_bytes(bi, bio_sectors(bio)); buf = kmalloc(len, gfp); if (unlikely(buf == NULL)) { - printk(KERN_ERR "could not allocate integrity buffer\n"); goto err_end_io; } bip = bio_integrity_alloc(bio, GFP_NOIO, 1); if (IS_ERR(bip)) { - printk(KERN_ERR "could not allocate data integrity bioset\n"); kfree(buf); goto err_end_io; } From df3c485e0e60e8ad87f168092f1513a3d621fa4b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2024 06:59:37 +0200 Subject: [PATCH 1514/1578] block: switch on bio operation in bio_integrity_prep Use a single switch to perform read and write specific checks and exit early for other operations instead of having two checks using different predicates. Signed-off-by: Christoph Hellwig Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240626045950.189758-5-hch@lst.de Signed-off-by: Jens Axboe --- block/bio-integrity.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 2da7fb222a3582..2f8a4617003473 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -435,9 +435,6 @@ bool bio_integrity_prep(struct bio *bio) if (!bi) return true; - if (bio_op(bio) != REQ_OP_READ && bio_op(bio) != REQ_OP_WRITE) - return true; - if (!bio_sectors(bio)) return true; @@ -445,10 +442,12 @@ bool bio_integrity_prep(struct bio *bio) if (bio_integrity(bio)) return true; - if (bio_data_dir(bio) == READ) { + switch (bio_op(bio)) { + case REQ_OP_READ: if (bi->flags & BLK_INTEGRITY_NOVERIFY) return true; - } else { + break; + case REQ_OP_WRITE: if (bi->flags & BLK_INTEGRITY_NOGENERATE) return true; @@ -459,6 +458,9 @@ bool bio_integrity_prep(struct bio *bio) */ if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE) gfp |= __GFP_ZERO; + break; + default: + return true; } /* Allocate kernel buffer for protection data */ From d19b46340b3c0ea66bef0f6c58876cc085813ba8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2024 06:59:38 +0200 Subject: [PATCH 1515/1578] block: remove bio_integrity_process Move the bvec interation into the generate/verify helpers to avoid a bit of argument passing churn. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240626045950.189758-6-hch@lst.de Signed-off-by: Jens Axboe --- block/bio-integrity.c | 47 +---------------- block/blk.h | 7 +-- block/t10-pi.c | 97 +++++++++++++++++++++++++++-------- include/linux/blk-integrity.h | 9 ---- 4 files changed, 79 insertions(+), 81 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 2f8a4617003473..ad296849aa2a9a 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -374,44 +374,6 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes, } EXPORT_SYMBOL_GPL(bio_integrity_map_user); -/** - * bio_integrity_process - Process integrity metadata for a bio - * @bio: bio to generate/verify integrity metadata for - * @proc_iter: iterator to process - */ -static blk_status_t bio_integrity_process(struct bio *bio, - struct bvec_iter *proc_iter) -{ - struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); - struct blk_integrity_iter iter; - struct bvec_iter bviter; - struct bio_vec bv; - struct bio_integrity_payload *bip = bio_integrity(bio); - blk_status_t ret = BLK_STS_OK; - - iter.disk_name = bio->bi_bdev->bd_disk->disk_name; - iter.interval = 1 << bi->interval_exp; - iter.seed = proc_iter->bi_sector; - iter.prot_buf = bvec_virt(bip->bip_vec); - - __bio_for_each_segment(bv, bio, bviter, *proc_iter) { - void *kaddr = bvec_kmap_local(&bv); - - iter.data_buf = kaddr; - iter.data_size = bv.bv_len; - if (bio_data_dir(bio) == WRITE) - blk_integrity_generate(&iter, bi); - else - ret = blk_integrity_verify(&iter, bi); - kunmap_local(kaddr); - - if (ret) - break; - - } - return ret; -} - /** * bio_integrity_prep - Prepare bio for integrity I/O * @bio: bio to prepare @@ -490,7 +452,7 @@ bool bio_integrity_prep(struct bio *bio) /* Auto-generate integrity metadata if this is a write */ if (bio_data_dir(bio) == WRITE) - bio_integrity_process(bio, &bio->bi_iter); + blk_integrity_generate(bio); else bip->bio_iter = bio->bi_iter; return true; @@ -516,12 +478,7 @@ static void bio_integrity_verify_fn(struct work_struct *work) container_of(work, struct bio_integrity_payload, bip_work); struct bio *bio = bip->bip_bio; - /* - * At the moment verify is called bio's iterator was advanced - * during split and completion, we need to rewind iterator to - * it's original position. - */ - bio->bi_status = bio_integrity_process(bio, &bip->bio_iter); + blk_integrity_verify(bio); bio_integrity_free(bio); bio_endio(bio); } diff --git a/block/blk.h b/block/blk.h index 95e5a4f81693c4..47dadd2439b1ca 100644 --- a/block/blk.h +++ b/block/blk.h @@ -9,7 +9,6 @@ #include #include "blk-crypto-internal.h" -struct blk_integrity_iter; struct elevator_type; /* Max future timer expiry for timeouts */ @@ -681,10 +680,8 @@ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops, struct file *bdev_file); int bdev_permission(dev_t dev, blk_mode_t mode, void *holder); -void blk_integrity_generate(struct blk_integrity_iter *iter, - struct blk_integrity *bi); -blk_status_t blk_integrity_verify(struct blk_integrity_iter *iter, - struct blk_integrity *bi); +void blk_integrity_generate(struct bio *bio); +void blk_integrity_verify(struct bio *bio); void blk_integrity_prepare(struct request *rq); void blk_integrity_complete(struct request *rq, unsigned int nr_bytes); diff --git a/block/t10-pi.c b/block/t10-pi.c index cd7fa60d63ff21..425e2836f3e1d8 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -13,6 +13,15 @@ #include #include "blk.h" +struct blk_integrity_iter { + void *prot_buf; + void *data_buf; + sector_t seed; + unsigned int data_size; + unsigned short interval; + const char *disk_name; +}; + static __be16 t10_pi_csum(__be16 csum, void *data, unsigned int len, unsigned char csum_type) { @@ -364,33 +373,77 @@ static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes) } } -void blk_integrity_generate(struct blk_integrity_iter *iter, - struct blk_integrity *bi) +void blk_integrity_generate(struct bio *bio) { - switch (bi->csum_type) { - case BLK_INTEGRITY_CSUM_CRC64: - ext_pi_crc64_generate(iter, bi); - break; - case BLK_INTEGRITY_CSUM_CRC: - case BLK_INTEGRITY_CSUM_IP: - t10_pi_generate(iter, bi); - break; - default: - break; + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + struct bio_integrity_payload *bip = bio_integrity(bio); + struct blk_integrity_iter iter; + struct bvec_iter bviter; + struct bio_vec bv; + + iter.disk_name = bio->bi_bdev->bd_disk->disk_name; + iter.interval = 1 << bi->interval_exp; + iter.seed = bio->bi_iter.bi_sector; + iter.prot_buf = bvec_virt(bip->bip_vec); + bio_for_each_segment(bv, bio, bviter) { + void *kaddr = bvec_kmap_local(&bv); + + iter.data_buf = kaddr; + iter.data_size = bv.bv_len; + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + ext_pi_crc64_generate(&iter, bi); + break; + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + t10_pi_generate(&iter, bi); + break; + default: + break; + } + kunmap_local(kaddr); } } -blk_status_t blk_integrity_verify(struct blk_integrity_iter *iter, - struct blk_integrity *bi) +void blk_integrity_verify(struct bio *bio) { - switch (bi->csum_type) { - case BLK_INTEGRITY_CSUM_CRC64: - return ext_pi_crc64_verify(iter, bi); - case BLK_INTEGRITY_CSUM_CRC: - case BLK_INTEGRITY_CSUM_IP: - return t10_pi_verify(iter, bi); - default: - return BLK_STS_OK; + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + struct bio_integrity_payload *bip = bio_integrity(bio); + struct blk_integrity_iter iter; + struct bvec_iter bviter; + struct bio_vec bv; + + /* + * At the moment verify is called bi_iter has been advanced during split + * and completion, so use the copy created during submission here. + */ + iter.disk_name = bio->bi_bdev->bd_disk->disk_name; + iter.interval = 1 << bi->interval_exp; + iter.seed = bip->bio_iter.bi_sector; + iter.prot_buf = bvec_virt(bip->bip_vec); + __bio_for_each_segment(bv, bio, bviter, bip->bio_iter) { + void *kaddr = bvec_kmap_local(&bv); + blk_status_t ret = BLK_STS_OK; + + iter.data_buf = kaddr; + iter.data_size = bv.bv_len; + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + ret = ext_pi_crc64_verify(&iter, bi); + break; + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + ret = t10_pi_verify(&iter, bi); + break; + default: + break; + } + kunmap_local(kaddr); + + if (ret) { + bio->bi_status = ret; + return; + } } } diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index c58634924782d0..804f856ed3e571 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -14,15 +14,6 @@ enum blk_integrity_flags { BLK_INTEGRITY_STACKED = 1 << 4, }; -struct blk_integrity_iter { - void *prot_buf; - void *data_buf; - sector_t seed; - unsigned int data_size; - unsigned short interval; - const char *disk_name; -}; - const char *blk_integrity_profile_name(struct blk_integrity *bi); bool queue_limits_stack_integrity(struct queue_limits *t, struct queue_limits *b); From aa6ff4eb7c10d9a6532db3ea9e78124bf14e70ae Mon Sep 17 00:00:00 2001 From: Dongliang Cui Date: Fri, 14 Jun 2024 15:49:36 +0800 Subject: [PATCH 1516/1578] block: Add ioprio to block_rq tracepoint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sometimes we need to track the processing order of requests with ioprio set. So the ioprio of request can be useful information. Example: block_rq_insert: 8,0 RA 16384 () 6500840 + 32 be,0,6 [binder:815_3] block_rq_issue: 8,0 RA 16384 () 6500840 + 32 be,0,6 [binder:815_3] block_rq_complete: 8,0 RA () 6500840 + 32 be,0,6 [0] Signed-off-by: Dongliang Cui Reviewed-by: Bart Van Assche Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20240614074936.113659-1-dongliang.cui@unisoc.com Signed-off-by: Jens Axboe --- include/trace/events/block.h | 41 ++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 0e128ad5146015..1527d5d45e01a4 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -9,9 +9,17 @@ #include #include #include +#include #define RWBS_LEN 8 +#define IOPRIO_CLASS_STRINGS \ + { IOPRIO_CLASS_NONE, "none" }, \ + { IOPRIO_CLASS_RT, "rt" }, \ + { IOPRIO_CLASS_BE, "be" }, \ + { IOPRIO_CLASS_IDLE, "idle" }, \ + { IOPRIO_CLASS_INVALID, "invalid"} + #ifdef CONFIG_BUFFER_HEAD DECLARE_EVENT_CLASS(block_buffer, @@ -82,6 +90,7 @@ TRACE_EVENT(block_rq_requeue, __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) + __field( unsigned short, ioprio ) __array( char, rwbs, RWBS_LEN ) __dynamic_array( char, cmd, 1 ) ), @@ -90,16 +99,20 @@ TRACE_EVENT(block_rq_requeue, __entry->dev = rq->q->disk ? disk_devt(rq->q->disk) : 0; __entry->sector = blk_rq_trace_sector(rq); __entry->nr_sector = blk_rq_trace_nr_sectors(rq); + __entry->ioprio = rq->ioprio; blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; ), - TP_printk("%d,%d %s (%s) %llu + %u [%d]", + TP_printk("%d,%d %s (%s) %llu + %u %s,%u,%u [%d]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __get_str(cmd), - (unsigned long long)__entry->sector, - __entry->nr_sector, 0) + (unsigned long long)__entry->sector, __entry->nr_sector, + __print_symbolic(IOPRIO_PRIO_CLASS(__entry->ioprio), + IOPRIO_CLASS_STRINGS), + IOPRIO_PRIO_HINT(__entry->ioprio), + IOPRIO_PRIO_LEVEL(__entry->ioprio), 0) ); DECLARE_EVENT_CLASS(block_rq_completion, @@ -113,6 +126,7 @@ DECLARE_EVENT_CLASS(block_rq_completion, __field( sector_t, sector ) __field( unsigned int, nr_sector ) __field( int , error ) + __field( unsigned short, ioprio ) __array( char, rwbs, RWBS_LEN ) __dynamic_array( char, cmd, 1 ) ), @@ -122,16 +136,20 @@ DECLARE_EVENT_CLASS(block_rq_completion, __entry->sector = blk_rq_pos(rq); __entry->nr_sector = nr_bytes >> 9; __entry->error = blk_status_to_errno(error); + __entry->ioprio = rq->ioprio; blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; ), - TP_printk("%d,%d %s (%s) %llu + %u [%d]", + TP_printk("%d,%d %s (%s) %llu + %u %s,%u,%u [%d]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __get_str(cmd), - (unsigned long long)__entry->sector, - __entry->nr_sector, __entry->error) + (unsigned long long)__entry->sector, __entry->nr_sector, + __print_symbolic(IOPRIO_PRIO_CLASS(__entry->ioprio), + IOPRIO_CLASS_STRINGS), + IOPRIO_PRIO_HINT(__entry->ioprio), + IOPRIO_PRIO_LEVEL(__entry->ioprio), __entry->error) ); /** @@ -180,6 +198,7 @@ DECLARE_EVENT_CLASS(block_rq, __field( sector_t, sector ) __field( unsigned int, nr_sector ) __field( unsigned int, bytes ) + __field( unsigned short, ioprio ) __array( char, rwbs, RWBS_LEN ) __array( char, comm, TASK_COMM_LEN ) __dynamic_array( char, cmd, 1 ) @@ -190,17 +209,21 @@ DECLARE_EVENT_CLASS(block_rq, __entry->sector = blk_rq_trace_sector(rq); __entry->nr_sector = blk_rq_trace_nr_sectors(rq); __entry->bytes = blk_rq_bytes(rq); + __entry->ioprio = rq->ioprio; blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), - TP_printk("%d,%d %s %u (%s) %llu + %u [%s]", + TP_printk("%d,%d %s %u (%s) %llu + %u %s,%u,%u [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __entry->bytes, __get_str(cmd), - (unsigned long long)__entry->sector, - __entry->nr_sector, __entry->comm) + (unsigned long long)__entry->sector, __entry->nr_sector, + __print_symbolic(IOPRIO_PRIO_CLASS(__entry->ioprio), + IOPRIO_CLASS_STRINGS), + IOPRIO_PRIO_HINT(__entry->ioprio), + IOPRIO_PRIO_LEVEL(__entry->ioprio), __entry->comm) ); /** From 0676c434a99be42f3bacca4adfd27df65edbf903 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 20 Jun 2024 11:06:31 +0800 Subject: [PATCH 1517/1578] block: check bio alignment in blk_mq_submit_bio IO logical block size is one fundamental queue limit, and every IO has to be aligned with logical block size because our bio split can't deal with unaligned bio. The check has to be done with queue usage counter grabbed because device reconfiguration may change logical block size, and we can prevent the reconfiguration from happening by holding queue usage counter. logical_block_size stays in the 1st cache line of queue_limits, and this cache line is always fetched in fast path via bio_may_exceed_limits(), so IO perf won't be affected by this check. Cc: Yi Zhang Cc: Christoph Hellwig Cc: Ye Bin Cc: stable@vger.kernel.org Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240620030631.3114026-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index 47fe9d19b8f109..fec2dea5c6e885 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2909,6 +2909,17 @@ static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, INIT_LIST_HEAD(&rq->queuelist); } +static bool bio_unaligned(const struct bio *bio, struct request_queue *q) +{ + unsigned int bs_mask = queue_logical_block_size(q) - 1; + + /* .bi_sector of any zero sized bio need to be initialized */ + if ((bio->bi_iter.bi_size & bs_mask) || + ((bio->bi_iter.bi_sector << SECTOR_SHIFT) & bs_mask)) + return true; + return false; +} + /** * blk_mq_submit_bio - Create and send a request to block device. * @bio: Bio pointer. @@ -2961,6 +2972,15 @@ void blk_mq_submit_bio(struct bio *bio) return; } + /* + * Device reconfiguration may change logical block size, so alignment + * check has to be done with queue usage counter held + */ + if (unlikely(bio_unaligned(bio, q))) { + bio_io_error(bio); + goto queue_exit; + } + if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); if (!bio) From 667ea36378cf7f669044b27871c496e1559c872a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 27 Jun 2024 14:49:11 +0200 Subject: [PATCH 1518/1578] loop: don't set QUEUE_FLAG_NOMERGES QUEUE_FLAG_NOMERGES isn't really a driver interface, but a user tunable. There also isn't any good reason to set it in the loop driver. The original commit adding it (5b5e20f421c0b6d "block: loop: set QUEUE_FLAG_NOMERGES for request queue of loop") claims that "It doesn't make sense to enable merge because the I/O submitted to backing file is handled page by page." which of course isn't true for multi-page bvec now, and it never has been for direct I/O, for which commit 40326d8a33d ("block/loop: allow request merge for directio mode") alredy disabled the nomerges flag. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20240627124926.512662-2-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index cbda971d22b40b..4f0d96876b1f5d 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -211,13 +211,10 @@ static void __loop_update_dio(struct loop_device *lo, bool dio) if (lo->lo_state == Lo_bound) blk_mq_freeze_queue(lo->lo_queue); lo->use_dio = use_dio; - if (use_dio) { - blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, lo->lo_queue); + if (use_dio) lo->lo_flags |= LO_FLAGS_DIRECT_IO; - } else { - blk_queue_flag_set(QUEUE_FLAG_NOMERGES, lo->lo_queue); + else lo->lo_flags &= ~LO_FLAGS_DIRECT_IO; - } if (lo->lo_state == Lo_bound) blk_mq_unfreeze_queue(lo->lo_queue); } @@ -2030,14 +2027,6 @@ static int loop_add(int i) } lo->lo_queue = lo->lo_disk->queue; - /* - * By default, we do buffer IO, so it doesn't make sense to enable - * merge because the I/O submitted to backing file is handled page by - * page. For directio mode, merge does help to dispatch bigger request - * to underlayer disk. We will enable merge once directio is enabled. - */ - blk_queue_flag_set(QUEUE_FLAG_NOMERGES, lo->lo_queue); - /* * Disable partition scanning by default. The in-kernel partition * scanning can be requested individually per-device during its From aa57abe6a7f91fafe53fb98d0f1e74db951bce24 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 27 Jun 2024 14:49:12 +0200 Subject: [PATCH 1519/1578] megaraid_sas: don't set QUEUE_FLAG_NOMERGES Setting QUEUE_FLAG_NOMERGES was added in commit 15dd03811d99dcf ("scsi: megaraid_sas: NVME Interface detection and prop settings") without any explanation. Drivers should second guess the block layer merge decisions, so remove the flag. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20240627124926.512662-3-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/megaraid/megaraid_sas_base.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 88acefbf9aeaba..6c79c350a4d5ba 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -1981,8 +1981,6 @@ megasas_set_nvme_device_properties(struct scsi_device *sdev, lim->max_hw_sectors = max_io_size / 512; lim->virt_boundary_mask = mr_nvme_pg_size - 1; - - blk_queue_flag_set(QUEUE_FLAG_NOMERGES, sdev->request_queue); } /* From 8b77f23fadcbb030a898f168bebe74f465e5d5a2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 27 Jun 2024 14:49:13 +0200 Subject: [PATCH 1520/1578] mpt3sas_scsih: don't set QUEUE_FLAG_NOMERGES Setting QUEUE_FLAG_NOMERGES was added in commit d1b01d14b7ba ("scsi: mpt3sas: Set NVMe device queue depth as 128") without any explanation. Drivers should second guess the block layer merge decisions, so remove the flag. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20240627124926.512662-4-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/mpt3sas/mpt3sas_scsih.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c index 12d08d8ba5382d..b050aedc9d4334 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c +++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c @@ -2680,12 +2680,6 @@ scsih_device_configure(struct scsi_device *sdev, struct queue_limits *lim) pcie_device_put(pcie_device); spin_unlock_irqrestore(&ioc->pcie_device_lock, flags); mpt3sas_scsih_change_queue_depth(sdev, qdepth); - /* Enable QUEUE_FLAG_NOMERGES flag, so that IOs won't be - ** merged and can eliminate holes created during merging - ** operation. - **/ - blk_queue_flag_set(QUEUE_FLAG_NOMERGES, - sdev->request_queue); lim->virt_boundary_mask = ioc->page_size - 1; return 0; } From 40988f15907baee227d3b83bd4d8f8fdfeb95dd3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 27 Jun 2024 14:49:14 +0200 Subject: [PATCH 1521/1578] rnbd: don't set QUEUE_FLAG_SAME_COMP QUEUE_FLAG_SAME_COMP is already set by default. Signed-off-by: Christoph Hellwig Acked-by: Jack Wang Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20240627124926.512662-5-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 4918b0f68b46cd..0e3773fe479706 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1397,7 +1397,6 @@ static int rnbd_client_setup_device(struct rnbd_clt_dev *dev, dev->queue = dev->gd->queue; rnbd_init_mq_hw_queues(dev); - blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, dev->queue); blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, dev->queue); return rnbd_clt_setup_gen_disk(dev, rsp, idx); } From caffa7cdce47718a0c2e3195c9a1bcf786d655a4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 27 Jun 2024 14:49:15 +0200 Subject: [PATCH 1522/1578] rnbd-cnt: don't set QUEUE_FLAG_SAME_FORCE QUEUE_FLAG_SAME_FORCE has been set by rnbd-cnt since the initial merge. There is no good reason for a driver to force exact core delivery, which is tunable for very specific workloads and not a driver setting. Signed-off-by: Christoph Hellwig Acked-by: Jack Wang Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20240627124926.512662-6-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 0e3773fe479706..c34695d2eea7fe 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1397,7 +1397,6 @@ static int rnbd_client_setup_device(struct rnbd_clt_dev *dev, dev->queue = dev->gd->queue; rnbd_init_mq_hw_queues(dev); - blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, dev->queue); return rnbd_clt_setup_gen_disk(dev, rsp, idx); } From c422b6a630240f706063e0ecbb894aa8491b1fa1 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 27 Jun 2024 13:14:47 +0200 Subject: [PATCH 1523/1578] i2c: testunit: don't erase registers after STOP STOP fallsthrough to WRITE_REQUESTED but this became problematic when clearing the testunit registers was added to the latter. Actually, there is no reason to clear the testunit state after STOP. Doing it when a new WRITE_REQUESTED arrives is enough. So, no need to fallthrough, at all. Fixes: b39ab96aa894 ("i2c: testunit: add support for block process calls") Signed-off-by: Wolfram Sang Reviewed-by: Andi Shyti --- drivers/i2c/i2c-slave-testunit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/i2c-slave-testunit.c b/drivers/i2c/i2c-slave-testunit.c index a49642bbae4b70..a5dcbc3c2c141a 100644 --- a/drivers/i2c/i2c-slave-testunit.c +++ b/drivers/i2c/i2c-slave-testunit.c @@ -118,7 +118,7 @@ static int i2c_slave_testunit_slave_cb(struct i2c_client *client, queue_delayed_work(system_long_wq, &tu->worker, msecs_to_jiffies(10 * tu->regs[TU_REG_DELAY])); } - fallthrough; + break; case I2C_SLAVE_WRITE_REQUESTED: memset(tu->regs, 0, TU_NUM_REGS); From c116deafd1a5cc1e9739099eb32114e90623209c Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 27 Jun 2024 13:14:48 +0200 Subject: [PATCH 1524/1578] i2c: testunit: discard write requests while old command is running When clearing registers on new write requests was added, the protection for currently running commands was missed leading to concurrent access to the testunit registers. Check the flag beforehand. Fixes: b39ab96aa894 ("i2c: testunit: add support for block process calls") Signed-off-by: Wolfram Sang Reviewed-by: Andi Shyti --- drivers/i2c/i2c-slave-testunit.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/i2c/i2c-slave-testunit.c b/drivers/i2c/i2c-slave-testunit.c index a5dcbc3c2c141a..ca43e98cae1b22 100644 --- a/drivers/i2c/i2c-slave-testunit.c +++ b/drivers/i2c/i2c-slave-testunit.c @@ -121,6 +121,9 @@ static int i2c_slave_testunit_slave_cb(struct i2c_client *client, break; case I2C_SLAVE_WRITE_REQUESTED: + if (test_bit(TU_FLAG_IN_PROCESS, &tu->flags)) + return -EBUSY; + memset(tu->regs, 0, TU_NUM_REGS); tu->reg_idx = 0; break; From 5b026e34120766408e76ba19a0e33a9dc996f9f0 Mon Sep 17 00:00:00 2001 From: Andreas Hindborg Date: Fri, 28 Jun 2024 11:11:52 +0200 Subject: [PATCH 1525/1578] rust: block: fix generated bindings after refactoring of features Block device features and flags were refactored from `enum` to `#define`. This broke Rust binding generation. This patch fixes the binding generation. Fixes: fcf865e357f8 ("block: convert features and flags to __bitwise types") Signed-off-by: Andreas Hindborg Acked-by: Miguel Ojeda Link: https://lore.kernel.org/r/20240628091152.2185241-1-nmi@metaspace.dk Signed-off-by: Jens Axboe --- rust/bindings/bindings_helper.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h index 84f601d7068e37..6deee85a29c896 100644 --- a/rust/bindings/bindings_helper.h +++ b/rust/bindings/bindings_helper.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -28,3 +29,4 @@ const gfp_t RUST_CONST_HELPER_GFP_KERNEL = GFP_KERNEL; const gfp_t RUST_CONST_HELPER_GFP_KERNEL_ACCOUNT = GFP_KERNEL_ACCOUNT; const gfp_t RUST_CONST_HELPER_GFP_NOWAIT = GFP_NOWAIT; const gfp_t RUST_CONST_HELPER___GFP_ZERO = __GFP_ZERO; +const blk_features_t RUST_CONST_HELPER_BLK_FEAT_ROTATIONAL = BLK_FEAT_ROTATIONAL; From 3991657ae7074c3c497bf095093178bed37ea1b4 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Wed, 26 Jun 2024 15:36:52 +0530 Subject: [PATCH 1526/1578] block: set bip_vcnt correctly Set the bip_vcnt correctly in bio_integrity_init_user and bio_integrity_copy_user. If the bio gets split at a later point, this value is required to set the right bip_vcnt in the cloned bio. Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240626100700.3629-3-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- block/bio-integrity.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index ad296849aa2a9a..eb3d7bbe1fe83a 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -254,6 +254,7 @@ static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec, bip->bip_flags |= BIP_INTEGRITY_USER | BIP_COPY_USER; bip->bip_iter.bi_sector = seed; + bip->bip_vcnt = nr_vecs; return 0; free_bip: bio_integrity_free(bio); @@ -275,6 +276,7 @@ static int bio_integrity_init_user(struct bio *bio, struct bio_vec *bvec, bip->bip_flags |= BIP_INTEGRITY_USER; bip->bip_iter.bi_sector = seed; bip->bip_iter.bi_size = len; + bip->bip_vcnt = nr_vecs; return 0; } From 1beabab88ecee0698ecee7b54afa9cce7046ef96 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 18 Jun 2024 14:21:08 +0800 Subject: [PATCH 1527/1578] blk-throttle: fix lower control under super low iops limit User will configure allowed iops limit in 1s, and calculate_io_allowed() will calculate allowed iops in the slice by: limit * HZ / throtl_slice However, if limit is quite low, the result can be 0, then allowed IO in the slice is 0, this will cause missing dispatch and control will be lower than limit. For example, set iops_limit to 5 with HD disk, and test will found that iops will be 3. This is usually not a big deal, because user will unlikely to configure such low iops limit, however, this is still a problem in the extreme scene. Fix the problem by making sure the wait time calculated by tg_within_iops_limit() should allow at least one IO to be dispatched. Signed-off-by: Yu Kuai Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20240618062108.3680835-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index c1bf73f8c75d95..dc6140fa3de061 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -704,6 +704,9 @@ static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio /* Calc approx time to dispatch */ jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed; + + /* make sure at least one io can be dispatched after waiting */ + jiffy_wait = max(jiffy_wait, HZ / iops_limit + 1); return jiffy_wait; } From 5476394aa9f27d670dd2bac426fdb6ac12b12cb3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 27 Jun 2024 13:14:01 +0200 Subject: [PATCH 1528/1578] block: simplify queue_logical_block_size queue_logical_block_size is never called with a 0 queue, and the logical_block_size field in queue_limits is always initialized for a live queue. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Link: https://lore.kernel.org/r/20240627111407.476276-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 53c41ef4222c3d..4d0d4b83bc740f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1227,12 +1227,7 @@ static inline unsigned int bdev_max_segments(struct block_device *bdev) static inline unsigned queue_logical_block_size(const struct request_queue *q) { - int retval = 512; - - if (q && q->limits.logical_block_size) - retval = q->limits.logical_block_size; - - return retval; + return q->limits.logical_block_size; } static inline unsigned int bdev_logical_block_size(struct block_device *bdev) From 319e8cfdf3caf41b98f50ef13542a35acd897bb6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 27 Jun 2024 13:14:02 +0200 Subject: [PATCH 1529/1578] block: add helper macros to de-duplicate the queue sysfs attributes A lof the code to implement the queue sysfs attributes is repetitive. Add a few macros to generate the common cases. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Link: https://lore.kernel.org/r/20240627111407.476276-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 255 +++++++++++++++------------------------------- 1 file changed, 82 insertions(+), 173 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 2e6d9b918127fe..a769fb441b58da 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -100,103 +100,65 @@ queue_ra_store(struct request_queue *q, const char *page, size_t count) return ret; } -static ssize_t queue_max_sectors_show(struct request_queue *q, char *page) -{ - int max_sectors_kb = queue_max_sectors(q) >> 1; - - return queue_var_show(max_sectors_kb, page); -} - -static ssize_t queue_max_segments_show(struct request_queue *q, char *page) -{ - return queue_var_show(queue_max_segments(q), page); -} - -static ssize_t queue_max_discard_segments_show(struct request_queue *q, - char *page) -{ - return queue_var_show(queue_max_discard_segments(q), page); -} - -static ssize_t queue_atomic_write_max_bytes_show(struct request_queue *q, - char *page) -{ - return queue_var_show(queue_atomic_write_max_bytes(q), page); -} - -static ssize_t queue_atomic_write_boundary_show(struct request_queue *q, - char *page) -{ - return queue_var_show(queue_atomic_write_boundary_bytes(q), page); -} - -static ssize_t queue_atomic_write_unit_min_show(struct request_queue *q, - char *page) -{ - return queue_var_show(queue_atomic_write_unit_min_bytes(q), page); -} - -static ssize_t queue_atomic_write_unit_max_show(struct request_queue *q, - char *page) -{ - return queue_var_show(queue_atomic_write_unit_max_bytes(q), page); -} - -static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page) -{ - return queue_var_show(q->limits.max_integrity_segments, page); -} - -static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page) -{ - return queue_var_show(queue_max_segment_size(q), page); -} - -static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page) -{ - return queue_var_show(queue_logical_block_size(q), page); -} - -static ssize_t queue_physical_block_size_show(struct request_queue *q, char *page) -{ - return queue_var_show(queue_physical_block_size(q), page); -} - -static ssize_t queue_chunk_sectors_show(struct request_queue *q, char *page) -{ - return queue_var_show(q->limits.chunk_sectors, page); -} - -static ssize_t queue_io_min_show(struct request_queue *q, char *page) -{ - return queue_var_show(queue_io_min(q), page); -} - -static ssize_t queue_io_opt_show(struct request_queue *q, char *page) -{ - return queue_var_show(queue_io_opt(q), page); -} - -static ssize_t queue_discard_granularity_show(struct request_queue *q, char *page) -{ - return queue_var_show(q->limits.discard_granularity, page); +#define QUEUE_SYSFS_LIMIT_SHOW(_field) \ +static ssize_t queue_##_field##_show(struct request_queue *q, char *page) \ +{ \ + return queue_var_show(q->limits._field, page); \ +} + +QUEUE_SYSFS_LIMIT_SHOW(max_segments) +QUEUE_SYSFS_LIMIT_SHOW(max_discard_segments) +QUEUE_SYSFS_LIMIT_SHOW(max_integrity_segments) +QUEUE_SYSFS_LIMIT_SHOW(max_segment_size) +QUEUE_SYSFS_LIMIT_SHOW(logical_block_size) +QUEUE_SYSFS_LIMIT_SHOW(physical_block_size) +QUEUE_SYSFS_LIMIT_SHOW(chunk_sectors) +QUEUE_SYSFS_LIMIT_SHOW(io_min) +QUEUE_SYSFS_LIMIT_SHOW(io_opt) +QUEUE_SYSFS_LIMIT_SHOW(discard_granularity) +QUEUE_SYSFS_LIMIT_SHOW(zone_write_granularity) +QUEUE_SYSFS_LIMIT_SHOW(virt_boundary_mask) +QUEUE_SYSFS_LIMIT_SHOW(dma_alignment) +QUEUE_SYSFS_LIMIT_SHOW(max_open_zones) +QUEUE_SYSFS_LIMIT_SHOW(max_active_zones) +QUEUE_SYSFS_LIMIT_SHOW(atomic_write_unit_min) +QUEUE_SYSFS_LIMIT_SHOW(atomic_write_unit_max) + +#define QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(_field) \ +static ssize_t queue_##_field##_show(struct request_queue *q, char *page) \ +{ \ + return sprintf(page, "%llu\n", \ + (unsigned long long)q->limits._field << SECTOR_SHIFT); \ +} + +QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_discard_sectors) +QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_discard_sectors) +QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_write_zeroes_sectors) +QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_max_sectors) +QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_boundary_sectors) + +#define QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(_field) \ +static ssize_t queue_##_field##_show(struct request_queue *q, char *page) \ +{ \ + return queue_var_show(q->limits._field >> 1, page); \ +} + +QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(max_sectors) +QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(max_hw_sectors) + +#define QUEUE_SYSFS_SHOW_CONST(_name, _val) \ +static ssize_t queue_##_name##_show(struct request_queue *q, char *page) \ +{ \ + return sprintf(page, "%d\n", _val); \ } -static ssize_t queue_discard_max_hw_show(struct request_queue *q, char *page) -{ - - return sprintf(page, "%llu\n", - (unsigned long long)q->limits.max_hw_discard_sectors << 9); -} +/* deprecated fields */ +QUEUE_SYSFS_SHOW_CONST(discard_zeroes_data, 0) +QUEUE_SYSFS_SHOW_CONST(write_same_max, 0) +QUEUE_SYSFS_SHOW_CONST(poll_delay, -1) -static ssize_t queue_discard_max_show(struct request_queue *q, char *page) -{ - return sprintf(page, "%llu\n", - (unsigned long long)q->limits.max_discard_sectors << 9); -} - -static ssize_t queue_discard_max_store(struct request_queue *q, - const char *page, size_t count) +static ssize_t queue_max_discard_sectors_store(struct request_queue *q, + const char *page, size_t count) { unsigned long max_discard_bytes; struct queue_limits lim; @@ -221,28 +183,11 @@ static ssize_t queue_discard_max_store(struct request_queue *q, return ret; } -static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page) -{ - return queue_var_show(0, page); -} - -static ssize_t queue_write_same_max_show(struct request_queue *q, char *page) -{ - return queue_var_show(0, page); -} - -static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page) -{ - return sprintf(page, "%llu\n", - (unsigned long long)q->limits.max_write_zeroes_sectors << 9); -} - -static ssize_t queue_zone_write_granularity_show(struct request_queue *q, - char *page) -{ - return queue_var_show(queue_zone_write_granularity(q), page); -} - +/* + * For zone append queue_max_zone_append_sectors does not just return the + * underlying queue limits, but actually contains a calculation. Because of + * that we can't simply use QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES here. + */ static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page) { unsigned long long max_sectors = queue_max_zone_append_sectors(q); @@ -270,23 +215,6 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) return ret; } -static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page) -{ - int max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1; - - return queue_var_show(max_hw_sectors_kb, page); -} - -static ssize_t queue_virt_boundary_mask_show(struct request_queue *q, char *page) -{ - return queue_var_show(q->limits.virt_boundary_mask, page); -} - -static ssize_t queue_dma_alignment_show(struct request_queue *q, char *page) -{ - return queue_var_show(queue_dma_alignment(q), page); -} - static ssize_t queue_feature_store(struct request_queue *q, const char *page, size_t count, blk_features_t feature) { @@ -325,6 +253,16 @@ QUEUE_SYSFS_FEATURE(add_random, BLK_FEAT_ADD_RANDOM) QUEUE_SYSFS_FEATURE(iostats, BLK_FEAT_IO_STAT) QUEUE_SYSFS_FEATURE(stable_writes, BLK_FEAT_STABLE_WRITES); +#define QUEUE_SYSFS_FEATURE_SHOW(_name, _feature) \ +static ssize_t queue_##_name##_show(struct request_queue *q, char *page) \ +{ \ + return sprintf(page, "%u\n", !!(q->limits.features & _feature)); \ +} + +QUEUE_SYSFS_FEATURE_SHOW(poll, BLK_FEAT_POLL); +QUEUE_SYSFS_FEATURE_SHOW(fua, BLK_FEAT_FUA); +QUEUE_SYSFS_FEATURE_SHOW(dax, BLK_FEAT_DAX); + static ssize_t queue_zoned_show(struct request_queue *q, char *page) { if (blk_queue_is_zoned(q)) @@ -337,16 +275,6 @@ static ssize_t queue_nr_zones_show(struct request_queue *q, char *page) return queue_var_show(disk_nr_zones(q->disk), page); } -static ssize_t queue_max_open_zones_show(struct request_queue *q, char *page) -{ - return queue_var_show(bdev_max_open_zones(q->disk->part0), page); -} - -static ssize_t queue_max_active_zones_show(struct request_queue *q, char *page) -{ - return queue_var_show(bdev_max_active_zones(q->disk->part0), page); -} - static ssize_t queue_nomerges_show(struct request_queue *q, char *page) { return queue_var_show((blk_queue_nomerges(q) << 1) | @@ -405,22 +333,12 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) return ret; } -static ssize_t queue_poll_delay_show(struct request_queue *q, char *page) -{ - return sprintf(page, "%d\n", -1); -} - static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page, size_t count) { return count; } -static ssize_t queue_poll_show(struct request_queue *q, char *page) -{ - return queue_var_show(!!(q->limits.features & BLK_FEAT_POLL), page); -} - static ssize_t queue_poll_store(struct request_queue *q, const char *page, size_t count) { @@ -485,16 +403,6 @@ static ssize_t queue_wc_store(struct request_queue *q, const char *page, return count; } -static ssize_t queue_fua_show(struct request_queue *q, char *page) -{ - return sprintf(page, "%u\n", !!(q->limits.features & BLK_FEAT_FUA)); -} - -static ssize_t queue_dax_show(struct request_queue *q, char *page) -{ - return queue_var_show(!!blk_queue_dax(q), page); -} - #define QUEUE_RO_ENTRY(_prefix, _name) \ static struct queue_sysfs_entry _prefix##_entry = { \ .attr = { .name = _name, .mode = 0444 }, \ @@ -525,17 +433,18 @@ QUEUE_RO_ENTRY(queue_io_opt, "optimal_io_size"); QUEUE_RO_ENTRY(queue_max_discard_segments, "max_discard_segments"); QUEUE_RO_ENTRY(queue_discard_granularity, "discard_granularity"); -QUEUE_RO_ENTRY(queue_discard_max_hw, "discard_max_hw_bytes"); -QUEUE_RW_ENTRY(queue_discard_max, "discard_max_bytes"); +QUEUE_RO_ENTRY(queue_max_hw_discard_sectors, "discard_max_hw_bytes"); +QUEUE_RW_ENTRY(queue_max_discard_sectors, "discard_max_bytes"); QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data"); -QUEUE_RO_ENTRY(queue_atomic_write_max_bytes, "atomic_write_max_bytes"); -QUEUE_RO_ENTRY(queue_atomic_write_boundary, "atomic_write_boundary_bytes"); +QUEUE_RO_ENTRY(queue_atomic_write_max_sectors, "atomic_write_max_bytes"); +QUEUE_RO_ENTRY(queue_atomic_write_boundary_sectors, + "atomic_write_boundary_bytes"); QUEUE_RO_ENTRY(queue_atomic_write_unit_max, "atomic_write_unit_max_bytes"); QUEUE_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes"); QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes"); -QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes"); +QUEUE_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes"); QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes"); QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); @@ -652,15 +561,15 @@ static struct attribute *queue_attrs[] = { &queue_io_min_entry.attr, &queue_io_opt_entry.attr, &queue_discard_granularity_entry.attr, - &queue_discard_max_entry.attr, - &queue_discard_max_hw_entry.attr, + &queue_max_discard_sectors_entry.attr, + &queue_max_hw_discard_sectors_entry.attr, &queue_discard_zeroes_data_entry.attr, - &queue_atomic_write_max_bytes_entry.attr, - &queue_atomic_write_boundary_entry.attr, + &queue_atomic_write_max_sectors_entry.attr, + &queue_atomic_write_boundary_sectors_entry.attr, &queue_atomic_write_unit_min_entry.attr, &queue_atomic_write_unit_max_entry.attr, &queue_write_same_max_entry.attr, - &queue_write_zeroes_max_entry.attr, + &queue_max_write_zeroes_sectors_entry.attr, &queue_zone_append_max_entry.attr, &queue_zone_write_granularity_entry.attr, &queue_rotational_entry.attr, From 62e35f942231e372f8e465d8484de66a60221226 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 27 Jun 2024 13:14:03 +0200 Subject: [PATCH 1530/1578] block: pass a gendisk to the queue_sysfs_entry methods The kobject for the queue entries is embedded into a struct gendisk. Pass it to the sysfs methods instead of the request_queue derived from it. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240627111407.476276-4-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 180 +++++++++++++++++++++++----------------------- block/elevator.c | 9 +-- block/elevator.h | 4 +- 3 files changed, 96 insertions(+), 97 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index a769fb441b58da..60116d13cb8043 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -22,8 +22,8 @@ struct queue_sysfs_entry { struct attribute attr; - ssize_t (*show)(struct request_queue *, char *); - ssize_t (*store)(struct request_queue *, const char *, size_t); + ssize_t (*show)(struct gendisk *disk, char *page); + ssize_t (*store)(struct gendisk *disk, const char *page, size_t count); }; static ssize_t @@ -47,18 +47,18 @@ queue_var_store(unsigned long *var, const char *page, size_t count) return count; } -static ssize_t queue_requests_show(struct request_queue *q, char *page) +static ssize_t queue_requests_show(struct gendisk *disk, char *page) { - return queue_var_show(q->nr_requests, page); + return queue_var_show(disk->queue->nr_requests, page); } static ssize_t -queue_requests_store(struct request_queue *q, const char *page, size_t count) +queue_requests_store(struct gendisk *disk, const char *page, size_t count) { unsigned long nr; int ret, err; - if (!queue_is_mq(q)) + if (!queue_is_mq(disk->queue)) return -EINVAL; ret = queue_var_store(&nr, page, count); @@ -68,42 +68,35 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) if (nr < BLKDEV_MIN_RQ) nr = BLKDEV_MIN_RQ; - err = blk_mq_update_nr_requests(q, nr); + err = blk_mq_update_nr_requests(disk->queue, nr); if (err) return err; return ret; } -static ssize_t queue_ra_show(struct request_queue *q, char *page) +static ssize_t queue_ra_show(struct gendisk *disk, char *page) { - unsigned long ra_kb; - - if (!q->disk) - return -EINVAL; - ra_kb = q->disk->bdi->ra_pages << (PAGE_SHIFT - 10); - return queue_var_show(ra_kb, page); + return queue_var_show(disk->bdi->ra_pages << (PAGE_SHIFT - 10), page); } static ssize_t -queue_ra_store(struct request_queue *q, const char *page, size_t count) +queue_ra_store(struct gendisk *disk, const char *page, size_t count) { unsigned long ra_kb; ssize_t ret; - if (!q->disk) - return -EINVAL; ret = queue_var_store(&ra_kb, page, count); if (ret < 0) return ret; - q->disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10); + disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10); return ret; } #define QUEUE_SYSFS_LIMIT_SHOW(_field) \ -static ssize_t queue_##_field##_show(struct request_queue *q, char *page) \ +static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \ { \ - return queue_var_show(q->limits._field, page); \ + return queue_var_show(disk->queue->limits._field, page); \ } QUEUE_SYSFS_LIMIT_SHOW(max_segments) @@ -125,10 +118,11 @@ QUEUE_SYSFS_LIMIT_SHOW(atomic_write_unit_min) QUEUE_SYSFS_LIMIT_SHOW(atomic_write_unit_max) #define QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(_field) \ -static ssize_t queue_##_field##_show(struct request_queue *q, char *page) \ +static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \ { \ return sprintf(page, "%llu\n", \ - (unsigned long long)q->limits._field << SECTOR_SHIFT); \ + (unsigned long long)disk->queue->limits._field << \ + SECTOR_SHIFT); \ } QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_discard_sectors) @@ -138,16 +132,16 @@ QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_max_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_boundary_sectors) #define QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(_field) \ -static ssize_t queue_##_field##_show(struct request_queue *q, char *page) \ +static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \ { \ - return queue_var_show(q->limits._field >> 1, page); \ + return queue_var_show(disk->queue->limits._field >> 1, page); \ } QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(max_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(max_hw_sectors) #define QUEUE_SYSFS_SHOW_CONST(_name, _val) \ -static ssize_t queue_##_name##_show(struct request_queue *q, char *page) \ +static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ { \ return sprintf(page, "%d\n", _val); \ } @@ -157,7 +151,7 @@ QUEUE_SYSFS_SHOW_CONST(discard_zeroes_data, 0) QUEUE_SYSFS_SHOW_CONST(write_same_max, 0) QUEUE_SYSFS_SHOW_CONST(poll_delay, -1) -static ssize_t queue_max_discard_sectors_store(struct request_queue *q, +static ssize_t queue_max_discard_sectors_store(struct gendisk *disk, const char *page, size_t count) { unsigned long max_discard_bytes; @@ -169,15 +163,15 @@ static ssize_t queue_max_discard_sectors_store(struct request_queue *q, if (ret < 0) return ret; - if (max_discard_bytes & (q->limits.discard_granularity - 1)) + if (max_discard_bytes & (disk->queue->limits.discard_granularity - 1)) return -EINVAL; if ((max_discard_bytes >> SECTOR_SHIFT) > UINT_MAX) return -EINVAL; - lim = queue_limits_start_update(q); + lim = queue_limits_start_update(disk->queue); lim.max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT; - err = queue_limits_commit_update(q, &lim); + err = queue_limits_commit_update(disk->queue, &lim); if (err) return err; return ret; @@ -188,15 +182,15 @@ static ssize_t queue_max_discard_sectors_store(struct request_queue *q, * underlying queue limits, but actually contains a calculation. Because of * that we can't simply use QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES here. */ -static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page) +static ssize_t queue_zone_append_max_show(struct gendisk *disk, char *page) { - unsigned long long max_sectors = queue_max_zone_append_sectors(q); - - return sprintf(page, "%llu\n", max_sectors << SECTOR_SHIFT); + return sprintf(page, "%llu\n", + (u64)queue_max_zone_append_sectors(disk->queue) << + SECTOR_SHIFT); } static ssize_t -queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) +queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count) { unsigned long max_sectors_kb; struct queue_limits lim; @@ -207,15 +201,15 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) if (ret < 0) return ret; - lim = queue_limits_start_update(q); + lim = queue_limits_start_update(disk->queue); lim.max_user_sectors = max_sectors_kb << 1; - err = queue_limits_commit_update(q, &lim); + err = queue_limits_commit_update(disk->queue, &lim); if (err) return err; return ret; } -static ssize_t queue_feature_store(struct request_queue *q, const char *page, +static ssize_t queue_feature_store(struct gendisk *disk, const char *page, size_t count, blk_features_t feature) { struct queue_limits lim; @@ -226,26 +220,27 @@ static ssize_t queue_feature_store(struct request_queue *q, const char *page, if (ret < 0) return ret; - lim = queue_limits_start_update(q); + lim = queue_limits_start_update(disk->queue); if (val) lim.features |= feature; else lim.features &= ~feature; - ret = queue_limits_commit_update(q, &lim); + ret = queue_limits_commit_update(disk->queue, &lim); if (ret) return ret; return count; } -#define QUEUE_SYSFS_FEATURE(_name, _feature) \ -static ssize_t queue_##_name##_show(struct request_queue *q, char *page) \ -{ \ - return sprintf(page, "%u\n", !!(q->limits.features & _feature)); \ -} \ -static ssize_t queue_##_name##_store(struct request_queue *q, \ - const char *page, size_t count) \ -{ \ - return queue_feature_store(q, page, count, _feature); \ +#define QUEUE_SYSFS_FEATURE(_name, _feature) \ +static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ +{ \ + return sprintf(page, "%u\n", \ + !!(disk->queue->limits.features & _feature)); \ +} \ +static ssize_t queue_##_name##_store(struct gendisk *disk, \ + const char *page, size_t count) \ +{ \ + return queue_feature_store(disk, page, count, _feature); \ } QUEUE_SYSFS_FEATURE(rotational, BLK_FEAT_ROTATIONAL) @@ -253,35 +248,36 @@ QUEUE_SYSFS_FEATURE(add_random, BLK_FEAT_ADD_RANDOM) QUEUE_SYSFS_FEATURE(iostats, BLK_FEAT_IO_STAT) QUEUE_SYSFS_FEATURE(stable_writes, BLK_FEAT_STABLE_WRITES); -#define QUEUE_SYSFS_FEATURE_SHOW(_name, _feature) \ -static ssize_t queue_##_name##_show(struct request_queue *q, char *page) \ -{ \ - return sprintf(page, "%u\n", !!(q->limits.features & _feature)); \ +#define QUEUE_SYSFS_FEATURE_SHOW(_name, _feature) \ +static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ +{ \ + return sprintf(page, "%u\n", \ + !!(disk->queue->limits.features & _feature)); \ } QUEUE_SYSFS_FEATURE_SHOW(poll, BLK_FEAT_POLL); QUEUE_SYSFS_FEATURE_SHOW(fua, BLK_FEAT_FUA); QUEUE_SYSFS_FEATURE_SHOW(dax, BLK_FEAT_DAX); -static ssize_t queue_zoned_show(struct request_queue *q, char *page) +static ssize_t queue_zoned_show(struct gendisk *disk, char *page) { - if (blk_queue_is_zoned(q)) + if (blk_queue_is_zoned(disk->queue)) return sprintf(page, "host-managed\n"); return sprintf(page, "none\n"); } -static ssize_t queue_nr_zones_show(struct request_queue *q, char *page) +static ssize_t queue_nr_zones_show(struct gendisk *disk, char *page) { - return queue_var_show(disk_nr_zones(q->disk), page); + return queue_var_show(disk_nr_zones(disk), page); } -static ssize_t queue_nomerges_show(struct request_queue *q, char *page) +static ssize_t queue_nomerges_show(struct gendisk *disk, char *page) { - return queue_var_show((blk_queue_nomerges(q) << 1) | - blk_queue_noxmerges(q), page); + return queue_var_show((blk_queue_nomerges(disk->queue) << 1) | + blk_queue_noxmerges(disk->queue), page); } -static ssize_t queue_nomerges_store(struct request_queue *q, const char *page, +static ssize_t queue_nomerges_store(struct gendisk *disk, const char *page, size_t count) { unsigned long nm; @@ -290,29 +286,30 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page, if (ret < 0) return ret; - blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, q); - blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, q); + blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, disk->queue); + blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, disk->queue); if (nm == 2) - blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); + blk_queue_flag_set(QUEUE_FLAG_NOMERGES, disk->queue); else if (nm) - blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); + blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, disk->queue); return ret; } -static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page) +static ssize_t queue_rq_affinity_show(struct gendisk *disk, char *page) { - bool set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags); - bool force = test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags); + bool set = test_bit(QUEUE_FLAG_SAME_COMP, &disk->queue->queue_flags); + bool force = test_bit(QUEUE_FLAG_SAME_FORCE, &disk->queue->queue_flags); return queue_var_show(set << force, page); } static ssize_t -queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) +queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count) { ssize_t ret = -EINVAL; #ifdef CONFIG_SMP + struct request_queue *q = disk->queue; unsigned long val; ret = queue_var_store(&val, page, count); @@ -333,28 +330,28 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) return ret; } -static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page, +static ssize_t queue_poll_delay_store(struct gendisk *disk, const char *page, size_t count) { return count; } -static ssize_t queue_poll_store(struct request_queue *q, const char *page, +static ssize_t queue_poll_store(struct gendisk *disk, const char *page, size_t count) { - if (!(q->limits.features & BLK_FEAT_POLL)) + if (!(disk->queue->limits.features & BLK_FEAT_POLL)) return -EINVAL; pr_info_ratelimited("writes to the poll attribute are ignored.\n"); pr_info_ratelimited("please use driver specific parameters instead.\n"); return count; } -static ssize_t queue_io_timeout_show(struct request_queue *q, char *page) +static ssize_t queue_io_timeout_show(struct gendisk *disk, char *page) { - return sprintf(page, "%u\n", jiffies_to_msecs(q->rq_timeout)); + return sprintf(page, "%u\n", jiffies_to_msecs(disk->queue->rq_timeout)); } -static ssize_t queue_io_timeout_store(struct request_queue *q, const char *page, +static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page, size_t count) { unsigned int val; @@ -364,19 +361,19 @@ static ssize_t queue_io_timeout_store(struct request_queue *q, const char *page, if (err || val == 0) return -EINVAL; - blk_queue_rq_timeout(q, msecs_to_jiffies(val)); + blk_queue_rq_timeout(disk->queue, msecs_to_jiffies(val)); return count; } -static ssize_t queue_wc_show(struct request_queue *q, char *page) +static ssize_t queue_wc_show(struct gendisk *disk, char *page) { - if (blk_queue_write_cache(q)) + if (blk_queue_write_cache(disk->queue)) return sprintf(page, "write back\n"); return sprintf(page, "write through\n"); } -static ssize_t queue_wc_store(struct request_queue *q, const char *page, +static ssize_t queue_wc_store(struct gendisk *disk, const char *page, size_t count) { struct queue_limits lim; @@ -392,12 +389,12 @@ static ssize_t queue_wc_store(struct request_queue *q, const char *page, return -EINVAL; } - lim = queue_limits_start_update(q); + lim = queue_limits_start_update(disk->queue); if (disable) lim.flags |= BLK_FLAG_WRITE_CACHE_DISABLED; else lim.flags &= ~BLK_FLAG_WRITE_CACHE_DISABLED; - err = queue_limits_commit_update(q, &lim); + err = queue_limits_commit_update(disk->queue, &lim); if (err) return err; return count; @@ -489,20 +486,22 @@ static ssize_t queue_var_store64(s64 *var, const char *page) return 0; } -static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) +static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page) { - if (!wbt_rq_qos(q)) + if (!wbt_rq_qos(disk->queue)) return -EINVAL; - if (wbt_disabled(q)) + if (wbt_disabled(disk->queue)) return sprintf(page, "0\n"); - return sprintf(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000)); + return sprintf(page, "%llu\n", + div_u64(wbt_get_min_lat(disk->queue), 1000)); } -static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, +static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, size_t count) { + struct request_queue *q = disk->queue; struct rq_qos *rqos; ssize_t ret; s64 val; @@ -515,7 +514,7 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, rqos = wbt_rq_qos(q); if (!rqos) { - ret = wbt_init(q->disk); + ret = wbt_init(disk); if (ret) return ret; } @@ -649,14 +648,13 @@ queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct queue_sysfs_entry *entry = to_queue(attr); struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); - struct request_queue *q = disk->queue; ssize_t res; if (!entry->show) return -EIO; - mutex_lock(&q->sysfs_lock); - res = entry->show(q, page); - mutex_unlock(&q->sysfs_lock); + mutex_lock(&disk->queue->sysfs_lock); + res = entry->show(disk, page); + mutex_unlock(&disk->queue->sysfs_lock); return res; } @@ -674,7 +672,7 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, blk_mq_freeze_queue(q); mutex_lock(&q->sysfs_lock); - res = entry->store(q, page, length); + res = entry->store(disk, page, length); mutex_unlock(&q->sysfs_lock); blk_mq_unfreeze_queue(q); return res; diff --git a/block/elevator.c b/block/elevator.c index f64ebd726e588a..f13d552a32c8b8 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -709,24 +709,25 @@ static int elevator_change(struct request_queue *q, const char *elevator_name) return ret; } -ssize_t elv_iosched_store(struct request_queue *q, const char *buf, +ssize_t elv_iosched_store(struct gendisk *disk, const char *buf, size_t count) { char elevator_name[ELV_NAME_MAX]; int ret; - if (!elv_support_iosched(q)) + if (!elv_support_iosched(disk->queue)) return count; strscpy(elevator_name, buf, sizeof(elevator_name)); - ret = elevator_change(q, strstrip(elevator_name)); + ret = elevator_change(disk->queue, strstrip(elevator_name)); if (!ret) return count; return ret; } -ssize_t elv_iosched_show(struct request_queue *q, char *name) +ssize_t elv_iosched_show(struct gendisk *disk, char *name) { + struct request_queue *q = disk->queue; struct elevator_queue *eq = q->elevator; struct elevator_type *cur = NULL, *e; int len = 0; diff --git a/block/elevator.h b/block/elevator.h index e9a050a96e5305..3fe18e1a869275 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -147,8 +147,8 @@ extern void elv_unregister(struct elevator_type *); /* * io scheduler sysfs switching */ -extern ssize_t elv_iosched_show(struct request_queue *, char *); -extern ssize_t elv_iosched_store(struct request_queue *, const char *, size_t); +ssize_t elv_iosched_show(struct gendisk *disk, char *page); +ssize_t elv_iosched_store(struct gendisk *disk, const char *page, size_t count); extern bool elv_bio_merge_ok(struct request *, struct bio *); extern struct elevator_queue *elevator_alloc(struct request_queue *, From 093d9603b60093a9aaae942db56107f6432a5dca Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 28 Jun 2024 14:27:22 -0700 Subject: [PATCH 1531/1578] x86: stop playing stack games in profile_pc() The 'profile_pc()' function is used for timer-based profiling, which isn't really all that relevant any more to begin with, but it also ends up making assumptions based on the stack layout that aren't necessarily valid. Basically, the code tries to account the time spent in spinlocks to the caller rather than the spinlock, and while I support that as a concept, it's not worth the code complexity or the KASAN warnings when no serious profiling is done using timers anyway these days. And the code really does depend on stack layout that is only true in the simplest of cases. We've lost the comment at some point (I think when the 32-bit and 64-bit code was unified), but it used to say: Assume the lock function has either no stack frame or a copy of eflags from PUSHF. which explains why it just blindly loads a word or two straight off the stack pointer and then takes a minimal look at the values to just check if they might be eflags or the return pc: Eflags always has bits 22 and up cleared unlike kernel addresses but that basic stack layout assumption assumes that there isn't any lock debugging etc going on that would complicate the code and cause a stack frame. It causes KASAN unhappiness reported for years by syzkaller [1] and others [2]. With no real practical reason for this any more, just remove the code. Just for historical interest, here's some background commits relating to this code from 2006: 0cb91a229364 ("i386: Account spinlocks to the caller during profiling for !FP kernels") 31679f38d886 ("Simplify profile_pc on x86-64") and a code unification from 2009: ef4512882dbe ("x86: time_32/64.c unify profile_pc") but the basics of this thing actually goes back to before the git tree. Link: https://syzkaller.appspot.com/bug?extid=84fe685c02cd112a2ac3 [1] Link: https://lore.kernel.org/all/CAK55_s7Xyq=nh97=K=G1sxueOFrJDAvPOJAL4TPTCAYvmxO9_A@mail.gmail.com/ [2] Signed-off-by: Linus Torvalds --- arch/x86/kernel/time.c | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index e42faa792c0793..52e1f3f0b361ce 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -27,25 +27,7 @@ unsigned long profile_pc(struct pt_regs *regs) { - unsigned long pc = instruction_pointer(regs); - - if (!user_mode(regs) && in_lock_functions(pc)) { -#ifdef CONFIG_FRAME_POINTER - return *(unsigned long *)(regs->bp + sizeof(long)); -#else - unsigned long *sp = (unsigned long *)regs->sp; - /* - * Return address is either directly at stack pointer - * or above a saved flags. Eflags has bits 22-31 zero, - * kernel addresses don't. - */ - if (sp[0] >> 22) - return sp[0]; - if (sp[1] >> 22) - return sp[1]; -#endif - } - return pc; + return instruction_pointer(regs); } EXPORT_SYMBOL(profile_pc); From 769327258a141ba80ac8b96fce35c68631228370 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 25 Jun 2024 17:50:04 -0700 Subject: [PATCH 1532/1578] x86-32: fix cmpxchg8b_emu build error with clang The kernel test robot reported that clang no longer compiles the 32-bit x86 kernel in some configurations due to commit 95ece48165c1 ("locking/atomic/x86: Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions"). The build fails with arch/x86/include/asm/cmpxchg_32.h:149:9: error: inline assembly requires more registers than available and the reason seems to be that not only does the cmpxchg8b instruction need four fixed registers (EDX:EAX and ECX:EBX), with the emulation fallback the inline asm also wants a fifth fixed register for the address (it uses %esi for that, but that's just a software convention with cmpxchg8b_emu). Avoiding using another pointer input to the asm (and just forcing it to use the "0(%esi)" addressing that we end up requiring for the sw fallback) seems to fix the issue. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202406230912.F6XFIyA6-lkp@intel.com/ Fixes: 95ece48165c1 ("locking/atomic/x86: Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions") Link: https://lore.kernel.org/all/202406230912.F6XFIyA6-lkp@intel.com/ Suggested-by: Uros Bizjak Reviewed-and-Tested-by: Uros Bizjak Signed-off-by: Linus Torvalds --- arch/x86/include/asm/cmpxchg_32.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index ed2797f132ce09..62cef2113ca749 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -93,10 +93,9 @@ static __always_inline bool __try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, \ asm volatile(ALTERNATIVE(_lock_loc \ "call cmpxchg8b_emu", \ - _lock "cmpxchg8b %[ptr]", X86_FEATURE_CX8) \ - : [ptr] "+m" (*(_ptr)), \ - "+a" (o.low), "+d" (o.high) \ - : "b" (n.low), "c" (n.high), "S" (_ptr) \ + _lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \ + : "+a" (o.low), "+d" (o.high) \ + : "b" (n.low), "c" (n.high), [ptr] "S" (_ptr) \ : "memory"); \ \ o.full; \ @@ -122,12 +121,11 @@ static __always_inline u64 arch_cmpxchg64_local(volatile u64 *ptr, u64 old, u64 \ asm volatile(ALTERNATIVE(_lock_loc \ "call cmpxchg8b_emu", \ - _lock "cmpxchg8b %[ptr]", X86_FEATURE_CX8) \ + _lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \ CC_SET(e) \ : CC_OUT(e) (ret), \ - [ptr] "+m" (*(_ptr)), \ "+a" (o.low), "+d" (o.high) \ - : "b" (n.low), "c" (n.high), "S" (_ptr) \ + : "b" (n.low), "c" (n.high), [ptr] "S" (_ptr) \ : "memory"); \ \ if (unlikely(!ret)) \ From 5d92c7c566dc76d96e0e19e481d926bbe6631c1e Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Sat, 29 Jun 2024 14:42:11 +0200 Subject: [PATCH 1533/1578] ata: libata-core: Fix null pointer dereference on error If the ata_port_alloc() call in ata_host_alloc() fails, ata_host_release() will get called. However, the code in ata_host_release() tries to free ata_port struct members unconditionally, which can lead to the following: BUG: unable to handle page fault for address: 0000000000003990 PGD 0 P4D 0 Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 10 PID: 594 Comm: (udev-worker) Not tainted 6.10.0-rc5 #44 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-2.fc40 04/01/2014 RIP: 0010:ata_host_release.cold+0x2f/0x6e [libata] Code: e4 4d 63 f4 44 89 e2 48 c7 c6 90 ad 32 c0 48 c7 c7 d0 70 33 c0 49 83 c6 0e 41 RSP: 0018:ffffc90000ebb968 EFLAGS: 00010246 RAX: 0000000000000041 RBX: ffff88810fb52e78 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff88813b3218c0 RDI: ffff88813b3218c0 RBP: ffff88810fb52e40 R08: 0000000000000000 R09: 6c65725f74736f68 R10: ffffc90000ebb738 R11: 73692033203a746e R12: 0000000000000004 R13: 0000000000000000 R14: 0000000000000011 R15: 0000000000000006 FS: 00007f6cc55b9980(0000) GS:ffff88813b300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000003990 CR3: 00000001122a2000 CR4: 0000000000750ef0 PKRU: 55555554 Call Trace: ? __die_body.cold+0x19/0x27 ? page_fault_oops+0x15a/0x2f0 ? exc_page_fault+0x7e/0x180 ? asm_exc_page_fault+0x26/0x30 ? ata_host_release.cold+0x2f/0x6e [libata] ? ata_host_release.cold+0x2f/0x6e [libata] release_nodes+0x35/0xb0 devres_release_group+0x113/0x140 ata_host_alloc+0xed/0x120 [libata] ata_host_alloc_pinfo+0x14/0xa0 [libata] ahci_init_one+0x6c9/0xd20 [ahci] Do not access ata_port struct members unconditionally. Fixes: 633273a3ed1c ("libata-pmp: hook PMP support and enable it") Cc: stable@vger.kernel.org Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: John Garry Link: https://lore.kernel.org/r/20240629124210.181537-7-cassel@kernel.org Signed-off-by: Niklas Cassel --- drivers/ata/libata-core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index efb5195da60c56..bdccf4ea251ae0 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -5517,6 +5517,9 @@ static void ata_host_release(struct kref *kref) for (i = 0; i < host->n_ports; i++) { struct ata_port *ap = host->ports[i]; + if (!ap) + continue; + kfree(ap->pmp_link); kfree(ap->slave_link); kfree(ap->ncq_sense_buf); From f6549f538fe0b2c389e1a7037f4e21039e25137a Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Sat, 29 Jun 2024 14:42:12 +0200 Subject: [PATCH 1534/1578] ata,scsi: libata-core: Do not leak memory for ata_port struct members libsas is currently not freeing all the struct ata_port struct members, e.g. ncq_sense_buf for a driver supporting Command Duration Limits (CDL). Add a function, ata_port_free(), that is used to free a ata_port, including its struct members. It makes sense to keep the code related to freeing a ata_port in its own function, which will also free all the struct members of struct ata_port. Fixes: 18bd7718b5c4 ("scsi: ata: libata: Handle completion of CDL commands using policy 0xD") Reviewed-by: John Garry Link: https://lore.kernel.org/r/20240629124210.181537-8-cassel@kernel.org Signed-off-by: Niklas Cassel --- drivers/ata/libata-core.c | 24 ++++++++++++++---------- drivers/scsi/libsas/sas_ata.c | 6 +++--- drivers/scsi/libsas/sas_discover.c | 2 +- include/linux/libata.h | 1 + 4 files changed, 19 insertions(+), 14 deletions(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index bdccf4ea251ae0..27d22bc43a952a 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -5489,6 +5489,18 @@ struct ata_port *ata_port_alloc(struct ata_host *host) return ap; } +void ata_port_free(struct ata_port *ap) +{ + if (!ap) + return; + + kfree(ap->pmp_link); + kfree(ap->slave_link); + kfree(ap->ncq_sense_buf); + kfree(ap); +} +EXPORT_SYMBOL_GPL(ata_port_free); + static void ata_devres_release(struct device *gendev, void *res) { struct ata_host *host = dev_get_drvdata(gendev); @@ -5515,15 +5527,7 @@ static void ata_host_release(struct kref *kref) int i; for (i = 0; i < host->n_ports; i++) { - struct ata_port *ap = host->ports[i]; - - if (!ap) - continue; - - kfree(ap->pmp_link); - kfree(ap->slave_link); - kfree(ap->ncq_sense_buf); - kfree(ap); + ata_port_free(host->ports[i]); host->ports[i] = NULL; } kfree(host); @@ -5906,7 +5910,7 @@ int ata_host_register(struct ata_host *host, const struct scsi_host_template *sh * allocation time. */ for (i = host->n_ports; host->ports[i]; i++) - kfree(host->ports[i]); + ata_port_free(host->ports[i]); /* give ports names and add SCSI hosts */ for (i = 0; i < host->n_ports; i++) { diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c index 4c69fc63c11966..cbbe43d8ef87bd 100644 --- a/drivers/scsi/libsas/sas_ata.c +++ b/drivers/scsi/libsas/sas_ata.c @@ -610,15 +610,15 @@ int sas_ata_init(struct domain_device *found_dev) rc = ata_sas_tport_add(ata_host->dev, ap); if (rc) - goto destroy_port; + goto free_port; found_dev->sata_dev.ata_host = ata_host; found_dev->sata_dev.ap = ap; return 0; -destroy_port: - kfree(ap); +free_port: + ata_port_free(ap); free_host: ata_host_put(ata_host); return rc; diff --git a/drivers/scsi/libsas/sas_discover.c b/drivers/scsi/libsas/sas_discover.c index 8fb7c41c096245..48d975c6dbf2cd 100644 --- a/drivers/scsi/libsas/sas_discover.c +++ b/drivers/scsi/libsas/sas_discover.c @@ -301,7 +301,7 @@ void sas_free_device(struct kref *kref) if (dev_is_sata(dev) && dev->sata_dev.ap) { ata_sas_tport_delete(dev->sata_dev.ap); - kfree(dev->sata_dev.ap); + ata_port_free(dev->sata_dev.ap); ata_host_put(dev->sata_dev.ata_host); dev->sata_dev.ata_host = NULL; dev->sata_dev.ap = NULL; diff --git a/include/linux/libata.h b/include/linux/libata.h index 13fb41d25da608..7d3bd7c9664ab3 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1249,6 +1249,7 @@ extern int ata_slave_link_init(struct ata_port *ap); extern struct ata_port *ata_sas_port_alloc(struct ata_host *, struct ata_port_info *, struct Scsi_Host *); extern void ata_port_probe(struct ata_port *ap); +extern void ata_port_free(struct ata_port *ap); extern int ata_sas_tport_add(struct device *parent, struct ata_port *ap); extern void ata_sas_tport_delete(struct ata_port *ap); int ata_sas_device_configure(struct scsi_device *sdev, struct queue_limits *lim, From ab9e0c529eb7cafebdd31fe1644524e80a48b05d Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Sat, 29 Jun 2024 14:42:13 +0200 Subject: [PATCH 1535/1578] ata: libata-core: Fix double free on error If e.g. the ata_port_alloc() call in ata_host_alloc() fails, we will jump to the err_out label, which will call devres_release_group(). devres_release_group() will trigger a call to ata_host_release(). ata_host_release() calls kfree(host), so executing the kfree(host) in ata_host_alloc() will lead to a double free: kernel BUG at mm/slub.c:553! Oops: invalid opcode: 0000 [#1] PREEMPT SMP NOPTI CPU: 11 PID: 599 Comm: (udev-worker) Not tainted 6.10.0-rc5 #47 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-2.fc40 04/01/2014 RIP: 0010:kfree+0x2cf/0x2f0 Code: 5d 41 5e 41 5f 5d e9 80 d6 ff ff 4d 89 f1 41 b8 01 00 00 00 48 89 d9 48 89 da RSP: 0018:ffffc90000f377f0 EFLAGS: 00010246 RAX: ffff888112b1f2c0 RBX: ffff888112b1f2c0 RCX: ffff888112b1f320 RDX: 000000000000400b RSI: ffffffffc02c9de5 RDI: ffff888112b1f2c0 RBP: ffffc90000f37830 R08: 0000000000000000 R09: 0000000000000000 R10: ffffc90000f37610 R11: 617461203a736b6e R12: ffffea00044ac780 R13: ffff888100046400 R14: ffffffffc02c9de5 R15: 0000000000000006 FS: 00007f2f1cabe980(0000) GS:ffff88813b380000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f2f1c3acf75 CR3: 0000000111724000 CR4: 0000000000750ef0 PKRU: 55555554 Call Trace: ? __die_body.cold+0x19/0x27 ? die+0x2e/0x50 ? do_trap+0xca/0x110 ? do_error_trap+0x6a/0x90 ? kfree+0x2cf/0x2f0 ? exc_invalid_op+0x50/0x70 ? kfree+0x2cf/0x2f0 ? asm_exc_invalid_op+0x1a/0x20 ? ata_host_alloc+0xf5/0x120 [libata] ? ata_host_alloc+0xf5/0x120 [libata] ? kfree+0x2cf/0x2f0 ata_host_alloc+0xf5/0x120 [libata] ata_host_alloc_pinfo+0x14/0xa0 [libata] ahci_init_one+0x6c9/0xd20 [ahci] Ensure that we will not call kfree(host) twice, by performing the kfree() only if the devres_open_group() call failed. Fixes: dafd6c496381 ("libata: ensure host is free'd on error exit paths") Cc: stable@vger.kernel.org Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240629124210.181537-9-cassel@kernel.org Signed-off-by: Niklas Cassel --- drivers/ata/libata-core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 27d22bc43a952a..74b59b78d278ce 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -5577,8 +5577,10 @@ struct ata_host *ata_host_alloc(struct device *dev, int max_ports) if (!host) return NULL; - if (!devres_open_group(dev, NULL, GFP_KERNEL)) - goto err_free; + if (!devres_open_group(dev, NULL, GFP_KERNEL)) { + kfree(host); + return NULL; + } dr = devres_alloc(ata_devres_release, 0, GFP_KERNEL); if (!dr) @@ -5610,8 +5612,6 @@ struct ata_host *ata_host_alloc(struct device *dev, int max_ports) err_out: devres_release_group(dev, NULL); - err_free: - kfree(host); return NULL; } EXPORT_SYMBOL_GPL(ata_host_alloc); From eeb25a09c5e0805d92e4ebd12c4b0ad0df1b0295 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Sat, 29 Jun 2024 14:42:14 +0200 Subject: [PATCH 1536/1578] ata: ahci: Clean up sysfs file on error .probe() (ahci_init_one()) calls sysfs_add_file_to_group(), however, if probe() fails after this call, we currently never call sysfs_remove_file_from_group(). (The sysfs_remove_file_from_group() call in .remove() (ahci_remove_one()) does not help, as .remove() is not called on .probe() error.) Thus, if probe() fails after the sysfs_add_file_to_group() call, the next time we insmod the module we will get: sysfs: cannot create duplicate filename '/devices/pci0000:00/0000:00:04.0/remapped_nvme' CPU: 11 PID: 954 Comm: modprobe Not tainted 6.10.0-rc5 #43 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-2.fc40 04/01/2014 Call Trace: dump_stack_lvl+0x5d/0x80 sysfs_warn_dup.cold+0x17/0x23 sysfs_add_file_mode_ns+0x11a/0x130 sysfs_add_file_to_group+0x7e/0xc0 ahci_init_one+0x31f/0xd40 [ahci] Fixes: 894fba7f434a ("ata: ahci: Add sysfs attribute to show remapped NVMe device count") Cc: stable@vger.kernel.org Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240629124210.181537-10-cassel@kernel.org Signed-off-by: Niklas Cassel --- drivers/ata/ahci.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index 5eb38fbbbecdbe..fc6fd583faf8a7 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -1975,8 +1975,10 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) n_ports = max(ahci_nr_ports(hpriv->cap), fls(hpriv->port_map)); host = ata_host_alloc_pinfo(&pdev->dev, ppi, n_ports); - if (!host) - return -ENOMEM; + if (!host) { + rc = -ENOMEM; + goto err_rm_sysfs_file; + } host->private_data = hpriv; if (ahci_init_msi(pdev, n_ports, hpriv) < 0) { @@ -2031,11 +2033,11 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) /* initialize adapter */ rc = ahci_configure_dma_masks(pdev, hpriv); if (rc) - return rc; + goto err_rm_sysfs_file; rc = ahci_pci_reset_controller(host); if (rc) - return rc; + goto err_rm_sysfs_file; ahci_pci_init_controller(host); ahci_pci_print_info(host); @@ -2044,10 +2046,15 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) rc = ahci_host_activate(host, &ahci_sht); if (rc) - return rc; + goto err_rm_sysfs_file; pm_runtime_put_noidle(&pdev->dev); return 0; + +err_rm_sysfs_file: + sysfs_remove_file_from_group(&pdev->dev.kobj, + &dev_attr_remapped_nvme.attr, NULL); + return rc; } static void ahci_shutdown_one(struct pci_dev *pdev) From 22a40d14b572deb80c0648557f4bd502d7e83826 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 30 Jun 2024 14:40:44 -0700 Subject: [PATCH 1537/1578] Linux 6.10-rc6 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 4d36f943b3b1f4..06aa6402b38503 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 10 SUBLEVEL = 0 -EXTRAVERSION = -rc5 +EXTRAVERSION = -rc6 NAME = Baby Opossum Posse # *DOCUMENTATION* From 4e63aeb5d0101ddada36a2f64f048e2f9d2202fc Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Tue, 4 Jun 2024 11:05:22 +0800 Subject: [PATCH 1538/1578] blk-wbt: don't throttle swap writes in direct reclaim Now we avoid throttling swap writes by determining whether the current process is kswapd (aka current_is_kswapd()), but swap writes can come from either kswapd or direct reclaim, so the swap writes from direct reclaim will still be throttled. When a process holds a lock to allocate a free page, and enters direct reclaim because there is no free memory, then it might trigger a hung due to the wbt throttling that causes other processes to fail to get the lock. Both kswapd and direct reclaim set the REQ_SWAP flag, so use REQ_SWAP instead of current_is_kswapd() to avoid throttling swap writes. Also renamed WBT_KSWAPD to WBT_SWAP and WBT_RWQ_KSWAPD to WBT_RWQ_SWAP. Signed-off-by: Baokun Li Reviewed-by: Yu Kuai Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240604030522.3686177-1-libaokun@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-wbt.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 1a5e4b049ecd1d..6dfc659d22e2b7 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -37,7 +37,7 @@ enum wbt_flags { WBT_TRACKED = 1, /* write, tracked for throttling */ WBT_READ = 2, /* read */ - WBT_KSWAPD = 4, /* write, from kswapd */ + WBT_SWAP = 4, /* write, from swap_writepage() */ WBT_DISCARD = 8, /* discard */ WBT_NR_BITS = 4, /* number of bits */ @@ -45,7 +45,7 @@ enum wbt_flags { enum { WBT_RWQ_BG = 0, - WBT_RWQ_KSWAPD, + WBT_RWQ_SWAP, WBT_RWQ_DISCARD, WBT_NUM_RWQ, }; @@ -172,8 +172,8 @@ static bool wb_recent_wait(struct rq_wb *rwb) static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, enum wbt_flags wb_acct) { - if (wb_acct & WBT_KSWAPD) - return &rwb->rq_wait[WBT_RWQ_KSWAPD]; + if (wb_acct & WBT_SWAP) + return &rwb->rq_wait[WBT_RWQ_SWAP]; else if (wb_acct & WBT_DISCARD) return &rwb->rq_wait[WBT_RWQ_DISCARD]; @@ -528,7 +528,7 @@ static bool close_io(struct rq_wb *rwb) time_before(now, rwb->last_comp + HZ / 10); } -#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO) +#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO | REQ_SWAP) static inline unsigned int get_limit(struct rq_wb *rwb, blk_opf_t opf) { @@ -539,13 +539,13 @@ static inline unsigned int get_limit(struct rq_wb *rwb, blk_opf_t opf) /* * At this point we know it's a buffered write. If this is - * kswapd trying to free memory, or REQ_SYNC is set, then + * swap trying to free memory, or REQ_SYNC is set, then * it's WB_SYNC_ALL writeback, and we'll use the max limit for * that. If the write is marked as a background write, then use * the idle limit, or go to normal if we haven't had competing * IO for a bit. */ - if ((opf & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd()) + if ((opf & REQ_HIPRIO) || wb_recent_wait(rwb)) limit = rwb->rq_depth.max_depth; else if ((opf & REQ_BACKGROUND) || close_io(rwb)) { /* @@ -622,8 +622,8 @@ static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio) if (bio_op(bio) == REQ_OP_READ) { flags = WBT_READ; } else if (wbt_should_throttle(bio)) { - if (current_is_kswapd()) - flags |= WBT_KSWAPD; + if (bio->bi_opf & REQ_SWAP) + flags |= WBT_SWAP; if (bio_op(bio) == REQ_OP_DISCARD) flags |= WBT_DISCARD; flags |= WBT_TRACKED; From f62e8edc0a9fda84fe5bf32d5f5874b489d6c301 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jul 2024 07:17:50 +0200 Subject: [PATCH 1539/1578] block: remove a duplicate io_min check in blk_validate_limits If io_min is larger than the cap, it must by definition be non-zero. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Reviewed-by: Nitesh Shetty Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240701051800.1245240-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index 2e559cf97cc834..ff8bbc101fedaa 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -279,8 +279,7 @@ static int blk_validate_limits(struct queue_limits *lim) } else if (lim->io_opt) { lim->max_sectors = min(max_hw_sectors, lim->io_opt >> SECTOR_SHIFT); - } else if (lim->io_min && - lim->io_min > (BLK_DEF_MAX_SECTORS_CAP << SECTOR_SHIFT)) { + } else if (lim->io_min > (BLK_DEF_MAX_SECTORS_CAP << SECTOR_SHIFT)) { lim->max_sectors = min(max_hw_sectors, lim->io_min >> SECTOR_SHIFT); } else { From 37105615f73125cb0466c09796f277a4c46d9295 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jul 2024 07:17:51 +0200 Subject: [PATCH 1540/1578] block: don't reduce max_sectors based on io_opt Don't reduce the max_sectors value below the normal cap when the driver advertsizes a very low io_opt. This restores the behavior we had before the recent changes to the max_sectors calculation. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Reviewed-by: Nitesh Shetty Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240701051800.1245240-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index ff8bbc101fedaa..9fa4eed4df06b0 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -276,7 +276,7 @@ static int blk_validate_limits(struct queue_limits *lim) if (lim->max_user_sectors < PAGE_SIZE / SECTOR_SIZE) return -EINVAL; lim->max_sectors = min(max_hw_sectors, lim->max_user_sectors); - } else if (lim->io_opt) { + } else if (lim->io_opt > (BLK_DEF_MAX_SECTORS_CAP << SECTOR_SHIFT)) { lim->max_sectors = min(max_hw_sectors, lim->io_opt >> SECTOR_SHIFT); } else if (lim->io_min > (BLK_DEF_MAX_SECTORS_CAP << SECTOR_SHIFT)) { From f3bf25d5135539603f24e377c6dec3016fbd9786 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jul 2024 07:17:52 +0200 Subject: [PATCH 1541/1578] nvme: don't set io_opt if NOWS is zero NOWS is one of the annoying "0's based values" in NVMe, where 0 means one and we thus can't detect if it isn't set. Thus a NOWS value of 0 means that the Namespace Optimal Write Size is a single LBA, which is clearly bogus. Ignore the value in that case and don't propagate an io_opt value to the block layer. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Reviewed-by: Nitesh Shetty Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240701051800.1245240-4-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 89ebfa89613ee6..ef00bfc6b8408a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2024,7 +2024,8 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id, /* NPWG = Namespace Preferred Write Granularity */ phys_bs = bs * (1 + le16_to_cpu(id->npwg)); /* NOWS = Namespace Optimal Write Size */ - io_opt = bs * (1 + le16_to_cpu(id->nows)); + if (id->nows) + io_opt = bs * (1 + le16_to_cpu(id->nows)); } /* From b0727b1243cd084260e47c51c7950020bfddb636 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 1 Jul 2024 08:40:29 -0600 Subject: [PATCH 1542/1578] io_uring/msg_ring: check for dead submitter task The change for improving the handling of the target CQE posting inadvertently dropped the NULL check for the submitter task on the target ring, reinstate that. Fixes: 0617bb500bfa ("io_uring/msg_ring: improve handling of target CQE posting") Reported-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/msg_ring.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 47a754e83b49a9..c2171495098bfe 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -86,16 +86,21 @@ static void io_msg_tw_complete(struct io_kiocb *req, struct io_tw_state *ts) percpu_ref_put(&ctx->refs); } -static void io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, - int res, u32 cflags, u64 user_data) +static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, + int res, u32 cflags, u64 user_data) { + req->task = READ_ONCE(ctx->submitter_task); + if (!req->task) { + kmem_cache_free(req_cachep, req); + return -EOWNERDEAD; + } req->cqe.user_data = user_data; io_req_set_res(req, res, cflags); percpu_ref_get(&ctx->refs); req->ctx = ctx; - req->task = READ_ONCE(ctx->submitter_task); req->io_task_work.func = io_msg_tw_complete; io_req_task_work_add_remote(req, ctx, IOU_F_TWQ_LAZY_WAKE); + return 0; } static struct io_kiocb *io_msg_get_kiocb(struct io_ring_ctx *ctx) @@ -125,8 +130,8 @@ static int io_msg_data_remote(struct io_kiocb *req) if (msg->flags & IORING_MSG_RING_FLAGS_PASS) flags = msg->cqe_flags; - io_msg_remote_post(target_ctx, target, msg->len, flags, msg->user_data); - return 0; + return io_msg_remote_post(target_ctx, target, msg->len, flags, + msg->user_data); } static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) From be4f5d9c992ba1d89ce63ad9e40a99f120882038 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 1 Jul 2024 08:46:25 -0600 Subject: [PATCH 1543/1578] io_uring/msg_ring: use kmem_cache_free() to free request The change adding caching around the request allocated and freed for data messages changed a kmem_cache_free() to a kfree(), which isn't correct as the request came from slab in the first place. Fix that up and use the right freeing function if the cache is already at its limit. Note that the current mixing of kmem_cache_alloc and kfree is fine, but consistent alloc/free functions should be used as it's otherwise somewhat confusing. Fixes: 50cf5f3842af ("io_uring/msg_ring: add an alloc cache for io_kiocb entries") Signed-off-by: Jens Axboe --- io_uring/msg_ring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index c2171495098bfe..29fa9285a33d9e 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -82,7 +82,7 @@ static void io_msg_tw_complete(struct io_kiocb *req, struct io_tw_state *ts) spin_unlock(&ctx->msg_lock); } if (req) - kfree(req); + kmem_cache_free(req_cachep, req); percpu_ref_put(&ctx->refs); } From 8515f1661ca1f9ad63850a5e1e86599399420d2e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 1 Jul 2024 09:23:26 -0600 Subject: [PATCH 1544/1578] MAINTAINERS: change Pavel Begunkov from io_uring reviewer to maintainer This more accurately describes Pavel's role for the project, so let's make the change to reflect that. Signed-off-by: Jens Axboe --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index cf9c9221c38830..ad96b9bd68ac61 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11570,7 +11570,7 @@ F: include/linux/iosys-map.h IO_URING M: Jens Axboe -R: Pavel Begunkov +M: Pavel Begunkov L: io-uring@vger.kernel.org S: Maintained T: git git://git.kernel.dk/linux-block From 6259151c04d4e0085e00d2dcb471ebdd1778e72e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 9 May 2024 10:01:48 -0700 Subject: [PATCH 1545/1578] block: Call .limit_depth() after .hctx has been set Call .limit_depth() after data->hctx has been set such that data->hctx can be used in .limit_depth() implementations. Cc: Christoph Hellwig Cc: Damien Le Moal Cc: Zhiguo Niu Fixes: 07757588e507 ("block/mq-deadline: Reserve 25% of scheduler tags for synchronous requests") Signed-off-by: Bart Van Assche Tested-by: Zhiguo Niu Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240509170149.7639-2-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-mq.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index fec2dea5c6e885..e3c3c0c21b5536 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -448,6 +448,10 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) if (data->cmd_flags & REQ_NOWAIT) data->flags |= BLK_MQ_REQ_NOWAIT; +retry: + data->ctx = blk_mq_get_ctx(q); + data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx); + if (q->elevator) { /* * All requests use scheduler tags when an I/O scheduler is @@ -469,13 +473,9 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) if (ops->limit_depth) ops->limit_depth(data->cmd_flags, data); } - } - -retry: - data->ctx = blk_mq_get_ctx(q); - data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx); - if (!(data->rq_flags & RQF_SCHED_TAGS)) + } else { blk_mq_tag_busy(data->hctx); + } if (data->flags & BLK_MQ_REQ_RESERVED) data->rq_flags |= RQF_RESV; From 39823b47bbd40502632ffba90ebb34fff7c8b5e8 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 9 May 2024 10:01:49 -0700 Subject: [PATCH 1546/1578] block/mq-deadline: Fix the tag reservation code The current tag reservation code is based on a misunderstanding of the meaning of data->shallow_depth. Fix the tag reservation code as follows: * By default, do not reserve any tags for synchronous requests because for certain use cases reserving tags reduces performance. See also Harshit Mogalapalli, [bug-report] Performance regression with fio sequential-write on a multipath setup, 2024-03-07 (https://lore.kernel.org/linux-block/5ce2ae5d-61e2-4ede-ad55-551112602401@oracle.com/) * Reduce min_shallow_depth to one because min_shallow_depth must be less than or equal any shallow_depth value. * Scale dd->async_depth from the range [1, nr_requests] to [1, bits_per_sbitmap_word]. Cc: Christoph Hellwig Cc: Damien Le Moal Cc: Zhiguo Niu Fixes: 07757588e507 ("block/mq-deadline: Reserve 25% of scheduler tags for synchronous requests") Signed-off-by: Bart Van Assche Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240509170149.7639-3-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 94eede4fb9ebed..acdc28756d9d77 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -487,6 +487,20 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) return rq; } +/* + * 'depth' is a number in the range 1..INT_MAX representing a number of + * requests. Scale it with a factor (1 << bt->sb.shift) / q->nr_requests since + * 1..(1 << bt->sb.shift) is the range expected by sbitmap_get_shallow(). + * Values larger than q->nr_requests have the same effect as q->nr_requests. + */ +static int dd_to_word_depth(struct blk_mq_hw_ctx *hctx, unsigned int qdepth) +{ + struct sbitmap_queue *bt = &hctx->sched_tags->bitmap_tags; + const unsigned int nrr = hctx->queue->nr_requests; + + return ((qdepth << bt->sb.shift) + nrr - 1) / nrr; +} + /* * Called by __blk_mq_alloc_request(). The shallow_depth value set by this * function is used by __blk_mq_get_tag(). @@ -503,7 +517,7 @@ static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) * Throttle asynchronous requests and writes such that these requests * do not block the allocation of synchronous requests. */ - data->shallow_depth = dd->async_depth; + data->shallow_depth = dd_to_word_depth(data->hctx, dd->async_depth); } /* Called by blk_mq_update_nr_requests(). */ @@ -513,9 +527,9 @@ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx) struct deadline_data *dd = q->elevator->elevator_data; struct blk_mq_tags *tags = hctx->sched_tags; - dd->async_depth = max(1UL, 3 * q->nr_requests / 4); + dd->async_depth = q->nr_requests; - sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth); + sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1); } /* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */ From 3b7c16be30e35ec035b2efcc0f7d7b368789c443 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 2 Jul 2024 14:38:11 +0100 Subject: [PATCH 1547/1578] io_uring/msg_ring: fix overflow posting The caller of io_cqring_event_overflow() should be holding the completion_lock, which is violated by io_msg_tw_complete. There is only one caller of io_add_aux_cqe(), so just add locking there for now. WARNING: CPU: 0 PID: 5145 at io_uring/io_uring.c:703 io_cqring_event_overflow+0x442/0x660 io_uring/io_uring.c:703 RIP: 0010:io_cqring_event_overflow+0x442/0x660 io_uring/io_uring.c:703 __io_post_aux_cqe io_uring/io_uring.c:816 [inline] io_add_aux_cqe+0x27c/0x320 io_uring/io_uring.c:837 io_msg_tw_complete+0x9d/0x4d0 io_uring/msg_ring.c:78 io_fallback_req_func+0xce/0x1c0 io_uring/io_uring.c:256 process_one_work kernel/workqueue.c:3224 [inline] process_scheduled_works+0xa2c/0x1830 kernel/workqueue.c:3305 worker_thread+0x86d/0xd40 kernel/workqueue.c:3383 kthread+0x2f0/0x390 kernel/kthread.c:389 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:144 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Fixes: f33096a3c99c0 ("io_uring: add io_add_aux_cqe() helper") Reported-by: syzbot+f7f9c893345c5c615d34@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c7350d07fefe8cce32b50f57665edbb6355ea8c1.1719927398.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 7ed1e009aaecb0..42139bb85fff86 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -834,7 +834,11 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags */ void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { - __io_post_aux_cqe(ctx, user_data, res, cflags); + if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) { + spin_lock(&ctx->completion_lock); + io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); + spin_unlock(&ctx->completion_lock); + } ctx->submit_state.cq_flush = true; } From 1c0b3fca381bf879e2168b362692f83808677f95 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 2 Jul 2024 16:32:34 +0900 Subject: [PATCH 1548/1578] null_blk: Fix description of the fua parameter The description of the fua module parameter is defined using MODULE_PARM_DESC() with the first argument passed being "zoned". That is the wrong name, obviously. Fix that by using the correct "fua" parameter name so that "modinfo null_blk" displays correct information. Fixes: f4f84586c8b9 ("null_blk: Introduce fua attribute") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20240702073234.206458-1-dlemoal@kernel.org Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 83a4ebe4763ae5..5de9ca4eceb4b4 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -227,7 +227,7 @@ MODULE_PARM_DESC(mbps, "Cache size in MiB for memory-backed device. Default: 0 ( static bool g_fua = true; module_param_named(fua, g_fua, bool, 0444); -MODULE_PARM_DESC(zoned, "Enable/disable FUA support when cache_size is used. Default: true"); +MODULE_PARM_DESC(fua, "Enable/disable FUA support when cache_size is used. Default: true"); static unsigned int g_mbps; module_param_named(mbps, g_mbps, uint, 0444); From 98d34c087249d39838874b83e17671e7d5eb1ca7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 25 Jun 2024 07:52:38 +0200 Subject: [PATCH 1549/1578] xen-blkfront: fix sector_size propagation to the block layer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ensure that info->sector_size and info->physical_sector_size are set before the call to blkif_set_queue_limits by doing away with the local variables and arguments that propagate them. Thanks to Marek Marczykowski-Górecki and Jürgen Groß for root causing the issue. Fixes: ba3f67c11638 ("xen-blkfront: atomically update queue limits") Reported-by: Rusty Bird Signed-off-by: Christoph Hellwig Reviewed-by: Roger Pau Monné Link: https://lore.kernel.org/r/20240625055238.7934-1-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/xen-blkfront.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index fa3a2ba525458b..59ce113b882a0e 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1070,8 +1070,7 @@ static char *encode_disk_name(char *ptr, unsigned int n) } static int xlvbd_alloc_gendisk(blkif_sector_t capacity, - struct blkfront_info *info, u16 sector_size, - unsigned int physical_sector_size) + struct blkfront_info *info) { struct queue_limits lim = {}; struct gendisk *gd; @@ -1165,8 +1164,6 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, info->rq = gd->queue; info->gd = gd; - info->sector_size = sector_size; - info->physical_sector_size = physical_sector_size; xlvbd_flush(info); @@ -2320,8 +2317,6 @@ static void blkfront_gather_backend_features(struct blkfront_info *info) static void blkfront_connect(struct blkfront_info *info) { unsigned long long sectors; - unsigned long sector_size; - unsigned int physical_sector_size; int err, i; struct blkfront_ring_info *rinfo; @@ -2360,7 +2355,7 @@ static void blkfront_connect(struct blkfront_info *info) err = xenbus_gather(XBT_NIL, info->xbdev->otherend, "sectors", "%llu", §ors, "info", "%u", &info->vdisk_info, - "sector-size", "%lu", §or_size, + "sector-size", "%lu", &info->sector_size, NULL); if (err) { xenbus_dev_fatal(info->xbdev, err, @@ -2374,9 +2369,9 @@ static void blkfront_connect(struct blkfront_info *info) * provide this. Assume physical sector size to be the same as * sector_size in that case. */ - physical_sector_size = xenbus_read_unsigned(info->xbdev->otherend, + info->physical_sector_size = xenbus_read_unsigned(info->xbdev->otherend, "physical-sector-size", - sector_size); + info->sector_size); blkfront_gather_backend_features(info); for_each_rinfo(info, rinfo, i) { err = blkfront_setup_indirect(rinfo); @@ -2388,8 +2383,7 @@ static void blkfront_connect(struct blkfront_info *info) } } - err = xlvbd_alloc_gendisk(sectors, info, sector_size, - physical_sector_size); + err = xlvbd_alloc_gendisk(sectors, info); if (err) { xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", info->xbdev->otherend); From 93d8032f4143c8d2ac3e10c6504385c26acc511f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 2 Jul 2024 09:16:46 -0600 Subject: [PATCH 1550/1578] io_uring/net: cleanup io_recv_finish() bundle handling Combine the two cases that check for whether or not this is a bundle, rather than having them as separate checks. This is easier to reduce, and it reduces the text associated with it as well. Signed-off-by: Jens Axboe --- io_uring/net.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index db4a4a03ce3ab6..25223e11958f8c 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -827,20 +827,20 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, bool mshot_finished, unsigned issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - unsigned int cflags; - - if (sr->flags & IORING_RECVSEND_BUNDLE) - cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), - issue_flags); - else - cflags = io_put_kbuf(req, issue_flags); + unsigned int cflags = 0; if (kmsg->msg.msg_inq > 0) cflags |= IORING_CQE_F_SOCK_NONEMPTY; - /* bundle with no more immediate buffers, we're done */ - if (sr->flags & IORING_RECVSEND_BUNDLE && req->flags & REQ_F_BL_EMPTY) - goto finish; + if (sr->flags & IORING_RECVSEND_BUNDLE) { + cflags |= io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), + issue_flags); + /* bundle with no more immediate buffers, we're done */ + if (req->flags & REQ_F_BL_EMPTY) + goto finish; + } else { + cflags |= io_put_kbuf(req, issue_flags); + } /* * Fill CQE for this receive and see if we should keep trying to From da042a3655151157c06e71a583e883ab2d86d1ff Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Jul 2024 17:10:19 +0200 Subject: [PATCH 1551/1578] block: split integrity support out of bio.h Split struct bio_integrity_payload and the related prototypes out of bio.h into a separate bio-integrity.h header so that it is only pulled in by the few places that need it. Signed-off-by: Christoph Hellwig Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Reviewed-by: Johannes Thumshirn Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240702151047.1746127-2-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 2 +- block/blk.h | 1 + block/bounce.c | 2 +- drivers/md/dm.c | 1 + drivers/nvme/host/ioctl.c | 1 + drivers/scsi/sd.c | 3 +- include/linux/bio-integrity.h | 153 +++++++++++++++++++++++++++++++++ include/linux/bio.h | 156 ---------------------------------- include/linux/blk-integrity.h | 1 + 9 files changed, 161 insertions(+), 159 deletions(-) create mode 100644 include/linux/bio-integrity.h diff --git a/block/bio.c b/block/bio.c index e9e809a63c5975..4ca3f31ce45fb5 100644 --- a/block/bio.c +++ b/block/bio.c @@ -4,7 +4,7 @@ */ #include #include -#include +#include #include #include #include diff --git a/block/blk.h b/block/blk.h index 47dadd2439b1ca..401e604f35d2cf 100644 --- a/block/blk.h +++ b/block/blk.h @@ -2,6 +2,7 @@ #ifndef BLK_INTERNAL_H #define BLK_INTERNAL_H +#include #include #include /* for max_pfn/max_low_pfn */ #include diff --git a/block/bounce.c b/block/bounce.c index d6a5219f29dd53..0d898cd5ec497f 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 7d107ae06e1ae1..92d6eeb0a59327 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -11,6 +11,7 @@ #include "dm-uevent.h" #include "dm-ima.h" +#include #include #include #include diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 8b69427a44762a..fb46f55f8b2894 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -3,6 +3,7 @@ * Copyright (c) 2011-2014, Intel Corporation. * Copyright (c) 2017-2021 Christoph Hellwig. */ +#include #include /* for force_successful_syscall_return */ #include #include diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 979795dad62b11..8bb3a361185152 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -33,11 +33,12 @@ * than the level indicated above to trigger output. */ +#include #include #include #include #include -#include +#include #include #include #include diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h new file mode 100644 index 00000000000000..70ef19a0dc7e8b --- /dev/null +++ b/include/linux/bio-integrity.h @@ -0,0 +1,153 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_BIO_INTEGRITY_H +#define _LINUX_BIO_INTEGRITY_H + +#include + +enum bip_flags { + BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ + BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ + BIP_CTRL_NOCHECK = 1 << 2, /* disable HBA integrity checking */ + BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */ + BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ + BIP_INTEGRITY_USER = 1 << 5, /* Integrity payload is user address */ + BIP_COPY_USER = 1 << 6, /* Kernel bounce buffer in use */ +}; + +struct bio_integrity_payload { + struct bio *bip_bio; /* parent bio */ + + struct bvec_iter bip_iter; + + unsigned short bip_vcnt; /* # of integrity bio_vecs */ + unsigned short bip_max_vcnt; /* integrity bio_vec slots */ + unsigned short bip_flags; /* control flags */ + + struct bvec_iter bio_iter; /* for rewinding parent bio */ + + struct work_struct bip_work; /* I/O completion */ + + struct bio_vec *bip_vec; + struct bio_vec bip_inline_vecs[];/* embedded bvec array */ +}; + +#ifdef CONFIG_BLK_DEV_INTEGRITY + +#define bip_for_each_vec(bvl, bip, iter) \ + for_each_bvec(bvl, (bip)->bip_vec, iter, (bip)->bip_iter) + +#define bio_for_each_integrity_vec(_bvl, _bio, _iter) \ + for_each_bio(_bio) \ + bip_for_each_vec(_bvl, _bio->bi_integrity, _iter) + +static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) +{ + if (bio->bi_opf & REQ_INTEGRITY) + return bio->bi_integrity; + + return NULL; +} + +static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + + if (bip) + return bip->bip_flags & flag; + + return false; +} + +static inline sector_t bip_get_seed(struct bio_integrity_payload *bip) +{ + return bip->bip_iter.bi_sector; +} + +static inline void bip_set_seed(struct bio_integrity_payload *bip, + sector_t seed) +{ + bip->bip_iter.bi_sector = seed; +} + +struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp, + unsigned int nr); +int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, + unsigned int offset); +int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len, u32 seed); +void bio_integrity_unmap_free_user(struct bio *bio); +bool bio_integrity_prep(struct bio *bio); +void bio_integrity_advance(struct bio *bio, unsigned int bytes_done); +void bio_integrity_trim(struct bio *bio); +int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask); +int bioset_integrity_create(struct bio_set *bs, int pool_size); +void bioset_integrity_free(struct bio_set *bs); +void bio_integrity_init(void); + +#else /* CONFIG_BLK_DEV_INTEGRITY */ + +static inline void *bio_integrity(struct bio *bio) +{ + return NULL; +} + +static inline int bioset_integrity_create(struct bio_set *bs, int pool_size) +{ + return 0; +} + +static inline void bioset_integrity_free(struct bio_set *bs) +{ +} + +static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf, + ssize_t len, u32 seed) +{ + return -EINVAL; +} + +static inline void bio_integrity_unmap_free_user(struct bio *bio) +{ +} + +static inline bool bio_integrity_prep(struct bio *bio) +{ + return true; +} + +static inline int bio_integrity_clone(struct bio *bio, struct bio *bio_src, + gfp_t gfp_mask) +{ + return 0; +} + +static inline void bio_integrity_advance(struct bio *bio, + unsigned int bytes_done) +{ +} + +static inline void bio_integrity_trim(struct bio *bio) +{ +} + +static inline void bio_integrity_init(void) +{ +} + +static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) +{ + return false; +} + +static inline void *bio_integrity_alloc(struct bio *bio, gfp_t gfp, + unsigned int nr) +{ + return ERR_PTR(-EINVAL); +} + +static inline int bio_integrity_add_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) +{ + return 0; +} +#endif /* CONFIG_BLK_DEV_INTEGRITY */ +#endif /* _LINUX_BIO_INTEGRITY_H */ diff --git a/include/linux/bio.h b/include/linux/bio.h index 818e9361294781..a46e2047bea4d2 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -321,69 +321,6 @@ static inline void bio_next_folio(struct folio_iter *fi, struct bio *bio) #define bio_for_each_folio_all(fi, bio) \ for (bio_first_folio(&fi, bio, 0); fi.folio; bio_next_folio(&fi, bio)) -enum bip_flags { - BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ - BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ - BIP_CTRL_NOCHECK = 1 << 2, /* disable HBA integrity checking */ - BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */ - BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ - BIP_INTEGRITY_USER = 1 << 5, /* Integrity payload is user address */ - BIP_COPY_USER = 1 << 6, /* Kernel bounce buffer in use */ -}; - -/* - * bio integrity payload - */ -struct bio_integrity_payload { - struct bio *bip_bio; /* parent bio */ - - struct bvec_iter bip_iter; - - unsigned short bip_vcnt; /* # of integrity bio_vecs */ - unsigned short bip_max_vcnt; /* integrity bio_vec slots */ - unsigned short bip_flags; /* control flags */ - - struct bvec_iter bio_iter; /* for rewinding parent bio */ - - struct work_struct bip_work; /* I/O completion */ - - struct bio_vec *bip_vec; - struct bio_vec bip_inline_vecs[];/* embedded bvec array */ -}; - -#if defined(CONFIG_BLK_DEV_INTEGRITY) - -static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) -{ - if (bio->bi_opf & REQ_INTEGRITY) - return bio->bi_integrity; - - return NULL; -} - -static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) -{ - struct bio_integrity_payload *bip = bio_integrity(bio); - - if (bip) - return bip->bip_flags & flag; - - return false; -} - -static inline sector_t bip_get_seed(struct bio_integrity_payload *bip) -{ - return bip->bip_iter.bi_sector; -} - -static inline void bip_set_seed(struct bio_integrity_payload *bip, - sector_t seed) -{ - bip->bip_iter.bi_sector = seed; -} - -#endif /* CONFIG_BLK_DEV_INTEGRITY */ - void bio_trim(struct bio *bio, sector_t offset, sector_t size); extern struct bio *bio_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs); @@ -721,99 +658,6 @@ static inline bool bioset_initialized(struct bio_set *bs) return bs->bio_slab != NULL; } -#if defined(CONFIG_BLK_DEV_INTEGRITY) - -#define bip_for_each_vec(bvl, bip, iter) \ - for_each_bvec(bvl, (bip)->bip_vec, iter, (bip)->bip_iter) - -#define bio_for_each_integrity_vec(_bvl, _bio, _iter) \ - for_each_bio(_bio) \ - bip_for_each_vec(_bvl, _bio->bi_integrity, _iter) - -int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len, u32 seed); -void bio_integrity_unmap_free_user(struct bio *bio); -extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int); -extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int); -extern bool bio_integrity_prep(struct bio *); -extern void bio_integrity_advance(struct bio *, unsigned int); -extern void bio_integrity_trim(struct bio *); -extern int bio_integrity_clone(struct bio *, struct bio *, gfp_t); -extern int bioset_integrity_create(struct bio_set *, int); -extern void bioset_integrity_free(struct bio_set *); -extern void bio_integrity_init(void); - -#else /* CONFIG_BLK_DEV_INTEGRITY */ - -static inline void *bio_integrity(struct bio *bio) -{ - return NULL; -} - -static inline int bioset_integrity_create(struct bio_set *bs, int pool_size) -{ - return 0; -} - -static inline void bioset_integrity_free (struct bio_set *bs) -{ - return; -} - -static inline bool bio_integrity_prep(struct bio *bio) -{ - return true; -} - -static inline int bio_integrity_clone(struct bio *bio, struct bio *bio_src, - gfp_t gfp_mask) -{ - return 0; -} - -static inline void bio_integrity_advance(struct bio *bio, - unsigned int bytes_done) -{ - return; -} - -static inline void bio_integrity_trim(struct bio *bio) -{ - return; -} - -static inline void bio_integrity_init(void) -{ - return; -} - -static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) -{ - return false; -} - -static inline void *bio_integrity_alloc(struct bio * bio, gfp_t gfp, - unsigned int nr) -{ - return ERR_PTR(-EINVAL); -} - -static inline int bio_integrity_add_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int offset) -{ - return 0; -} - -static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf, - ssize_t len, u32 seed) -{ - return -EINVAL; -} -static inline void bio_integrity_unmap_free_user(struct bio *bio) -{ -} - -#endif /* CONFIG_BLK_DEV_INTEGRITY */ - /* * Mark a bio as polled. Note that for async polled IO, the caller must * expect -EWOULDBLOCK if we cannot allocate a request (or other resources). diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index 804f856ed3e571..de98049b7ded91 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -3,6 +3,7 @@ #define _LINUX_BLK_INTEGRITY_H #include +#include struct request; From 21671a1ed1ff22e158ebe9d619943f926f03f5cd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Jul 2024 17:10:20 +0200 Subject: [PATCH 1552/1578] block: also return bio_integrity_payload * from stubs struct bio_integrity_payload is defined unconditionally. No need to return void * from bio_integrity() and bio_integrity_alloc(). Signed-off-by: Christoph Hellwig Reviewed-by: Kanchan Joshi Reviewed-by: Anuj Gupta Reviewed-by: Johannes Thumshirn Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240702151047.1746127-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio-integrity.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 70ef19a0dc7e8b..cac24dac06fff0 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -85,7 +85,7 @@ void bio_integrity_init(void); #else /* CONFIG_BLK_DEV_INTEGRITY */ -static inline void *bio_integrity(struct bio *bio) +static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) { return NULL; } @@ -138,8 +138,8 @@ static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) return false; } -static inline void *bio_integrity_alloc(struct bio *bio, gfp_t gfp, - unsigned int nr) +static inline struct bio_integrity_payload * +bio_integrity_alloc(struct bio *bio, gfp_t gfp, unsigned int nr) { return ERR_PTR(-EINVAL); } From bf4c89fc8797f5c0964a0c3d561fbe7e8483b62f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Jul 2024 17:10:21 +0200 Subject: [PATCH 1553/1578] block: don't call bio_uninit from bio_endio Commit b222dd2fdd53 ("block: call bio_uninit in bio_endio") added a call to bio_uninit in bio_endio to work around callers that use bio_init but fail to call bio_uninit after they are done to release the resources. While this is an abuse of the bio_init API we still have quite a few of those left. But this early uninit causes a problem for integrity data, as at least some users need the bio_integrity_payload. Right now the only one is the NVMe passthrough which archives this by adding a special case to skip the freeing if the BIP_INTEGRITY_USER flag is set. Sort this out by only putting bi_blkg in bio_endio as that is the cause of the actual leaks - the few users of the crypto context and integrity data all properly call bio_uninit, usually through bio_put for dynamically allocated bios. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240702151047.1746127-4-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/block/bio.c b/block/bio.c index 4ca3f31ce45fb5..68ce75fd9b7c89 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1630,8 +1630,18 @@ void bio_endio(struct bio *bio) goto again; } - /* release cgroup info */ - bio_uninit(bio); +#ifdef CONFIG_BLK_CGROUP + /* + * Release cgroup info. We shouldn't have to do this here, but quite + * a few callers of bio_init fail to call bio_uninit, so we cover up + * for that here at least for now. + */ + if (bio->bi_blkg) { + blkg_put(bio->bi_blkg); + bio->bi_blkg = NULL; + } +#endif + if (bio->bi_end_io) bio->bi_end_io(bio); } From f8924374fd37a8b41d554acd8b7407af7d354c0d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Jul 2024 17:10:22 +0200 Subject: [PATCH 1554/1578] block: call bio_integrity_unmap_free_user from blk_rq_unmap_user blk_rq_unmap_user always unmaps user space pass-through request. If such a request has integrity data attached it must come from a user mapping as well. Call bio_integrity_unmap_free_user from blk_rq_unmap_user and remove the nvme_unmap_bio wrapper in the nvme driver. Signed-off-by: Christoph Hellwig Reviewed-by: Kanchan Joshi Reviewed-by: Anuj Gupta Reviewed-by: Johannes Thumshirn Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240702151047.1746127-5-hch@lst.de Signed-off-by: Jens Axboe --- block/bio-integrity.c | 1 - block/blk-map.c | 3 +++ drivers/nvme/host/ioctl.c | 15 ++++----------- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index c4aed1dfa497a3..c8757d47e0ef62 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -174,7 +174,6 @@ void bio_integrity_unmap_free_user(struct bio *bio) bio->bi_integrity = NULL; bio->bi_opf &= ~REQ_INTEGRITY; } -EXPORT_SYMBOL(bio_integrity_unmap_free_user); /** * bio_integrity_add_page - Attach integrity metadata diff --git a/block/blk-map.c b/block/blk-map.c index bce144091128f6..df5f82d114720f 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -757,6 +757,9 @@ int blk_rq_unmap_user(struct bio *bio) bio_release_pages(bio, bio_data_dir(bio) == READ); } + if (bio_integrity(bio)) + bio_integrity_unmap_free_user(bio); + next_bio = bio; bio = bio->bi_next; blk_mq_map_bio_put(next_bio); diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index fb46f55f8b2894..f1d58e70933f54 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -112,13 +112,6 @@ static struct request *nvme_alloc_user_request(struct request_queue *q, return req; } -static void nvme_unmap_bio(struct bio *bio) -{ - if (bio_integrity(bio)) - bio_integrity_unmap_free_user(bio); - blk_rq_unmap_user(bio); -} - static int nvme_map_user_request(struct request *req, u64 ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, u32 meta_seed, struct io_uring_cmd *ioucmd, unsigned int flags) @@ -165,7 +158,7 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, out_unmap: if (bio) - nvme_unmap_bio(bio); + blk_rq_unmap_user(bio); out: blk_mq_free_request(req); return ret; @@ -203,7 +196,7 @@ static int nvme_submit_user_cmd(struct request_queue *q, if (result) *result = le64_to_cpu(nvme_req(req)->result.u64); if (bio) - nvme_unmap_bio(bio); + blk_rq_unmap_user(bio); blk_mq_free_request(req); if (effects) @@ -414,7 +407,7 @@ static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd, struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); if (pdu->bio) - nvme_unmap_bio(pdu->bio); + blk_rq_unmap_user(pdu->bio); io_uring_cmd_done(ioucmd, pdu->status, pdu->result, issue_flags); } @@ -440,7 +433,7 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, */ if (blk_rq_is_poll(req)) { if (pdu->bio) - nvme_unmap_bio(pdu->bio); + blk_rq_unmap_user(pdu->bio); io_uring_cmd_iopoll_done(ioucmd, pdu->result, pdu->status); } else { io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); From 85253bac4d02b1f95d6109c221aeccd7a262ec4d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Jul 2024 17:10:23 +0200 Subject: [PATCH 1555/1578] block: don't free submitter owned integrity payload on I/O completion Currently __bio_integrity_endio frees the integrity payload unless it is explicitly marked as user-mapped. This means in-kernel callers that allocate their own integrity payload never get to see it on I/O completion. The current two users don't need it as they just pre-mapped PI tuples received over the network, but this limits uses of integrity data lot. Change bio_integrity_endio to call __bio_integrity_endio for block layer generated integrity data only, and leave freeing of submitter allocated integrity data to bio_uninit which also gets called from the final bio_put. This requires that unmapping user mapped or copied integrity data is now always done by the caller, and the special BIP_INTEGRITY_USER flag can go away. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240702151047.1746127-6-hch@lst.de Signed-off-by: Jens Axboe --- block/bio-integrity.c | 57 ++++++++++++++--------------------- block/blk.h | 13 ++++++-- include/linux/bio-integrity.h | 3 +- 3 files changed, 34 insertions(+), 39 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index c8757d47e0ef62..4aa836d603fb23 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -22,9 +22,17 @@ void blk_flush_integrity(void) flush_workqueue(kintegrityd_wq); } -static void __bio_integrity_free(struct bio_set *bs, - struct bio_integrity_payload *bip) +/** + * bio_integrity_free - Free bio integrity payload + * @bio: bio containing bip to be freed + * + * Description: Free the integrity portion of a bio. + */ +void bio_integrity_free(struct bio *bio) { + struct bio_integrity_payload *bip = bio_integrity(bio); + struct bio_set *bs = bio->bi_pool; + if (bs && mempool_initialized(&bs->bio_integrity_pool)) { if (bip->bip_vec) bvec_free(&bs->bvec_integrity_pool, bip->bip_vec, @@ -33,6 +41,8 @@ static void __bio_integrity_free(struct bio_set *bs, } else { kfree(bip); } + bio->bi_integrity = NULL; + bio->bi_opf &= ~REQ_INTEGRITY; } /** @@ -86,7 +96,10 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, return bip; err: - __bio_integrity_free(bs, bip); + if (bs && mempool_initialized(&bs->bio_integrity_pool)) + mempool_free(bip, &bs->bio_integrity_pool); + else + kfree(bip); return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL(bio_integrity_alloc); @@ -132,28 +145,6 @@ static void bio_integrity_unmap_user(struct bio_integrity_payload *bip) bio_integrity_unpin_bvec(bip->bip_vec, bip->bip_max_vcnt, dirty); } -/** - * bio_integrity_free - Free bio integrity payload - * @bio: bio containing bip to be freed - * - * Description: Used to free the integrity portion of a bio. Usually - * called from bio_free(). - */ -void bio_integrity_free(struct bio *bio) -{ - struct bio_integrity_payload *bip = bio_integrity(bio); - struct bio_set *bs = bio->bi_pool; - - if (bip->bip_flags & BIP_INTEGRITY_USER) - return; - if (bip->bip_flags & BIP_BLOCK_INTEGRITY) - kfree(bvec_virt(bip->bip_vec)); - - __bio_integrity_free(bs, bip); - bio->bi_integrity = NULL; - bio->bi_opf &= ~REQ_INTEGRITY; -} - /** * bio_integrity_unmap_free_user - Unmap and free bio user integrity payload * @bio: bio containing bip to be unmapped and freed @@ -165,14 +156,9 @@ void bio_integrity_free(struct bio *bio) void bio_integrity_unmap_free_user(struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); - struct bio_set *bs = bio->bi_pool; - if (WARN_ON_ONCE(!(bip->bip_flags & BIP_INTEGRITY_USER))) - return; bio_integrity_unmap_user(bip); - __bio_integrity_free(bs, bip); - bio->bi_integrity = NULL; - bio->bi_opf &= ~REQ_INTEGRITY; + bio_integrity_free(bio); } /** @@ -273,7 +259,7 @@ static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec, goto free_bip; } - bip->bip_flags |= BIP_INTEGRITY_USER | BIP_COPY_USER; + bip->bip_flags |= BIP_COPY_USER; bip->bip_iter.bi_sector = seed; bip->bip_vcnt = nr_vecs; return 0; @@ -294,7 +280,6 @@ static int bio_integrity_init_user(struct bio *bio, struct bio_vec *bvec, return PTR_ERR(bip); memcpy(bip->bip_vec, bvec, nr_vecs * sizeof(*bvec)); - bip->bip_flags |= BIP_INTEGRITY_USER; bip->bip_iter.bi_sector = seed; bip->bip_iter.bi_size = len; bip->bip_vcnt = nr_vecs; @@ -502,6 +487,8 @@ static void bio_integrity_verify_fn(struct work_struct *work) struct bio *bio = bip->bip_bio; blk_integrity_verify(bio); + + kfree(bvec_virt(bip->bip_vec)); bio_integrity_free(bio); bio_endio(bio); } @@ -522,13 +509,13 @@ bool __bio_integrity_endio(struct bio *bio) struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct bio_integrity_payload *bip = bio_integrity(bio); - if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && - (bip->bip_flags & BIP_BLOCK_INTEGRITY) && bi->csum_type) { + if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && bi->csum_type) { INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); queue_work(kintegrityd_wq, &bip->bip_work); return false; } + kfree(bvec_virt(bip->bip_vec)); bio_integrity_free(bio); return true; } diff --git a/block/blk.h b/block/blk.h index 401e604f35d2cf..2233dc8d36b82a 100644 --- a/block/blk.h +++ b/block/blk.h @@ -202,11 +202,20 @@ static inline unsigned int blk_queue_get_max_sectors(struct request *rq) #ifdef CONFIG_BLK_DEV_INTEGRITY void blk_flush_integrity(void); -bool __bio_integrity_endio(struct bio *); void bio_integrity_free(struct bio *bio); + +/* + * Integrity payloads can either be owned by the submitter, in which case + * bio_uninit will free them, or owned and generated by the block layer, + * in which case we'll verify them here (for reads) and free them before + * the bio is handed back to the submitted. + */ +bool __bio_integrity_endio(struct bio *bio); static inline bool bio_integrity_endio(struct bio *bio) { - if (bio_integrity(bio)) + struct bio_integrity_payload *bip = bio_integrity(bio); + + if (bip && (bip->bip_flags & BIP_BLOCK_INTEGRITY)) return __bio_integrity_endio(bio); return true; } diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index cac24dac06fff0..3823d9be0d0790 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -10,8 +10,7 @@ enum bip_flags { BIP_CTRL_NOCHECK = 1 << 2, /* disable HBA integrity checking */ BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */ BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ - BIP_INTEGRITY_USER = 1 << 5, /* Integrity payload is user address */ - BIP_COPY_USER = 1 << 6, /* Kernel bounce buffer in use */ + BIP_COPY_USER = 1 << 5, /* Kernel bounce buffer in use */ }; struct bio_integrity_payload { From 74cc150282e41c6c0704cd305c9a4392dc64ef4d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Jul 2024 17:10:24 +0200 Subject: [PATCH 1556/1578] block: don't free the integrity payload in bio_integrity_unmap_free_user Now that the integrity payload is always freed in bio_uninit, don't bother freeing it a little earlier in bio_integrity_unmap_free_user. With that the separate bio_integrity_unmap_free_user can go away by just passing the bio to bio_integrity_unmap_user. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240702151047.1746127-7-hch@lst.de Signed-off-by: Jens Axboe --- block/bio-integrity.c | 31 +++++++++++-------------------- block/blk-map.c | 2 +- include/linux/bio-integrity.h | 4 ++-- 3 files changed, 14 insertions(+), 23 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 4aa836d603fb23..4b5c604585561e 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -131,34 +131,25 @@ static void bio_integrity_uncopy_user(struct bio_integrity_payload *bip) bio_integrity_unpin_bvec(copy, nr_vecs, true); } -static void bio_integrity_unmap_user(struct bio_integrity_payload *bip) +/** + * bio_integrity_unmap_user - Unmap user integrity payload + * @bio: bio containing bip to be unmapped + * + * Unmap the user mapped integrity portion of a bio. + */ +void bio_integrity_unmap_user(struct bio *bio) { - bool dirty = bio_data_dir(bip->bip_bio) == READ; + struct bio_integrity_payload *bip = bio_integrity(bio); if (bip->bip_flags & BIP_COPY_USER) { - if (dirty) + if (bio_data_dir(bio) == READ) bio_integrity_uncopy_user(bip); kfree(bvec_virt(bip->bip_vec)); return; } - bio_integrity_unpin_bvec(bip->bip_vec, bip->bip_max_vcnt, dirty); -} - -/** - * bio_integrity_unmap_free_user - Unmap and free bio user integrity payload - * @bio: bio containing bip to be unmapped and freed - * - * Description: Used to unmap and free the user mapped integrity portion of a - * bio. Submitter attaching the user integrity buffer is responsible for - * unmapping and freeing it during completion. - */ -void bio_integrity_unmap_free_user(struct bio *bio) -{ - struct bio_integrity_payload *bip = bio_integrity(bio); - - bio_integrity_unmap_user(bip); - bio_integrity_free(bio); + bio_integrity_unpin_bvec(bip->bip_vec, bip->bip_max_vcnt, + bio_data_dir(bio) == READ); } /** diff --git a/block/blk-map.c b/block/blk-map.c index df5f82d114720f..0e1167b239342f 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -758,7 +758,7 @@ int blk_rq_unmap_user(struct bio *bio) } if (bio_integrity(bio)) - bio_integrity_unmap_free_user(bio); + bio_integrity_unmap_user(bio); next_bio = bio; bio = bio->bi_next; diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 3823d9be0d0790..dd831c269e9948 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -73,7 +73,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp, int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset); int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len, u32 seed); -void bio_integrity_unmap_free_user(struct bio *bio); +void bio_integrity_unmap_user(struct bio *bio); bool bio_integrity_prep(struct bio *bio); void bio_integrity_advance(struct bio *bio, unsigned int bytes_done); void bio_integrity_trim(struct bio *bio); @@ -104,7 +104,7 @@ static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf, return -EINVAL; } -static inline void bio_integrity_unmap_free_user(struct bio *bio) +static inline void bio_integrity_unmap_user(struct bio *bio) { } From 2314c2e3a70521f055dd011245dccf6fd97c7ee0 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 15 Jun 2024 16:51:43 +0800 Subject: [PATCH 1557/1578] md/raid5: fix spares errors about rcu usage As commit ad8606702f26 ("md/raid5: remove rcu protection to access rdev from conf") explains, rcu protection can be removed, however, there are three places left, there won't be any real problems. drivers/md/raid5.c:8071:24: error: incompatible types in comparison expression (different address spaces): drivers/md/raid5.c:8071:24: struct md_rdev [noderef] __rcu * drivers/md/raid5.c:8071:24: struct md_rdev * drivers/md/raid5.c:7569:25: error: incompatible types in comparison expression (different address spaces): drivers/md/raid5.c:7569:25: struct md_rdev [noderef] __rcu * drivers/md/raid5.c:7569:25: struct md_rdev * drivers/md/raid5.c:7573:25: error: incompatible types in comparison expression (different address spaces): drivers/md/raid5.c:7573:25: struct md_rdev [noderef] __rcu * drivers/md/raid5.c:7573:25: struct md_rdev * Fixes: ad8606702f26 ("md/raid5: remove rcu protection to access rdev from conf") Cc: stable@vger.kernel.org Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240615085143.1648223-1-yukuai1@huaweicloud.com --- drivers/md/raid5.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 10219205160bbf..825fa629db33a3 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -155,7 +155,7 @@ static int raid6_idx_to_slot(int idx, struct stripe_head *sh, return slot; } -static void print_raid5_conf (struct r5conf *conf); +static void print_raid5_conf(struct r5conf *conf); static int stripe_operations_active(struct stripe_head *sh) { @@ -7568,11 +7568,11 @@ static struct r5conf *setup_conf(struct mddev *mddev) if (test_bit(Replacement, &rdev->flags)) { if (disk->replacement) goto abort; - RCU_INIT_POINTER(disk->replacement, rdev); + disk->replacement = rdev; } else { if (disk->rdev) goto abort; - RCU_INIT_POINTER(disk->rdev, rdev); + disk->rdev = rdev; } if (test_bit(In_sync, &rdev->flags)) { @@ -8054,7 +8054,7 @@ static void raid5_status(struct seq_file *seq, struct mddev *mddev) seq_printf (seq, "]"); } -static void print_raid5_conf (struct r5conf *conf) +static void print_raid5_conf(struct r5conf *conf) { struct md_rdev *rdev; int i; @@ -8068,15 +8068,13 @@ static void print_raid5_conf (struct r5conf *conf) conf->raid_disks, conf->raid_disks - conf->mddev->degraded); - rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { - rdev = rcu_dereference(conf->disks[i].rdev); + rdev = conf->disks[i].rdev; if (rdev) pr_debug(" disk %d, o:%d, dev:%pg\n", i, !test_bit(Faulty, &rdev->flags), rdev->bdev); } - rcu_read_unlock(); } static int raid5_spare_active(struct mddev *mddev) From ae720670b9fc5ef3588efd5b95e6a0f59a36dec0 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Tue, 18 Jun 2024 09:07:59 +0800 Subject: [PATCH 1558/1578] md: Remove unneeded semicolon ./drivers/md/md.c:630:21-22: Unneeded semicolon Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=9344 Signed-off-by: Yang Li Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240618010759.85416-1-yang.lee@linux.alibaba.com --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 0ff26a547f1afc..149be072546a59 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -627,7 +627,7 @@ static void md_submit_flush_data(struct work_struct *ws) * always is 0, make_request() will not be called here. */ if (WARN_ON_ONCE(!mddev->pers->make_request(mddev, bio))) - bio_io_error(bio);; + bio_io_error(bio); } /* The pair is percpu_ref_get() from md_flush_request() */ From 1f4a72ff00cafa74b43b0c8a37573c78f86ed1a8 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 23 Jun 2024 22:17:54 +0200 Subject: [PATCH 1559/1578] md-cluster: Constify struct md_cluster_operations 'struct md_cluster_operations' is not modified in this driver. Constifying this structure moves some data to a read-only section, so increase overall security. On a x86_64, with allmodconfig, as an example: Before: ====== text data bss dec hex filename 51941 1442 80 53463 d0d7 drivers/md/md-cluster.o After: ===== text data bss dec hex filename 52133 1246 80 53459 d0d3 drivers/md/md-cluster.o Signed-off-by: Christophe JAILLET Signed-off-by: Song Liu Link: https://lore.kernel.org/r/3727f3ce9693cae4e62ae6778ea13971df805479.1719173852.git.christophe.jaillet@wanadoo.fr --- drivers/md/md-cluster.c | 2 +- drivers/md/md.c | 4 ++-- drivers/md/md.h | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 8e36a0feec0989..139fe2019c1dd6 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -1570,7 +1570,7 @@ static int gather_bitmaps(struct md_rdev *rdev) return err; } -static struct md_cluster_operations cluster_ops = { +static const struct md_cluster_operations cluster_ops = { .join = join, .leave = leave, .slot_number = slot_number, diff --git a/drivers/md/md.c b/drivers/md/md.c index 149be072546a59..b50721ddf5d37f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -85,7 +85,7 @@ static DEFINE_SPINLOCK(pers_lock); static const struct kobj_type md_ktype; -struct md_cluster_operations *md_cluster_ops; +const struct md_cluster_operations *md_cluster_ops; EXPORT_SYMBOL(md_cluster_ops); static struct module *md_cluster_mod; @@ -8543,7 +8543,7 @@ int unregister_md_personality(struct md_personality *p) } EXPORT_SYMBOL(unregister_md_personality); -int register_md_cluster_operations(struct md_cluster_operations *ops, +int register_md_cluster_operations(const struct md_cluster_operations *ops, struct module *module) { int ret = 0; diff --git a/drivers/md/md.h b/drivers/md/md.h index 28cb4b0b6c1740..a0d6827dced9b7 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -849,7 +849,7 @@ static inline void safe_put_page(struct page *p) extern int register_md_personality(struct md_personality *p); extern int unregister_md_personality(struct md_personality *p); -extern int register_md_cluster_operations(struct md_cluster_operations *ops, +extern int register_md_cluster_operations(const struct md_cluster_operations *ops, struct module *module); extern int unregister_md_cluster_operations(void); extern int md_setup_cluster(struct mddev *mddev, int nodes); @@ -932,7 +932,7 @@ static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) } } -extern struct md_cluster_operations *md_cluster_ops; +extern const struct md_cluster_operations *md_cluster_ops; static inline int mddev_is_clustered(struct mddev *mddev) { return mddev->cluster_info && mddev->bitmap_info.nodes > 1; From a1fd37f97808db4fa1bf55da0275790c42521e45 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 27 Jun 2024 19:23:21 +0800 Subject: [PATCH 1560/1578] md: Don't wait for MD_RECOVERY_NEEDED for HOT_REMOVE_DISK ioctl Commit 90f5f7ad4f38 ("md: Wait for md_check_recovery before attempting device removal.") explained in the commit message that failed device must be reomoved from the personality first by md_check_recovery(), before it can be removed from the array. That's the reason the commit add the code to wait for MD_RECOVERY_NEEDED. However, this is not the case now, because remove_and_add_spares() is called directly from hot_remove_disk() from ioctl path, hence failed device(marked faulty) can be removed from the personality by ioctl. On the other hand, the commit introduced a performance problem that if MD_RECOVERY_NEEDED is set and the array is not running, ioctl will wait for 5s before it can return failure to user. Since the waiting is not needed now, fix the problem by removing the waiting. Fixes: 90f5f7ad4f38 ("md: Wait for md_check_recovery before attempting device removal.") Reported-by: Mateusz Kusiak Closes: https://lore.kernel.org/all/814ff6ee-47a2-4ba0-963e-cf256ee4ecfa@linux.intel.com/ Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240627112321.3044744-1-yukuai1@huaweicloud.com --- drivers/md/md.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index b50721ddf5d37f..64693913ed186e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7765,12 +7765,6 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode, return get_bitmap_file(mddev, argp); } - if (cmd == HOT_REMOVE_DISK) - /* need to ensure recovery thread has run */ - wait_event_interruptible_timeout(mddev->sb_wait, - !test_bit(MD_RECOVERY_NEEDED, - &mddev->recovery), - msecs_to_jiffies(5000)); if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { /* Need to flush page cache, and ensure no-one else opens * and writes From 25b3a8237a03ec0b67b965b52d74862e77ef7115 Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Tue, 2 Jul 2024 11:18:02 -0400 Subject: [PATCH 1561/1578] md/raid5: recheck if reshape has finished with device_lock held When handling an IO request, MD checks if a reshape is currently happening, and if so, where the IO sector is in relation to the reshape progress. MD uses conf->reshape_progress for both of these tasks. When the reshape finishes, conf->reshape_progress is set to MaxSector. If this occurs after MD checks if the reshape is currently happening but before it calls ahead_of_reshape(), then ahead_of_reshape() will end up comparing the IO sector against MaxSector. During a backwards reshape, this will make MD think the IO sector is in the area not yet reshaped, causing it to use the previous configuration, and map the IO to the sector where that data was before the reshape. This bug can be triggered by running the lvm2 lvconvert-raid-reshape-linear_to_raid6-single-type.sh test in a loop, although it's very hard to reproduce. Fix this by factoring the code that checks where the IO sector is in relation to the reshape out to a helper called get_reshape_loc(), which reads reshape_progress and reshape_safe while holding the device_lock, and then rechecks if the reshape has finished before calling ahead_of_reshape with the saved values. Also use the helper during the REQ_NOWAIT check to see if the location is inside of the reshape region. Fixes: fef9c61fdfabf ("md/raid5: change reshape-progress measurement to cope with reshaping backwards.") Signed-off-by: Benjamin Marzinski Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240702151802.1632010-1-bmarzins@redhat.com --- drivers/md/raid5.c | 64 +++++++++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 23 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 825fa629db33a3..c14cf2410365dd 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5899,6 +5899,39 @@ static int add_all_stripe_bios(struct r5conf *conf, return ret; } +enum reshape_loc { + LOC_NO_RESHAPE, + LOC_AHEAD_OF_RESHAPE, + LOC_INSIDE_RESHAPE, + LOC_BEHIND_RESHAPE, +}; + +static enum reshape_loc get_reshape_loc(struct mddev *mddev, + struct r5conf *conf, sector_t logical_sector) +{ + sector_t reshape_progress, reshape_safe; + /* + * Spinlock is needed as reshape_progress may be + * 64bit on a 32bit platform, and so it might be + * possible to see a half-updated value + * Of course reshape_progress could change after + * the lock is dropped, so once we get a reference + * to the stripe that we think it is, we will have + * to check again. + */ + spin_lock_irq(&conf->device_lock); + reshape_progress = conf->reshape_progress; + reshape_safe = conf->reshape_safe; + spin_unlock_irq(&conf->device_lock); + if (reshape_progress == MaxSector) + return LOC_NO_RESHAPE; + if (ahead_of_reshape(mddev, logical_sector, reshape_progress)) + return LOC_AHEAD_OF_RESHAPE; + if (ahead_of_reshape(mddev, logical_sector, reshape_safe)) + return LOC_INSIDE_RESHAPE; + return LOC_BEHIND_RESHAPE; +} + static enum stripe_result make_stripe_request(struct mddev *mddev, struct r5conf *conf, struct stripe_request_ctx *ctx, sector_t logical_sector, struct bio *bi) @@ -5913,28 +5946,14 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, seq = read_seqcount_begin(&conf->gen_lock); if (unlikely(conf->reshape_progress != MaxSector)) { - /* - * Spinlock is needed as reshape_progress may be - * 64bit on a 32bit platform, and so it might be - * possible to see a half-updated value - * Of course reshape_progress could change after - * the lock is dropped, so once we get a reference - * to the stripe that we think it is, we will have - * to check again. - */ - spin_lock_irq(&conf->device_lock); - if (ahead_of_reshape(mddev, logical_sector, - conf->reshape_progress)) { - previous = 1; - } else { - if (ahead_of_reshape(mddev, logical_sector, - conf->reshape_safe)) { - spin_unlock_irq(&conf->device_lock); - ret = STRIPE_SCHEDULE_AND_RETRY; - goto out; - } + enum reshape_loc loc = get_reshape_loc(mddev, conf, + logical_sector); + if (loc == LOC_INSIDE_RESHAPE) { + ret = STRIPE_SCHEDULE_AND_RETRY; + goto out; } - spin_unlock_irq(&conf->device_lock); + if (loc == LOC_AHEAD_OF_RESHAPE) + previous = 1; } new_sector = raid5_compute_sector(conf, logical_sector, previous, @@ -6112,8 +6131,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) /* Bail out if conflicts with reshape and REQ_NOWAIT is set */ if ((bi->bi_opf & REQ_NOWAIT) && (conf->reshape_progress != MaxSector) && - !ahead_of_reshape(mddev, logical_sector, conf->reshape_progress) && - ahead_of_reshape(mddev, logical_sector, conf->reshape_safe)) { + get_reshape_loc(mddev, conf, logical_sector) == LOC_INSIDE_RESHAPE) { bio_wouldblock_error(bi); if (rw == WRITE) md_write_end(mddev); From 162e06871e6dcde861ef608e0c00a8b6a2d35d43 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 4 Jul 2024 11:45:15 +0530 Subject: [PATCH 1562/1578] block: t10-pi: Return correct ref tag when queue has no integrity profile Commit c6e56cf6b2e7 ("block: move integrity information into queue_limits") changed the ref tag calculation logic. It would break if there is no integrity profile. This in turn causes read/write failures for such cases. Fixes: c6e56cf6b2e7 ("block: move integrity information into queue_limits") Signed-off-by: Anuj Gupta Link: https://lore.kernel.org/r/20240704061515.282343-1-joshi.k@samsung.com Signed-off-by: Jens Axboe --- include/linux/t10-pi.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h index 1773610010ebaf..2c59fe3efcd424 100644 --- a/include/linux/t10-pi.h +++ b/include/linux/t10-pi.h @@ -39,8 +39,11 @@ struct t10_pi_tuple { static inline u32 t10_pi_ref_tag(struct request *rq) { - unsigned int shift = rq->q->limits.integrity.interval_exp; + unsigned int shift = ilog2(queue_logical_block_size(rq->q)); + if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && + rq->q->limits.integrity.interval_exp) + shift = rq->q->limits.integrity.interval_exp; return blk_rq_pos(rq) >> (shift - SECTOR_SHIFT) & 0xffffffff; } @@ -61,8 +64,11 @@ static inline u64 lower_48_bits(u64 n) static inline u64 ext_pi_ref_tag(struct request *rq) { - unsigned int shift = rq->q->limits.integrity.interval_exp; + unsigned int shift = ilog2(queue_logical_block_size(rq->q)); + if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && + rq->q->limits.integrity.interval_exp) + shift = rq->q->limits.integrity.interval_exp; return lower_48_bits(blk_rq_pos(rq) >> (shift - SECTOR_SHIFT)); } From a18df07b7d3dbfa7ae54962cc59569002eaafd6d Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Thu, 4 Jul 2024 03:06:38 +0200 Subject: [PATCH 1563/1578] null_blk: don't initialize static 'g_virt_boundary' to false No functional changes intended. Signed-off-by: Zhu Yanjun Link: https://lore.kernel.org/r/20240704010638.324349-1-yanjun.zhu@linux.dev Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 5de9ca4eceb4b4..cbf4b29b16876d 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -77,7 +77,7 @@ enum { NULL_IRQ_TIMER = 2, }; -static bool g_virt_boundary = false; +static bool g_virt_boundary; module_param_named(virt_boundary, g_virt_boundary, bool, 0444); MODULE_PARM_DESC(virt_boundary, "Require a virtual boundary for the device. Default: False"); From ba942238056584efd3adc278a76592258d500918 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Tue, 2 Jul 2024 15:37:53 +0530 Subject: [PATCH 1564/1578] block: reuse original bio_vec array for integrity during clone Modify bio_integrity_clone to reuse the original bvec array instead of allocating and copying it, similar to how bio data path is cloned. Suggested-by: Christoph Hellwig Signed-off-by: Anuj Gupta Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20240702100753.2168-1-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- block/bio-integrity.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index eb3d7bbe1fe83a..0dc22985fed606 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -76,7 +76,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, &bip->bip_max_vcnt, gfp_mask); if (!bip->bip_vec) goto err; - } else { + } else if (nr_vecs) { bip->bip_vec = bip->bip_inline_vecs; } @@ -562,14 +562,11 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, BUG_ON(bip_src == NULL); - bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt); + bip = bio_integrity_alloc(bio, gfp_mask, 0); if (IS_ERR(bip)) return PTR_ERR(bip); - memcpy(bip->bip_vec, bip_src->bip_vec, - bip_src->bip_vcnt * sizeof(struct bio_vec)); - - bip->bip_vcnt = bip_src->bip_vcnt; + bip->bip_vec = bip_src->bip_vec; bip->bip_iter = bip_src->bip_iter; bip->bip_flags = bip_src->bip_flags & ~BIP_BLOCK_INTEGRITY; From dd54fd4e1780b349043f2d74784b8b702dbd84e9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 5 Jul 2024 07:31:14 +0200 Subject: [PATCH 1565/1578] loop: remove the unused inode variable in loop_configure Remove the inode variable now that the last user is gone. Fixes: a17ece76bcfe ("loop: regularize upgrading the block size for direct I/O") Reported-by: kernel test robot Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240705053114.2042976-1-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 4f0d96876b1f5d..a6f2d63678988b 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -996,7 +996,6 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode, const struct loop_config *config) { struct file *file = fget(config->fd); - struct inode *inode; struct address_space *mapping; int error; loff_t size; @@ -1033,7 +1032,6 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode, goto out_unlock; mapping = file->f_mapping; - inode = mapping->host; if ((config->info.lo_flags & ~LOOP_CONFIGURE_SETTABLE_FLAGS) != 0) { error = -EINVAL; From f4d5dc33c823ef1d7ccbbd2d1e40b871fad0012b Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 4 Jul 2024 14:28:12 +0900 Subject: [PATCH 1566/1578] null_blk: Introduce the zone_full parameter Allow creating a zoned null_blk device with the initial state of its sequential write required zones to be FULL. This is convenient to avoid having to first write these zones to perform read performance evaluation or test zone management operations such as zone reset (and zone reset all). Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240704052816.623865-2-dlemoal@kernel.org Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 9 ++++++++- drivers/block/null_blk/null_blk.h | 1 + drivers/block/null_blk/zoned.c | 10 ++++++++-- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index cbf4b29b16876d..9d0f6da77601b4 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -262,6 +262,10 @@ module_param_named(zone_append_max_sectors, g_zone_append_max_sectors, int, 0444 MODULE_PARM_DESC(zone_append_max_sectors, "Maximum size of a zone append command (in 512B sectors). Specify 0 for zone append emulation"); +static bool g_zone_full; +module_param_named(zone_full, g_zone_full, bool, S_IRUGO); +MODULE_PARM_DESC(zone_full, "Initialize the sequential write required zones of a zoned device to be full. Default: false"); + static struct nullb_device *null_alloc_dev(void); static void null_free_dev(struct nullb_device *dev); static void null_del_dev(struct nullb *nullb); @@ -458,6 +462,7 @@ NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL); NULLB_DEVICE_ATTR(zone_max_open, uint, NULL); NULLB_DEVICE_ATTR(zone_max_active, uint, NULL); NULLB_DEVICE_ATTR(zone_append_max_sectors, uint, NULL); +NULLB_DEVICE_ATTR(zone_full, bool, NULL); NULLB_DEVICE_ATTR(virt_boundary, bool, NULL); NULLB_DEVICE_ATTR(no_sched, bool, NULL); NULLB_DEVICE_ATTR(shared_tags, bool, NULL); @@ -610,6 +615,7 @@ static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_zone_append_max_sectors, &nullb_device_attr_zone_readonly, &nullb_device_attr_zone_offline, + &nullb_device_attr_zone_full, &nullb_device_attr_virt_boundary, &nullb_device_attr_no_sched, &nullb_device_attr_shared_tags, @@ -700,7 +706,7 @@ static ssize_t memb_group_features_show(struct config_item *item, char *page) "shared_tags,size,submit_queues,use_per_node_hctx," "virt_boundary,zoned,zone_capacity,zone_max_active," "zone_max_open,zone_nr_conv,zone_offline,zone_readonly," - "zone_size,zone_append_max_sectors\n"); + "zone_size,zone_append_max_sectors,zone_full\n"); } CONFIGFS_ATTR_RO(memb_group_, features); @@ -781,6 +787,7 @@ static struct nullb_device *null_alloc_dev(void) dev->zone_max_open = g_zone_max_open; dev->zone_max_active = g_zone_max_active; dev->zone_append_max_sectors = g_zone_append_max_sectors; + dev->zone_full = g_zone_full; dev->virt_boundary = g_virt_boundary; dev->no_sched = g_no_sched; dev->shared_tags = g_shared_tags; diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h index 3234e6c85eed8c..a7bb32f73ec36a 100644 --- a/drivers/block/null_blk/null_blk.h +++ b/drivers/block/null_blk/null_blk.h @@ -101,6 +101,7 @@ struct nullb_device { bool memory_backed; /* if data is stored in memory */ bool discard; /* if support discard */ bool zoned; /* if device is zoned */ + bool zone_full; /* Initialize zones to be full */ bool virt_boundary; /* virtual boundary on/off for the device */ bool no_sched; /* no IO scheduler for the device */ bool shared_tags; /* share tag set between devices for blk-mq */ diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index 9f7151ad93cfc4..7996e2e7dce28a 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -145,7 +145,7 @@ int null_init_zoned_dev(struct nullb_device *dev, zone = &dev->zones[i]; null_init_zone_lock(dev, zone); - zone->start = zone->wp = sector; + zone->start = sector; if (zone->start + dev->zone_size_sects > dev_capacity_sects) zone->len = dev_capacity_sects - zone->start; else @@ -153,7 +153,13 @@ int null_init_zoned_dev(struct nullb_device *dev, zone->capacity = min_t(sector_t, zone->len, zone_capacity_sects); zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; - zone->cond = BLK_ZONE_COND_EMPTY; + if (dev->zone_full) { + zone->cond = BLK_ZONE_COND_FULL; + zone->wp = zone->start + zone->capacity; + } else{ + zone->cond = BLK_ZONE_COND_EMPTY; + zone->wp = zone->start; + } sector += dev->zone_size_sects; } From ae7e965b36e3132238d16b4ccd223f65162397b5 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 4 Jul 2024 14:28:13 +0900 Subject: [PATCH 1567/1578] dm: Refactor is_abnormal_io() Use a single switch-case to simplify is_abnormal_io() and make this function more readable and easier to modify. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240704052816.623865-3-dlemoal@kernel.org Signed-off-by: Jens Axboe --- drivers/md/dm.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 7d107ae06e1ae1..0d80caccbd9eec 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1598,20 +1598,18 @@ static void __send_abnormal_io(struct clone_info *ci, struct dm_target *ti, static bool is_abnormal_io(struct bio *bio) { - enum req_op op = bio_op(bio); - - if (op != REQ_OP_READ && op != REQ_OP_WRITE && op != REQ_OP_FLUSH) { - switch (op) { - case REQ_OP_DISCARD: - case REQ_OP_SECURE_ERASE: - case REQ_OP_WRITE_ZEROES: - return true; - default: - break; - } + switch (bio_op(bio)) { + case REQ_OP_READ: + case REQ_OP_WRITE: + case REQ_OP_FLUSH: + return false; + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_ZEROES: + return true; + default: + return false; } - - return false; } static blk_status_t __process_abnormal_io(struct clone_info *ci, From 81e7706345f06e1e97a092f59697b7e20a0ee868 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 4 Jul 2024 14:28:14 +0900 Subject: [PATCH 1568/1578] dm: handle REQ_OP_ZONE_RESET_ALL This commit implements processing of the REQ_OP_ZONE_RESET_ALL operation for zoned mapped devices. Given that this operation always has a BIO sector of 0 and a 0 size, processing through the regular BIO __split_and_process_bio() function does not work because this function would always select the first target. Instead, handling of this operation is implemented using the function __send_zone_reset_all(). Similarly to the __send_empty_flush() function, the new __send_zone_reset_all() function manually goes through all targets of a mapped device table doing the following: 1) If the target can natively support REQ_OP_ZONE_RESET_ALL, __send_duplicate_bios() is used to forward the reset all operation to the target. This case is handled with the __send_zone_reset_all_native() function. 2) For other targets, the function __send_zone_reset_all_emulated() is executed to emulate the execution of REQ_OP_ZONE_RESET_ALL using regular REQ_OP_ZONE_RESET operations. Targets that can natively support REQ_OP_ZONE_RESET_ALL are identified using the new target field zone_reset_all_supported. This boolean is set to true in for targets that have reliable zone limits, that is, targets that map all sequential write required zones of their zoned device(s). Setting this field is handled in dm_set_zones_restrictions() and device_get_zone_resource_limits(). For targets with unreliable zone limits, REQ_OP_ZONE_RESET_ALL must be emulated (case 2 above). This is implemented with __send_zone_reset_all_emulated() and is similar to the block layer function blkdev_zone_reset_all_emulated(): first a report zones is done for the zones of the target to identify zones that need reset, that is, any sequential write required zone that is not already empty. This is done using a bitmap and the function dm_zone_get_reset_bitmap() which sets to 1 the bit corresponding to a zone that needs reset. Next, this zone bitmap is inspected and a clone BIO modified to use the REQ_OP_ZONE_RESET operation issued for any zone with its bit set in the zone bitmap. This implementation is more efficient than what the block layer does with blkdev_zone_reset_all_emulated(), which is always used for DM zoned devices currently: as we can natively use REQ_OP_ZONE_RESET_ALL on targets mapping all sequential write required zones, resetting all zones of a zoned mapped device can be much faster compared to always emulating this operation using regular per-zone reset. In the worst case, this implementation is as-efficient as the block layer emulation. This reduction in the time it takes to reset all zones of a zoned mapped device depends directly on the mapped device targets mapping (reliable zone limits or not). Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240704052816.623865-4-dlemoal@kernel.org Signed-off-by: Jens Axboe --- drivers/md/dm-zone.c | 50 ++++++++++++- drivers/md/dm.c | 135 +++++++++++++++++++++++++++++++++- drivers/md/dm.h | 3 + include/linux/device-mapper.h | 7 ++ 4 files changed, 190 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 4d37e53b50ee64..c0d41c36e06ebf 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -292,10 +292,12 @@ static int device_get_zone_resource_limits(struct dm_target *ti, /* * If the target does not map all sequential zones, the limits - * will not be reliable. + * will not be reliable and we cannot use REQ_OP_ZONE_RESET_ALL. */ - if (zc.target_nr_seq_zones < zc.total_nr_seq_zones) + if (zc.target_nr_seq_zones < zc.total_nr_seq_zones) { zlim->reliable_limits = false; + ti->zone_reset_all_supported = false; + } /* * If the target maps less sequential zones than the limit values, then @@ -353,6 +355,14 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, for (unsigned int i = 0; i < t->num_targets; i++) { struct dm_target *ti = dm_table_get_target(t, i); + /* + * Assume that the target can accept REQ_OP_ZONE_RESET_ALL. + * device_get_zone_resource_limits() may adjust this if one of + * the device used by the target does not have all its + * sequential write required zones mapped. + */ + ti->zone_reset_all_supported = true; + if (!ti->type->iterate_devices || ti->type->iterate_devices(ti, device_get_zone_resource_limits, &zlim)) { @@ -420,3 +430,39 @@ void dm_zone_endio(struct dm_io *io, struct bio *clone) return; } + +static int dm_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ + /* + * For an all-zones reset, ignore conventional, empty, read-only + * and offline zones. + */ + switch (zone->cond) { + case BLK_ZONE_COND_NOT_WP: + case BLK_ZONE_COND_EMPTY: + case BLK_ZONE_COND_READONLY: + case BLK_ZONE_COND_OFFLINE: + return 0; + default: + set_bit(idx, (unsigned long *)data); + return 0; + } +} + +int dm_zone_get_reset_bitmap(struct mapped_device *md, struct dm_table *t, + sector_t sector, unsigned int nr_zones, + unsigned long *need_reset) +{ + int ret; + + ret = dm_blk_do_report_zones(md, t, sector, nr_zones, + dm_zone_need_reset_cb, need_reset); + if (ret != nr_zones) { + DMERR("Get %s zone reset bitmap failed\n", + md->disk->disk_name); + return -EIO; + } + + return 0; +} diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 0d80caccbd9eec..4b1b69e576a550 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1606,6 +1606,7 @@ static bool is_abnormal_io(struct bio *bio) case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: case REQ_OP_WRITE_ZEROES: + case REQ_OP_ZONE_RESET_ALL: return true; default: return false; @@ -1774,6 +1775,119 @@ static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio) { return dm_emulate_zone_append(md) && blk_zone_plug_bio(bio, 0); } + +static blk_status_t __send_zone_reset_all_emulated(struct clone_info *ci, + struct dm_target *ti) +{ + struct bio_list blist = BIO_EMPTY_LIST; + struct mapped_device *md = ci->io->md; + unsigned int zone_sectors = md->disk->queue->limits.chunk_sectors; + unsigned long *need_reset; + unsigned int i, nr_zones, nr_reset; + unsigned int num_bios = 0; + blk_status_t sts = BLK_STS_OK; + sector_t sector = ti->begin; + struct bio *clone; + int ret; + + nr_zones = ti->len >> ilog2(zone_sectors); + need_reset = bitmap_zalloc(nr_zones, GFP_NOIO); + if (!need_reset) + return BLK_STS_RESOURCE; + + ret = dm_zone_get_reset_bitmap(md, ci->map, ti->begin, + nr_zones, need_reset); + if (ret) { + sts = BLK_STS_IOERR; + goto free_bitmap; + } + + /* If we have no zone to reset, we are done. */ + nr_reset = bitmap_weight(need_reset, nr_zones); + if (!nr_reset) + goto free_bitmap; + + atomic_add(nr_zones, &ci->io->io_count); + + for (i = 0; i < nr_zones; i++) { + + if (!test_bit(i, need_reset)) { + sector += zone_sectors; + continue; + } + + if (bio_list_empty(&blist)) { + /* This may take a while, so be nice to others */ + if (num_bios) + cond_resched(); + + /* + * We may need to reset thousands of zones, so let's + * not go crazy with the clone allocation. + */ + alloc_multiple_bios(&blist, ci, ti, min(nr_reset, 32), + NULL, GFP_NOIO); + } + + /* Get a clone and change it to a regular reset operation. */ + clone = bio_list_pop(&blist); + clone->bi_opf &= ~REQ_OP_MASK; + clone->bi_opf |= REQ_OP_ZONE_RESET | REQ_SYNC; + clone->bi_iter.bi_sector = sector; + clone->bi_iter.bi_size = 0; + __map_bio(clone); + + sector += zone_sectors; + num_bios++; + nr_reset--; + } + + WARN_ON_ONCE(!bio_list_empty(&blist)); + atomic_sub(nr_zones - num_bios, &ci->io->io_count); + ci->sector_count = 0; + +free_bitmap: + bitmap_free(need_reset); + + return sts; +} + +static void __send_zone_reset_all_native(struct clone_info *ci, + struct dm_target *ti) +{ + unsigned int bios; + + atomic_add(1, &ci->io->io_count); + bios = __send_duplicate_bios(ci, ti, 1, NULL, GFP_NOIO); + atomic_sub(1 - bios, &ci->io->io_count); + + ci->sector_count = 0; +} + +static blk_status_t __send_zone_reset_all(struct clone_info *ci) +{ + struct dm_table *t = ci->map; + blk_status_t sts = BLK_STS_OK; + + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + if (ti->zone_reset_all_supported) { + __send_zone_reset_all_native(ci, ti); + continue; + } + + sts = __send_zone_reset_all_emulated(ci, ti); + if (sts != BLK_STS_OK) + break; + } + + /* Release the reference that alloc_io() took for submission. */ + atomic_sub(1, &ci->io->io_count); + + return sts; +} + #else static inline bool dm_zone_bio_needs_split(struct mapped_device *md, struct bio *bio) @@ -1784,6 +1898,10 @@ static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio) { return false; } +static blk_status_t __send_zone_reset_all(struct clone_info *ci) +{ + return BLK_STS_NOTSUPP; +} #endif /* @@ -1797,9 +1915,14 @@ static void dm_split_and_process_bio(struct mapped_device *md, blk_status_t error = BLK_STS_OK; bool is_abnormal, need_split; - need_split = is_abnormal = is_abnormal_io(bio); - if (static_branch_unlikely(&zoned_enabled)) - need_split = is_abnormal || dm_zone_bio_needs_split(md, bio); + is_abnormal = is_abnormal_io(bio); + if (static_branch_unlikely(&zoned_enabled)) { + /* Special case REQ_OP_ZONE_RESET_ALL as it cannot be split. */ + need_split = (bio_op(bio) != REQ_OP_ZONE_RESET_ALL) && + (is_abnormal || dm_zone_bio_needs_split(md, bio)); + } else { + need_split = is_abnormal; + } if (unlikely(need_split)) { /* @@ -1840,6 +1963,12 @@ static void dm_split_and_process_bio(struct mapped_device *md, goto out; } + if (static_branch_unlikely(&zoned_enabled) && + (bio_op(bio) == REQ_OP_ZONE_RESET_ALL)) { + error = __send_zone_reset_all(&ci); + goto out; + } + error = __split_and_process_bio(&ci); if (error || !ci.sector_count) goto out; diff --git a/drivers/md/dm.h b/drivers/md/dm.h index c984ecb64b1e89..cc466ad5cb1df2 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -110,6 +110,9 @@ int dm_blk_report_zones(struct gendisk *disk, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); bool dm_is_zone_write(struct mapped_device *md, struct bio *bio); int dm_zone_map_bio(struct dm_target_io *io); +int dm_zone_get_reset_bitmap(struct mapped_device *md, struct dm_table *t, + sector_t sector, unsigned int nr_zones, + unsigned long *need_reset); #else #define dm_blk_report_zones NULL static inline bool dm_is_zone_write(struct mapped_device *md, struct bio *bio) diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 82b2195efaca78..15d28164bbbdb8 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -357,6 +357,13 @@ struct dm_target { */ bool discards_supported:1; + /* + * Automatically set by dm-core if this target supports + * REQ_OP_ZONE_RESET_ALL. Otherwise, this operation will be emulated + * using REQ_OP_ZONE_RESET. Target drivers must not set this manually. + */ + bool zone_reset_all_supported:1; + /* * Set if this target requires that discards be split on * 'max_discard_sectors' boundaries. From f2a7bea23710fceb99dac6da4ef82c3cc8932f7f Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 4 Jul 2024 14:28:15 +0900 Subject: [PATCH 1569/1578] block: Remove REQ_OP_ZONE_RESET_ALL emulation Now that device mapper can handle resetting all zones of a mapped zoned device using REQ_OP_ZONE_RESET_ALL, all zoned block device drivers support this operation. With this, the request queue feature BLK_FEAT_ZONE_RESETALL is not necessary and the emulation code in blk-zone.c can be removed. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240704052816.623865-5-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-core.c | 5 +-- block/blk-zoned.c | 76 ++-------------------------------- drivers/block/null_blk/zoned.c | 2 +- drivers/block/ublk_drv.c | 2 +- drivers/block/virtio_blk.c | 2 +- drivers/nvme/host/zns.c | 2 +- drivers/scsi/sd_zbc.c | 2 +- include/linux/blkdev.h | 5 --- 8 files changed, 9 insertions(+), 87 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 71b7622c523a30..02bceeb36f2c48 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -830,11 +830,8 @@ void submit_bio_noacct(struct bio *bio) case REQ_OP_ZONE_OPEN: case REQ_OP_ZONE_CLOSE: case REQ_OP_ZONE_FINISH: - if (!bdev_is_zoned(bio->bi_bdev)) - goto not_supported; - break; case REQ_OP_ZONE_RESET_ALL: - if (!bdev_is_zoned(bio->bi_bdev) || !blk_queue_zone_resetall(q)) + if (!bdev_is_zoned(bio->bi_bdev)) goto not_supported; break; case REQ_OP_DRV_IN: diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 601c21a224c93e..0007ef9cd5cacd 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -157,70 +157,6 @@ static inline unsigned long *blk_alloc_zone_bitmap(int node, GFP_NOIO, node); } -static int blk_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx, - void *data) -{ - /* - * For an all-zones reset, ignore conventional, empty, read-only - * and offline zones. - */ - switch (zone->cond) { - case BLK_ZONE_COND_NOT_WP: - case BLK_ZONE_COND_EMPTY: - case BLK_ZONE_COND_READONLY: - case BLK_ZONE_COND_OFFLINE: - return 0; - default: - set_bit(idx, (unsigned long *)data); - return 0; - } -} - -static int blkdev_zone_reset_all_emulated(struct block_device *bdev) -{ - struct gendisk *disk = bdev->bd_disk; - sector_t capacity = bdev_nr_sectors(bdev); - sector_t zone_sectors = bdev_zone_sectors(bdev); - unsigned long *need_reset; - struct bio *bio = NULL; - sector_t sector = 0; - int ret; - - need_reset = blk_alloc_zone_bitmap(disk->queue->node, disk->nr_zones); - if (!need_reset) - return -ENOMEM; - - ret = disk->fops->report_zones(disk, 0, disk->nr_zones, - blk_zone_need_reset_cb, need_reset); - if (ret < 0) - goto out_free_need_reset; - - ret = 0; - while (sector < capacity) { - if (!test_bit(disk_zone_no(disk, sector), need_reset)) { - sector += zone_sectors; - continue; - } - - bio = blk_next_bio(bio, bdev, 0, REQ_OP_ZONE_RESET | REQ_SYNC, - GFP_KERNEL); - bio->bi_iter.bi_sector = sector; - sector += zone_sectors; - - /* This may take a while, so be nice to others */ - cond_resched(); - } - - if (bio) { - ret = submit_bio_wait(bio); - bio_put(bio); - } - -out_free_need_reset: - kfree(need_reset); - return ret; -} - static int blkdev_zone_reset_all(struct block_device *bdev) { struct bio bio; @@ -247,7 +183,6 @@ static int blkdev_zone_reset_all(struct block_device *bdev) int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, sector_t sector, sector_t nr_sectors) { - struct request_queue *q = bdev_get_queue(bdev); sector_t zone_sectors = bdev_zone_sectors(bdev); sector_t capacity = bdev_nr_sectors(bdev); sector_t end_sector = sector + nr_sectors; @@ -275,16 +210,11 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, return -EINVAL; /* - * In the case of a zone reset operation over all zones, - * REQ_OP_ZONE_RESET_ALL can be used with devices supporting this - * command. For other devices, we emulate this command behavior by - * identifying the zones needing a reset. + * In the case of a zone reset operation over all zones, use + * REQ_OP_ZONE_RESET_ALL. */ - if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) { - if (!blk_queue_zone_resetall(q)) - return blkdev_zone_reset_all_emulated(bdev); + if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) return blkdev_zone_reset_all(bdev); - } while (sector < end_sector) { bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, GFP_KERNEL); diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index 7996e2e7dce28a..9bc768b2ca56b7 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -164,7 +164,7 @@ int null_init_zoned_dev(struct nullb_device *dev, sector += dev->zone_size_sects; } - lim->features |= BLK_FEAT_ZONED | BLK_FEAT_ZONE_RESETALL; + lim->features |= BLK_FEAT_ZONED; lim->chunk_sectors = dev->zone_size_sects; lim->max_zone_append_sectors = dev->zone_append_max_sectors; lim->max_open_zones = dev->zone_max_open; diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 4fdff13fc23b8a..d10a2ea072921d 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2194,7 +2194,7 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) return -EOPNOTSUPP; - lim.features |= BLK_FEAT_ZONED | BLK_FEAT_ZONE_RESETALL; + lim.features |= BLK_FEAT_ZONED; lim.max_active_zones = p->max_active_zones; lim.max_open_zones = p->max_open_zones; lim.max_zone_append_sectors = p->max_zone_append_sectors; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 6c64a67ab9c901..84c3efd0c6117f 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -728,7 +728,7 @@ static int virtblk_read_zoned_limits(struct virtio_blk *vblk, dev_dbg(&vdev->dev, "probing host-managed zoned device\n"); - lim->features |= BLK_FEAT_ZONED | BLK_FEAT_ZONE_RESETALL; + lim->features |= BLK_FEAT_ZONED; virtio_cread(vdev, struct virtio_blk_config, zoned.max_open_zones, &v); diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index 99bb89c2495ae3..9a06f9d98cd68c 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -108,7 +108,7 @@ int nvme_query_zone_info(struct nvme_ns *ns, unsigned lbaf, void nvme_update_zone_info(struct nvme_ns *ns, struct queue_limits *lim, struct nvme_zone_info *zi) { - lim->features |= BLK_FEAT_ZONED | BLK_FEAT_ZONE_RESETALL; + lim->features |= BLK_FEAT_ZONED; lim->max_open_zones = zi->max_open_zones; lim->max_active_zones = zi->max_active_zones; lim->max_zone_append_sectors = ns->ctrl->max_zone_append; diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index f7067afac79c14..c8b9654d30f0c3 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -599,7 +599,7 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, struct queue_limits *lim, if (sdkp->device->type != TYPE_ZBC) return 0; - lim->features |= BLK_FEAT_ZONED | BLK_FEAT_ZONE_RESETALL; + lim->features |= BLK_FEAT_ZONED; /* * Per ZBC and ZAC specifications, writes in sequential write required diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4d0d4b83bc740f..dc250d8070d215 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -318,9 +318,6 @@ typedef unsigned int __bitwise blk_features_t; /* is a zoned device */ #define BLK_FEAT_ZONED ((__force blk_features_t)(1u << 10)) -/* supports Zone Reset All */ -#define BLK_FEAT_ZONE_RESETALL ((__force blk_features_t)(1u << 11)) - /* supports PCI(e) p2p requests */ #define BLK_FEAT_PCI_P2PDMA ((__force blk_features_t)(1u << 12)) @@ -618,8 +615,6 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) #define blk_queue_nonrot(q) (!((q)->limits.features & BLK_FEAT_ROTATIONAL)) #define blk_queue_io_stat(q) ((q)->limits.features & BLK_FEAT_IO_STAT) -#define blk_queue_zone_resetall(q) \ - ((q)->limits.features & BLK_FEAT_ZONE_RESETALL) #define blk_queue_dax(q) ((q)->limits.features & BLK_FEAT_DAX) #define blk_queue_pci_p2pdma(q) ((q)->limits.features & BLK_FEAT_PCI_P2PDMA) #ifdef CONFIG_BLK_RQ_ALLOC_TIME From 2f20872ed43185780a5f30581472599342c86d4a Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 4 Jul 2024 14:28:16 +0900 Subject: [PATCH 1570/1578] block: Remove blk_alloc_zone_bitmap() Remove the helper function blk_alloc_zone_bitmap() and replace its single call site with a call to bitmap_alloc(). To be consistent with this change, use bitmap_free() to free a disk convnetional zone bitmap. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240704052816.623865-6-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 0007ef9cd5cacd..7ace890136a333 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -150,13 +150,6 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL_GPL(blkdev_report_zones); -static inline unsigned long *blk_alloc_zone_bitmap(int node, - unsigned int nr_zones) -{ - return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long), - GFP_NOIO, node); -} - static int blkdev_zone_reset_all(struct block_device *bdev) { struct bio bio; @@ -1482,7 +1475,7 @@ void disk_free_zone_resources(struct gendisk *disk) mempool_destroy(disk->zone_wplugs_pool); disk->zone_wplugs_pool = NULL; - kfree(disk->conv_zones_bitmap); + bitmap_free(disk->conv_zones_bitmap); disk->conv_zones_bitmap = NULL; disk->zone_capacity = 0; disk->last_zone_capacity = 0; @@ -1604,7 +1597,6 @@ static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, struct blk_revalidate_zone_args *args) { struct gendisk *disk = args->disk; - struct request_queue *q = disk->queue; if (zone->capacity != zone->len) { pr_warn("%s: Invalid conventional zone capacity\n", @@ -1620,7 +1612,7 @@ static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, if (!args->conv_zones_bitmap) { args->conv_zones_bitmap = - blk_alloc_zone_bitmap(q->node, args->nr_zones); + bitmap_zalloc(args->nr_zones, GFP_NOIO); if (!args->conv_zones_bitmap) return -ENOMEM; } From 73a768d5f95533574bb8ace34eb683a88c40509e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jul 2024 18:51:15 +0200 Subject: [PATCH 1571/1578] block: factor out a blk_write_zeroes_limit helper Contrary to the comment in __blkdev_issue_write_zeroes, nothing here checks for a potential bi_size overflow. Add a helper mirroring the secure erase code for the check. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240701165219.1571322-6-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-lib.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/block/blk-lib.c b/block/blk-lib.c index 442da9dad04213..297bcf6896930f 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -103,24 +103,28 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL(blkdev_issue_discard); +static sector_t bio_write_zeroes_limit(struct block_device *bdev) +{ + sector_t bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; + + return min(bdev_write_zeroes_sectors(bdev), + (UINT_MAX >> SECTOR_SHIFT) & ~bs_mask); +} + static int __blkdev_issue_write_zeroes(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, unsigned flags) { struct bio *bio = *biop; - unsigned int max_sectors; if (bdev_read_only(bdev)) return -EPERM; - - /* Ensure that max_sectors doesn't overflow bi_size */ - max_sectors = bdev_write_zeroes_sectors(bdev); - - if (max_sectors == 0) + if (!bdev_write_zeroes_sectors(bdev)) return -EOPNOTSUPP; while (nr_sects) { - unsigned int len = min_t(sector_t, nr_sects, max_sectors); + unsigned int len = min_t(sector_t, nr_sects, + bio_write_zeroes_limit(bdev)); bio = blk_next_bio(bio, bdev, 0, REQ_OP_WRITE_ZEROES, gfp_mask); bio->bi_iter.bi_sector = sector; From ff760a8f0d09f4ba7574ae2ca8be987854f5246d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jul 2024 18:51:16 +0200 Subject: [PATCH 1572/1578] block: remove the LBA alignment check in __blkdev_issue_zeroout __blkdev_issue_zeroout is a purely kernel internal API and thus can rely on the block layer sector alignment checks. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240701165219.1571322-7-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-lib.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/block/blk-lib.c b/block/blk-lib.c index 297bcf6896930f..601a122af8587c 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -209,11 +209,6 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, unsigned flags) { int ret; - sector_t bs_mask; - - bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; - if ((sector | nr_sects) & bs_mask) - return -EINVAL; ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask, biop, flags); From f6eacb26541ad1eabc40d7e9f5cd86bae7dc0b46 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jul 2024 18:51:17 +0200 Subject: [PATCH 1573/1578] block: move read-only and supported checks into (__)blkdev_issue_zeroout Move these checks out of the lower level helpers and into the higher level ones to prepare for refactoring. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240701165219.1571322-8-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-lib.c | 51 ++++++++++++++++++++++--------------------------- 1 file changed, 23 insertions(+), 28 deletions(-) diff --git a/block/blk-lib.c b/block/blk-lib.c index 601a122af8587c..02f69c683a9839 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -111,17 +111,12 @@ static sector_t bio_write_zeroes_limit(struct block_device *bdev) (UINT_MAX >> SECTOR_SHIFT) & ~bs_mask); } -static int __blkdev_issue_write_zeroes(struct block_device *bdev, +static void __blkdev_issue_write_zeroes(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, unsigned flags) { struct bio *bio = *biop; - if (bdev_read_only(bdev)) - return -EPERM; - if (!bdev_write_zeroes_sectors(bdev)) - return -EOPNOTSUPP; - while (nr_sects) { unsigned int len = min_t(sector_t, nr_sects, bio_write_zeroes_limit(bdev)); @@ -138,7 +133,6 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev, } *biop = bio; - return 0; } /* @@ -154,7 +148,7 @@ static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects) return min(pages, (sector_t)BIO_MAX_VECS); } -static int __blkdev_issue_zero_pages(struct block_device *bdev, +static void __blkdev_issue_zero_pages(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop) { @@ -162,9 +156,6 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev, int bi_size = 0; unsigned int sz; - if (bdev_read_only(bdev)) - return -EPERM; - while (nr_sects != 0) { bio = blk_next_bio(bio, bdev, __blkdev_sectors_to_bio_pages(nr_sects), REQ_OP_WRITE, gfp_mask); @@ -182,7 +173,6 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev, } *biop = bio; - return 0; } /** @@ -208,15 +198,19 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, unsigned flags) { - int ret; - - ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask, - biop, flags); - if (ret != -EOPNOTSUPP || (flags & BLKDEV_ZERO_NOFALLBACK)) - return ret; + if (bdev_read_only(bdev)) + return -EPERM; - return __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp_mask, - biop); + if (bdev_write_zeroes_sectors(bdev)) { + __blkdev_issue_write_zeroes(bdev, sector, nr_sects, + gfp_mask, biop, flags); + } else { + if (flags & BLKDEV_ZERO_NOFALLBACK) + return -EOPNOTSUPP; + __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp_mask, + biop); + } + return 0; } EXPORT_SYMBOL(__blkdev_issue_zeroout); @@ -245,21 +239,22 @@ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; if ((sector | nr_sects) & bs_mask) return -EINVAL; + if (bdev_read_only(bdev)) + return -EPERM; + if ((flags & BLKDEV_ZERO_NOFALLBACK) && !try_write_zeroes) + return -EOPNOTSUPP; retry: bio = NULL; blk_start_plug(&plug); if (try_write_zeroes) { - ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, - gfp_mask, &bio, flags); - } else if (!(flags & BLKDEV_ZERO_NOFALLBACK)) { - ret = __blkdev_issue_zero_pages(bdev, sector, nr_sects, - gfp_mask, &bio); + __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask, + &bio, flags); } else { - /* No zeroing offload support */ - ret = -EOPNOTSUPP; + __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp_mask, + &bio); } - if (ret == 0 && bio) { + if (bio) { ret = submit_bio_wait(bio); bio_put(bio); } From 99800ced26b9d87a918aa9824881bdb90a3c1b03 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jul 2024 18:51:18 +0200 Subject: [PATCH 1574/1578] block: refacto blkdev_issue_zeroout Split out two well-defined helpers for hardware supported Write Zeroes and manually writing zeroes using the Write command. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240701165219.1571322-9-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-lib.c | 94 +++++++++++++++++++++++++++++-------------------- 1 file changed, 55 insertions(+), 39 deletions(-) diff --git a/block/blk-lib.c b/block/blk-lib.c index 02f69c683a9839..9dbab472c63f12 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -135,6 +135,32 @@ static void __blkdev_issue_write_zeroes(struct block_device *bdev, *biop = bio; } +static int blkdev_issue_write_zeroes(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp, unsigned flags) +{ + struct bio *bio = NULL; + struct blk_plug plug; + int ret = 0; + + blk_start_plug(&plug); + __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp, &bio, flags); + if (bio) { + ret = submit_bio_wait(bio); + bio_put(bio); + } + blk_finish_plug(&plug); + + /* + * For some devices there is no non-destructive way to verify whether + * WRITE ZEROES is actually supported. These will clear the capability + * on an I/O error, in which case we'll turn any error into + * "not supported" here. + */ + if (ret && !bdev_write_zeroes_sectors(bdev)) + return -EOPNOTSUPP; + return ret; +} + /* * Convert a number of 512B sectors to a number of pages. * The result is limited to a number of pages that can fit into a BIO. @@ -175,6 +201,27 @@ static void __blkdev_issue_zero_pages(struct block_device *bdev, *biop = bio; } +static int blkdev_issue_zero_pages(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp, unsigned flags) +{ + struct bio *bio = NULL; + struct blk_plug plug; + int ret = 0; + + if (flags & BLKDEV_ZERO_NOFALLBACK) + return -EOPNOTSUPP; + + blk_start_plug(&plug); + __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp, &bio); + if (bio) { + ret = submit_bio_wait(bio); + bio_put(bio); + } + blk_finish_plug(&plug); + + return ret; +} + /** * __blkdev_issue_zeroout - generate number of zero filed write bios * @bdev: blockdev to issue @@ -230,52 +277,21 @@ EXPORT_SYMBOL(__blkdev_issue_zeroout); int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, unsigned flags) { - int ret = 0; - sector_t bs_mask; - struct bio *bio; - struct blk_plug plug; - bool try_write_zeroes = !!bdev_write_zeroes_sectors(bdev); + int ret; - bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; - if ((sector | nr_sects) & bs_mask) + if ((sector | nr_sects) & ((bdev_logical_block_size(bdev) >> 9) - 1)) return -EINVAL; if (bdev_read_only(bdev)) return -EPERM; - if ((flags & BLKDEV_ZERO_NOFALLBACK) && !try_write_zeroes) - return -EOPNOTSUPP; -retry: - bio = NULL; - blk_start_plug(&plug); - if (try_write_zeroes) { - __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask, - &bio, flags); - } else { - __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp_mask, - &bio); - } - if (bio) { - ret = submit_bio_wait(bio); - bio_put(bio); - } - blk_finish_plug(&plug); - if (ret && try_write_zeroes) { - if (!(flags & BLKDEV_ZERO_NOFALLBACK)) { - try_write_zeroes = false; - goto retry; - } - if (!bdev_write_zeroes_sectors(bdev)) { - /* - * Zeroing offload support was indicated, but the - * device reported ILLEGAL REQUEST (for some devices - * there is no non-destructive way to verify whether - * WRITE ZEROES is actually supported). - */ - ret = -EOPNOTSUPP; - } + if (bdev_write_zeroes_sectors(bdev)) { + ret = blkdev_issue_write_zeroes(bdev, sector, nr_sects, + gfp_mask, flags); + if (!ret) + return ret; } - return ret; + return blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp_mask, flags); } EXPORT_SYMBOL(blkdev_issue_zeroout); From 39722a2f2bcd82bdecc226711412d88b54fcb05b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jul 2024 18:51:19 +0200 Subject: [PATCH 1575/1578] block: limit the Write Zeroes to manually writing zeroes fallback Only fall back from hardware Write Zeroes failures when blkdev_issue_write_zeroes returns -EOPNOTSUPP; Note that blkdev_issue_write_zeroes turns any failure into -EOPNOTSUPP when the write zeroes queue limit has been cleared to 0, so this still catches all I/O errors where the driver detected missing support for the hardware acceleration. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240701165219.1571322-10-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-lib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-lib.c b/block/blk-lib.c index 9dbab472c63f12..2384acd913d21f 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -287,7 +287,7 @@ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, if (bdev_write_zeroes_sectors(bdev)) { ret = blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask, flags); - if (!ret) + if (ret != -EOPNOTSUPP) return ret; } From bf86bcdb40123ee99669ee91b67e023669433a1a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jul 2024 18:51:20 +0200 Subject: [PATCH 1576/1578] blk-lib: check for kill signal in ioctl BLKZEROOUT Zeroout can access a significant capacity and take longer than the user expected. A user may change their mind about wanting to run that command and attempt to kill the process and do something else with their device. But since the task is uninterruptable, they have to wait for it to finish, which could be many hours. Add a new BLKDEV_ZERO_KILLABLE flag for blkdev_issue_zeroout that checks for a fatal signal at each iteration so the user doesn't have to wait for their regretted operation to complete naturally. Heavily based on an earlier patch from Keith Busch. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240701165219.1571322-11-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-lib.c | 66 +++++++++++++++++++++++++++--------------- block/ioctl.c | 2 +- include/linux/blkdev.h | 1 + 3 files changed, 45 insertions(+), 24 deletions(-) diff --git a/block/blk-lib.c b/block/blk-lib.c index 2384acd913d21f..9f735efa6c9459 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -115,24 +115,27 @@ static void __blkdev_issue_write_zeroes(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, unsigned flags) { - struct bio *bio = *biop; - while (nr_sects) { unsigned int len = min_t(sector_t, nr_sects, bio_write_zeroes_limit(bdev)); + struct bio *bio; + + if ((flags & BLKDEV_ZERO_KILLABLE) && + fatal_signal_pending(current)) + break; - bio = blk_next_bio(bio, bdev, 0, REQ_OP_WRITE_ZEROES, gfp_mask); + bio = bio_alloc(bdev, 0, REQ_OP_WRITE_ZEROES, gfp_mask); bio->bi_iter.bi_sector = sector; if (flags & BLKDEV_ZERO_NOUNMAP) bio->bi_opf |= REQ_NOUNMAP; bio->bi_iter.bi_size = len << SECTOR_SHIFT; + *biop = bio_chain_and_submit(*biop, bio); + nr_sects -= len; sector += len; cond_resched(); } - - *biop = bio; } static int blkdev_issue_write_zeroes(struct block_device *bdev, sector_t sector, @@ -145,6 +148,12 @@ static int blkdev_issue_write_zeroes(struct block_device *bdev, sector_t sector, blk_start_plug(&plug); __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp, &bio, flags); if (bio) { + if ((flags & BLKDEV_ZERO_KILLABLE) && + fatal_signal_pending(current)) { + bio_await_chain(bio); + blk_finish_plug(&plug); + return -EINTR; + } ret = submit_bio_wait(bio); bio_put(bio); } @@ -176,29 +185,34 @@ static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects) static void __blkdev_issue_zero_pages(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, - struct bio **biop) + struct bio **biop, unsigned int flags) { - struct bio *bio = *biop; - int bi_size = 0; - unsigned int sz; + while (nr_sects) { + unsigned int nr_vecs = __blkdev_sectors_to_bio_pages(nr_sects); + struct bio *bio; - while (nr_sects != 0) { - bio = blk_next_bio(bio, bdev, __blkdev_sectors_to_bio_pages(nr_sects), - REQ_OP_WRITE, gfp_mask); + bio = bio_alloc(bdev, nr_vecs, REQ_OP_WRITE, gfp_mask); bio->bi_iter.bi_sector = sector; - while (nr_sects != 0) { - sz = min((sector_t) PAGE_SIZE, nr_sects << 9); - bi_size = bio_add_page(bio, ZERO_PAGE(0), sz, 0); - nr_sects -= bi_size >> 9; - sector += bi_size >> 9; - if (bi_size < sz) + if ((flags & BLKDEV_ZERO_KILLABLE) && + fatal_signal_pending(current)) + break; + + do { + unsigned int len, added; + + len = min_t(sector_t, + PAGE_SIZE, nr_sects << SECTOR_SHIFT); + added = bio_add_page(bio, ZERO_PAGE(0), len, 0); + if (added < len) break; - } + nr_sects -= added >> SECTOR_SHIFT; + sector += added >> SECTOR_SHIFT; + } while (nr_sects); + + *biop = bio_chain_and_submit(*biop, bio); cond_resched(); } - - *biop = bio; } static int blkdev_issue_zero_pages(struct block_device *bdev, sector_t sector, @@ -212,8 +226,14 @@ static int blkdev_issue_zero_pages(struct block_device *bdev, sector_t sector, return -EOPNOTSUPP; blk_start_plug(&plug); - __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp, &bio); + __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp, &bio, flags); if (bio) { + if ((flags & BLKDEV_ZERO_KILLABLE) && + fatal_signal_pending(current)) { + bio_await_chain(bio); + blk_finish_plug(&plug); + return -EINTR; + } ret = submit_bio_wait(bio); bio_put(bio); } @@ -255,7 +275,7 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, if (flags & BLKDEV_ZERO_NOFALLBACK) return -EOPNOTSUPP; __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp_mask, - biop); + biop, flags); } return 0; } diff --git a/block/ioctl.c b/block/ioctl.c index d570e16958961e..e8e4a4190f183a 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -224,7 +224,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, blk_mode_t mode, goto fail; err = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL, - BLKDEV_ZERO_NOUNMAP); + BLKDEV_ZERO_NOUNMAP | BLKDEV_ZERO_KILLABLE); fail: filemap_invalidate_unlock(bdev->bd_mapping); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index dc250d8070d215..02e04df272828e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1095,6 +1095,7 @@ int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector, #define BLKDEV_ZERO_NOUNMAP (1 << 0) /* do not free blocks */ #define BLKDEV_ZERO_NOFALLBACK (1 << 1) /* don't write explicit zeroes */ +#define BLKDEV_ZERO_KILLABLE (1 << 2) /* interruptible by fatal signals */ extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, From 25f76c3db2f08428b5acd082a52787164001eb6e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 6 Jul 2024 09:52:17 +0200 Subject: [PATCH 1577/1578] block: add a bvec_phys helper Get callers out of poking into bvec internals a bit more. Not a huge win right now, but with the proposed new DMA mapping API we might end up with a lot more of this otherwise. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20240706075228.2350978-2-hch@lst.de Signed-off-by: Jens Axboe --- arch/m68k/emu/nfblock.c | 2 +- block/bio.c | 2 +- block/blk.h | 4 ++-- include/linux/bvec.h | 14 ++++++++++++++ 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c index 8eea7ef9115146..874fe958877388 100644 --- a/arch/m68k/emu/nfblock.c +++ b/arch/m68k/emu/nfblock.c @@ -71,7 +71,7 @@ static void nfhd_submit_bio(struct bio *bio) len = bvec.bv_len; len >>= 9; nfhd_read_write(dev->id, 0, dir, sec >> shift, len >> shift, - page_to_phys(bvec.bv_page) + bvec.bv_offset); + bvec_phys(&bvec)); sec += len; } bio_endio(bio); diff --git a/block/bio.c b/block/bio.c index e9e809a63c5975..a3b1b2266c50be 100644 --- a/block/bio.c +++ b/block/bio.c @@ -953,7 +953,7 @@ bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, bool *same_page) { unsigned long mask = queue_segment_boundary(q); - phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset; + phys_addr_t addr1 = bvec_phys(bv); phys_addr_t addr2 = page_to_phys(page) + offset + len - 1; if ((addr1 | mask) != (addr2 | mask)) diff --git a/block/blk.h b/block/blk.h index 47dadd2439b1ca..8e8936e97307c6 100644 --- a/block/blk.h +++ b/block/blk.h @@ -98,8 +98,8 @@ static inline bool biovec_phys_mergeable(struct request_queue *q, struct bio_vec *vec1, struct bio_vec *vec2) { unsigned long mask = queue_segment_boundary(q); - phys_addr_t addr1 = page_to_phys(vec1->bv_page) + vec1->bv_offset; - phys_addr_t addr2 = page_to_phys(vec2->bv_page) + vec2->bv_offset; + phys_addr_t addr1 = bvec_phys(vec1); + phys_addr_t addr2 = bvec_phys(vec2); /* * Merging adjacent physical pages may not work correctly under KMSAN diff --git a/include/linux/bvec.h b/include/linux/bvec.h index bd1e361b351c5a..f41c7f0ef91ed5 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -280,4 +280,18 @@ static inline void *bvec_virt(struct bio_vec *bvec) return page_address(bvec->bv_page) + bvec->bv_offset; } +/** + * bvec_phys - return the physical address for a bvec + * @bvec: bvec to return the physical address for + */ +static inline phys_addr_t bvec_phys(const struct bio_vec *bvec) +{ + /* + * Note this open codes page_to_phys because page_to_phys is defined in + * , which we don't want to pull in here. If it ever moves to + * a sensible place we should start using it. + */ + return PFN_PHYS(page_to_pfn(bvec->bv_page)) + bvec->bv_offset; +} + #endif /* __LINUX_BVEC_H */ From 09595e0c9d654743483197b2f21dd4ec37c90a27 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 6 Jul 2024 09:52:18 +0200 Subject: [PATCH 1578/1578] block: pass a phys_addr_t to get_max_segment_size Work on a single address to simplify the logic, and prepare the callers from using better helpers. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20240706075228.2350978-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-merge.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index cff20bcc0252a7..e41ea331809936 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -209,23 +209,22 @@ static inline unsigned get_max_io_size(struct bio *bio, /** * get_max_segment_size() - maximum number of bytes to add as a single segment * @lim: Request queue limits. - * @start_page: See below. - * @offset: Offset from @start_page where to add a segment. + * @paddr: address of the range to add + * @max_len: maximum length available to add at @paddr * - * Returns the maximum number of bytes that can be added as a single segment. + * Returns the maximum number of bytes of the range starting at @paddr that can + * be added to a single segment. */ static inline unsigned get_max_segment_size(const struct queue_limits *lim, - struct page *start_page, unsigned long offset) + phys_addr_t paddr, unsigned int len) { - unsigned long mask = lim->seg_boundary_mask; - - offset = mask & (page_to_phys(start_page) + offset); - /* * Prevent an overflow if mask = ULONG_MAX and offset = 0 by adding 1 * after having calculated the minimum. */ - return min(mask - offset, (unsigned long)lim->max_segment_size - 1) + 1; + return min_t(unsigned long, len, + min(lim->seg_boundary_mask - (lim->seg_boundary_mask & paddr), + (unsigned long)lim->max_segment_size - 1) + 1); } /** @@ -258,9 +257,7 @@ static bool bvec_split_segs(const struct queue_limits *lim, unsigned seg_size = 0; while (len && *nsegs < max_segs) { - seg_size = get_max_segment_size(lim, bv->bv_page, - bv->bv_offset + total_len); - seg_size = min(seg_size, len); + seg_size = get_max_segment_size(lim, bvec_phys(bv) + total_len, len); (*nsegs)++; total_len += seg_size; @@ -494,8 +491,8 @@ static unsigned blk_bvec_map_sg(struct request_queue *q, while (nbytes > 0) { unsigned offset = bvec->bv_offset + total; - unsigned len = min(get_max_segment_size(&q->limits, - bvec->bv_page, offset), nbytes); + unsigned len = get_max_segment_size(&q->limits, bvec_phys(bvec), + nbytes); struct page *page = bvec->bv_page; /*