diff --git a/hw/femu/dma.c b/hw/femu/dma.c index 63e376391c3..3465d1017ea 100644 --- a/hw/femu/dma.c +++ b/hw/femu/dma.c @@ -1,23 +1,29 @@ #include "./nvme.h" -void nvme_addr_read(FemuCtrl *n, hwaddr addr, void *buf, int size) +int nvme_addr_read(FemuCtrl *n, hwaddr addr, void *buf, int size) { if (n->cmbsz && addr >= n->ctrl_mem.addr && addr < (n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size))) { memcpy(buf, (void *)&n->cmbuf[addr - n->ctrl_mem.addr], size); + return 0; } else { pci_dma_read(&n->parent_obj, addr, buf, size); + return 0; } + return 1; } -void nvme_addr_write(FemuCtrl *n, hwaddr addr, void *buf, int size) +int nvme_addr_write(FemuCtrl *n, hwaddr addr, void *buf, int size) { if (n->cmbsz && addr >= n->ctrl_mem.addr && addr < (n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size))) { memcpy((void *)&n->cmbuf[addr - n->ctrl_mem.addr], buf, size); + return 0; } else { pci_dma_write(&n->parent_obj, addr, buf, size); + return 0; } + return 1; } uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1, @@ -109,6 +115,205 @@ uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1, return NVME_INVALID_FIELD | NVME_DNR; } +static inline void nvme_sg_init(FemuCtrl *n, QEMUSGList *sg, bool dma) +{ + if (dma) { + pci_dma_sglist_init(sg, PCI_DEVICE(n), 0); + } +} +/* + * Map 'nsgld' data descriptors from 'segment'. The function will subtract the + * number of bytes mapped in len. + */ +static uint16_t nvme_map_sgl_data(FemuCtrl *n, QEMUSGList *sg, + NvmeSglDescriptor *segment, uint64_t nsgld, + size_t *len, NvmeCmd *cmd) +{ + dma_addr_t addr, trans_len; + uint32_t dlen; + uint16_t status; + + for (int i = 0; i < nsgld; i++) { + uint8_t type = NVME_SGL_TYPE(segment[i].type); + + switch (type) { + case NVME_SGL_DESCR_TYPE_DATA_BLOCK: + break; + case NVME_SGL_DESCR_TYPE_SEGMENT: + case NVME_SGL_DESCR_TYPE_LAST_SEGMENT: + return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR; + default: + return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR; + } + + dlen = le32_to_cpu(segment[i].len); + + if (!dlen) { + continue; + } + + if (*len == 0) { + /* + * All data has been mapped, but the SGL contains additional + * segments and/or descriptors. The controller might accept + * ignoring the rest of the SGL. + */ + uint32_t sgls = le32_to_cpu(n->id_ctrl.sgls); + if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) { + break; + } + + return NVME_DATA_SGL_LEN_INVALID | NVME_DNR; + } + + trans_len = MIN(*len, dlen); + + addr = le64_to_cpu(segment[i].addr); + + if (UINT64_MAX - addr < dlen) { + return NVME_DATA_SGL_LEN_INVALID | NVME_DNR; + } + + qemu_sglist_add(sg, addr, trans_len); + + *len -= trans_len; + } + + return NVME_SUCCESS; +} + +uint16_t nvme_map_sgl(FemuCtrl *n, QEMUSGList *sg, NvmeSglDescriptor sgl, + size_t len, NvmeCmd *cmd) +{ + /* + * Read the segment in chunks of 256 descriptors (one 4k page) to avoid + * dynamically allocating a potentially huge SGL. The spec allows the SGL + * to be larger (as in number of bytes required to describe the SGL + * descriptors and segment chain) than the command transfer size, so it is + * not bounded by MDTS. + */ + const int SEG_CHUNK_SIZE = 256; + + NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld; + uint64_t nsgld; + uint32_t seg_len; + uint16_t status; + hwaddr addr; + int ret; + + sgld = &sgl; + addr = le64_to_cpu(sgl.addr); + + nvme_sg_init(n, sg, true); + + /* + * If the entire transfer can be described with a single data block it can + * be mapped directly. + */ + if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) { + status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd); + if (status) { + goto unmap; + } + + goto out; + } + + for (;;) { + switch (NVME_SGL_TYPE(sgld->type)) { + case NVME_SGL_DESCR_TYPE_SEGMENT: + case NVME_SGL_DESCR_TYPE_LAST_SEGMENT: + break; + default: + return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR; + } + + seg_len = le32_to_cpu(sgld->len); + + /* check the length of the (Last) Segment descriptor */ + if (!seg_len || seg_len & 0xf) { + return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR; + } + + if (UINT64_MAX - addr < seg_len) { + return NVME_DATA_SGL_LEN_INVALID | NVME_DNR; + } + + nsgld = seg_len / sizeof(NvmeSglDescriptor); + + while (nsgld > SEG_CHUNK_SIZE) { + if (nvme_addr_read(n, addr, segment, sizeof(segment))) { + status = NVME_DATA_TRAS_ERROR; + goto unmap; + } + + status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE, + &len, cmd); + if (status) { + goto unmap; + } + + nsgld -= SEG_CHUNK_SIZE; + addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor); + } + + ret = nvme_addr_read(n, addr, segment, nsgld * + sizeof(NvmeSglDescriptor)); + if (ret) { + status = NVME_DATA_TRAS_ERROR; + goto unmap; + } + + last_sgld = &segment[nsgld - 1]; + + /* + * If the segment ends with a Data Block, then we are done. + */ + if (NVME_SGL_TYPE(last_sgld->type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) { + status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd); + if (status) { + goto unmap; + } + + goto out; + } + + /* + * If the last descriptor was not a Data Block, then the current + * segment must not be a Last Segment. + */ + if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) { + status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR; + goto unmap; + } + + sgld = last_sgld; + addr = le64_to_cpu(sgld->addr); + + /* + * Do not map the last descriptor; it will be a Segment or Last Segment + * descriptor and is handled by the next iteration. + */ + status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd); + if (status) { + goto unmap; + } + } + +out: + /* if there is any residual left in len, the SGL was too short */ + if (len) { + status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR; + goto unmap; + } + + return NVME_SUCCESS; + +unmap: + qemu_sglist_destroy(&sg); + return status; +} + uint16_t dma_write_prp(FemuCtrl *n, uint8_t *ptr, uint32_t len, uint64_t prp1, uint64_t prp2) { diff --git a/hw/femu/femu.c b/hw/femu/femu.c index 292ec827200..16085ae0914 100644 --- a/hw/femu/femu.c +++ b/hw/femu/femu.c @@ -429,6 +429,7 @@ static void nvme_init_ctrl(FemuCtrl *n) id->psd[0].mp = cpu_to_le16(0x9c4); id->psd[0].enlat = cpu_to_le32(0x10); id->psd[0].exlat = cpu_to_le32(0x4); + id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN | NVME_CTRL_SGLS_BITBUCKET); // NVMe SGL Support n->features.arbitration = 0x1f0f0706; n->features.power_mgmt = 0; @@ -722,4 +723,4 @@ static void femu_register_types(void) type_register_static(&femu_info); } -type_init(femu_register_types) \ No newline at end of file +type_init(femu_register_types) diff --git a/hw/femu/nvme-io.c b/hw/femu/nvme-io.c index d7ff7ed340d..46eb63a800a 100644 --- a/hw/femu/nvme-io.c +++ b/hw/femu/nvme-io.c @@ -234,6 +234,24 @@ void *nvme_poller(void *arg) return NULL; } +static uint16_t nvme_map_dptr(FemuCtrl *n, size_t len, NvmeRequest *req) +{ + uint64_t prp1, prp2; + + switch (req->cmd.psdt) { + case NVME_PSDT_PRP: + prp1 = le64_to_cpu(req->cmd.dptr.prp1); + prp2 = le64_to_cpu(req->cmd.dptr.prp2); + + return nvme_map_prp(&req->qsg, &req->iov, prp1, prp2, len, n); + case NVME_PSDT_SGL_MPTR_CONTIGUOUS: + case NVME_PSDT_SGL_MPTR_SGL: + return nvme_map_sgl(n, &req->qsg, req->cmd.dptr.sgl, len, &req->cmd); + default: + return NVME_INVALID_FIELD; + } +} + uint16_t nvme_rw(FemuCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, NvmeRequest *req) { NvmeRwCmd *rw = (NvmeRwCmd *)cmd; @@ -259,13 +277,13 @@ uint16_t nvme_rw(FemuCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, NvmeRequest *req) if (err) return err; - if (nvme_map_prp(&req->qsg, &req->iov, prp1, prp2, data_size, n)) { + if(nvme_map_dptr(n, data_size, req)) { nvme_set_error_page(n, req->sq->sqid, cmd->cid, NVME_INVALID_FIELD, offsetof(NvmeRwCmd, prp1), 0, ns->id); return NVME_INVALID_FIELD | NVME_DNR; } - assert((nlb << data_shift) == req->qsg.size); +// assert((nlb << data_shift) == req->qsg.size); req->slba = slba; req->status = NVME_SUCCESS; diff --git a/hw/femu/nvme.h b/hw/femu/nvme.h index eaf4d616639..858631f40d2 100644 --- a/hw/femu/nvme.h +++ b/hw/femu/nvme.h @@ -8,6 +8,7 @@ #include "qemu/memalign.h" #include "hw/pci/msix.h" #include "hw/pci/msi.h" +#include "hw/pci/pci.h" #include "hw/virtio/vhost.h" #include "qapi/error.h" #include "sysemu/kvm.h" @@ -18,6 +19,21 @@ #include "nand/nand.h" #include "timing-model/timing.h" +// from include/block/nvme.h +#define NVME_CTRL_SGLS_SUPPORT_MASK (0x3 << 0) +#define NVME_CTRL_SGLS_SUPPORT_NO_ALIGN (0x1 << 0) +#define NVME_CTRL_SGLS_SUPPORT_DWORD_ALIGN (0x1 << 1) +#define NVME_CTRL_SGLS_KEYED (0x1 << 2) +#define NVME_CTRL_SGLS_BITBUCKET (0x1 << 16) +#define NVME_CTRL_SGLS_MPTR_CONTIGUOUS (0x1 << 17) +#define NVME_CTRL_SGLS_EXCESS_LENGTH (0x1 << 18) +#define NVME_CTRL_SGLS_MPTR_SGL (0x1 << 19) +#define NVME_CTRL_SGLS_ADDR_OFFSET (0x1 << 20) + +#define NVME_SGL_TYPE(type) ((type >> 4) & 0xf) +#define NVME_SGL_SUBTYPE(type) (type & 0xf) + + #define NVME_ID_NS_LBADS(ns) \ ((ns)->id_ns.lbaf[NVME_ID_NS_FLBAS_INDEX((ns)->id_ns.flbas)].lbads) @@ -73,6 +89,34 @@ enum NvmeCapMask { CAP_MPSMAX_MASK = 0xf, }; +enum NvmeSglDescriptorType { + NVME_SGL_DESCR_TYPE_DATA_BLOCK = 0x0, + NVME_SGL_DESCR_TYPE_BIT_BUCKET = 0x1, + NVME_SGL_DESCR_TYPE_SEGMENT = 0x2, + NVME_SGL_DESCR_TYPE_LAST_SEGMENT = 0x3, + NVME_SGL_DESCR_TYPE_KEYED_DATA_BLOCK = 0x4, + + NVME_SGL_DESCR_TYPE_VENDOR_SPECIFIC = 0xf, +}; + +enum NvmeSglDescriptorSubtype { + NVME_SGL_DESCR_SUBTYPE_ADDRESS = 0x0, +}; + +enum { + NVME_SG_ALLOC = 1 << 0, + NVME_SG_DMA = 1 << 1, +}; + +typedef struct NvmeSg { + int flags; + + union { + QEMUSGList qsg; + QEMUIOVector iov; + }; +} NvmeSg; + #define NVME_MAX_QS PCI_MSIX_FLAGS_QSIZE #define NVME_MAX_QUEUE_ENTRIES 0xffff #define NVME_MAX_STRIDE 12 @@ -531,6 +575,12 @@ enum NvmeStatusCodes { NVME_CMD_ABORT_MISSING_FUSE = 0x000a, NVME_INVALID_NSID = 0x000b, NVME_CMD_SEQ_ERROR = 0x000c, + NVME_INVALID_SGL_SEG_DESCR = 0x000d, + NVME_INVALID_NUM_SGL_DESCRS = 0x000e, + NVME_DATA_SGL_LEN_INVALID = 0x000f, + NVME_MD_SGL_LEN_INVALID = 0x0010, + NVME_SGL_DESCR_TYPE_INVALID = 0x0011, + NVME_INVALID_USE_OF_CMB = 0x0012, NVME_INVALID_CMD_SET = 0x002c, NVME_LBA_RANGE = 0x0080, NVME_CAP_EXCEEDED = 0x0081, @@ -1440,15 +1490,17 @@ int nvme_setup_virq(FemuCtrl *n, NvmeCQueue *cq); int nvme_clear_virq(FemuCtrl *n); /* Public DMA APIs from dma.c */ -void nvme_addr_read(FemuCtrl *n, hwaddr addr, void *buf, int size); -void nvme_addr_write(FemuCtrl *n, hwaddr addr, void *buf, int size); +int nvme_addr_read(FemuCtrl *n, hwaddr addr, void *buf, int size); +int nvme_addr_write(FemuCtrl *n, hwaddr addr, void *buf, int size); uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1, uint64_t prp2, uint32_t len, FemuCtrl *n); uint16_t dma_write_prp(FemuCtrl *n, uint8_t *ptr, uint32_t len, uint64_t prp1, uint64_t prp2); uint16_t dma_read_prp(FemuCtrl *n, uint8_t *ptr, uint32_t len, uint64_t prp1, uint64_t prp2); - +// uint16_t nvme_map_dptr(FemuCtrl *n, size_t len, NvmeRequest *req); +uint16_t nvme_map_sgl(FemuCtrl *n, QEMUSGList *sg, NvmeSglDescriptor sgl, + size_t len, NvmeCmd *cmd); /* Misc */ uint64_t *nvme_setup_discontig(FemuCtrl *n, uint64_t prp_addr, uint16_t diff --git a/hw/femu/ocssd/oc12.c b/hw/femu/ocssd/oc12.c index 763b334964c..4dc0bc84528 100644 --- a/hw/femu/ocssd/oc12.c +++ b/hw/femu/ocssd/oc12.c @@ -421,6 +421,24 @@ static int oc12_advance_status(FemuCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, return 0; } +static uint16_t nvme_map_dptr(FemuCtrl *n, size_t len, NvmeRequest *req) +{ + uint64_t prp1, prp2; + + switch (req->cmd.psdt) { + case NVME_PSDT_PRP: + prp1 = le64_to_cpu(req->cmd.dptr.prp1); + prp2 = le64_to_cpu(req->cmd.dptr.prp2); + + return nvme_map_prp(&req->qsg, &req->iov, prp1, prp2, len, n); + case NVME_PSDT_SGL_MPTR_CONTIGUOUS: + case NVME_PSDT_SGL_MPTR_SGL: + return nvme_map_sgl(n, &req->qsg, req->cmd.dptr.sgl, len, &req->cmd); + default: + return NVME_INVALID_FIELD; + } +} + static uint16_t oc12_read(FemuCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, NvmeRequest *req) { @@ -474,7 +492,7 @@ static uint16_t oc12_read(FemuCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, } /* DMA user data */ - if (nvme_map_prp(&req->qsg, &req->iov, prp1, prp2, data_size, n)) { + if (nvme_map_dptr(n, data_size, req)) { femu_err("oc12_read: malformed prp (sz:%lu)\n", data_size); err = NVME_INVALID_FIELD | NVME_DNR; goto fail_free; @@ -553,7 +571,7 @@ static uint16_t oc12_write(FemuCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, } /* DMA user data */ - if (nvme_map_prp(&req->qsg, &req->iov, prp1, prp2, data_size, n)) { + if (nvme_map_dptr(n, data_size, req)) { femu_err("oc12_write: malformed prp (sz:%lu)\n", data_size); err = NVME_INVALID_FIELD | NVME_DNR; goto fail_free; diff --git a/hw/femu/zns/zns.c b/hw/femu/zns/zns.c index 7788a0ebbb0..5fcb9d2f757 100644 --- a/hw/femu/zns/zns.c +++ b/hw/femu/zns/zns.c @@ -1175,6 +1175,9 @@ static uint16_t zns_map_dptr(FemuCtrl *n, size_t len, NvmeRequest *req) prp2 = le64_to_cpu(req->cmd.dptr.prp2); return nvme_map_prp(&req->qsg, &req->iov, prp1, prp2, len, n); + case NVME_PSDT_SGL_MPTR_CONTIGUOUS: + case NVME_PSDT_SGL_MPTR_SGL: + return nvme_map_sgl(n, &req->qsg, req->cmd.dptr.sgl, len, &req->cmd); default: return NVME_INVALID_FIELD; }