diff options
author | thegeorg <thegeorg@yandex-team.com> | 2024-06-09 11:55:21 +0300 |
---|---|---|
committer | thegeorg <thegeorg@yandex-team.com> | 2024-06-09 12:07:55 +0300 |
commit | afd4899380eea1c70e2a68714b5da1c9919ccdbd (patch) | |
tree | cd5120708784139bc6a0f8881da1ed8389a065b3 /contrib/libs/liburing/src/setup.c | |
parent | a83bd2dd3c21e38c6c0807ec5e679497ab567f24 (diff) | |
download | ydb-afd4899380eea1c70e2a68714b5da1c9919ccdbd.tar.gz |
Update contrib/libs/liburing to 2.6
3b51a9fb14de805208d11f1c077c78bb5d487e0f
Diffstat (limited to 'contrib/libs/liburing/src/setup.c')
-rw-r--r-- | contrib/libs/liburing/src/setup.c | 380 |
1 files changed, 297 insertions, 83 deletions
diff --git a/contrib/libs/liburing/src/setup.c b/contrib/libs/liburing/src/setup.c index db2f3dfe15..0d33f65aae 100644 --- a/contrib/libs/liburing/src/setup.c +++ b/contrib/libs/liburing/src/setup.c @@ -6,16 +6,96 @@ #include "syscall.h" #include "liburing.h" #include "int_flags.h" +#include "setup.h" #include "liburing/compat.h" #include "liburing/io_uring.h" +#define KERN_MAX_ENTRIES 32768 +#define KERN_MAX_CQ_ENTRIES (2 * KERN_MAX_ENTRIES) + +static inline int __fls(int x) +{ + if (!x) + return 0; + return 8 * sizeof(x) - __builtin_clz(x); +} + +static unsigned roundup_pow2(unsigned depth) +{ + return 1U << __fls(depth - 1); +} + +static int get_sq_cq_entries(unsigned entries, struct io_uring_params *p, + unsigned *sq, unsigned *cq) +{ + unsigned cq_entries; + + if (!entries) + return -EINVAL; + if (entries > KERN_MAX_ENTRIES) { + if (!(p->flags & IORING_SETUP_CLAMP)) + return -EINVAL; + entries = KERN_MAX_ENTRIES; + } + + entries = roundup_pow2(entries); + if (p->flags & IORING_SETUP_CQSIZE) { + if (!p->cq_entries) + return -EINVAL; + cq_entries = p->cq_entries; + if (cq_entries > KERN_MAX_CQ_ENTRIES) { + if (!(p->flags & IORING_SETUP_CLAMP)) + return -EINVAL; + cq_entries = KERN_MAX_CQ_ENTRIES; + } + cq_entries = roundup_pow2(cq_entries); + if (cq_entries < entries) + return -EINVAL; + } else { + cq_entries = 2 * entries; + } + + *sq = entries; + *cq = cq_entries; + return 0; +} + static void io_uring_unmap_rings(struct io_uring_sq *sq, struct io_uring_cq *cq) { - __sys_munmap(sq->ring_ptr, sq->ring_sz); - if (cq->ring_ptr && cq->ring_ptr != sq->ring_ptr) + if (sq->ring_sz) + __sys_munmap(sq->ring_ptr, sq->ring_sz); + if (cq->ring_ptr && cq->ring_sz && cq->ring_ptr != sq->ring_ptr) __sys_munmap(cq->ring_ptr, cq->ring_sz); } +static void io_uring_setup_ring_pointers(struct io_uring_params *p, + struct io_uring_sq *sq, + struct io_uring_cq *cq) +{ + sq->khead = sq->ring_ptr + p->sq_off.head; + sq->ktail = sq->ring_ptr + p->sq_off.tail; + sq->kring_mask = sq->ring_ptr + p->sq_off.ring_mask; + sq->kring_entries = sq->ring_ptr + p->sq_off.ring_entries; + sq->kflags = sq->ring_ptr + p->sq_off.flags; + sq->kdropped = sq->ring_ptr + p->sq_off.dropped; + if (!(p->flags & IORING_SETUP_NO_SQARRAY)) + sq->array = sq->ring_ptr + p->sq_off.array; + + cq->khead = cq->ring_ptr + p->cq_off.head; + cq->ktail = cq->ring_ptr + p->cq_off.tail; + cq->kring_mask = cq->ring_ptr + p->cq_off.ring_mask; + cq->kring_entries = cq->ring_ptr + p->cq_off.ring_entries; + cq->koverflow = cq->ring_ptr + p->cq_off.overflow; + cq->cqes = cq->ring_ptr + p->cq_off.cqes; + if (p->cq_off.flags) + cq->kflags = cq->ring_ptr + p->cq_off.flags; + + sq->ring_mask = *sq->kring_mask; + sq->ring_entries = *sq->kring_entries; + cq->ring_mask = *cq->kring_mask; + cq->ring_entries = *cq->kring_entries; +} + static int io_uring_mmap(int fd, struct io_uring_params *p, struct io_uring_sq *sq, struct io_uring_cq *cq) { @@ -53,14 +133,6 @@ static int io_uring_mmap(int fd, struct io_uring_params *p, } } - sq->khead = sq->ring_ptr + p->sq_off.head; - sq->ktail = sq->ring_ptr + p->sq_off.tail; - sq->kring_mask = sq->ring_ptr + p->sq_off.ring_mask; - sq->kring_entries = sq->ring_ptr + p->sq_off.ring_entries; - sq->kflags = sq->ring_ptr + p->sq_off.flags; - sq->kdropped = sq->ring_ptr + p->sq_off.dropped; - sq->array = sq->ring_ptr + p->sq_off.array; - size = sizeof(struct io_uring_sqe); if (p->flags & IORING_SETUP_SQE128) size += 64; @@ -73,19 +145,7 @@ err: return ret; } - cq->khead = cq->ring_ptr + p->cq_off.head; - cq->ktail = cq->ring_ptr + p->cq_off.tail; - cq->kring_mask = cq->ring_ptr + p->cq_off.ring_mask; - cq->kring_entries = cq->ring_ptr + p->cq_off.ring_entries; - cq->koverflow = cq->ring_ptr + p->cq_off.overflow; - cq->cqes = cq->ring_ptr + p->cq_off.cqes; - if (p->cq_off.flags) - cq->kflags = cq->ring_ptr + p->cq_off.flags; - - sq->ring_mask = *sq->kring_mask; - sq->ring_entries = *sq->kring_entries; - cq->ring_mask = *cq->kring_mask; - cq->ring_entries = *cq->kring_entries; + io_uring_setup_ring_pointers(p, sq, cq); return 0; } @@ -98,17 +158,8 @@ err: __cold int io_uring_queue_mmap(int fd, struct io_uring_params *p, struct io_uring *ring) { - int ret; - memset(ring, 0, sizeof(*ring)); - ret = io_uring_mmap(fd, p, &ring->sq, &ring->cq); - if (!ret) { - ring->flags = p->flags; - ring->ring_fd = ring->enter_ring_fd = fd; - ring->int_flags = 0; - return 0; - } - return ret; + return io_uring_mmap(fd, p, &ring->sq, &ring->cq); } /* @@ -146,33 +197,214 @@ __cold int io_uring_ring_dontfork(struct io_uring *ring) return 0; } -__cold int io_uring_queue_init_params(unsigned entries, struct io_uring *ring, - struct io_uring_params *p) +/* FIXME */ +static size_t huge_page_size = 2 * 1024 * 1024; + +/* + * Returns negative for error, or number of bytes used in the buffer on success + */ +static int io_uring_alloc_huge(unsigned entries, struct io_uring_params *p, + struct io_uring_sq *sq, struct io_uring_cq *cq, + void *buf, size_t buf_size) { - int fd, ret; + unsigned long page_size = get_page_size(); + unsigned sq_entries, cq_entries; + size_t ring_mem, sqes_mem; + unsigned long mem_used = 0; + void *ptr; + int ret; + + ret = get_sq_cq_entries(entries, p, &sq_entries, &cq_entries); + if (ret) + return ret; + + sqes_mem = sq_entries * sizeof(struct io_uring_sqe); + sqes_mem = (sqes_mem + page_size - 1) & ~(page_size - 1); + ring_mem = cq_entries * sizeof(struct io_uring_cqe); + if (p->flags & IORING_SETUP_CQE32) + ring_mem *= 2; + if (!(p->flags & IORING_SETUP_NO_SQARRAY)) + ring_mem += sq_entries * sizeof(unsigned); + mem_used = sqes_mem + ring_mem; + mem_used = (mem_used + page_size - 1) & ~(page_size - 1); + + /* + * A maxed-out number of CQ entries with IORING_SETUP_CQE32 fills a 2MB + * huge page by itself, so the SQ entries won't fit in the same huge + * page. For SQEs, that shouldn't be possible given KERN_MAX_ENTRIES, + * but check that too to future-proof (e.g. against different huge page + * sizes). Bail out early so we don't overrun. + */ + if (!buf && (sqes_mem > huge_page_size || ring_mem > huge_page_size)) + return -ENOMEM; + + if (buf) { + if (mem_used > buf_size) + return -ENOMEM; + ptr = buf; + } else { + int map_hugetlb = 0; + if (sqes_mem <= page_size) + buf_size = page_size; + else { + buf_size = huge_page_size; + map_hugetlb = MAP_HUGETLB; + } + ptr = __sys_mmap(NULL, buf_size, PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_ANONYMOUS|map_hugetlb, + -1, 0); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + } + + sq->sqes = ptr; + if (mem_used <= buf_size) { + sq->ring_ptr = (void *) sq->sqes + sqes_mem; + /* clear ring sizes, we have just one mmap() to undo */ + cq->ring_sz = 0; + sq->ring_sz = 0; + } else { + int map_hugetlb = 0; + if (ring_mem <= page_size) + buf_size = page_size; + else { + buf_size = huge_page_size; + map_hugetlb = MAP_HUGETLB; + } + ptr = __sys_mmap(NULL, buf_size, PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_ANONYMOUS|map_hugetlb, + -1, 0); + if (IS_ERR(ptr)) { + __sys_munmap(sq->sqes, 1); + return PTR_ERR(ptr); + } + sq->ring_ptr = ptr; + sq->ring_sz = buf_size; + cq->ring_sz = 0; + } + + cq->ring_ptr = (void *) sq->ring_ptr; + p->sq_off.user_addr = (unsigned long) sq->sqes; + p->cq_off.user_addr = (unsigned long) sq->ring_ptr; + return (int) mem_used; +} + +int __io_uring_queue_init_params(unsigned entries, struct io_uring *ring, + struct io_uring_params *p, void *buf, + size_t buf_size) +{ + int fd, ret = 0; unsigned *sq_array; unsigned sq_entries, index; + memset(ring, 0, sizeof(*ring)); + + /* + * The kernel does this check already, but checking it here allows us + * to avoid handling it below. + */ + if (p->flags & IORING_SETUP_REGISTERED_FD_ONLY + && !(p->flags & IORING_SETUP_NO_MMAP)) + return -EINVAL; + + if (p->flags & IORING_SETUP_NO_MMAP) { + ret = io_uring_alloc_huge(entries, p, &ring->sq, &ring->cq, + buf, buf_size); + if (ret < 0) + return ret; + if (buf) + ring->int_flags |= INT_FLAG_APP_MEM; + } + fd = __sys_io_uring_setup(entries, p); - if (fd < 0) + if (fd < 0) { + if ((p->flags & IORING_SETUP_NO_MMAP) && + !(ring->int_flags & INT_FLAG_APP_MEM)) { + __sys_munmap(ring->sq.sqes, 1); + io_uring_unmap_rings(&ring->sq, &ring->cq); + } return fd; + } - ret = io_uring_queue_mmap(fd, p, ring); - if (ret) { - __sys_close(fd); - return ret; + if (!(p->flags & IORING_SETUP_NO_MMAP)) { + ret = io_uring_queue_mmap(fd, p, ring); + if (ret) { + __sys_close(fd); + return ret; + } + } else { + io_uring_setup_ring_pointers(p, &ring->sq, &ring->cq); } /* * Directly map SQ slots to SQEs */ - sq_array = ring->sq.array; sq_entries = ring->sq.ring_entries; - for (index = 0; index < sq_entries; index++) - sq_array[index] = index; + if (!(p->flags & IORING_SETUP_NO_SQARRAY)) { + sq_array = ring->sq.array; + for (index = 0; index < sq_entries; index++) + sq_array[index] = index; + } ring->features = p->features; - return 0; + ring->flags = p->flags; + ring->enter_ring_fd = fd; + if (p->flags & IORING_SETUP_REGISTERED_FD_ONLY) { + ring->ring_fd = -1; + ring->int_flags |= INT_FLAG_REG_RING | INT_FLAG_REG_REG_RING; + } else { + ring->ring_fd = fd; + } + + return ret; +} + +static int io_uring_queue_init_try_nosqarr(unsigned entries, struct io_uring *ring, + struct io_uring_params *p, void *buf, + size_t buf_size) +{ + unsigned flags = p->flags; + int ret; + + p->flags |= IORING_SETUP_NO_SQARRAY; + ret = __io_uring_queue_init_params(entries, ring, p, buf, buf_size); + + /* don't fallback if explicitly asked for NOSQARRAY */ + if (ret != -EINVAL || (flags & IORING_SETUP_NO_SQARRAY)) + return ret; + + p->flags = flags; + return __io_uring_queue_init_params(entries, ring, p, buf, buf_size); +} + +/* + * Like io_uring_queue_init_params(), except it allows the application to pass + * in a pre-allocated memory range that is used for the shared data between + * the kernel and the application. This includes the sqes array, and the two + * rings. The memory must be contiguous, the use case here is that the app + * allocates a huge page and passes it in. + * + * Returns the number of bytes used in the buffer, the app can then reuse + * the buffer with the returned offset to put more rings in the same huge + * page. Returns -ENOMEM if there's not enough room left in the buffer to + * host the ring. + */ +int io_uring_queue_init_mem(unsigned entries, struct io_uring *ring, + struct io_uring_params *p, + void *buf, size_t buf_size) +{ + /* should already be set... */ + p->flags |= IORING_SETUP_NO_MMAP; + return io_uring_queue_init_try_nosqarr(entries, ring, p, buf, buf_size); +} + +int io_uring_queue_init_params(unsigned entries, struct io_uring *ring, + struct io_uring_params *p) +{ + int ret; + + ret = io_uring_queue_init_try_nosqarr(entries, ring, p, NULL, 0); + return ret >= 0 ? 0 : ret; } /* @@ -196,11 +428,20 @@ __cold void io_uring_queue_exit(struct io_uring *ring) struct io_uring_cq *cq = &ring->cq; size_t sqe_size; - sqe_size = sizeof(struct io_uring_sqe); - if (ring->flags & IORING_SETUP_SQE128) - sqe_size += 64; - __sys_munmap(sq->sqes, sqe_size * sq->ring_entries); - io_uring_unmap_rings(sq, cq); + if (!sq->ring_sz) { + sqe_size = sizeof(struct io_uring_sqe); + if (ring->flags & IORING_SETUP_SQE128) + sqe_size += 64; + __sys_munmap(sq->sqes, sqe_size * sq->ring_entries); + io_uring_unmap_rings(sq, cq); + } else { + if (!(ring->int_flags & INT_FLAG_APP_MEM)) { + __sys_munmap(sq->sqes, + *sq->kring_entries * sizeof(struct io_uring_sqe)); + io_uring_unmap_rings(sq, cq); + } + } + /* * Not strictly required, but frees up the slot we used now rather * than at process exit time. @@ -251,18 +492,6 @@ __cold void io_uring_free_probe(struct io_uring_probe *probe) free(probe); } -static inline int __fls(unsigned long x) -{ - if (!x) - return 0; - return 8 * sizeof(x) - __builtin_clzl(x); -} - -static unsigned roundup_pow2(unsigned depth) -{ - return 1U << __fls(depth - 1); -} - static size_t npages(size_t size, long page_size) { size--; @@ -293,9 +522,6 @@ static size_t rings_size(struct io_uring_params *p, unsigned entries, return pages * page_size; } -#define KERN_MAX_ENTRIES 32768 -#define KERN_MAX_CQ_ENTRIES (2 * KERN_MAX_ENTRIES) - /* * Return the required ulimit -l memlock memory required for a given ring * setup, in bytes. May return -errno on error. On newer (5.12+) kernels, @@ -309,9 +535,10 @@ __cold ssize_t io_uring_mlock_size_params(unsigned entries, { struct io_uring_params lp; struct io_uring ring; - unsigned cq_entries; + unsigned cq_entries, sq; long page_size; ssize_t ret; + int cret; memset(&lp, 0, sizeof(lp)); @@ -341,25 +568,12 @@ __cold ssize_t io_uring_mlock_size_params(unsigned entries, entries = KERN_MAX_ENTRIES; } - entries = roundup_pow2(entries); - if (p->flags & IORING_SETUP_CQSIZE) { - if (!p->cq_entries) - return -EINVAL; - cq_entries = p->cq_entries; - if (cq_entries > KERN_MAX_CQ_ENTRIES) { - if (!(p->flags & IORING_SETUP_CLAMP)) - return -EINVAL; - cq_entries = KERN_MAX_CQ_ENTRIES; - } - cq_entries = roundup_pow2(cq_entries); - if (cq_entries < entries) - return -EINVAL; - } else { - cq_entries = 2 * entries; - } + cret = get_sq_cq_entries(entries, p, &sq, &cq_entries); + if (cret) + return cret; page_size = get_page_size(); - return rings_size(p, entries, cq_entries, page_size); + return rings_size(p, sq, cq_entries, page_size); } /* |