aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/liburing/src/setup.c
diff options
context:
space:
mode:
authorthegeorg <thegeorg@yandex-team.com>2024-06-09 11:55:21 +0300
committerthegeorg <thegeorg@yandex-team.com>2024-06-09 12:07:55 +0300
commitafd4899380eea1c70e2a68714b5da1c9919ccdbd (patch)
treecd5120708784139bc6a0f8881da1ed8389a065b3 /contrib/libs/liburing/src/setup.c
parenta83bd2dd3c21e38c6c0807ec5e679497ab567f24 (diff)
downloadydb-afd4899380eea1c70e2a68714b5da1c9919ccdbd.tar.gz
Update contrib/libs/liburing to 2.6
3b51a9fb14de805208d11f1c077c78bb5d487e0f
Diffstat (limited to 'contrib/libs/liburing/src/setup.c')
-rw-r--r--contrib/libs/liburing/src/setup.c380
1 files changed, 297 insertions, 83 deletions
diff --git a/contrib/libs/liburing/src/setup.c b/contrib/libs/liburing/src/setup.c
index db2f3dfe15..0d33f65aae 100644
--- a/contrib/libs/liburing/src/setup.c
+++ b/contrib/libs/liburing/src/setup.c
@@ -6,16 +6,96 @@
#include "syscall.h"
#include "liburing.h"
#include "int_flags.h"
+#include "setup.h"
#include "liburing/compat.h"
#include "liburing/io_uring.h"
+#define KERN_MAX_ENTRIES 32768
+#define KERN_MAX_CQ_ENTRIES (2 * KERN_MAX_ENTRIES)
+
+static inline int __fls(int x)
+{
+ if (!x)
+ return 0;
+ return 8 * sizeof(x) - __builtin_clz(x);
+}
+
+static unsigned roundup_pow2(unsigned depth)
+{
+ return 1U << __fls(depth - 1);
+}
+
+static int get_sq_cq_entries(unsigned entries, struct io_uring_params *p,
+ unsigned *sq, unsigned *cq)
+{
+ unsigned cq_entries;
+
+ if (!entries)
+ return -EINVAL;
+ if (entries > KERN_MAX_ENTRIES) {
+ if (!(p->flags & IORING_SETUP_CLAMP))
+ return -EINVAL;
+ entries = KERN_MAX_ENTRIES;
+ }
+
+ entries = roundup_pow2(entries);
+ if (p->flags & IORING_SETUP_CQSIZE) {
+ if (!p->cq_entries)
+ return -EINVAL;
+ cq_entries = p->cq_entries;
+ if (cq_entries > KERN_MAX_CQ_ENTRIES) {
+ if (!(p->flags & IORING_SETUP_CLAMP))
+ return -EINVAL;
+ cq_entries = KERN_MAX_CQ_ENTRIES;
+ }
+ cq_entries = roundup_pow2(cq_entries);
+ if (cq_entries < entries)
+ return -EINVAL;
+ } else {
+ cq_entries = 2 * entries;
+ }
+
+ *sq = entries;
+ *cq = cq_entries;
+ return 0;
+}
+
static void io_uring_unmap_rings(struct io_uring_sq *sq, struct io_uring_cq *cq)
{
- __sys_munmap(sq->ring_ptr, sq->ring_sz);
- if (cq->ring_ptr && cq->ring_ptr != sq->ring_ptr)
+ if (sq->ring_sz)
+ __sys_munmap(sq->ring_ptr, sq->ring_sz);
+ if (cq->ring_ptr && cq->ring_sz && cq->ring_ptr != sq->ring_ptr)
__sys_munmap(cq->ring_ptr, cq->ring_sz);
}
+static void io_uring_setup_ring_pointers(struct io_uring_params *p,
+ struct io_uring_sq *sq,
+ struct io_uring_cq *cq)
+{
+ sq->khead = sq->ring_ptr + p->sq_off.head;
+ sq->ktail = sq->ring_ptr + p->sq_off.tail;
+ sq->kring_mask = sq->ring_ptr + p->sq_off.ring_mask;
+ sq->kring_entries = sq->ring_ptr + p->sq_off.ring_entries;
+ sq->kflags = sq->ring_ptr + p->sq_off.flags;
+ sq->kdropped = sq->ring_ptr + p->sq_off.dropped;
+ if (!(p->flags & IORING_SETUP_NO_SQARRAY))
+ sq->array = sq->ring_ptr + p->sq_off.array;
+
+ cq->khead = cq->ring_ptr + p->cq_off.head;
+ cq->ktail = cq->ring_ptr + p->cq_off.tail;
+ cq->kring_mask = cq->ring_ptr + p->cq_off.ring_mask;
+ cq->kring_entries = cq->ring_ptr + p->cq_off.ring_entries;
+ cq->koverflow = cq->ring_ptr + p->cq_off.overflow;
+ cq->cqes = cq->ring_ptr + p->cq_off.cqes;
+ if (p->cq_off.flags)
+ cq->kflags = cq->ring_ptr + p->cq_off.flags;
+
+ sq->ring_mask = *sq->kring_mask;
+ sq->ring_entries = *sq->kring_entries;
+ cq->ring_mask = *cq->kring_mask;
+ cq->ring_entries = *cq->kring_entries;
+}
+
static int io_uring_mmap(int fd, struct io_uring_params *p,
struct io_uring_sq *sq, struct io_uring_cq *cq)
{
@@ -53,14 +133,6 @@ static int io_uring_mmap(int fd, struct io_uring_params *p,
}
}
- sq->khead = sq->ring_ptr + p->sq_off.head;
- sq->ktail = sq->ring_ptr + p->sq_off.tail;
- sq->kring_mask = sq->ring_ptr + p->sq_off.ring_mask;
- sq->kring_entries = sq->ring_ptr + p->sq_off.ring_entries;
- sq->kflags = sq->ring_ptr + p->sq_off.flags;
- sq->kdropped = sq->ring_ptr + p->sq_off.dropped;
- sq->array = sq->ring_ptr + p->sq_off.array;
-
size = sizeof(struct io_uring_sqe);
if (p->flags & IORING_SETUP_SQE128)
size += 64;
@@ -73,19 +145,7 @@ err:
return ret;
}
- cq->khead = cq->ring_ptr + p->cq_off.head;
- cq->ktail = cq->ring_ptr + p->cq_off.tail;
- cq->kring_mask = cq->ring_ptr + p->cq_off.ring_mask;
- cq->kring_entries = cq->ring_ptr + p->cq_off.ring_entries;
- cq->koverflow = cq->ring_ptr + p->cq_off.overflow;
- cq->cqes = cq->ring_ptr + p->cq_off.cqes;
- if (p->cq_off.flags)
- cq->kflags = cq->ring_ptr + p->cq_off.flags;
-
- sq->ring_mask = *sq->kring_mask;
- sq->ring_entries = *sq->kring_entries;
- cq->ring_mask = *cq->kring_mask;
- cq->ring_entries = *cq->kring_entries;
+ io_uring_setup_ring_pointers(p, sq, cq);
return 0;
}
@@ -98,17 +158,8 @@ err:
__cold int io_uring_queue_mmap(int fd, struct io_uring_params *p,
struct io_uring *ring)
{
- int ret;
-
memset(ring, 0, sizeof(*ring));
- ret = io_uring_mmap(fd, p, &ring->sq, &ring->cq);
- if (!ret) {
- ring->flags = p->flags;
- ring->ring_fd = ring->enter_ring_fd = fd;
- ring->int_flags = 0;
- return 0;
- }
- return ret;
+ return io_uring_mmap(fd, p, &ring->sq, &ring->cq);
}
/*
@@ -146,33 +197,214 @@ __cold int io_uring_ring_dontfork(struct io_uring *ring)
return 0;
}
-__cold int io_uring_queue_init_params(unsigned entries, struct io_uring *ring,
- struct io_uring_params *p)
+/* FIXME */
+static size_t huge_page_size = 2 * 1024 * 1024;
+
+/*
+ * Returns negative for error, or number of bytes used in the buffer on success
+ */
+static int io_uring_alloc_huge(unsigned entries, struct io_uring_params *p,
+ struct io_uring_sq *sq, struct io_uring_cq *cq,
+ void *buf, size_t buf_size)
{
- int fd, ret;
+ unsigned long page_size = get_page_size();
+ unsigned sq_entries, cq_entries;
+ size_t ring_mem, sqes_mem;
+ unsigned long mem_used = 0;
+ void *ptr;
+ int ret;
+
+ ret = get_sq_cq_entries(entries, p, &sq_entries, &cq_entries);
+ if (ret)
+ return ret;
+
+ sqes_mem = sq_entries * sizeof(struct io_uring_sqe);
+ sqes_mem = (sqes_mem + page_size - 1) & ~(page_size - 1);
+ ring_mem = cq_entries * sizeof(struct io_uring_cqe);
+ if (p->flags & IORING_SETUP_CQE32)
+ ring_mem *= 2;
+ if (!(p->flags & IORING_SETUP_NO_SQARRAY))
+ ring_mem += sq_entries * sizeof(unsigned);
+ mem_used = sqes_mem + ring_mem;
+ mem_used = (mem_used + page_size - 1) & ~(page_size - 1);
+
+ /*
+ * A maxed-out number of CQ entries with IORING_SETUP_CQE32 fills a 2MB
+ * huge page by itself, so the SQ entries won't fit in the same huge
+ * page. For SQEs, that shouldn't be possible given KERN_MAX_ENTRIES,
+ * but check that too to future-proof (e.g. against different huge page
+ * sizes). Bail out early so we don't overrun.
+ */
+ if (!buf && (sqes_mem > huge_page_size || ring_mem > huge_page_size))
+ return -ENOMEM;
+
+ if (buf) {
+ if (mem_used > buf_size)
+ return -ENOMEM;
+ ptr = buf;
+ } else {
+ int map_hugetlb = 0;
+ if (sqes_mem <= page_size)
+ buf_size = page_size;
+ else {
+ buf_size = huge_page_size;
+ map_hugetlb = MAP_HUGETLB;
+ }
+ ptr = __sys_mmap(NULL, buf_size, PROT_READ|PROT_WRITE,
+ MAP_SHARED|MAP_ANONYMOUS|map_hugetlb,
+ -1, 0);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
+ }
+
+ sq->sqes = ptr;
+ if (mem_used <= buf_size) {
+ sq->ring_ptr = (void *) sq->sqes + sqes_mem;
+ /* clear ring sizes, we have just one mmap() to undo */
+ cq->ring_sz = 0;
+ sq->ring_sz = 0;
+ } else {
+ int map_hugetlb = 0;
+ if (ring_mem <= page_size)
+ buf_size = page_size;
+ else {
+ buf_size = huge_page_size;
+ map_hugetlb = MAP_HUGETLB;
+ }
+ ptr = __sys_mmap(NULL, buf_size, PROT_READ|PROT_WRITE,
+ MAP_SHARED|MAP_ANONYMOUS|map_hugetlb,
+ -1, 0);
+ if (IS_ERR(ptr)) {
+ __sys_munmap(sq->sqes, 1);
+ return PTR_ERR(ptr);
+ }
+ sq->ring_ptr = ptr;
+ sq->ring_sz = buf_size;
+ cq->ring_sz = 0;
+ }
+
+ cq->ring_ptr = (void *) sq->ring_ptr;
+ p->sq_off.user_addr = (unsigned long) sq->sqes;
+ p->cq_off.user_addr = (unsigned long) sq->ring_ptr;
+ return (int) mem_used;
+}
+
+int __io_uring_queue_init_params(unsigned entries, struct io_uring *ring,
+ struct io_uring_params *p, void *buf,
+ size_t buf_size)
+{
+ int fd, ret = 0;
unsigned *sq_array;
unsigned sq_entries, index;
+ memset(ring, 0, sizeof(*ring));
+
+ /*
+ * The kernel does this check already, but checking it here allows us
+ * to avoid handling it below.
+ */
+ if (p->flags & IORING_SETUP_REGISTERED_FD_ONLY
+ && !(p->flags & IORING_SETUP_NO_MMAP))
+ return -EINVAL;
+
+ if (p->flags & IORING_SETUP_NO_MMAP) {
+ ret = io_uring_alloc_huge(entries, p, &ring->sq, &ring->cq,
+ buf, buf_size);
+ if (ret < 0)
+ return ret;
+ if (buf)
+ ring->int_flags |= INT_FLAG_APP_MEM;
+ }
+
fd = __sys_io_uring_setup(entries, p);
- if (fd < 0)
+ if (fd < 0) {
+ if ((p->flags & IORING_SETUP_NO_MMAP) &&
+ !(ring->int_flags & INT_FLAG_APP_MEM)) {
+ __sys_munmap(ring->sq.sqes, 1);
+ io_uring_unmap_rings(&ring->sq, &ring->cq);
+ }
return fd;
+ }
- ret = io_uring_queue_mmap(fd, p, ring);
- if (ret) {
- __sys_close(fd);
- return ret;
+ if (!(p->flags & IORING_SETUP_NO_MMAP)) {
+ ret = io_uring_queue_mmap(fd, p, ring);
+ if (ret) {
+ __sys_close(fd);
+ return ret;
+ }
+ } else {
+ io_uring_setup_ring_pointers(p, &ring->sq, &ring->cq);
}
/*
* Directly map SQ slots to SQEs
*/
- sq_array = ring->sq.array;
sq_entries = ring->sq.ring_entries;
- for (index = 0; index < sq_entries; index++)
- sq_array[index] = index;
+ if (!(p->flags & IORING_SETUP_NO_SQARRAY)) {
+ sq_array = ring->sq.array;
+ for (index = 0; index < sq_entries; index++)
+ sq_array[index] = index;
+ }
ring->features = p->features;
- return 0;
+ ring->flags = p->flags;
+ ring->enter_ring_fd = fd;
+ if (p->flags & IORING_SETUP_REGISTERED_FD_ONLY) {
+ ring->ring_fd = -1;
+ ring->int_flags |= INT_FLAG_REG_RING | INT_FLAG_REG_REG_RING;
+ } else {
+ ring->ring_fd = fd;
+ }
+
+ return ret;
+}
+
+static int io_uring_queue_init_try_nosqarr(unsigned entries, struct io_uring *ring,
+ struct io_uring_params *p, void *buf,
+ size_t buf_size)
+{
+ unsigned flags = p->flags;
+ int ret;
+
+ p->flags |= IORING_SETUP_NO_SQARRAY;
+ ret = __io_uring_queue_init_params(entries, ring, p, buf, buf_size);
+
+ /* don't fallback if explicitly asked for NOSQARRAY */
+ if (ret != -EINVAL || (flags & IORING_SETUP_NO_SQARRAY))
+ return ret;
+
+ p->flags = flags;
+ return __io_uring_queue_init_params(entries, ring, p, buf, buf_size);
+}
+
+/*
+ * Like io_uring_queue_init_params(), except it allows the application to pass
+ * in a pre-allocated memory range that is used for the shared data between
+ * the kernel and the application. This includes the sqes array, and the two
+ * rings. The memory must be contiguous, the use case here is that the app
+ * allocates a huge page and passes it in.
+ *
+ * Returns the number of bytes used in the buffer, the app can then reuse
+ * the buffer with the returned offset to put more rings in the same huge
+ * page. Returns -ENOMEM if there's not enough room left in the buffer to
+ * host the ring.
+ */
+int io_uring_queue_init_mem(unsigned entries, struct io_uring *ring,
+ struct io_uring_params *p,
+ void *buf, size_t buf_size)
+{
+ /* should already be set... */
+ p->flags |= IORING_SETUP_NO_MMAP;
+ return io_uring_queue_init_try_nosqarr(entries, ring, p, buf, buf_size);
+}
+
+int io_uring_queue_init_params(unsigned entries, struct io_uring *ring,
+ struct io_uring_params *p)
+{
+ int ret;
+
+ ret = io_uring_queue_init_try_nosqarr(entries, ring, p, NULL, 0);
+ return ret >= 0 ? 0 : ret;
}
/*
@@ -196,11 +428,20 @@ __cold void io_uring_queue_exit(struct io_uring *ring)
struct io_uring_cq *cq = &ring->cq;
size_t sqe_size;
- sqe_size = sizeof(struct io_uring_sqe);
- if (ring->flags & IORING_SETUP_SQE128)
- sqe_size += 64;
- __sys_munmap(sq->sqes, sqe_size * sq->ring_entries);
- io_uring_unmap_rings(sq, cq);
+ if (!sq->ring_sz) {
+ sqe_size = sizeof(struct io_uring_sqe);
+ if (ring->flags & IORING_SETUP_SQE128)
+ sqe_size += 64;
+ __sys_munmap(sq->sqes, sqe_size * sq->ring_entries);
+ io_uring_unmap_rings(sq, cq);
+ } else {
+ if (!(ring->int_flags & INT_FLAG_APP_MEM)) {
+ __sys_munmap(sq->sqes,
+ *sq->kring_entries * sizeof(struct io_uring_sqe));
+ io_uring_unmap_rings(sq, cq);
+ }
+ }
+
/*
* Not strictly required, but frees up the slot we used now rather
* than at process exit time.
@@ -251,18 +492,6 @@ __cold void io_uring_free_probe(struct io_uring_probe *probe)
free(probe);
}
-static inline int __fls(unsigned long x)
-{
- if (!x)
- return 0;
- return 8 * sizeof(x) - __builtin_clzl(x);
-}
-
-static unsigned roundup_pow2(unsigned depth)
-{
- return 1U << __fls(depth - 1);
-}
-
static size_t npages(size_t size, long page_size)
{
size--;
@@ -293,9 +522,6 @@ static size_t rings_size(struct io_uring_params *p, unsigned entries,
return pages * page_size;
}
-#define KERN_MAX_ENTRIES 32768
-#define KERN_MAX_CQ_ENTRIES (2 * KERN_MAX_ENTRIES)
-
/*
* Return the required ulimit -l memlock memory required for a given ring
* setup, in bytes. May return -errno on error. On newer (5.12+) kernels,
@@ -309,9 +535,10 @@ __cold ssize_t io_uring_mlock_size_params(unsigned entries,
{
struct io_uring_params lp;
struct io_uring ring;
- unsigned cq_entries;
+ unsigned cq_entries, sq;
long page_size;
ssize_t ret;
+ int cret;
memset(&lp, 0, sizeof(lp));
@@ -341,25 +568,12 @@ __cold ssize_t io_uring_mlock_size_params(unsigned entries,
entries = KERN_MAX_ENTRIES;
}
- entries = roundup_pow2(entries);
- if (p->flags & IORING_SETUP_CQSIZE) {
- if (!p->cq_entries)
- return -EINVAL;
- cq_entries = p->cq_entries;
- if (cq_entries > KERN_MAX_CQ_ENTRIES) {
- if (!(p->flags & IORING_SETUP_CLAMP))
- return -EINVAL;
- cq_entries = KERN_MAX_CQ_ENTRIES;
- }
- cq_entries = roundup_pow2(cq_entries);
- if (cq_entries < entries)
- return -EINVAL;
- } else {
- cq_entries = 2 * entries;
- }
+ cret = get_sq_cq_entries(entries, p, &sq, &cq_entries);
+ if (cret)
+ return cret;
page_size = get_page_size();
- return rings_size(p, entries, cq_entries, page_size);
+ return rings_size(p, sq, cq_entries, page_size);
}
/*