diff options
author | thegeorg <thegeorg@yandex-team.com> | 2024-06-09 11:55:21 +0300 |
---|---|---|
committer | thegeorg <thegeorg@yandex-team.com> | 2024-06-09 12:07:55 +0300 |
commit | afd4899380eea1c70e2a68714b5da1c9919ccdbd (patch) | |
tree | cd5120708784139bc6a0f8881da1ed8389a065b3 /contrib/libs/liburing/src | |
parent | a83bd2dd3c21e38c6c0807ec5e679497ab567f24 (diff) | |
download | ydb-afd4899380eea1c70e2a68714b5da1c9919ccdbd.tar.gz |
Update contrib/libs/liburing to 2.6
3b51a9fb14de805208d11f1c077c78bb5d487e0f
Diffstat (limited to 'contrib/libs/liburing/src')
-rw-r--r-- | contrib/libs/liburing/src/arch/aarch64/lib.h | 1 | ||||
-rw-r--r-- | contrib/libs/liburing/src/include/liburing.h | 146 | ||||
-rw-r--r-- | contrib/libs/liburing/src/include/liburing/io_uring.h | 76 | ||||
-rw-r--r-- | contrib/libs/liburing/src/include/liburing/io_uring_version.h | 2 | ||||
-rw-r--r-- | contrib/libs/liburing/src/int_flags.h | 1 | ||||
-rw-r--r-- | contrib/libs/liburing/src/lib.h | 2 | ||||
-rw-r--r-- | contrib/libs/liburing/src/queue.c | 20 | ||||
-rw-r--r-- | contrib/libs/liburing/src/register.c | 34 | ||||
-rw-r--r-- | contrib/libs/liburing/src/setup.c | 380 | ||||
-rw-r--r-- | contrib/libs/liburing/src/setup.h | 9 | ||||
-rw-r--r-- | contrib/libs/liburing/src/syscall.h | 2 | ||||
-rw-r--r-- | contrib/libs/liburing/src/version.c | 2 |
12 files changed, 553 insertions, 122 deletions
diff --git a/contrib/libs/liburing/src/arch/aarch64/lib.h b/contrib/libs/liburing/src/arch/aarch64/lib.h index 3b701b1fb7..41bcfc957d 100644 --- a/contrib/libs/liburing/src/arch/aarch64/lib.h +++ b/contrib/libs/liburing/src/arch/aarch64/lib.h @@ -4,7 +4,6 @@ #define LIBURING_ARCH_AARCH64_LIB_H #include <elf.h> -#include <sys/auxv.h> #include "../../syscall.h" static inline long __get_page_size(void) diff --git a/contrib/libs/liburing/src/include/liburing.h b/contrib/libs/liburing/src/include/liburing.h index e6dc9a792f..7d04aec13a 100644 --- a/contrib/libs/liburing/src/include/liburing.h +++ b/contrib/libs/liburing/src/include/liburing.h @@ -2,14 +2,6 @@ #ifndef LIB_URING_H #define LIB_URING_H -#ifndef _XOPEN_SOURCE -#define _XOPEN_SOURCE 500 /* Required for glibc to expose sigset_t */ -#endif - -#ifndef _GNU_SOURCE -#define _GNU_SOURCE /* Required for musl to expose cpu_set_t */ -#endif - #include <sys/socket.h> #include <sys/stat.h> #include <sys/uio.h> @@ -21,6 +13,7 @@ #include <fcntl.h> #include <sched.h> #include <linux/swab.h> +#include <sys/wait.h> #include "liburing/compat.h" #include "liburing/io_uring.h" #include "liburing/io_uring_version.h" @@ -164,6 +157,9 @@ IOURINGINLINE int io_uring_opcode_supported(const struct io_uring_probe *p, return (p->ops[op].flags & IO_URING_OP_SUPPORTED) != 0; } +int io_uring_queue_init_mem(unsigned entries, struct io_uring *ring, + struct io_uring_params *p, + void *buf, size_t buf_size); int io_uring_queue_init_params(unsigned entries, struct io_uring *ring, struct io_uring_params *p); int io_uring_queue_init(unsigned entries, struct io_uring *ring, @@ -235,12 +231,16 @@ int io_uring_close_ring_fd(struct io_uring *ring); int io_uring_register_buf_ring(struct io_uring *ring, struct io_uring_buf_reg *reg, unsigned int flags); int io_uring_unregister_buf_ring(struct io_uring *ring, int bgid); +int io_uring_buf_ring_head(struct io_uring *ring, int buf_group, uint16_t *head); int io_uring_register_sync_cancel(struct io_uring *ring, struct io_uring_sync_cancel_reg *reg); int io_uring_register_file_alloc_range(struct io_uring *ring, unsigned off, unsigned len); +int io_uring_register_napi(struct io_uring *ring, struct io_uring_napi *napi); +int io_uring_unregister_napi(struct io_uring *ring, struct io_uring_napi *napi); + int io_uring_get_events(struct io_uring *ring); int io_uring_submit_and_get_events(struct io_uring *ring); @@ -375,17 +375,10 @@ IOURINGINLINE void __io_uring_set_target_fixed_file(struct io_uring_sqe *sqe, sqe->file_index = file_index + 1; } -IOURINGINLINE void io_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd, - const void *addr, unsigned len, - __u64 offset) +IOURINGINLINE void io_uring_initialize_sqe(struct io_uring_sqe *sqe) { - sqe->opcode = (__u8) op; sqe->flags = 0; sqe->ioprio = 0; - sqe->fd = fd; - sqe->off = offset; - sqe->addr = (unsigned long) addr; - sqe->len = len; sqe->rw_flags = 0; sqe->buf_index = 0; sqe->personality = 0; @@ -394,6 +387,17 @@ IOURINGINLINE void io_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd, sqe->__pad2[0] = 0; } +IOURINGINLINE void io_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd, + const void *addr, unsigned len, + __u64 offset) +{ + sqe->opcode = (__u8) op; + sqe->fd = fd; + sqe->off = offset; + sqe->addr = (unsigned long) addr; + sqe->len = len; +} + /* * io_uring_prep_splice() - Either @fd_in or @fd_out must be a pipe. * @@ -720,6 +724,15 @@ IOURINGINLINE void io_uring_prep_read(struct io_uring_sqe *sqe, int fd, io_uring_prep_rw(IORING_OP_READ, sqe, fd, buf, nbytes, offset); } +IOURINGINLINE void io_uring_prep_read_multishot(struct io_uring_sqe *sqe, + int fd, unsigned nbytes, + __u64 offset, int buf_group) +{ + io_uring_prep_rw(IORING_OP_READ_MULTISHOT, sqe, fd, NULL, nbytes, + offset); + sqe->buf_group = buf_group; +} + IOURINGINLINE void io_uring_prep_write(struct io_uring_sqe *sqe, int fd, const void *buf, unsigned nbytes, __u64 offset) @@ -1126,12 +1139,88 @@ IOURINGINLINE void io_uring_prep_socket_direct_alloc(struct io_uring_sqe *sqe, } /* + * Prepare commands for sockets + */ +IOURINGINLINE void io_uring_prep_cmd_sock(struct io_uring_sqe *sqe, + int cmd_op, + int fd, + int level, + int optname, + void *optval, + int optlen) +{ + io_uring_prep_rw(IORING_OP_URING_CMD, sqe, fd, NULL, 0, 0); + sqe->optval = (unsigned long) (uintptr_t) optval; + sqe->optname = optname; + sqe->optlen = optlen; + sqe->cmd_op = cmd_op; + sqe->level = level; +} + +IOURINGINLINE void io_uring_prep_waitid(struct io_uring_sqe *sqe, + idtype_t idtype, + id_t id, + siginfo_t *infop, + int options, unsigned int flags) +{ + io_uring_prep_rw(IORING_OP_WAITID, sqe, id, NULL, (unsigned) idtype, 0); + sqe->waitid_flags = flags; + sqe->file_index = options; + sqe->addr2 = (unsigned long) infop; +} + +IOURINGINLINE void io_uring_prep_futex_wake(struct io_uring_sqe *sqe, + uint32_t *futex, uint64_t val, + uint64_t mask, uint32_t futex_flags, + unsigned int flags) +{ + io_uring_prep_rw(IORING_OP_FUTEX_WAKE, sqe, futex_flags, futex, 0, val); + sqe->futex_flags = flags; + sqe->addr3 = mask; +} + +IOURINGINLINE void io_uring_prep_futex_wait(struct io_uring_sqe *sqe, + uint32_t *futex, uint64_t val, + uint64_t mask, uint32_t futex_flags, + unsigned int flags) +{ + io_uring_prep_rw(IORING_OP_FUTEX_WAIT, sqe, futex_flags, futex, 0, val); + sqe->futex_flags = flags; + sqe->addr3 = mask; +} + +struct futex_waitv; +IOURINGINLINE void io_uring_prep_futex_waitv(struct io_uring_sqe *sqe, + struct futex_waitv *futex, + uint32_t nr_futex, + unsigned int flags) +{ + io_uring_prep_rw(IORING_OP_FUTEX_WAITV, sqe, 0, futex, nr_futex, 0); + sqe->futex_flags = flags; +} + +IOURINGINLINE void io_uring_prep_fixed_fd_install(struct io_uring_sqe *sqe, + int fd, + unsigned int flags) +{ + io_uring_prep_rw(IORING_OP_FIXED_FD_INSTALL, sqe, fd, NULL, 0, 0); + sqe->flags = IOSQE_FIXED_FILE; + sqe->install_fd_flags = flags; +} + +IOURINGINLINE void io_uring_prep_ftruncate(struct io_uring_sqe *sqe, + int fd, loff_t len) +{ + io_uring_prep_rw(IORING_OP_FTRUNCATE, sqe, fd, 0, 0, len); +} + +/* * Returns number of unconsumed (if SQPOLL) or unsubmitted entries exist in * the SQ ring */ IOURINGINLINE unsigned io_uring_sq_ready(const struct io_uring *ring) { - unsigned khead = *ring->sq.khead; + unsigned khead; /* * Without a barrier, we could miss an update and think the SQ wasn't @@ -1140,6 +1229,8 @@ IOURINGINLINE unsigned io_uring_sq_ready(const struct io_uring *ring) */ if (ring->flags & IORING_SETUP_SQPOLL) khead = io_uring_smp_load_acquire(ring->sq.khead); + else + khead = *ring->sq.khead; /* always use real head, to avoid losing sync for short submit */ return ring->sq.sqe_tail - khead; @@ -1326,7 +1417,7 @@ IOURINGINLINE struct io_uring_sqe *_io_uring_get_sqe(struct io_uring *ring) if (ring->flags & IORING_SETUP_SQE128) shift = 1; if (!(ring->flags & IORING_SETUP_SQPOLL)) - head = IO_URING_READ_ONCE(*sq->khead); + head = *sq->khead; else head = io_uring_smp_load_acquire(sq->khead); @@ -1335,6 +1426,7 @@ IOURINGINLINE struct io_uring_sqe *_io_uring_get_sqe(struct io_uring *ring) sqe = &sq->sqes[(sq->sqe_tail & sq->ring_mask) << shift]; sq->sqe_tail = next; + io_uring_initialize_sqe(sqe); return sqe; } @@ -1386,7 +1478,7 @@ IOURINGINLINE void __io_uring_buf_ring_cq_advance(struct io_uring *ring, struct io_uring_buf_ring *br, int cq_count, int buf_count) { - br->tail += buf_count; + io_uring_buf_ring_advance(br, buf_count); io_uring_cq_advance(ring, cq_count); } @@ -1404,6 +1496,20 @@ IOURINGINLINE void io_uring_buf_ring_cq_advance(struct io_uring *ring, __io_uring_buf_ring_cq_advance(ring, br, count, count); } +IOURINGINLINE int io_uring_buf_ring_available(struct io_uring *ring, + struct io_uring_buf_ring *br, + unsigned short bgid) +{ + uint16_t head; + int ret; + + ret = io_uring_buf_ring_head(ring, bgid, &head); + if (ret) + return ret; + + return (uint16_t) (br->tail - head); +} + #ifndef LIBURING_INTERNAL IOURINGINLINE struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring) { @@ -1432,7 +1538,7 @@ bool io_uring_check_version(int major, int minor); #define IO_URING_CHECK_VERSION(major,minor) \ (major > IO_URING_VERSION_MAJOR || \ (major == IO_URING_VERSION_MAJOR && \ - minor >= IO_URING_VERSION_MINOR)) + minor > IO_URING_VERSION_MINOR)) #ifdef __cplusplus } diff --git a/contrib/libs/liburing/src/include/liburing/io_uring.h b/contrib/libs/liburing/src/include/liburing/io_uring.h index 84c33b5e84..bde11991bf 100644 --- a/contrib/libs/liburing/src/include/liburing/io_uring.h +++ b/contrib/libs/liburing/src/include/liburing/io_uring.h @@ -43,6 +43,10 @@ struct io_uring_sqe { union { __u64 addr; /* pointer to buffer or iovecs */ __u64 splice_off_in; + struct { + __u32 level; + __u32 optname; + }; }; __u32 len; /* buffer size or number of iovecs */ union { @@ -65,6 +69,9 @@ struct io_uring_sqe { __u32 xattr_flags; __u32 msg_ring_flags; __u32 uring_cmd_flags; + __u32 waitid_flags; + __u32 futex_flags; + __u32 install_fd_flags; }; __u64 user_data; /* data to be passed back at completion time */ /* pack this to avoid bogus arm OABI complaints */ @@ -79,6 +86,7 @@ struct io_uring_sqe { union { __s32 splice_fd_in; __u32 file_index; + __u32 optlen; struct { __u16 addr_len; __u16 __pad3[1]; @@ -89,6 +97,7 @@ struct io_uring_sqe { __u64 addr3; __u64 __pad2[1]; }; + __u64 optval; /* * If the ring is initialized with IORING_SETUP_SQE128, then * this field is used for 80 bytes of arbitrary command data @@ -173,6 +182,23 @@ enum { */ #define IORING_SETUP_DEFER_TASKRUN (1U << 13) +/* + * Application provides ring memory + */ +#define IORING_SETUP_NO_MMAP (1U << 14) + +/* + * Register the ring fd in itself for use with + * IORING_REGISTER_USE_REGISTERED_RING; return a registered fd index rather + * than an fd. + */ +#define IORING_SETUP_REGISTERED_FD_ONLY (1U << 15) + +/* + * Removes indirection through the SQ index array. + */ +#define IORING_SETUP_NO_SQARRAY (1U << 16) + enum io_uring_op { IORING_OP_NOP, IORING_OP_READV, @@ -223,6 +249,13 @@ enum io_uring_op { IORING_OP_URING_CMD, IORING_OP_SEND_ZC, IORING_OP_SENDMSG_ZC, + IORING_OP_READ_MULTISHOT, + IORING_OP_WAITID, + IORING_OP_FUTEX_WAIT, + IORING_OP_FUTEX_WAKE, + IORING_OP_FUTEX_WAITV, + IORING_OP_FIXED_FD_INSTALL, + IORING_OP_FTRUNCATE, /* this goes last, obviously */ IORING_OP_LAST, @@ -352,6 +385,13 @@ enum { #define IORING_MSG_RING_FLAGS_PASS (1U << 1) /* + * IORING_OP_FIXED_FD_INSTALL flags (sqe->install_fd_flags) + * + * IORING_FIXED_FD_NO_CLOEXEC Don't mark the fd as O_CLOEXEC + */ +#define IORING_FIXED_FD_NO_CLOEXEC (1U << 0) + +/* * IO completion data structure (Completion Queue Entry) */ struct io_uring_cqe { @@ -406,7 +446,7 @@ struct io_sqring_offsets { __u32 dropped; __u32 array; __u32 resv1; - __u64 resv2; + __u64 user_addr; }; /* @@ -425,7 +465,7 @@ struct io_cqring_offsets { __u32 cqes; __u32 flags; __u32 resv1; - __u64 resv2; + __u64 user_addr; }; /* @@ -523,6 +563,13 @@ enum { /* register a range of fixed file slots for automatic slot allocation */ IORING_REGISTER_FILE_ALLOC_RANGE = 25, + /* return status information for a buffer group */ + IORING_REGISTER_PBUF_STATUS = 26, + + /* set/clear busy poll settings */ + IORING_REGISTER_NAPI = 27, + IORING_UNREGISTER_NAPI = 28, + /* this goes last */ IORING_REGISTER_LAST, @@ -649,6 +696,21 @@ struct io_uring_buf_reg { __u64 resv[3]; }; +/* argument for IORING_REGISTER_PBUF_STATUS */ +struct io_uring_buf_status { + __u32 buf_group; /* input */ + __u32 head; /* output */ + __u32 resv[8]; +}; + +/* argument for IORING_(UN)REGISTER_NAPI */ +struct io_uring_napi { + __u32 busy_poll_to; + __u8 prefer_busy_poll; + __u8 pad[3]; + __u64 resv; +}; + /* * io_uring_restriction->opcode values */ @@ -703,6 +765,16 @@ struct io_uring_recvmsg_out { __u32 flags; }; +/* + * Argument for IORING_OP_URING_CMD when file is a socket + */ +enum { + SOCKET_URING_OP_SIOCINQ = 0, + SOCKET_URING_OP_SIOCOUTQ, + SOCKET_URING_OP_GETSOCKOPT, + SOCKET_URING_OP_SETSOCKOPT, +}; + #ifdef __cplusplus } #endif diff --git a/contrib/libs/liburing/src/include/liburing/io_uring_version.h b/contrib/libs/liburing/src/include/liburing/io_uring_version.h index 8029e041f9..49d8c7ed72 100644 --- a/contrib/libs/liburing/src/include/liburing/io_uring_version.h +++ b/contrib/libs/liburing/src/include/liburing/io_uring_version.h @@ -3,6 +3,6 @@ #define LIBURING_VERSION_H #define IO_URING_VERSION_MAJOR 2 -#define IO_URING_VERSION_MINOR 4 +#define IO_URING_VERSION_MINOR 6 #endif diff --git a/contrib/libs/liburing/src/int_flags.h b/contrib/libs/liburing/src/int_flags.h index 71774fbca5..548dd1094c 100644 --- a/contrib/libs/liburing/src/int_flags.h +++ b/contrib/libs/liburing/src/int_flags.h @@ -5,6 +5,7 @@ enum { INT_FLAG_REG_RING = 1, INT_FLAG_REG_REG_RING = 2, + INT_FLAG_APP_MEM = 4, }; #endif diff --git a/contrib/libs/liburing/src/lib.h b/contrib/libs/liburing/src/lib.h index 635a30ece5..0f1674fa26 100644 --- a/contrib/libs/liburing/src/lib.h +++ b/contrib/libs/liburing/src/lib.h @@ -10,6 +10,8 @@ #include "arch/x86/lib.h" #elif defined(__aarch64__) #include "arch/aarch64/lib.h" +#elif defined(__riscv) && __riscv_xlen == 64 +#error #include "arch/riscv64/lib.h" #else /* * We don't have nolibc support for this arch. Must use libc! diff --git a/contrib/libs/liburing/src/queue.c b/contrib/libs/liburing/src/queue.c index 9fca31fe34..79457c35ae 100644 --- a/contrib/libs/liburing/src/queue.c +++ b/contrib/libs/liburing/src/queue.c @@ -213,22 +213,18 @@ static unsigned __io_uring_flush_sq(struct io_uring *ring) * Ensure kernel sees the SQE updates before the tail update. */ if (!(ring->flags & IORING_SETUP_SQPOLL)) - IO_URING_WRITE_ONCE(*sq->ktail, tail); + *sq->ktail = tail; else io_uring_smp_store_release(sq->ktail, tail); } /* - * This _may_ look problematic, as we're not supposed to be reading - * SQ->head without acquire semantics. When we're in SQPOLL mode, the - * kernel submitter could be updating this right now. For non-SQPOLL, - * task itself does it, and there's no potential race. But even for - * SQPOLL, the load is going to be potentially out-of-date the very - * instant it's done, regardless or whether or not it's done - * atomically. Worst case, we're going to be over-estimating what - * we can submit. The point is, we need to be able to deal with this - * situation regardless of any perceived atomicity. - */ - return tail - *sq->khead; + * This load needs to be atomic, since sq->khead is written concurrently + * by the kernel, but it doesn't need to be load_acquire, since the + * kernel doesn't store to the submission queue; it advances khead just + * to indicate that it's finished reading the submission queue entries + * so they're available for us to write to. + */ + return tail - IO_URING_READ_ONCE(*sq->khead); } /* diff --git a/contrib/libs/liburing/src/register.c b/contrib/libs/liburing/src/register.c index 5563db2c4a..7a97303a0e 100644 --- a/contrib/libs/liburing/src/register.c +++ b/contrib/libs/liburing/src/register.c @@ -12,10 +12,16 @@ static inline int do_register(struct io_uring *ring, unsigned int opcode, const void *arg, unsigned int nr_args) { - if (ring->int_flags & INT_FLAG_REG_REG_RING) + int fd; + + if (ring->int_flags & INT_FLAG_REG_REG_RING) { opcode |= IORING_REGISTER_USE_REGISTERED_RING; + fd = ring->enter_ring_fd; + } else { + fd = ring->ring_fd; + } - return __sys_io_uring_register(ring->enter_ring_fd, opcode, arg, nr_args); + return __sys_io_uring_register(fd, opcode, arg, nr_args); } int io_uring_register_buffers_update_tag(struct io_uring *ring, unsigned off, @@ -321,6 +327,20 @@ int io_uring_unregister_buf_ring(struct io_uring *ring, int bgid) return do_register(ring, IORING_UNREGISTER_PBUF_RING, ®, 1); } +int io_uring_buf_ring_head(struct io_uring *ring, int buf_group, uint16_t *head) +{ + struct io_uring_buf_status buf_status = { + .buf_group = buf_group, + }; + int ret; + + ret = do_register(ring, IORING_REGISTER_PBUF_STATUS, &buf_status, 1); + if (ret) + return ret; + *head = buf_status.head; + return 0; +} + int io_uring_register_sync_cancel(struct io_uring *ring, struct io_uring_sync_cancel_reg *reg) { @@ -337,3 +357,13 @@ int io_uring_register_file_alloc_range(struct io_uring *ring, return do_register(ring, IORING_REGISTER_FILE_ALLOC_RANGE, &range, 0); } + +int io_uring_register_napi(struct io_uring *ring, struct io_uring_napi *napi) +{ + return do_register(ring, IORING_REGISTER_NAPI, napi, 1); +} + +int io_uring_unregister_napi(struct io_uring *ring, struct io_uring_napi *napi) +{ + return do_register(ring, IORING_UNREGISTER_NAPI, napi, 1); +} diff --git a/contrib/libs/liburing/src/setup.c b/contrib/libs/liburing/src/setup.c index db2f3dfe15..0d33f65aae 100644 --- a/contrib/libs/liburing/src/setup.c +++ b/contrib/libs/liburing/src/setup.c @@ -6,16 +6,96 @@ #include "syscall.h" #include "liburing.h" #include "int_flags.h" +#include "setup.h" #include "liburing/compat.h" #include "liburing/io_uring.h" +#define KERN_MAX_ENTRIES 32768 +#define KERN_MAX_CQ_ENTRIES (2 * KERN_MAX_ENTRIES) + +static inline int __fls(int x) +{ + if (!x) + return 0; + return 8 * sizeof(x) - __builtin_clz(x); +} + +static unsigned roundup_pow2(unsigned depth) +{ + return 1U << __fls(depth - 1); +} + +static int get_sq_cq_entries(unsigned entries, struct io_uring_params *p, + unsigned *sq, unsigned *cq) +{ + unsigned cq_entries; + + if (!entries) + return -EINVAL; + if (entries > KERN_MAX_ENTRIES) { + if (!(p->flags & IORING_SETUP_CLAMP)) + return -EINVAL; + entries = KERN_MAX_ENTRIES; + } + + entries = roundup_pow2(entries); + if (p->flags & IORING_SETUP_CQSIZE) { + if (!p->cq_entries) + return -EINVAL; + cq_entries = p->cq_entries; + if (cq_entries > KERN_MAX_CQ_ENTRIES) { + if (!(p->flags & IORING_SETUP_CLAMP)) + return -EINVAL; + cq_entries = KERN_MAX_CQ_ENTRIES; + } + cq_entries = roundup_pow2(cq_entries); + if (cq_entries < entries) + return -EINVAL; + } else { + cq_entries = 2 * entries; + } + + *sq = entries; + *cq = cq_entries; + return 0; +} + static void io_uring_unmap_rings(struct io_uring_sq *sq, struct io_uring_cq *cq) { - __sys_munmap(sq->ring_ptr, sq->ring_sz); - if (cq->ring_ptr && cq->ring_ptr != sq->ring_ptr) + if (sq->ring_sz) + __sys_munmap(sq->ring_ptr, sq->ring_sz); + if (cq->ring_ptr && cq->ring_sz && cq->ring_ptr != sq->ring_ptr) __sys_munmap(cq->ring_ptr, cq->ring_sz); } +static void io_uring_setup_ring_pointers(struct io_uring_params *p, + struct io_uring_sq *sq, + struct io_uring_cq *cq) +{ + sq->khead = sq->ring_ptr + p->sq_off.head; + sq->ktail = sq->ring_ptr + p->sq_off.tail; + sq->kring_mask = sq->ring_ptr + p->sq_off.ring_mask; + sq->kring_entries = sq->ring_ptr + p->sq_off.ring_entries; + sq->kflags = sq->ring_ptr + p->sq_off.flags; + sq->kdropped = sq->ring_ptr + p->sq_off.dropped; + if (!(p->flags & IORING_SETUP_NO_SQARRAY)) + sq->array = sq->ring_ptr + p->sq_off.array; + + cq->khead = cq->ring_ptr + p->cq_off.head; + cq->ktail = cq->ring_ptr + p->cq_off.tail; + cq->kring_mask = cq->ring_ptr + p->cq_off.ring_mask; + cq->kring_entries = cq->ring_ptr + p->cq_off.ring_entries; + cq->koverflow = cq->ring_ptr + p->cq_off.overflow; + cq->cqes = cq->ring_ptr + p->cq_off.cqes; + if (p->cq_off.flags) + cq->kflags = cq->ring_ptr + p->cq_off.flags; + + sq->ring_mask = *sq->kring_mask; + sq->ring_entries = *sq->kring_entries; + cq->ring_mask = *cq->kring_mask; + cq->ring_entries = *cq->kring_entries; +} + static int io_uring_mmap(int fd, struct io_uring_params *p, struct io_uring_sq *sq, struct io_uring_cq *cq) { @@ -53,14 +133,6 @@ static int io_uring_mmap(int fd, struct io_uring_params *p, } } - sq->khead = sq->ring_ptr + p->sq_off.head; - sq->ktail = sq->ring_ptr + p->sq_off.tail; - sq->kring_mask = sq->ring_ptr + p->sq_off.ring_mask; - sq->kring_entries = sq->ring_ptr + p->sq_off.ring_entries; - sq->kflags = sq->ring_ptr + p->sq_off.flags; - sq->kdropped = sq->ring_ptr + p->sq_off.dropped; - sq->array = sq->ring_ptr + p->sq_off.array; - size = sizeof(struct io_uring_sqe); if (p->flags & IORING_SETUP_SQE128) size += 64; @@ -73,19 +145,7 @@ err: return ret; } - cq->khead = cq->ring_ptr + p->cq_off.head; - cq->ktail = cq->ring_ptr + p->cq_off.tail; - cq->kring_mask = cq->ring_ptr + p->cq_off.ring_mask; - cq->kring_entries = cq->ring_ptr + p->cq_off.ring_entries; - cq->koverflow = cq->ring_ptr + p->cq_off.overflow; - cq->cqes = cq->ring_ptr + p->cq_off.cqes; - if (p->cq_off.flags) - cq->kflags = cq->ring_ptr + p->cq_off.flags; - - sq->ring_mask = *sq->kring_mask; - sq->ring_entries = *sq->kring_entries; - cq->ring_mask = *cq->kring_mask; - cq->ring_entries = *cq->kring_entries; + io_uring_setup_ring_pointers(p, sq, cq); return 0; } @@ -98,17 +158,8 @@ err: __cold int io_uring_queue_mmap(int fd, struct io_uring_params *p, struct io_uring *ring) { - int ret; - memset(ring, 0, sizeof(*ring)); - ret = io_uring_mmap(fd, p, &ring->sq, &ring->cq); - if (!ret) { - ring->flags = p->flags; - ring->ring_fd = ring->enter_ring_fd = fd; - ring->int_flags = 0; - return 0; - } - return ret; + return io_uring_mmap(fd, p, &ring->sq, &ring->cq); } /* @@ -146,33 +197,214 @@ __cold int io_uring_ring_dontfork(struct io_uring *ring) return 0; } -__cold int io_uring_queue_init_params(unsigned entries, struct io_uring *ring, - struct io_uring_params *p) +/* FIXME */ +static size_t huge_page_size = 2 * 1024 * 1024; + +/* + * Returns negative for error, or number of bytes used in the buffer on success + */ +static int io_uring_alloc_huge(unsigned entries, struct io_uring_params *p, + struct io_uring_sq *sq, struct io_uring_cq *cq, + void *buf, size_t buf_size) { - int fd, ret; + unsigned long page_size = get_page_size(); + unsigned sq_entries, cq_entries; + size_t ring_mem, sqes_mem; + unsigned long mem_used = 0; + void *ptr; + int ret; + + ret = get_sq_cq_entries(entries, p, &sq_entries, &cq_entries); + if (ret) + return ret; + + sqes_mem = sq_entries * sizeof(struct io_uring_sqe); + sqes_mem = (sqes_mem + page_size - 1) & ~(page_size - 1); + ring_mem = cq_entries * sizeof(struct io_uring_cqe); + if (p->flags & IORING_SETUP_CQE32) + ring_mem *= 2; + if (!(p->flags & IORING_SETUP_NO_SQARRAY)) + ring_mem += sq_entries * sizeof(unsigned); + mem_used = sqes_mem + ring_mem; + mem_used = (mem_used + page_size - 1) & ~(page_size - 1); + + /* + * A maxed-out number of CQ entries with IORING_SETUP_CQE32 fills a 2MB + * huge page by itself, so the SQ entries won't fit in the same huge + * page. For SQEs, that shouldn't be possible given KERN_MAX_ENTRIES, + * but check that too to future-proof (e.g. against different huge page + * sizes). Bail out early so we don't overrun. + */ + if (!buf && (sqes_mem > huge_page_size || ring_mem > huge_page_size)) + return -ENOMEM; + + if (buf) { + if (mem_used > buf_size) + return -ENOMEM; + ptr = buf; + } else { + int map_hugetlb = 0; + if (sqes_mem <= page_size) + buf_size = page_size; + else { + buf_size = huge_page_size; + map_hugetlb = MAP_HUGETLB; + } + ptr = __sys_mmap(NULL, buf_size, PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_ANONYMOUS|map_hugetlb, + -1, 0); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + } + + sq->sqes = ptr; + if (mem_used <= buf_size) { + sq->ring_ptr = (void *) sq->sqes + sqes_mem; + /* clear ring sizes, we have just one mmap() to undo */ + cq->ring_sz = 0; + sq->ring_sz = 0; + } else { + int map_hugetlb = 0; + if (ring_mem <= page_size) + buf_size = page_size; + else { + buf_size = huge_page_size; + map_hugetlb = MAP_HUGETLB; + } + ptr = __sys_mmap(NULL, buf_size, PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_ANONYMOUS|map_hugetlb, + -1, 0); + if (IS_ERR(ptr)) { + __sys_munmap(sq->sqes, 1); + return PTR_ERR(ptr); + } + sq->ring_ptr = ptr; + sq->ring_sz = buf_size; + cq->ring_sz = 0; + } + + cq->ring_ptr = (void *) sq->ring_ptr; + p->sq_off.user_addr = (unsigned long) sq->sqes; + p->cq_off.user_addr = (unsigned long) sq->ring_ptr; + return (int) mem_used; +} + +int __io_uring_queue_init_params(unsigned entries, struct io_uring *ring, + struct io_uring_params *p, void *buf, + size_t buf_size) +{ + int fd, ret = 0; unsigned *sq_array; unsigned sq_entries, index; + memset(ring, 0, sizeof(*ring)); + + /* + * The kernel does this check already, but checking it here allows us + * to avoid handling it below. + */ + if (p->flags & IORING_SETUP_REGISTERED_FD_ONLY + && !(p->flags & IORING_SETUP_NO_MMAP)) + return -EINVAL; + + if (p->flags & IORING_SETUP_NO_MMAP) { + ret = io_uring_alloc_huge(entries, p, &ring->sq, &ring->cq, + buf, buf_size); + if (ret < 0) + return ret; + if (buf) + ring->int_flags |= INT_FLAG_APP_MEM; + } + fd = __sys_io_uring_setup(entries, p); - if (fd < 0) + if (fd < 0) { + if ((p->flags & IORING_SETUP_NO_MMAP) && + !(ring->int_flags & INT_FLAG_APP_MEM)) { + __sys_munmap(ring->sq.sqes, 1); + io_uring_unmap_rings(&ring->sq, &ring->cq); + } return fd; + } - ret = io_uring_queue_mmap(fd, p, ring); - if (ret) { - __sys_close(fd); - return ret; + if (!(p->flags & IORING_SETUP_NO_MMAP)) { + ret = io_uring_queue_mmap(fd, p, ring); + if (ret) { + __sys_close(fd); + return ret; + } + } else { + io_uring_setup_ring_pointers(p, &ring->sq, &ring->cq); } /* * Directly map SQ slots to SQEs */ - sq_array = ring->sq.array; sq_entries = ring->sq.ring_entries; - for (index = 0; index < sq_entries; index++) - sq_array[index] = index; + if (!(p->flags & IORING_SETUP_NO_SQARRAY)) { + sq_array = ring->sq.array; + for (index = 0; index < sq_entries; index++) + sq_array[index] = index; + } ring->features = p->features; - return 0; + ring->flags = p->flags; + ring->enter_ring_fd = fd; + if (p->flags & IORING_SETUP_REGISTERED_FD_ONLY) { + ring->ring_fd = -1; + ring->int_flags |= INT_FLAG_REG_RING | INT_FLAG_REG_REG_RING; + } else { + ring->ring_fd = fd; + } + + return ret; +} + +static int io_uring_queue_init_try_nosqarr(unsigned entries, struct io_uring *ring, + struct io_uring_params *p, void *buf, + size_t buf_size) +{ + unsigned flags = p->flags; + int ret; + + p->flags |= IORING_SETUP_NO_SQARRAY; + ret = __io_uring_queue_init_params(entries, ring, p, buf, buf_size); + + /* don't fallback if explicitly asked for NOSQARRAY */ + if (ret != -EINVAL || (flags & IORING_SETUP_NO_SQARRAY)) + return ret; + + p->flags = flags; + return __io_uring_queue_init_params(entries, ring, p, buf, buf_size); +} + +/* + * Like io_uring_queue_init_params(), except it allows the application to pass + * in a pre-allocated memory range that is used for the shared data between + * the kernel and the application. This includes the sqes array, and the two + * rings. The memory must be contiguous, the use case here is that the app + * allocates a huge page and passes it in. + * + * Returns the number of bytes used in the buffer, the app can then reuse + * the buffer with the returned offset to put more rings in the same huge + * page. Returns -ENOMEM if there's not enough room left in the buffer to + * host the ring. + */ +int io_uring_queue_init_mem(unsigned entries, struct io_uring *ring, + struct io_uring_params *p, + void *buf, size_t buf_size) +{ + /* should already be set... */ + p->flags |= IORING_SETUP_NO_MMAP; + return io_uring_queue_init_try_nosqarr(entries, ring, p, buf, buf_size); +} + +int io_uring_queue_init_params(unsigned entries, struct io_uring *ring, + struct io_uring_params *p) +{ + int ret; + + ret = io_uring_queue_init_try_nosqarr(entries, ring, p, NULL, 0); + return ret >= 0 ? 0 : ret; } /* @@ -196,11 +428,20 @@ __cold void io_uring_queue_exit(struct io_uring *ring) struct io_uring_cq *cq = &ring->cq; size_t sqe_size; - sqe_size = sizeof(struct io_uring_sqe); - if (ring->flags & IORING_SETUP_SQE128) - sqe_size += 64; - __sys_munmap(sq->sqes, sqe_size * sq->ring_entries); - io_uring_unmap_rings(sq, cq); + if (!sq->ring_sz) { + sqe_size = sizeof(struct io_uring_sqe); + if (ring->flags & IORING_SETUP_SQE128) + sqe_size += 64; + __sys_munmap(sq->sqes, sqe_size * sq->ring_entries); + io_uring_unmap_rings(sq, cq); + } else { + if (!(ring->int_flags & INT_FLAG_APP_MEM)) { + __sys_munmap(sq->sqes, + *sq->kring_entries * sizeof(struct io_uring_sqe)); + io_uring_unmap_rings(sq, cq); + } + } + /* * Not strictly required, but frees up the slot we used now rather * than at process exit time. @@ -251,18 +492,6 @@ __cold void io_uring_free_probe(struct io_uring_probe *probe) free(probe); } -static inline int __fls(unsigned long x) -{ - if (!x) - return 0; - return 8 * sizeof(x) - __builtin_clzl(x); -} - -static unsigned roundup_pow2(unsigned depth) -{ - return 1U << __fls(depth - 1); -} - static size_t npages(size_t size, long page_size) { size--; @@ -293,9 +522,6 @@ static size_t rings_size(struct io_uring_params *p, unsigned entries, return pages * page_size; } -#define KERN_MAX_ENTRIES 32768 -#define KERN_MAX_CQ_ENTRIES (2 * KERN_MAX_ENTRIES) - /* * Return the required ulimit -l memlock memory required for a given ring * setup, in bytes. May return -errno on error. On newer (5.12+) kernels, @@ -309,9 +535,10 @@ __cold ssize_t io_uring_mlock_size_params(unsigned entries, { struct io_uring_params lp; struct io_uring ring; - unsigned cq_entries; + unsigned cq_entries, sq; long page_size; ssize_t ret; + int cret; memset(&lp, 0, sizeof(lp)); @@ -341,25 +568,12 @@ __cold ssize_t io_uring_mlock_size_params(unsigned entries, entries = KERN_MAX_ENTRIES; } - entries = roundup_pow2(entries); - if (p->flags & IORING_SETUP_CQSIZE) { - if (!p->cq_entries) - return -EINVAL; - cq_entries = p->cq_entries; - if (cq_entries > KERN_MAX_CQ_ENTRIES) { - if (!(p->flags & IORING_SETUP_CLAMP)) - return -EINVAL; - cq_entries = KERN_MAX_CQ_ENTRIES; - } - cq_entries = roundup_pow2(cq_entries); - if (cq_entries < entries) - return -EINVAL; - } else { - cq_entries = 2 * entries; - } + cret = get_sq_cq_entries(entries, p, &sq, &cq_entries); + if (cret) + return cret; page_size = get_page_size(); - return rings_size(p, entries, cq_entries, page_size); + return rings_size(p, sq, cq_entries, page_size); } /* diff --git a/contrib/libs/liburing/src/setup.h b/contrib/libs/liburing/src/setup.h new file mode 100644 index 0000000000..ae44314cbe --- /dev/null +++ b/contrib/libs/liburing/src/setup.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: MIT */ +#ifndef LIBURING_SETUP_H +#define LIBURING_SETUP_H + +int __io_uring_queue_init_params(unsigned entries, struct io_uring *ring, + struct io_uring_params *p, void *buf, + size_t buf_size); + +#endif diff --git a/contrib/libs/liburing/src/syscall.h b/contrib/libs/liburing/src/syscall.h index 4fa77e60ed..908659fff9 100644 --- a/contrib/libs/liburing/src/syscall.h +++ b/contrib/libs/liburing/src/syscall.h @@ -37,6 +37,8 @@ static inline bool IS_ERR(const void *ptr) #include "arch/x86/syscall.h" #elif defined(__aarch64__) #include "arch/aarch64/syscall.h" +#elif defined(__riscv) && __riscv_xlen == 64 +#error #include "arch/riscv64/syscall.h" #else /* * We don't have native syscall wrappers diff --git a/contrib/libs/liburing/src/version.c b/contrib/libs/liburing/src/version.c index e1a01229be..c020cf81df 100644 --- a/contrib/libs/liburing/src/version.c +++ b/contrib/libs/liburing/src/version.c @@ -18,5 +18,5 @@ bool io_uring_check_version(int major, int minor) { return major > io_uring_major_version() || (major == io_uring_major_version() && - minor >= io_uring_minor_version()); + minor > io_uring_minor_version()); } |