aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/liburing/src
diff options
context:
space:
mode:
authorthegeorg <thegeorg@yandex-team.com>2024-06-09 11:55:21 +0300
committerthegeorg <thegeorg@yandex-team.com>2024-06-09 12:07:55 +0300
commitafd4899380eea1c70e2a68714b5da1c9919ccdbd (patch)
treecd5120708784139bc6a0f8881da1ed8389a065b3 /contrib/libs/liburing/src
parenta83bd2dd3c21e38c6c0807ec5e679497ab567f24 (diff)
downloadydb-afd4899380eea1c70e2a68714b5da1c9919ccdbd.tar.gz
Update contrib/libs/liburing to 2.6
3b51a9fb14de805208d11f1c077c78bb5d487e0f
Diffstat (limited to 'contrib/libs/liburing/src')
-rw-r--r--contrib/libs/liburing/src/arch/aarch64/lib.h1
-rw-r--r--contrib/libs/liburing/src/include/liburing.h146
-rw-r--r--contrib/libs/liburing/src/include/liburing/io_uring.h76
-rw-r--r--contrib/libs/liburing/src/include/liburing/io_uring_version.h2
-rw-r--r--contrib/libs/liburing/src/int_flags.h1
-rw-r--r--contrib/libs/liburing/src/lib.h2
-rw-r--r--contrib/libs/liburing/src/queue.c20
-rw-r--r--contrib/libs/liburing/src/register.c34
-rw-r--r--contrib/libs/liburing/src/setup.c380
-rw-r--r--contrib/libs/liburing/src/setup.h9
-rw-r--r--contrib/libs/liburing/src/syscall.h2
-rw-r--r--contrib/libs/liburing/src/version.c2
12 files changed, 553 insertions, 122 deletions
diff --git a/contrib/libs/liburing/src/arch/aarch64/lib.h b/contrib/libs/liburing/src/arch/aarch64/lib.h
index 3b701b1fb7..41bcfc957d 100644
--- a/contrib/libs/liburing/src/arch/aarch64/lib.h
+++ b/contrib/libs/liburing/src/arch/aarch64/lib.h
@@ -4,7 +4,6 @@
#define LIBURING_ARCH_AARCH64_LIB_H
#include <elf.h>
-#include <sys/auxv.h>
#include "../../syscall.h"
static inline long __get_page_size(void)
diff --git a/contrib/libs/liburing/src/include/liburing.h b/contrib/libs/liburing/src/include/liburing.h
index e6dc9a792f..7d04aec13a 100644
--- a/contrib/libs/liburing/src/include/liburing.h
+++ b/contrib/libs/liburing/src/include/liburing.h
@@ -2,14 +2,6 @@
#ifndef LIB_URING_H
#define LIB_URING_H
-#ifndef _XOPEN_SOURCE
-#define _XOPEN_SOURCE 500 /* Required for glibc to expose sigset_t */
-#endif
-
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE /* Required for musl to expose cpu_set_t */
-#endif
-
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/uio.h>
@@ -21,6 +13,7 @@
#include <fcntl.h>
#include <sched.h>
#include <linux/swab.h>
+#include <sys/wait.h>
#include "liburing/compat.h"
#include "liburing/io_uring.h"
#include "liburing/io_uring_version.h"
@@ -164,6 +157,9 @@ IOURINGINLINE int io_uring_opcode_supported(const struct io_uring_probe *p,
return (p->ops[op].flags & IO_URING_OP_SUPPORTED) != 0;
}
+int io_uring_queue_init_mem(unsigned entries, struct io_uring *ring,
+ struct io_uring_params *p,
+ void *buf, size_t buf_size);
int io_uring_queue_init_params(unsigned entries, struct io_uring *ring,
struct io_uring_params *p);
int io_uring_queue_init(unsigned entries, struct io_uring *ring,
@@ -235,12 +231,16 @@ int io_uring_close_ring_fd(struct io_uring *ring);
int io_uring_register_buf_ring(struct io_uring *ring,
struct io_uring_buf_reg *reg, unsigned int flags);
int io_uring_unregister_buf_ring(struct io_uring *ring, int bgid);
+int io_uring_buf_ring_head(struct io_uring *ring, int buf_group, uint16_t *head);
int io_uring_register_sync_cancel(struct io_uring *ring,
struct io_uring_sync_cancel_reg *reg);
int io_uring_register_file_alloc_range(struct io_uring *ring,
unsigned off, unsigned len);
+int io_uring_register_napi(struct io_uring *ring, struct io_uring_napi *napi);
+int io_uring_unregister_napi(struct io_uring *ring, struct io_uring_napi *napi);
+
int io_uring_get_events(struct io_uring *ring);
int io_uring_submit_and_get_events(struct io_uring *ring);
@@ -375,17 +375,10 @@ IOURINGINLINE void __io_uring_set_target_fixed_file(struct io_uring_sqe *sqe,
sqe->file_index = file_index + 1;
}
-IOURINGINLINE void io_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd,
- const void *addr, unsigned len,
- __u64 offset)
+IOURINGINLINE void io_uring_initialize_sqe(struct io_uring_sqe *sqe)
{
- sqe->opcode = (__u8) op;
sqe->flags = 0;
sqe->ioprio = 0;
- sqe->fd = fd;
- sqe->off = offset;
- sqe->addr = (unsigned long) addr;
- sqe->len = len;
sqe->rw_flags = 0;
sqe->buf_index = 0;
sqe->personality = 0;
@@ -394,6 +387,17 @@ IOURINGINLINE void io_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd,
sqe->__pad2[0] = 0;
}
+IOURINGINLINE void io_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd,
+ const void *addr, unsigned len,
+ __u64 offset)
+{
+ sqe->opcode = (__u8) op;
+ sqe->fd = fd;
+ sqe->off = offset;
+ sqe->addr = (unsigned long) addr;
+ sqe->len = len;
+}
+
/*
* io_uring_prep_splice() - Either @fd_in or @fd_out must be a pipe.
*
@@ -720,6 +724,15 @@ IOURINGINLINE void io_uring_prep_read(struct io_uring_sqe *sqe, int fd,
io_uring_prep_rw(IORING_OP_READ, sqe, fd, buf, nbytes, offset);
}
+IOURINGINLINE void io_uring_prep_read_multishot(struct io_uring_sqe *sqe,
+ int fd, unsigned nbytes,
+ __u64 offset, int buf_group)
+{
+ io_uring_prep_rw(IORING_OP_READ_MULTISHOT, sqe, fd, NULL, nbytes,
+ offset);
+ sqe->buf_group = buf_group;
+}
+
IOURINGINLINE void io_uring_prep_write(struct io_uring_sqe *sqe, int fd,
const void *buf, unsigned nbytes,
__u64 offset)
@@ -1126,12 +1139,88 @@ IOURINGINLINE void io_uring_prep_socket_direct_alloc(struct io_uring_sqe *sqe,
}
/*
+ * Prepare commands for sockets
+ */
+IOURINGINLINE void io_uring_prep_cmd_sock(struct io_uring_sqe *sqe,
+ int cmd_op,
+ int fd,
+ int level,
+ int optname,
+ void *optval,
+ int optlen)
+{
+ io_uring_prep_rw(IORING_OP_URING_CMD, sqe, fd, NULL, 0, 0);
+ sqe->optval = (unsigned long) (uintptr_t) optval;
+ sqe->optname = optname;
+ sqe->optlen = optlen;
+ sqe->cmd_op = cmd_op;
+ sqe->level = level;
+}
+
+IOURINGINLINE void io_uring_prep_waitid(struct io_uring_sqe *sqe,
+ idtype_t idtype,
+ id_t id,
+ siginfo_t *infop,
+ int options, unsigned int flags)
+{
+ io_uring_prep_rw(IORING_OP_WAITID, sqe, id, NULL, (unsigned) idtype, 0);
+ sqe->waitid_flags = flags;
+ sqe->file_index = options;
+ sqe->addr2 = (unsigned long) infop;
+}
+
+IOURINGINLINE void io_uring_prep_futex_wake(struct io_uring_sqe *sqe,
+ uint32_t *futex, uint64_t val,
+ uint64_t mask, uint32_t futex_flags,
+ unsigned int flags)
+{
+ io_uring_prep_rw(IORING_OP_FUTEX_WAKE, sqe, futex_flags, futex, 0, val);
+ sqe->futex_flags = flags;
+ sqe->addr3 = mask;
+}
+
+IOURINGINLINE void io_uring_prep_futex_wait(struct io_uring_sqe *sqe,
+ uint32_t *futex, uint64_t val,
+ uint64_t mask, uint32_t futex_flags,
+ unsigned int flags)
+{
+ io_uring_prep_rw(IORING_OP_FUTEX_WAIT, sqe, futex_flags, futex, 0, val);
+ sqe->futex_flags = flags;
+ sqe->addr3 = mask;
+}
+
+struct futex_waitv;
+IOURINGINLINE void io_uring_prep_futex_waitv(struct io_uring_sqe *sqe,
+ struct futex_waitv *futex,
+ uint32_t nr_futex,
+ unsigned int flags)
+{
+ io_uring_prep_rw(IORING_OP_FUTEX_WAITV, sqe, 0, futex, nr_futex, 0);
+ sqe->futex_flags = flags;
+}
+
+IOURINGINLINE void io_uring_prep_fixed_fd_install(struct io_uring_sqe *sqe,
+ int fd,
+ unsigned int flags)
+{
+ io_uring_prep_rw(IORING_OP_FIXED_FD_INSTALL, sqe, fd, NULL, 0, 0);
+ sqe->flags = IOSQE_FIXED_FILE;
+ sqe->install_fd_flags = flags;
+}
+
+IOURINGINLINE void io_uring_prep_ftruncate(struct io_uring_sqe *sqe,
+ int fd, loff_t len)
+{
+ io_uring_prep_rw(IORING_OP_FTRUNCATE, sqe, fd, 0, 0, len);
+}
+
+/*
* Returns number of unconsumed (if SQPOLL) or unsubmitted entries exist in
* the SQ ring
*/
IOURINGINLINE unsigned io_uring_sq_ready(const struct io_uring *ring)
{
- unsigned khead = *ring->sq.khead;
+ unsigned khead;
/*
* Without a barrier, we could miss an update and think the SQ wasn't
@@ -1140,6 +1229,8 @@ IOURINGINLINE unsigned io_uring_sq_ready(const struct io_uring *ring)
*/
if (ring->flags & IORING_SETUP_SQPOLL)
khead = io_uring_smp_load_acquire(ring->sq.khead);
+ else
+ khead = *ring->sq.khead;
/* always use real head, to avoid losing sync for short submit */
return ring->sq.sqe_tail - khead;
@@ -1326,7 +1417,7 @@ IOURINGINLINE struct io_uring_sqe *_io_uring_get_sqe(struct io_uring *ring)
if (ring->flags & IORING_SETUP_SQE128)
shift = 1;
if (!(ring->flags & IORING_SETUP_SQPOLL))
- head = IO_URING_READ_ONCE(*sq->khead);
+ head = *sq->khead;
else
head = io_uring_smp_load_acquire(sq->khead);
@@ -1335,6 +1426,7 @@ IOURINGINLINE struct io_uring_sqe *_io_uring_get_sqe(struct io_uring *ring)
sqe = &sq->sqes[(sq->sqe_tail & sq->ring_mask) << shift];
sq->sqe_tail = next;
+ io_uring_initialize_sqe(sqe);
return sqe;
}
@@ -1386,7 +1478,7 @@ IOURINGINLINE void __io_uring_buf_ring_cq_advance(struct io_uring *ring,
struct io_uring_buf_ring *br,
int cq_count, int buf_count)
{
- br->tail += buf_count;
+ io_uring_buf_ring_advance(br, buf_count);
io_uring_cq_advance(ring, cq_count);
}
@@ -1404,6 +1496,20 @@ IOURINGINLINE void io_uring_buf_ring_cq_advance(struct io_uring *ring,
__io_uring_buf_ring_cq_advance(ring, br, count, count);
}
+IOURINGINLINE int io_uring_buf_ring_available(struct io_uring *ring,
+ struct io_uring_buf_ring *br,
+ unsigned short bgid)
+{
+ uint16_t head;
+ int ret;
+
+ ret = io_uring_buf_ring_head(ring, bgid, &head);
+ if (ret)
+ return ret;
+
+ return (uint16_t) (br->tail - head);
+}
+
#ifndef LIBURING_INTERNAL
IOURINGINLINE struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring)
{
@@ -1432,7 +1538,7 @@ bool io_uring_check_version(int major, int minor);
#define IO_URING_CHECK_VERSION(major,minor) \
(major > IO_URING_VERSION_MAJOR || \
(major == IO_URING_VERSION_MAJOR && \
- minor >= IO_URING_VERSION_MINOR))
+ minor > IO_URING_VERSION_MINOR))
#ifdef __cplusplus
}
diff --git a/contrib/libs/liburing/src/include/liburing/io_uring.h b/contrib/libs/liburing/src/include/liburing/io_uring.h
index 84c33b5e84..bde11991bf 100644
--- a/contrib/libs/liburing/src/include/liburing/io_uring.h
+++ b/contrib/libs/liburing/src/include/liburing/io_uring.h
@@ -43,6 +43,10 @@ struct io_uring_sqe {
union {
__u64 addr; /* pointer to buffer or iovecs */
__u64 splice_off_in;
+ struct {
+ __u32 level;
+ __u32 optname;
+ };
};
__u32 len; /* buffer size or number of iovecs */
union {
@@ -65,6 +69,9 @@ struct io_uring_sqe {
__u32 xattr_flags;
__u32 msg_ring_flags;
__u32 uring_cmd_flags;
+ __u32 waitid_flags;
+ __u32 futex_flags;
+ __u32 install_fd_flags;
};
__u64 user_data; /* data to be passed back at completion time */
/* pack this to avoid bogus arm OABI complaints */
@@ -79,6 +86,7 @@ struct io_uring_sqe {
union {
__s32 splice_fd_in;
__u32 file_index;
+ __u32 optlen;
struct {
__u16 addr_len;
__u16 __pad3[1];
@@ -89,6 +97,7 @@ struct io_uring_sqe {
__u64 addr3;
__u64 __pad2[1];
};
+ __u64 optval;
/*
* If the ring is initialized with IORING_SETUP_SQE128, then
* this field is used for 80 bytes of arbitrary command data
@@ -173,6 +182,23 @@ enum {
*/
#define IORING_SETUP_DEFER_TASKRUN (1U << 13)
+/*
+ * Application provides ring memory
+ */
+#define IORING_SETUP_NO_MMAP (1U << 14)
+
+/*
+ * Register the ring fd in itself for use with
+ * IORING_REGISTER_USE_REGISTERED_RING; return a registered fd index rather
+ * than an fd.
+ */
+#define IORING_SETUP_REGISTERED_FD_ONLY (1U << 15)
+
+/*
+ * Removes indirection through the SQ index array.
+ */
+#define IORING_SETUP_NO_SQARRAY (1U << 16)
+
enum io_uring_op {
IORING_OP_NOP,
IORING_OP_READV,
@@ -223,6 +249,13 @@ enum io_uring_op {
IORING_OP_URING_CMD,
IORING_OP_SEND_ZC,
IORING_OP_SENDMSG_ZC,
+ IORING_OP_READ_MULTISHOT,
+ IORING_OP_WAITID,
+ IORING_OP_FUTEX_WAIT,
+ IORING_OP_FUTEX_WAKE,
+ IORING_OP_FUTEX_WAITV,
+ IORING_OP_FIXED_FD_INSTALL,
+ IORING_OP_FTRUNCATE,
/* this goes last, obviously */
IORING_OP_LAST,
@@ -352,6 +385,13 @@ enum {
#define IORING_MSG_RING_FLAGS_PASS (1U << 1)
/*
+ * IORING_OP_FIXED_FD_INSTALL flags (sqe->install_fd_flags)
+ *
+ * IORING_FIXED_FD_NO_CLOEXEC Don't mark the fd as O_CLOEXEC
+ */
+#define IORING_FIXED_FD_NO_CLOEXEC (1U << 0)
+
+/*
* IO completion data structure (Completion Queue Entry)
*/
struct io_uring_cqe {
@@ -406,7 +446,7 @@ struct io_sqring_offsets {
__u32 dropped;
__u32 array;
__u32 resv1;
- __u64 resv2;
+ __u64 user_addr;
};
/*
@@ -425,7 +465,7 @@ struct io_cqring_offsets {
__u32 cqes;
__u32 flags;
__u32 resv1;
- __u64 resv2;
+ __u64 user_addr;
};
/*
@@ -523,6 +563,13 @@ enum {
/* register a range of fixed file slots for automatic slot allocation */
IORING_REGISTER_FILE_ALLOC_RANGE = 25,
+ /* return status information for a buffer group */
+ IORING_REGISTER_PBUF_STATUS = 26,
+
+ /* set/clear busy poll settings */
+ IORING_REGISTER_NAPI = 27,
+ IORING_UNREGISTER_NAPI = 28,
+
/* this goes last */
IORING_REGISTER_LAST,
@@ -649,6 +696,21 @@ struct io_uring_buf_reg {
__u64 resv[3];
};
+/* argument for IORING_REGISTER_PBUF_STATUS */
+struct io_uring_buf_status {
+ __u32 buf_group; /* input */
+ __u32 head; /* output */
+ __u32 resv[8];
+};
+
+/* argument for IORING_(UN)REGISTER_NAPI */
+struct io_uring_napi {
+ __u32 busy_poll_to;
+ __u8 prefer_busy_poll;
+ __u8 pad[3];
+ __u64 resv;
+};
+
/*
* io_uring_restriction->opcode values
*/
@@ -703,6 +765,16 @@ struct io_uring_recvmsg_out {
__u32 flags;
};
+/*
+ * Argument for IORING_OP_URING_CMD when file is a socket
+ */
+enum {
+ SOCKET_URING_OP_SIOCINQ = 0,
+ SOCKET_URING_OP_SIOCOUTQ,
+ SOCKET_URING_OP_GETSOCKOPT,
+ SOCKET_URING_OP_SETSOCKOPT,
+};
+
#ifdef __cplusplus
}
#endif
diff --git a/contrib/libs/liburing/src/include/liburing/io_uring_version.h b/contrib/libs/liburing/src/include/liburing/io_uring_version.h
index 8029e041f9..49d8c7ed72 100644
--- a/contrib/libs/liburing/src/include/liburing/io_uring_version.h
+++ b/contrib/libs/liburing/src/include/liburing/io_uring_version.h
@@ -3,6 +3,6 @@
#define LIBURING_VERSION_H
#define IO_URING_VERSION_MAJOR 2
-#define IO_URING_VERSION_MINOR 4
+#define IO_URING_VERSION_MINOR 6
#endif
diff --git a/contrib/libs/liburing/src/int_flags.h b/contrib/libs/liburing/src/int_flags.h
index 71774fbca5..548dd1094c 100644
--- a/contrib/libs/liburing/src/int_flags.h
+++ b/contrib/libs/liburing/src/int_flags.h
@@ -5,6 +5,7 @@
enum {
INT_FLAG_REG_RING = 1,
INT_FLAG_REG_REG_RING = 2,
+ INT_FLAG_APP_MEM = 4,
};
#endif
diff --git a/contrib/libs/liburing/src/lib.h b/contrib/libs/liburing/src/lib.h
index 635a30ece5..0f1674fa26 100644
--- a/contrib/libs/liburing/src/lib.h
+++ b/contrib/libs/liburing/src/lib.h
@@ -10,6 +10,8 @@
#include "arch/x86/lib.h"
#elif defined(__aarch64__)
#include "arch/aarch64/lib.h"
+#elif defined(__riscv) && __riscv_xlen == 64
+#error #include "arch/riscv64/lib.h"
#else
/*
* We don't have nolibc support for this arch. Must use libc!
diff --git a/contrib/libs/liburing/src/queue.c b/contrib/libs/liburing/src/queue.c
index 9fca31fe34..79457c35ae 100644
--- a/contrib/libs/liburing/src/queue.c
+++ b/contrib/libs/liburing/src/queue.c
@@ -213,22 +213,18 @@ static unsigned __io_uring_flush_sq(struct io_uring *ring)
* Ensure kernel sees the SQE updates before the tail update.
*/
if (!(ring->flags & IORING_SETUP_SQPOLL))
- IO_URING_WRITE_ONCE(*sq->ktail, tail);
+ *sq->ktail = tail;
else
io_uring_smp_store_release(sq->ktail, tail);
}
/*
- * This _may_ look problematic, as we're not supposed to be reading
- * SQ->head without acquire semantics. When we're in SQPOLL mode, the
- * kernel submitter could be updating this right now. For non-SQPOLL,
- * task itself does it, and there's no potential race. But even for
- * SQPOLL, the load is going to be potentially out-of-date the very
- * instant it's done, regardless or whether or not it's done
- * atomically. Worst case, we're going to be over-estimating what
- * we can submit. The point is, we need to be able to deal with this
- * situation regardless of any perceived atomicity.
- */
- return tail - *sq->khead;
+ * This load needs to be atomic, since sq->khead is written concurrently
+ * by the kernel, but it doesn't need to be load_acquire, since the
+ * kernel doesn't store to the submission queue; it advances khead just
+ * to indicate that it's finished reading the submission queue entries
+ * so they're available for us to write to.
+ */
+ return tail - IO_URING_READ_ONCE(*sq->khead);
}
/*
diff --git a/contrib/libs/liburing/src/register.c b/contrib/libs/liburing/src/register.c
index 5563db2c4a..7a97303a0e 100644
--- a/contrib/libs/liburing/src/register.c
+++ b/contrib/libs/liburing/src/register.c
@@ -12,10 +12,16 @@
static inline int do_register(struct io_uring *ring, unsigned int opcode,
const void *arg, unsigned int nr_args)
{
- if (ring->int_flags & INT_FLAG_REG_REG_RING)
+ int fd;
+
+ if (ring->int_flags & INT_FLAG_REG_REG_RING) {
opcode |= IORING_REGISTER_USE_REGISTERED_RING;
+ fd = ring->enter_ring_fd;
+ } else {
+ fd = ring->ring_fd;
+ }
- return __sys_io_uring_register(ring->enter_ring_fd, opcode, arg, nr_args);
+ return __sys_io_uring_register(fd, opcode, arg, nr_args);
}
int io_uring_register_buffers_update_tag(struct io_uring *ring, unsigned off,
@@ -321,6 +327,20 @@ int io_uring_unregister_buf_ring(struct io_uring *ring, int bgid)
return do_register(ring, IORING_UNREGISTER_PBUF_RING, &reg, 1);
}
+int io_uring_buf_ring_head(struct io_uring *ring, int buf_group, uint16_t *head)
+{
+ struct io_uring_buf_status buf_status = {
+ .buf_group = buf_group,
+ };
+ int ret;
+
+ ret = do_register(ring, IORING_REGISTER_PBUF_STATUS, &buf_status, 1);
+ if (ret)
+ return ret;
+ *head = buf_status.head;
+ return 0;
+}
+
int io_uring_register_sync_cancel(struct io_uring *ring,
struct io_uring_sync_cancel_reg *reg)
{
@@ -337,3 +357,13 @@ int io_uring_register_file_alloc_range(struct io_uring *ring,
return do_register(ring, IORING_REGISTER_FILE_ALLOC_RANGE, &range, 0);
}
+
+int io_uring_register_napi(struct io_uring *ring, struct io_uring_napi *napi)
+{
+ return do_register(ring, IORING_REGISTER_NAPI, napi, 1);
+}
+
+int io_uring_unregister_napi(struct io_uring *ring, struct io_uring_napi *napi)
+{
+ return do_register(ring, IORING_UNREGISTER_NAPI, napi, 1);
+}
diff --git a/contrib/libs/liburing/src/setup.c b/contrib/libs/liburing/src/setup.c
index db2f3dfe15..0d33f65aae 100644
--- a/contrib/libs/liburing/src/setup.c
+++ b/contrib/libs/liburing/src/setup.c
@@ -6,16 +6,96 @@
#include "syscall.h"
#include "liburing.h"
#include "int_flags.h"
+#include "setup.h"
#include "liburing/compat.h"
#include "liburing/io_uring.h"
+#define KERN_MAX_ENTRIES 32768
+#define KERN_MAX_CQ_ENTRIES (2 * KERN_MAX_ENTRIES)
+
+static inline int __fls(int x)
+{
+ if (!x)
+ return 0;
+ return 8 * sizeof(x) - __builtin_clz(x);
+}
+
+static unsigned roundup_pow2(unsigned depth)
+{
+ return 1U << __fls(depth - 1);
+}
+
+static int get_sq_cq_entries(unsigned entries, struct io_uring_params *p,
+ unsigned *sq, unsigned *cq)
+{
+ unsigned cq_entries;
+
+ if (!entries)
+ return -EINVAL;
+ if (entries > KERN_MAX_ENTRIES) {
+ if (!(p->flags & IORING_SETUP_CLAMP))
+ return -EINVAL;
+ entries = KERN_MAX_ENTRIES;
+ }
+
+ entries = roundup_pow2(entries);
+ if (p->flags & IORING_SETUP_CQSIZE) {
+ if (!p->cq_entries)
+ return -EINVAL;
+ cq_entries = p->cq_entries;
+ if (cq_entries > KERN_MAX_CQ_ENTRIES) {
+ if (!(p->flags & IORING_SETUP_CLAMP))
+ return -EINVAL;
+ cq_entries = KERN_MAX_CQ_ENTRIES;
+ }
+ cq_entries = roundup_pow2(cq_entries);
+ if (cq_entries < entries)
+ return -EINVAL;
+ } else {
+ cq_entries = 2 * entries;
+ }
+
+ *sq = entries;
+ *cq = cq_entries;
+ return 0;
+}
+
static void io_uring_unmap_rings(struct io_uring_sq *sq, struct io_uring_cq *cq)
{
- __sys_munmap(sq->ring_ptr, sq->ring_sz);
- if (cq->ring_ptr && cq->ring_ptr != sq->ring_ptr)
+ if (sq->ring_sz)
+ __sys_munmap(sq->ring_ptr, sq->ring_sz);
+ if (cq->ring_ptr && cq->ring_sz && cq->ring_ptr != sq->ring_ptr)
__sys_munmap(cq->ring_ptr, cq->ring_sz);
}
+static void io_uring_setup_ring_pointers(struct io_uring_params *p,
+ struct io_uring_sq *sq,
+ struct io_uring_cq *cq)
+{
+ sq->khead = sq->ring_ptr + p->sq_off.head;
+ sq->ktail = sq->ring_ptr + p->sq_off.tail;
+ sq->kring_mask = sq->ring_ptr + p->sq_off.ring_mask;
+ sq->kring_entries = sq->ring_ptr + p->sq_off.ring_entries;
+ sq->kflags = sq->ring_ptr + p->sq_off.flags;
+ sq->kdropped = sq->ring_ptr + p->sq_off.dropped;
+ if (!(p->flags & IORING_SETUP_NO_SQARRAY))
+ sq->array = sq->ring_ptr + p->sq_off.array;
+
+ cq->khead = cq->ring_ptr + p->cq_off.head;
+ cq->ktail = cq->ring_ptr + p->cq_off.tail;
+ cq->kring_mask = cq->ring_ptr + p->cq_off.ring_mask;
+ cq->kring_entries = cq->ring_ptr + p->cq_off.ring_entries;
+ cq->koverflow = cq->ring_ptr + p->cq_off.overflow;
+ cq->cqes = cq->ring_ptr + p->cq_off.cqes;
+ if (p->cq_off.flags)
+ cq->kflags = cq->ring_ptr + p->cq_off.flags;
+
+ sq->ring_mask = *sq->kring_mask;
+ sq->ring_entries = *sq->kring_entries;
+ cq->ring_mask = *cq->kring_mask;
+ cq->ring_entries = *cq->kring_entries;
+}
+
static int io_uring_mmap(int fd, struct io_uring_params *p,
struct io_uring_sq *sq, struct io_uring_cq *cq)
{
@@ -53,14 +133,6 @@ static int io_uring_mmap(int fd, struct io_uring_params *p,
}
}
- sq->khead = sq->ring_ptr + p->sq_off.head;
- sq->ktail = sq->ring_ptr + p->sq_off.tail;
- sq->kring_mask = sq->ring_ptr + p->sq_off.ring_mask;
- sq->kring_entries = sq->ring_ptr + p->sq_off.ring_entries;
- sq->kflags = sq->ring_ptr + p->sq_off.flags;
- sq->kdropped = sq->ring_ptr + p->sq_off.dropped;
- sq->array = sq->ring_ptr + p->sq_off.array;
-
size = sizeof(struct io_uring_sqe);
if (p->flags & IORING_SETUP_SQE128)
size += 64;
@@ -73,19 +145,7 @@ err:
return ret;
}
- cq->khead = cq->ring_ptr + p->cq_off.head;
- cq->ktail = cq->ring_ptr + p->cq_off.tail;
- cq->kring_mask = cq->ring_ptr + p->cq_off.ring_mask;
- cq->kring_entries = cq->ring_ptr + p->cq_off.ring_entries;
- cq->koverflow = cq->ring_ptr + p->cq_off.overflow;
- cq->cqes = cq->ring_ptr + p->cq_off.cqes;
- if (p->cq_off.flags)
- cq->kflags = cq->ring_ptr + p->cq_off.flags;
-
- sq->ring_mask = *sq->kring_mask;
- sq->ring_entries = *sq->kring_entries;
- cq->ring_mask = *cq->kring_mask;
- cq->ring_entries = *cq->kring_entries;
+ io_uring_setup_ring_pointers(p, sq, cq);
return 0;
}
@@ -98,17 +158,8 @@ err:
__cold int io_uring_queue_mmap(int fd, struct io_uring_params *p,
struct io_uring *ring)
{
- int ret;
-
memset(ring, 0, sizeof(*ring));
- ret = io_uring_mmap(fd, p, &ring->sq, &ring->cq);
- if (!ret) {
- ring->flags = p->flags;
- ring->ring_fd = ring->enter_ring_fd = fd;
- ring->int_flags = 0;
- return 0;
- }
- return ret;
+ return io_uring_mmap(fd, p, &ring->sq, &ring->cq);
}
/*
@@ -146,33 +197,214 @@ __cold int io_uring_ring_dontfork(struct io_uring *ring)
return 0;
}
-__cold int io_uring_queue_init_params(unsigned entries, struct io_uring *ring,
- struct io_uring_params *p)
+/* FIXME */
+static size_t huge_page_size = 2 * 1024 * 1024;
+
+/*
+ * Returns negative for error, or number of bytes used in the buffer on success
+ */
+static int io_uring_alloc_huge(unsigned entries, struct io_uring_params *p,
+ struct io_uring_sq *sq, struct io_uring_cq *cq,
+ void *buf, size_t buf_size)
{
- int fd, ret;
+ unsigned long page_size = get_page_size();
+ unsigned sq_entries, cq_entries;
+ size_t ring_mem, sqes_mem;
+ unsigned long mem_used = 0;
+ void *ptr;
+ int ret;
+
+ ret = get_sq_cq_entries(entries, p, &sq_entries, &cq_entries);
+ if (ret)
+ return ret;
+
+ sqes_mem = sq_entries * sizeof(struct io_uring_sqe);
+ sqes_mem = (sqes_mem + page_size - 1) & ~(page_size - 1);
+ ring_mem = cq_entries * sizeof(struct io_uring_cqe);
+ if (p->flags & IORING_SETUP_CQE32)
+ ring_mem *= 2;
+ if (!(p->flags & IORING_SETUP_NO_SQARRAY))
+ ring_mem += sq_entries * sizeof(unsigned);
+ mem_used = sqes_mem + ring_mem;
+ mem_used = (mem_used + page_size - 1) & ~(page_size - 1);
+
+ /*
+ * A maxed-out number of CQ entries with IORING_SETUP_CQE32 fills a 2MB
+ * huge page by itself, so the SQ entries won't fit in the same huge
+ * page. For SQEs, that shouldn't be possible given KERN_MAX_ENTRIES,
+ * but check that too to future-proof (e.g. against different huge page
+ * sizes). Bail out early so we don't overrun.
+ */
+ if (!buf && (sqes_mem > huge_page_size || ring_mem > huge_page_size))
+ return -ENOMEM;
+
+ if (buf) {
+ if (mem_used > buf_size)
+ return -ENOMEM;
+ ptr = buf;
+ } else {
+ int map_hugetlb = 0;
+ if (sqes_mem <= page_size)
+ buf_size = page_size;
+ else {
+ buf_size = huge_page_size;
+ map_hugetlb = MAP_HUGETLB;
+ }
+ ptr = __sys_mmap(NULL, buf_size, PROT_READ|PROT_WRITE,
+ MAP_SHARED|MAP_ANONYMOUS|map_hugetlb,
+ -1, 0);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
+ }
+
+ sq->sqes = ptr;
+ if (mem_used <= buf_size) {
+ sq->ring_ptr = (void *) sq->sqes + sqes_mem;
+ /* clear ring sizes, we have just one mmap() to undo */
+ cq->ring_sz = 0;
+ sq->ring_sz = 0;
+ } else {
+ int map_hugetlb = 0;
+ if (ring_mem <= page_size)
+ buf_size = page_size;
+ else {
+ buf_size = huge_page_size;
+ map_hugetlb = MAP_HUGETLB;
+ }
+ ptr = __sys_mmap(NULL, buf_size, PROT_READ|PROT_WRITE,
+ MAP_SHARED|MAP_ANONYMOUS|map_hugetlb,
+ -1, 0);
+ if (IS_ERR(ptr)) {
+ __sys_munmap(sq->sqes, 1);
+ return PTR_ERR(ptr);
+ }
+ sq->ring_ptr = ptr;
+ sq->ring_sz = buf_size;
+ cq->ring_sz = 0;
+ }
+
+ cq->ring_ptr = (void *) sq->ring_ptr;
+ p->sq_off.user_addr = (unsigned long) sq->sqes;
+ p->cq_off.user_addr = (unsigned long) sq->ring_ptr;
+ return (int) mem_used;
+}
+
+int __io_uring_queue_init_params(unsigned entries, struct io_uring *ring,
+ struct io_uring_params *p, void *buf,
+ size_t buf_size)
+{
+ int fd, ret = 0;
unsigned *sq_array;
unsigned sq_entries, index;
+ memset(ring, 0, sizeof(*ring));
+
+ /*
+ * The kernel does this check already, but checking it here allows us
+ * to avoid handling it below.
+ */
+ if (p->flags & IORING_SETUP_REGISTERED_FD_ONLY
+ && !(p->flags & IORING_SETUP_NO_MMAP))
+ return -EINVAL;
+
+ if (p->flags & IORING_SETUP_NO_MMAP) {
+ ret = io_uring_alloc_huge(entries, p, &ring->sq, &ring->cq,
+ buf, buf_size);
+ if (ret < 0)
+ return ret;
+ if (buf)
+ ring->int_flags |= INT_FLAG_APP_MEM;
+ }
+
fd = __sys_io_uring_setup(entries, p);
- if (fd < 0)
+ if (fd < 0) {
+ if ((p->flags & IORING_SETUP_NO_MMAP) &&
+ !(ring->int_flags & INT_FLAG_APP_MEM)) {
+ __sys_munmap(ring->sq.sqes, 1);
+ io_uring_unmap_rings(&ring->sq, &ring->cq);
+ }
return fd;
+ }
- ret = io_uring_queue_mmap(fd, p, ring);
- if (ret) {
- __sys_close(fd);
- return ret;
+ if (!(p->flags & IORING_SETUP_NO_MMAP)) {
+ ret = io_uring_queue_mmap(fd, p, ring);
+ if (ret) {
+ __sys_close(fd);
+ return ret;
+ }
+ } else {
+ io_uring_setup_ring_pointers(p, &ring->sq, &ring->cq);
}
/*
* Directly map SQ slots to SQEs
*/
- sq_array = ring->sq.array;
sq_entries = ring->sq.ring_entries;
- for (index = 0; index < sq_entries; index++)
- sq_array[index] = index;
+ if (!(p->flags & IORING_SETUP_NO_SQARRAY)) {
+ sq_array = ring->sq.array;
+ for (index = 0; index < sq_entries; index++)
+ sq_array[index] = index;
+ }
ring->features = p->features;
- return 0;
+ ring->flags = p->flags;
+ ring->enter_ring_fd = fd;
+ if (p->flags & IORING_SETUP_REGISTERED_FD_ONLY) {
+ ring->ring_fd = -1;
+ ring->int_flags |= INT_FLAG_REG_RING | INT_FLAG_REG_REG_RING;
+ } else {
+ ring->ring_fd = fd;
+ }
+
+ return ret;
+}
+
+static int io_uring_queue_init_try_nosqarr(unsigned entries, struct io_uring *ring,
+ struct io_uring_params *p, void *buf,
+ size_t buf_size)
+{
+ unsigned flags = p->flags;
+ int ret;
+
+ p->flags |= IORING_SETUP_NO_SQARRAY;
+ ret = __io_uring_queue_init_params(entries, ring, p, buf, buf_size);
+
+ /* don't fallback if explicitly asked for NOSQARRAY */
+ if (ret != -EINVAL || (flags & IORING_SETUP_NO_SQARRAY))
+ return ret;
+
+ p->flags = flags;
+ return __io_uring_queue_init_params(entries, ring, p, buf, buf_size);
+}
+
+/*
+ * Like io_uring_queue_init_params(), except it allows the application to pass
+ * in a pre-allocated memory range that is used for the shared data between
+ * the kernel and the application. This includes the sqes array, and the two
+ * rings. The memory must be contiguous, the use case here is that the app
+ * allocates a huge page and passes it in.
+ *
+ * Returns the number of bytes used in the buffer, the app can then reuse
+ * the buffer with the returned offset to put more rings in the same huge
+ * page. Returns -ENOMEM if there's not enough room left in the buffer to
+ * host the ring.
+ */
+int io_uring_queue_init_mem(unsigned entries, struct io_uring *ring,
+ struct io_uring_params *p,
+ void *buf, size_t buf_size)
+{
+ /* should already be set... */
+ p->flags |= IORING_SETUP_NO_MMAP;
+ return io_uring_queue_init_try_nosqarr(entries, ring, p, buf, buf_size);
+}
+
+int io_uring_queue_init_params(unsigned entries, struct io_uring *ring,
+ struct io_uring_params *p)
+{
+ int ret;
+
+ ret = io_uring_queue_init_try_nosqarr(entries, ring, p, NULL, 0);
+ return ret >= 0 ? 0 : ret;
}
/*
@@ -196,11 +428,20 @@ __cold void io_uring_queue_exit(struct io_uring *ring)
struct io_uring_cq *cq = &ring->cq;
size_t sqe_size;
- sqe_size = sizeof(struct io_uring_sqe);
- if (ring->flags & IORING_SETUP_SQE128)
- sqe_size += 64;
- __sys_munmap(sq->sqes, sqe_size * sq->ring_entries);
- io_uring_unmap_rings(sq, cq);
+ if (!sq->ring_sz) {
+ sqe_size = sizeof(struct io_uring_sqe);
+ if (ring->flags & IORING_SETUP_SQE128)
+ sqe_size += 64;
+ __sys_munmap(sq->sqes, sqe_size * sq->ring_entries);
+ io_uring_unmap_rings(sq, cq);
+ } else {
+ if (!(ring->int_flags & INT_FLAG_APP_MEM)) {
+ __sys_munmap(sq->sqes,
+ *sq->kring_entries * sizeof(struct io_uring_sqe));
+ io_uring_unmap_rings(sq, cq);
+ }
+ }
+
/*
* Not strictly required, but frees up the slot we used now rather
* than at process exit time.
@@ -251,18 +492,6 @@ __cold void io_uring_free_probe(struct io_uring_probe *probe)
free(probe);
}
-static inline int __fls(unsigned long x)
-{
- if (!x)
- return 0;
- return 8 * sizeof(x) - __builtin_clzl(x);
-}
-
-static unsigned roundup_pow2(unsigned depth)
-{
- return 1U << __fls(depth - 1);
-}
-
static size_t npages(size_t size, long page_size)
{
size--;
@@ -293,9 +522,6 @@ static size_t rings_size(struct io_uring_params *p, unsigned entries,
return pages * page_size;
}
-#define KERN_MAX_ENTRIES 32768
-#define KERN_MAX_CQ_ENTRIES (2 * KERN_MAX_ENTRIES)
-
/*
* Return the required ulimit -l memlock memory required for a given ring
* setup, in bytes. May return -errno on error. On newer (5.12+) kernels,
@@ -309,9 +535,10 @@ __cold ssize_t io_uring_mlock_size_params(unsigned entries,
{
struct io_uring_params lp;
struct io_uring ring;
- unsigned cq_entries;
+ unsigned cq_entries, sq;
long page_size;
ssize_t ret;
+ int cret;
memset(&lp, 0, sizeof(lp));
@@ -341,25 +568,12 @@ __cold ssize_t io_uring_mlock_size_params(unsigned entries,
entries = KERN_MAX_ENTRIES;
}
- entries = roundup_pow2(entries);
- if (p->flags & IORING_SETUP_CQSIZE) {
- if (!p->cq_entries)
- return -EINVAL;
- cq_entries = p->cq_entries;
- if (cq_entries > KERN_MAX_CQ_ENTRIES) {
- if (!(p->flags & IORING_SETUP_CLAMP))
- return -EINVAL;
- cq_entries = KERN_MAX_CQ_ENTRIES;
- }
- cq_entries = roundup_pow2(cq_entries);
- if (cq_entries < entries)
- return -EINVAL;
- } else {
- cq_entries = 2 * entries;
- }
+ cret = get_sq_cq_entries(entries, p, &sq, &cq_entries);
+ if (cret)
+ return cret;
page_size = get_page_size();
- return rings_size(p, entries, cq_entries, page_size);
+ return rings_size(p, sq, cq_entries, page_size);
}
/*
diff --git a/contrib/libs/liburing/src/setup.h b/contrib/libs/liburing/src/setup.h
new file mode 100644
index 0000000000..ae44314cbe
--- /dev/null
+++ b/contrib/libs/liburing/src/setup.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: MIT */
+#ifndef LIBURING_SETUP_H
+#define LIBURING_SETUP_H
+
+int __io_uring_queue_init_params(unsigned entries, struct io_uring *ring,
+ struct io_uring_params *p, void *buf,
+ size_t buf_size);
+
+#endif
diff --git a/contrib/libs/liburing/src/syscall.h b/contrib/libs/liburing/src/syscall.h
index 4fa77e60ed..908659fff9 100644
--- a/contrib/libs/liburing/src/syscall.h
+++ b/contrib/libs/liburing/src/syscall.h
@@ -37,6 +37,8 @@ static inline bool IS_ERR(const void *ptr)
#include "arch/x86/syscall.h"
#elif defined(__aarch64__)
#include "arch/aarch64/syscall.h"
+#elif defined(__riscv) && __riscv_xlen == 64
+#error #include "arch/riscv64/syscall.h"
#else
/*
* We don't have native syscall wrappers
diff --git a/contrib/libs/liburing/src/version.c b/contrib/libs/liburing/src/version.c
index e1a01229be..c020cf81df 100644
--- a/contrib/libs/liburing/src/version.c
+++ b/contrib/libs/liburing/src/version.c
@@ -18,5 +18,5 @@ bool io_uring_check_version(int major, int minor)
{
return major > io_uring_major_version() ||
(major == io_uring_major_version() &&
- minor >= io_uring_minor_version());
+ minor > io_uring_minor_version());
}