/*-------------------------------------------------------------------------
*
* socket.c
* Microsoft Windows Win32 Socket Functions
*
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
*
* IDENTIFICATION
* src/backend/port/win32/socket.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
/*
* Indicate if pgwin32_recv() and pgwin32_send() should operate
* in non-blocking mode.
*
* Since the socket emulation layer always sets the actual socket to
* non-blocking mode in order to be able to deliver signals, we must
* specify this in a separate flag if we actually need non-blocking
* operation.
*
* This flag changes the behaviour *globally* for all socket operations,
* so it should only be set for very short periods of time.
*/
int pgwin32_noblock = 0;
/* Undef the macros defined in win32.h, so we can access system functions */
#undef socket
#undef bind
#undef listen
#undef accept
#undef connect
#undef select
#undef recv
#undef send
/*
* Blocking socket functions implemented so they listen on both
* the socket and the signal event, required for signal handling.
*/
/*
* Convert the last socket error code into errno
*
* Note: where there is a direct correspondence between a WSAxxx error code
* and a Berkeley error symbol, this mapping is actually a no-op, because
* in win32.h we redefine the network-related Berkeley error symbols to have
* the values of their WSAxxx counterparts. The point of the switch is
* mostly to translate near-miss error codes into something that's sensible
* in the Berkeley universe.
*/
static void
TranslateSocketError(void)
{
switch (WSAGetLastError())
{
case WSAEINVAL:
case WSANOTINITIALISED:
case WSAEINVALIDPROVIDER:
case WSAEINVALIDPROCTABLE:
case WSAEDESTADDRREQ:
errno = EINVAL;
break;
case WSAEINPROGRESS:
errno = EINPROGRESS;
break;
case WSAEFAULT:
errno = EFAULT;
break;
case WSAEISCONN:
errno = EISCONN;
break;
case WSAEMSGSIZE:
errno = EMSGSIZE;
break;
case WSAEAFNOSUPPORT:
errno = EAFNOSUPPORT;
break;
case WSAEMFILE:
errno = EMFILE;
break;
case WSAENOBUFS:
errno = ENOBUFS;
break;
case WSAEPROTONOSUPPORT:
case WSAEPROTOTYPE:
case WSAESOCKTNOSUPPORT:
errno = EPROTONOSUPPORT;
break;
case WSAECONNABORTED:
errno = ECONNABORTED;
break;
case WSAECONNREFUSED:
errno = ECONNREFUSED;
break;
case WSAECONNRESET:
errno = ECONNRESET;
break;
case WSAEINTR:
errno = EINTR;
break;
case WSAENOTSOCK:
errno = ENOTSOCK;
break;
case WSAEOPNOTSUPP:
errno = EOPNOTSUPP;
break;
case WSAEWOULDBLOCK:
errno = EWOULDBLOCK;
break;
case WSAEACCES:
errno = EACCES;
break;
case WSAEADDRINUSE:
errno = EADDRINUSE;
break;
case WSAEADDRNOTAVAIL:
errno = EADDRNOTAVAIL;
break;
case WSAEHOSTDOWN:
errno = EHOSTDOWN;
break;
case WSAEHOSTUNREACH:
case WSAHOST_NOT_FOUND:
errno = EHOSTUNREACH;
break;
case WSAENETDOWN:
errno = ENETDOWN;
break;
case WSAENETUNREACH:
errno = ENETUNREACH;
break;
case WSAENETRESET:
errno = ENETRESET;
break;
case WSAENOTCONN:
case WSAESHUTDOWN:
case WSAEDISCON:
errno = ENOTCONN;
break;
default:
ereport(NOTICE,
(errmsg_internal("unrecognized win32 socket error code: %d", WSAGetLastError())));
errno = EINVAL;
}
}
static int
pgwin32_poll_signals(void)
{
if (UNBLOCKED_SIGNAL_QUEUE())
{
pgwin32_dispatch_queued_signals();
errno = EINTR;
return 1;
}
return 0;
}
static int
isDataGram(SOCKET s)
{
int type;
int typelen = sizeof(type);
if (getsockopt(s, SOL_SOCKET, SO_TYPE, (char *) &type, &typelen))
return 1;
return (type == SOCK_DGRAM) ? 1 : 0;
}
int
pgwin32_waitforsinglesocket(SOCKET s, int what, int timeout)
{
static HANDLE waitevent = INVALID_HANDLE_VALUE;
static SOCKET current_socket = INVALID_SOCKET;
static int isUDP = 0;
HANDLE events[2];
int r;
/* Create an event object just once and use it on all future calls */
if (waitevent == INVALID_HANDLE_VALUE)
{
waitevent = CreateEvent(NULL, TRUE, FALSE, NULL);
if (waitevent == INVALID_HANDLE_VALUE)
ereport(ERROR,
(errmsg_internal("could not create socket waiting event: error code %lu", GetLastError())));
}
else if (!ResetEvent(waitevent))
ereport(ERROR,
(errmsg_internal("could not reset socket waiting event: error code %lu", GetLastError())));
/*
* Track whether socket is UDP or not. (NB: most likely, this is both
* useless and wrong; there is no reason to think that the behavior of
* WSAEventSelect is different for TCP and UDP.)
*/
if (current_socket != s)
isUDP = isDataGram(s);
current_socket = s;
/*
* Attach event to socket. NOTE: we must detach it again before
* returning, since other bits of code may try to attach other events to
* the socket.
*/
if (WSAEventSelect(s, waitevent, what) != 0)
{
TranslateSocketError();
return 0;
}
events[0] = pgwin32_signal_event;
events[1] = waitevent;
/*
* Just a workaround of unknown locking problem with writing in UDP socket
* under high load: Client's pgsql backend sleeps infinitely in
* WaitForMultipleObjectsEx, pgstat process sleeps in pgwin32_select().
* So, we will wait with small timeout(0.1 sec) and if socket is still
* blocked, try WSASend (see comments in pgwin32_select) and wait again.
*/
if ((what & FD_WRITE) && isUDP)
{
for (;;)
{
r = WaitForMultipleObjectsEx(2, events, FALSE, 100, TRUE);
if (r == WAIT_TIMEOUT)
{
char c;
WSABUF buf;
DWORD sent;
buf.buf = &c;
buf.len = 0;
r = WSASend(s, &buf, 1, &sent, 0, NULL, NULL);
if (r == 0) /* Completed - means things are fine! */
{
WSAEventSelect(s, NULL, 0);
return 1;
}
else if (WSAGetLastError() != WSAEWOULDBLOCK)
{
TranslateSocketError();
WSAEventSelect(s, NULL, 0);
return 0;
}
}
else
break;
}
}
else
r = WaitForMultipleObjectsEx(2, events, FALSE, timeout, TRUE);
WSAEventSelect(s, NULL, 0);
if (r == WAIT_OBJECT_0 || r == WAIT_IO_COMPLETION)
{
pgwin32_dispatch_queued_signals();
errno = EINTR;
return 0;
}
if (r == WAIT_OBJECT_0 + 1)
return 1;
if (r == WAIT_TIMEOUT)
{
errno = EWOULDBLOCK;
return 0;
}
ereport(ERROR,
(errmsg_internal("unrecognized return value from WaitForMultipleObjects: %d (error code %lu)", r, GetLastError())));
return 0;
}
/*
* Create a socket, setting it to overlapped and non-blocking
*/
SOCKET
pgwin32_socket(int af, int type, int protocol)
{
SOCKET s;
unsigned long on = 1;
s = WSASocket(af, type, protocol, NULL, 0, WSA_FLAG_OVERLAPPED);
if (s == INVALID_SOCKET)
{
TranslateSocketError();
return INVALID_SOCKET;
}
if (ioctlsocket(s, FIONBIO, &on))
{
TranslateSocketError();
return INVALID_SOCKET;
}
errno = 0;
return s;
}
int
pgwin32_bind(SOCKET s, struct sockaddr *addr, int addrlen)
{
int res;
res = bind(s, addr, addrlen);
if (res < 0)
TranslateSocketError();
return res;
}
int
pgwin32_listen(SOCKET s, int backlog)
{
int res;
res = listen(s, backlog);
if (res < 0)
TranslateSocketError();
return res;
}
SOCKET
pgwin32_accept(SOCKET s, struct sockaddr *addr, int *addrlen)
{
SOCKET rs;
/*
* Poll for signals, but don't return with EINTR, since we don't handle
* that in pqcomm.c
*/
pgwin32_poll_signals();
rs = WSAAccept(s, addr, addrlen, NULL, 0);
if (rs == INVALID_SOCKET)
{
TranslateSocketError();
return INVALID_SOCKET;
}
return rs;
}
/* No signal delivery during connect. */
int
pgwin32_connect(SOCKET s, const struct sockaddr *addr, int addrlen)
{
int r;
r = WSAConnect(s, addr, addrlen, NULL, NULL, NULL, NULL);
if (r == 0)
return 0;
if (WSAGetLastError() != WSAEWOULDBLOCK)
{
TranslateSocketError();
return -1;
}
while (pgwin32_waitforsinglesocket(s, FD_CONNECT, INFINITE) == 0)
{
/* Loop endlessly as long as we are just delivering signals */
}
return 0;
}
int
pgwin32_recv(SOCKET s, char *buf, int len, int f)
{
WSABUF wbuf;
int r;
DWORD b;
DWORD flags = f;
int n;
if (pgwin32_poll_signals())
return -1;
wbuf.len = len;
wbuf.buf = buf;
r = WSARecv(s, &wbuf, 1, &b, &flags, NULL, NULL);
if (r != SOCKET_ERROR)
return b; /* success */
if (WSAGetLastError() != WSAEWOULDBLOCK)
{
TranslateSocketError();
return -1;
}
if (pgwin32_noblock)
{
/*
* No data received, and we are in "emulated non-blocking mode", so
* return indicating that we'd block if we were to continue.
*/
errno = EWOULDBLOCK;
return -1;
}
/* We're in blocking mode, so wait for data */
for (n = 0; n < 5; n++)
{
if (pgwin32_waitforsinglesocket(s, FD_READ | FD_CLOSE | FD_ACCEPT,
INFINITE) == 0)
return -1; /* errno already set */
r = WSARecv(s, &wbuf, 1, &b, &flags, NULL, NULL);
if (r != SOCKET_ERROR)
return b; /* success */
if (WSAGetLastError() != WSAEWOULDBLOCK)
{
TranslateSocketError();
return -1;
}
/*
* There seem to be cases on win2k (at least) where WSARecv can return
* WSAEWOULDBLOCK even when pgwin32_waitforsinglesocket claims the
* socket is readable. In this case, just sleep for a moment and try
* again. We try up to 5 times - if it fails more than that it's not
* likely to ever come back.
*/
pg_usleep(10000);
}
ereport(NOTICE,
(errmsg_internal("could not read from ready socket (after retries)")));
errno = EWOULDBLOCK;
return -1;
}
/*
* The second argument to send() is defined by SUS to be a "const void *"
* and so we use the same signature here to keep compilers happy when
* handling callers.
*
* But the buf member of a WSABUF struct is defined as "char *", so we cast
* the second argument to that here when assigning it, also to keep compilers
* happy.
*/
int
pgwin32_send(SOCKET s, const void *buf, int len, int flags)
{
WSABUF wbuf;
int r;
DWORD b;
if (pgwin32_poll_signals())
return -1;
wbuf.len = len;
wbuf.buf = (char *) buf;
/*
* Readiness of socket to send data to UDP socket may be not true: socket
* can become busy again! So loop until send or error occurs.
*/
for (;;)
{
r = WSASend(s, &wbuf, 1, &b, flags, NULL, NULL);
if (r != SOCKET_ERROR && b > 0)
/* Write succeeded right away */
return b;
if (r == SOCKET_ERROR &&
WSAGetLastError() != WSAEWOULDBLOCK)
{
TranslateSocketError();
return -1;
}
if (pgwin32_noblock)
{
/*
* No data sent, and we are in "emulated non-blocking mode", so
* return indicating that we'd block if we were to continue.
*/
errno = EWOULDBLOCK;
return -1;
}
/* No error, zero bytes (win2000+) or error+WSAEWOULDBLOCK (<=nt4) */
if (pgwin32_waitforsinglesocket(s, FD_WRITE | FD_CLOSE, INFINITE) == 0)
return -1;
}
return -1;
}
/*
* Wait for activity on one or more sockets.
* While waiting, allow signals to run
*
* NOTE! Currently does not implement exceptfds check,
* since it is not used in postgresql!
*/
int
pgwin32_select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, const struct timeval *timeout)
{
WSAEVENT events[FD_SETSIZE * 2]; /* worst case is readfds totally
* different from writefds, so
* 2*FD_SETSIZE sockets */
SOCKET sockets[FD_SETSIZE * 2];
int numevents = 0;
int i;
int r;
DWORD timeoutval = WSA_INFINITE;
FD_SET outreadfds;
FD_SET outwritefds;
int nummatches = 0;
Assert(exceptfds == NULL);
if (pgwin32_poll_signals())
return -1;
FD_ZERO(&outreadfds);
FD_ZERO(&outwritefds);
/*
* Windows does not guarantee to log an FD_WRITE network event indicating
* that more data can be sent unless the previous send() failed with
* WSAEWOULDBLOCK. While our caller might well have made such a call, we
* cannot assume that here. Therefore, if waiting for write-ready, force
* the issue by doing a dummy send(). If the dummy send() succeeds,
* assume that the socket is in fact write-ready, and return immediately.
* Also, if it fails with something other than WSAEWOULDBLOCK, return a
* write-ready indication to let our caller deal with the error condition.
*/
if (writefds != NULL)
{
for (i = 0; i < writefds->fd_count; i++)
{
char c;
WSABUF buf;
DWORD sent;
buf.buf = &c;
buf.len = 0;
r = WSASend(writefds->fd_array[i], &buf, 1, &sent, 0, NULL, NULL);
if (r == 0 || WSAGetLastError() != WSAEWOULDBLOCK)
FD_SET(writefds->fd_array[i], &outwritefds);
}
/* If we found any write-ready sockets, just return them immediately */
if (outwritefds.fd_count > 0)
{
memcpy(writefds, &outwritefds, sizeof(fd_set));
if (readfds)
FD_ZERO(readfds);
return outwritefds.fd_count;
}
}
/* Now set up for an actual select */
if (timeout != NULL)
{
/* timeoutval is in milliseconds */
timeoutval = timeout->tv_sec * 1000 + timeout->tv_usec / 1000;
}
if (readfds != NULL)
{
for (i = 0; i < readfds->fd_count; i++)
{
events[numevents] = WSACreateEvent();
sockets[numevents] = readfds->fd_array[i];
numevents++;
}
}
if (writefds != NULL)
{
for (i = 0; i < writefds->fd_count; i++)
{
if (!readfds ||
!FD_ISSET(writefds->fd_array[i], readfds))
{
/* If the socket is not in the read list */
events[numevents] = WSACreateEvent();
sockets[numevents] = writefds->fd_array[i];
numevents++;
}
}
}
for (i = 0; i < numevents; i++)
{
int flags = 0;
if (readfds && FD_ISSET(sockets[i], readfds))
flags |= FD_READ | FD_ACCEPT | FD_CLOSE;
if (writefds && FD_ISSET(sockets[i], writefds))
flags |= FD_WRITE | FD_CLOSE;
if (WSAEventSelect(sockets[i], events[i], flags) != 0)
{
TranslateSocketError();
/* release already-assigned event objects */
while (--i >= 0)
WSAEventSelect(sockets[i], NULL, 0);
for (i = 0; i < numevents; i++)
WSACloseEvent(events[i]);
return -1;
}
}
events[numevents] = pgwin32_signal_event;
r = WaitForMultipleObjectsEx(numevents + 1, events, FALSE, timeoutval, TRUE);
if (r != WAIT_TIMEOUT && r != WAIT_IO_COMPLETION && r != (WAIT_OBJECT_0 + numevents))
{
/*
* We scan all events, even those not signaled, in case more than one
* event has been tagged but Wait.. can only return one.
*/
WSANETWORKEVENTS resEvents;
for (i = 0; i < numevents; i++)
{
ZeroMemory(&resEvents, sizeof(resEvents));
if (WSAEnumNetworkEvents(sockets[i], events[i], &resEvents) != 0)
elog(ERROR, "failed to enumerate network events: error code %d",
WSAGetLastError());
/* Read activity? */
if (readfds && FD_ISSET(sockets[i], readfds))
{
if ((resEvents.lNetworkEvents & FD_READ) ||
(resEvents.lNetworkEvents & FD_ACCEPT) ||
(resEvents.lNetworkEvents & FD_CLOSE))
{
FD_SET(sockets[i], &outreadfds);
nummatches++;
}
}
/* Write activity? */
if (writefds && FD_ISSET(sockets[i], writefds))
{
if ((resEvents.lNetworkEvents & FD_WRITE) ||
(resEvents.lNetworkEvents & FD_CLOSE))
{
FD_SET(sockets[i], &outwritefds);
nummatches++;
}
}
}
}
/* Clean up all the event objects */
for (i = 0; i < numevents; i++)
{
WSAEventSelect(sockets[i], NULL, 0);
WSACloseEvent(events[i]);
}
if (r == WSA_WAIT_TIMEOUT)
{
if (readfds)
FD_ZERO(readfds);
if (writefds)
FD_ZERO(writefds);
return 0;
}
/* Signal-like events. */
if (r == WAIT_OBJECT_0 + numevents || r == WAIT_IO_COMPLETION)
{
pgwin32_dispatch_queued_signals();
errno = EINTR;
if (readfds)
FD_ZERO(readfds);
if (writefds)
FD_ZERO(writefds);
return -1;
}
/* Overwrite socket sets with our resulting values */
if (readfds)
memcpy(readfds, &outreadfds, sizeof(fd_set));
if (writefds)
memcpy(writefds, &outwritefds, sizeof(fd_set));
return nummatches;
}