diff options
author | uzhas <uzhas@ydb.tech> | 2022-08-03 16:49:05 +0300 |
---|---|---|
committer | uzhas <uzhas@ydb.tech> | 2022-08-03 16:49:05 +0300 |
commit | 9f5a9f6bb8e71744dba93d589498c3400ef37cf6 (patch) | |
tree | deeec1c662834706dd83e8a65f90b7c619aa6956 /contrib/libs/postgresql | |
parent | 99ca7f704a079771da487ee672539da698d0d3b8 (diff) | |
download | ydb-9f5a9f6bb8e71744dba93d589498c3400ef37cf6.tar.gz |
revert back: link ydbd with pg wrapper to support pg types
Diffstat (limited to 'contrib/libs/postgresql')
-rw-r--r-- | contrib/libs/postgresql/src/backend/port/posix_sema.c | 388 | ||||
-rw-r--r-- | contrib/libs/postgresql/src/backend/port/sysv_shmem.c | 946 |
2 files changed, 1334 insertions, 0 deletions
diff --git a/contrib/libs/postgresql/src/backend/port/posix_sema.c b/contrib/libs/postgresql/src/backend/port/posix_sema.c new file mode 100644 index 0000000000..114da3b30c --- /dev/null +++ b/contrib/libs/postgresql/src/backend/port/posix_sema.c @@ -0,0 +1,388 @@ +/*------------------------------------------------------------------------- + * + * posix_sema.c + * Implement PGSemaphores using POSIX semaphore facilities + * + * We prefer the unnamed style of POSIX semaphore (the kind made with + * sem_init). We can cope with the kind made with sem_open, however. + * + * In either implementation, typedef PGSemaphore is equivalent to "sem_t *". + * With unnamed semaphores, the sem_t structs live in an array in shared + * memory. With named semaphores, that's not true because we cannot persuade + * sem_open to do its allocation there. Therefore, the named-semaphore code + * *does not cope with EXEC_BACKEND*. The sem_t structs will just be in the + * postmaster's private memory, where they are successfully inherited by + * forked backends, but they could not be accessed by exec'd backends. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/port/posix_sema.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <fcntl.h> +#include <semaphore.h> +#include <signal.h> +#include <unistd.h> +#include <sys/stat.h> + +#include "miscadmin.h" +#include "storage/ipc.h" +#include "storage/pg_sema.h" +#include "storage/shmem.h" + + +/* see file header comment */ +#if defined(USE_NAMED_POSIX_SEMAPHORES) && defined(EXEC_BACKEND) +#error cannot use named POSIX semaphores with EXEC_BACKEND +#endif + +typedef union SemTPadded +{ + sem_t pgsem; + char pad[PG_CACHE_LINE_SIZE]; +} SemTPadded; + +/* typedef PGSemaphore is equivalent to pointer to sem_t */ +typedef struct PGSemaphoreData +{ + SemTPadded sem_padded; +} PGSemaphoreData; + +#define PG_SEM_REF(x) (&(x)->sem_padded.pgsem) + +#define IPCProtection (0600) /* access/modify by user only */ + +#ifdef USE_NAMED_POSIX_SEMAPHORES +static sem_t **mySemPointers; /* keep track of created semaphores */ +#else +static PGSemaphore sharedSemas; /* array of PGSemaphoreData in shared memory */ +#endif +static int numSems; /* number of semas acquired so far */ +static int maxSems; /* allocated size of above arrays */ +static int nextSemKey; /* next name to try */ + + +static void ReleaseSemaphores(int status, Datum arg); + + +#ifdef USE_NAMED_POSIX_SEMAPHORES + +/* + * PosixSemaphoreCreate + * + * Attempt to create a new named semaphore. + * + * If we fail with a failure code other than collision-with-existing-sema, + * print out an error and abort. Other types of errors suggest nonrecoverable + * problems. + */ +static sem_t * +PosixSemaphoreCreate(void) +{ + int semKey; + char semname[64]; + sem_t *mySem; + + for (;;) + { + semKey = nextSemKey++; + + snprintf(semname, sizeof(semname), "/pgsql-%d", semKey); + + mySem = sem_open(semname, O_CREAT | O_EXCL, + (mode_t) IPCProtection, (unsigned) 1); + +#ifdef SEM_FAILED + if (mySem != (sem_t *) SEM_FAILED) + break; +#else + if (mySem != (sem_t *) (-1)) + break; +#endif + + /* Loop if error indicates a collision */ + if (errno == EEXIST || errno == EACCES || errno == EINTR) + continue; + + /* + * Else complain and abort + */ + elog(FATAL, "sem_open(\"%s\") failed: %m", semname); + } + + /* + * Unlink the semaphore immediately, so it can't be accessed externally. + * This also ensures that it will go away if we crash. + */ + sem_unlink(semname); + + return mySem; +} +#else /* !USE_NAMED_POSIX_SEMAPHORES */ + +/* + * PosixSemaphoreCreate + * + * Attempt to create a new unnamed semaphore. + */ +static void +PosixSemaphoreCreate(sem_t *sem) +{ + if (sem_init(sem, 1, 1) < 0) + elog(FATAL, "sem_init failed: %m"); +} +#endif /* USE_NAMED_POSIX_SEMAPHORES */ + + +/* + * PosixSemaphoreKill - removes a semaphore + */ +static void +PosixSemaphoreKill(sem_t *sem) +{ +#ifdef USE_NAMED_POSIX_SEMAPHORES + /* Got to use sem_close for named semaphores */ + if (sem_close(sem) < 0) + elog(LOG, "sem_close failed: %m"); +#else + /* Got to use sem_destroy for unnamed semaphores */ + if (sem_destroy(sem) < 0) + elog(LOG, "sem_destroy failed: %m"); +#endif +} + + +/* + * Report amount of shared memory needed for semaphores + */ +Size +PGSemaphoreShmemSize(int maxSemas) +{ +#ifdef USE_NAMED_POSIX_SEMAPHORES + /* No shared memory needed in this case */ + return 0; +#else + /* Need a PGSemaphoreData per semaphore */ + return mul_size(maxSemas, sizeof(PGSemaphoreData)); +#endif +} + +/* + * PGReserveSemaphores --- initialize semaphore support + * + * This is called during postmaster start or shared memory reinitialization. + * It should do whatever is needed to be able to support up to maxSemas + * subsequent PGSemaphoreCreate calls. Also, if any system resources + * are acquired here or in PGSemaphoreCreate, register an on_shmem_exit + * callback to release them. + * + * In the Posix implementation, we acquire semaphores on-demand; the + * maxSemas parameter is just used to size the arrays. For unnamed + * semaphores, there is an array of PGSemaphoreData structs in shared memory. + * For named semaphores, we keep a postmaster-local array of sem_t pointers, + * which we use for releasing the semaphores when done. + * (This design minimizes the dependency of postmaster shutdown on the + * contents of shared memory, which a failed backend might have clobbered. + * We can't do much about the possibility of sem_destroy() crashing, but + * we don't have to expose the counters to other processes.) + */ +void +PGReserveSemaphores(int maxSemas) +{ + struct stat statbuf; + + /* + * We use the data directory's inode number to seed the search for free + * semaphore keys. This minimizes the odds of collision with other + * postmasters, while maximizing the odds that we will detect and clean up + * semaphores left over from a crashed postmaster in our own directory. + */ + if (stat(DataDir, &statbuf) < 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not stat data directory \"%s\": %m", + DataDir))); + +#ifdef USE_NAMED_POSIX_SEMAPHORES + mySemPointers = (sem_t **) malloc(maxSemas * sizeof(sem_t *)); + if (mySemPointers == NULL) + elog(PANIC, "out of memory"); +#else + + /* + * We must use ShmemAllocUnlocked(), since the spinlock protecting + * ShmemAlloc() won't be ready yet. (This ordering is necessary when we + * are emulating spinlocks with semaphores.) + */ + sharedSemas = (PGSemaphore) + ShmemAllocUnlocked(PGSemaphoreShmemSize(maxSemas)); +#endif + + numSems = 0; + maxSems = maxSemas; + nextSemKey = statbuf.st_ino; + + on_shmem_exit(ReleaseSemaphores, 0); +} + +/* + * Release semaphores at shutdown or shmem reinitialization + * + * (called as an on_shmem_exit callback, hence funny argument list) + */ +static void +ReleaseSemaphores(int status, Datum arg) +{ + int i; + +#ifdef USE_NAMED_POSIX_SEMAPHORES + for (i = 0; i < numSems; i++) + PosixSemaphoreKill(mySemPointers[i]); + free(mySemPointers); +#endif + +#ifdef USE_UNNAMED_POSIX_SEMAPHORES + for (i = 0; i < numSems; i++) + PosixSemaphoreKill(PG_SEM_REF(sharedSemas + i)); +#endif +} + +/* + * PGSemaphoreCreate + * + * Allocate a PGSemaphore structure with initial count 1 + */ +PGSemaphore +PGSemaphoreCreate(void) +{ + PGSemaphore sema; + sem_t *newsem; + + /* Can't do this in a backend, because static state is postmaster's */ + Assert(!IsUnderPostmaster); + + if (numSems >= maxSems) + elog(PANIC, "too many semaphores created"); + +#ifdef USE_NAMED_POSIX_SEMAPHORES + newsem = PosixSemaphoreCreate(); + /* Remember new sema for ReleaseSemaphores */ + mySemPointers[numSems] = newsem; + sema = (PGSemaphore) newsem; +#else + sema = &sharedSemas[numSems]; + newsem = PG_SEM_REF(sema); + PosixSemaphoreCreate(newsem); +#endif + + numSems++; + + return sema; +} + +/* + * PGSemaphoreReset + * + * Reset a previously-initialized PGSemaphore to have count 0 + */ +void +PGSemaphoreReset(PGSemaphore sema) +{ + /* + * There's no direct API for this in POSIX, so we have to ratchet the + * semaphore down to 0 with repeated trywait's. + */ + for (;;) + { + if (sem_trywait(PG_SEM_REF(sema)) < 0) + { + if (errno == EAGAIN || errno == EDEADLK) + break; /* got it down to 0 */ + if (errno == EINTR) + continue; /* can this happen? */ + elog(FATAL, "sem_trywait failed: %m"); + } + } +} + +/* + * PGSemaphoreLock + * + * Lock a semaphore (decrement count), blocking if count would be < 0 + */ +void +PGSemaphoreLock(PGSemaphore sema) +{ + int errStatus; + + /* See notes in sysv_sema.c's implementation of PGSemaphoreLock. */ + do + { + errStatus = sem_wait(PG_SEM_REF(sema)); + } while (errStatus < 0 && errno == EINTR); + + if (errStatus < 0) + elog(FATAL, "sem_wait failed: %m"); +} + +/* + * PGSemaphoreUnlock + * + * Unlock a semaphore (increment count) + */ +void +PGSemaphoreUnlock(PGSemaphore sema) +{ + int errStatus; + + /* + * Note: if errStatus is -1 and errno == EINTR then it means we returned + * from the operation prematurely because we were sent a signal. So we + * try and unlock the semaphore again. Not clear this can really happen, + * but might as well cope. + */ + do + { + errStatus = sem_post(PG_SEM_REF(sema)); + } while (errStatus < 0 && errno == EINTR); + + if (errStatus < 0) + elog(FATAL, "sem_post failed: %m"); +} + +/* + * PGSemaphoreTryLock + * + * Lock a semaphore only if able to do so without blocking + */ +bool +PGSemaphoreTryLock(PGSemaphore sema) +{ + int errStatus; + + /* + * Note: if errStatus is -1 and errno == EINTR then it means we returned + * from the operation prematurely because we were sent a signal. So we + * try and lock the semaphore again. + */ + do + { + errStatus = sem_trywait(PG_SEM_REF(sema)); + } while (errStatus < 0 && errno == EINTR); + + if (errStatus < 0) + { + if (errno == EAGAIN || errno == EDEADLK) + return false; /* failed to lock it */ + /* Otherwise we got trouble */ + elog(FATAL, "sem_trywait failed: %m"); + } + + return true; +} diff --git a/contrib/libs/postgresql/src/backend/port/sysv_shmem.c b/contrib/libs/postgresql/src/backend/port/sysv_shmem.c new file mode 100644 index 0000000000..35cce89e9c --- /dev/null +++ b/contrib/libs/postgresql/src/backend/port/sysv_shmem.c @@ -0,0 +1,946 @@ +/*------------------------------------------------------------------------- + * + * sysv_shmem.c + * Implement shared memory using SysV facilities + * + * These routines used to be a fairly thin layer on top of SysV shared + * memory functionality. With the addition of anonymous-shmem logic, + * they're a bit fatter now. We still require a SysV shmem block to + * exist, though, because mmap'd shmem provides no way to find out how + * many processes are attached, which we need for interlocking purposes. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/port/sysv_shmem.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <signal.h> +#include <unistd.h> +#include <sys/file.h> +#include <sys/mman.h> +#include <sys/stat.h> +#ifdef HAVE_SYS_IPC_H +#include <sys/ipc.h> +#endif +#ifdef HAVE_SYS_SHM_H +#include <sys/shm.h> +#endif + +#include "miscadmin.h" +#include "port/pg_bitutils.h" +#include "portability/mem.h" +#include "storage/dsm.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/pg_shmem.h" +#include "utils/guc.h" +#include "utils/pidfile.h" + + +/* + * As of PostgreSQL 9.3, we normally allocate only a very small amount of + * System V shared memory, and only for the purposes of providing an + * interlock to protect the data directory. The real shared memory block + * is allocated using mmap(). This works around the problem that many + * systems have very low limits on the amount of System V shared memory + * that can be allocated. Even a limit of a few megabytes will be enough + * to run many copies of PostgreSQL without needing to adjust system settings. + * + * We assume that no one will attempt to run PostgreSQL 9.3 or later on + * systems that are ancient enough that anonymous shared memory is not + * supported, such as pre-2.4 versions of Linux. If that turns out to be + * false, we might need to add compile and/or run-time tests here and do this + * only if the running kernel supports it. + * + * However, we must always disable this logic in the EXEC_BACKEND case, and + * fall back to the old method of allocating the entire segment using System V + * shared memory, because there's no way to attach an anonymous mmap'd segment + * to a process after exec(). Since EXEC_BACKEND is intended only for + * developer use, this shouldn't be a big problem. Because of this, we do + * not worry about supporting anonymous shmem in the EXEC_BACKEND cases below. + * + * As of PostgreSQL 12, we regained the ability to use a large System V shared + * memory region even in non-EXEC_BACKEND builds, if shared_memory_type is set + * to sysv (though this is not the default). + */ + + +typedef key_t IpcMemoryKey; /* shared memory key passed to shmget(2) */ +typedef int IpcMemoryId; /* shared memory ID returned by shmget(2) */ + +/* + * How does a given IpcMemoryId relate to this PostgreSQL process? + * + * One could recycle unattached segments of different data directories if we + * distinguished that case from other SHMSTATE_FOREIGN cases. Doing so would + * cause us to visit less of the key space, making us less likely to detect a + * SHMSTATE_ATTACHED key. It would also complicate the concurrency analysis, + * in that postmasters of different data directories could simultaneously + * attempt to recycle a given key. We'll waste keys longer in some cases, but + * avoiding the problems of the alternative justifies that loss. + */ +typedef enum +{ + SHMSTATE_ANALYSIS_FAILURE, /* unexpected failure to analyze the ID */ + SHMSTATE_ATTACHED, /* pertinent to DataDir, has attached PIDs */ + SHMSTATE_ENOENT, /* no segment of that ID */ + SHMSTATE_FOREIGN, /* exists, but not pertinent to DataDir */ + SHMSTATE_UNATTACHED /* pertinent to DataDir, no attached PIDs */ +} IpcMemoryState; + + +unsigned long UsedShmemSegID = 0; +void *UsedShmemSegAddr = NULL; + +static Size AnonymousShmemSize; +static void *AnonymousShmem = NULL; + +static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size); +static void IpcMemoryDetach(int status, Datum shmaddr); +static void IpcMemoryDelete(int status, Datum shmId); +static IpcMemoryState PGSharedMemoryAttach(IpcMemoryId shmId, + void *attachAt, + PGShmemHeader **addr); + + +/* + * InternalIpcMemoryCreate(memKey, size) + * + * Attempt to create a new shared memory segment with the specified key. + * Will fail (return NULL) if such a segment already exists. If successful, + * attach the segment to the current process and return its attached address. + * On success, callbacks are registered with on_shmem_exit to detach and + * delete the segment when on_shmem_exit is called. + * + * If we fail with a failure code other than collision-with-existing-segment, + * print out an error and abort. Other types of errors are not recoverable. + */ +static void * +InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size) +{ + IpcMemoryId shmid; + void *requestedAddress = NULL; + void *memAddress; + + /* + * Normally we just pass requestedAddress = NULL to shmat(), allowing the + * system to choose where the segment gets mapped. But in an EXEC_BACKEND + * build, it's possible for whatever is chosen in the postmaster to not + * work for backends, due to variations in address space layout. As a + * rather klugy workaround, allow the user to specify the address to use + * via setting the environment variable PG_SHMEM_ADDR. (If this were of + * interest for anything except debugging, we'd probably create a cleaner + * and better-documented way to set it, such as a GUC.) + */ +#ifdef EXEC_BACKEND + { + char *pg_shmem_addr = getenv("PG_SHMEM_ADDR"); + + if (pg_shmem_addr) + requestedAddress = (void *) strtoul(pg_shmem_addr, NULL, 0); + else + { +#if defined(__darwin__) && SIZEOF_VOID_P == 8 + /* + * Provide a default value that is believed to avoid problems with + * ASLR on the current macOS release. + */ + requestedAddress = (void *) 0x80000000000; +#endif + } + } +#endif + + shmid = shmget(memKey, size, IPC_CREAT | IPC_EXCL | IPCProtection); + + if (shmid < 0) + { + int shmget_errno = errno; + + /* + * Fail quietly if error indicates a collision with existing segment. + * One would expect EEXIST, given that we said IPC_EXCL, but perhaps + * we could get a permission violation instead? Also, EIDRM might + * occur if an old seg is slated for destruction but not gone yet. + */ + if (shmget_errno == EEXIST || shmget_errno == EACCES +#ifdef EIDRM + || shmget_errno == EIDRM +#endif + ) + return NULL; + + /* + * Some BSD-derived kernels are known to return EINVAL, not EEXIST, if + * there is an existing segment but it's smaller than "size" (this is + * a result of poorly-thought-out ordering of error tests). To + * distinguish between collision and invalid size in such cases, we + * make a second try with size = 0. These kernels do not test size + * against SHMMIN in the preexisting-segment case, so we will not get + * EINVAL a second time if there is such a segment. + */ + if (shmget_errno == EINVAL) + { + shmid = shmget(memKey, 0, IPC_CREAT | IPC_EXCL | IPCProtection); + + if (shmid < 0) + { + /* As above, fail quietly if we verify a collision */ + if (errno == EEXIST || errno == EACCES +#ifdef EIDRM + || errno == EIDRM +#endif + ) + return NULL; + /* Otherwise, fall through to report the original error */ + } + else + { + /* + * On most platforms we cannot get here because SHMMIN is + * greater than zero. However, if we do succeed in creating a + * zero-size segment, free it and then fall through to report + * the original error. + */ + if (shmctl(shmid, IPC_RMID, NULL) < 0) + elog(LOG, "shmctl(%d, %d, 0) failed: %m", + (int) shmid, IPC_RMID); + } + } + + /* + * Else complain and abort. + * + * Note: at this point EINVAL should mean that either SHMMIN or SHMMAX + * is violated. SHMALL violation might be reported as either ENOMEM + * (BSDen) or ENOSPC (Linux); the Single Unix Spec fails to say which + * it should be. SHMMNI violation is ENOSPC, per spec. Just plain + * not-enough-RAM is ENOMEM. + */ + errno = shmget_errno; + ereport(FATAL, + (errmsg("could not create shared memory segment: %m"), + errdetail("Failed system call was shmget(key=%lu, size=%zu, 0%o).", + (unsigned long) memKey, size, + IPC_CREAT | IPC_EXCL | IPCProtection), + (shmget_errno == EINVAL) ? + errhint("This error usually means that PostgreSQL's request for a shared memory " + "segment exceeded your kernel's SHMMAX parameter, or possibly that " + "it is less than " + "your kernel's SHMMIN parameter.\n" + "The PostgreSQL documentation contains more information about shared " + "memory configuration.") : 0, + (shmget_errno == ENOMEM) ? + errhint("This error usually means that PostgreSQL's request for a shared " + "memory segment exceeded your kernel's SHMALL parameter. You might need " + "to reconfigure the kernel with larger SHMALL.\n" + "The PostgreSQL documentation contains more information about shared " + "memory configuration.") : 0, + (shmget_errno == ENOSPC) ? + errhint("This error does *not* mean that you have run out of disk space. " + "It occurs either if all available shared memory IDs have been taken, " + "in which case you need to raise the SHMMNI parameter in your kernel, " + "or because the system's overall limit for shared memory has been " + "reached.\n" + "The PostgreSQL documentation contains more information about shared " + "memory configuration.") : 0)); + } + + /* Register on-exit routine to delete the new segment */ + on_shmem_exit(IpcMemoryDelete, Int32GetDatum(shmid)); + + /* OK, should be able to attach to the segment */ + memAddress = shmat(shmid, requestedAddress, PG_SHMAT_FLAGS); + + if (memAddress == (void *) -1) + elog(FATAL, "shmat(id=%d, addr=%p, flags=0x%x) failed: %m", + shmid, requestedAddress, PG_SHMAT_FLAGS); + + /* Register on-exit routine to detach new segment before deleting */ + on_shmem_exit(IpcMemoryDetach, PointerGetDatum(memAddress)); + + /* + * Store shmem key and ID in data directory lockfile. Format to try to + * keep it the same length always (trailing junk in the lockfile won't + * hurt, but might confuse humans). + */ + { + char line[64]; + + sprintf(line, "%9lu %9lu", + (unsigned long) memKey, (unsigned long) shmid); + AddToDataDirLockFile(LOCK_FILE_LINE_SHMEM_KEY, line); + } + + return memAddress; +} + +/****************************************************************************/ +/* IpcMemoryDetach(status, shmaddr) removes a shared memory segment */ +/* from process' address space */ +/* (called as an on_shmem_exit callback, hence funny argument list) */ +/****************************************************************************/ +static void +IpcMemoryDetach(int status, Datum shmaddr) +{ + /* Detach System V shared memory block. */ + if (shmdt(DatumGetPointer(shmaddr)) < 0) + elog(LOG, "shmdt(%p) failed: %m", DatumGetPointer(shmaddr)); +} + +/****************************************************************************/ +/* IpcMemoryDelete(status, shmId) deletes a shared memory segment */ +/* (called as an on_shmem_exit callback, hence funny argument list) */ +/****************************************************************************/ +static void +IpcMemoryDelete(int status, Datum shmId) +{ + if (shmctl(DatumGetInt32(shmId), IPC_RMID, NULL) < 0) + elog(LOG, "shmctl(%d, %d, 0) failed: %m", + DatumGetInt32(shmId), IPC_RMID); +} + +/* + * PGSharedMemoryIsInUse + * + * Is a previously-existing shmem segment still existing and in use? + * + * The point of this exercise is to detect the case where a prior postmaster + * crashed, but it left child backends that are still running. Therefore + * we only care about shmem segments that are associated with the intended + * DataDir. This is an important consideration since accidental matches of + * shmem segment IDs are reasonably common. + */ +bool +PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2) +{ + PGShmemHeader *memAddress; + IpcMemoryState state; + + state = PGSharedMemoryAttach((IpcMemoryId) id2, NULL, &memAddress); + if (memAddress && shmdt(memAddress) < 0) + elog(LOG, "shmdt(%p) failed: %m", memAddress); + switch (state) + { + case SHMSTATE_ENOENT: + case SHMSTATE_FOREIGN: + case SHMSTATE_UNATTACHED: + return false; + case SHMSTATE_ANALYSIS_FAILURE: + case SHMSTATE_ATTACHED: + return true; + } + return true; +} + +/* + * Test for a segment with id shmId; see comment at IpcMemoryState. + * + * If the segment exists, we'll attempt to attach to it, using attachAt + * if that's not NULL (but it's best to pass NULL if possible). + * + * *addr is set to the segment memory address if we attached to it, else NULL. + */ +static IpcMemoryState +PGSharedMemoryAttach(IpcMemoryId shmId, + void *attachAt, + PGShmemHeader **addr) +{ + struct shmid_ds shmStat; + struct stat statbuf; + PGShmemHeader *hdr; + + *addr = NULL; + + /* + * First, try to stat the shm segment ID, to see if it exists at all. + */ + if (shmctl(shmId, IPC_STAT, &shmStat) < 0) + { + /* + * EINVAL actually has multiple possible causes documented in the + * shmctl man page, but we assume it must mean the segment no longer + * exists. + */ + if (errno == EINVAL) + return SHMSTATE_ENOENT; + + /* + * EACCES implies we have no read permission, which means it is not a + * Postgres shmem segment (or at least, not one that is relevant to + * our data directory). + */ + if (errno == EACCES) + return SHMSTATE_FOREIGN; + + /* + * Some Linux kernel versions (in fact, all of them as of July 2007) + * sometimes return EIDRM when EINVAL is correct. The Linux kernel + * actually does not have any internal state that would justify + * returning EIDRM, so we can get away with assuming that EIDRM is + * equivalent to EINVAL on that platform. + */ +#ifdef HAVE_LINUX_EIDRM_BUG + if (errno == EIDRM) + return SHMSTATE_ENOENT; +#endif + + /* + * Otherwise, we had better assume that the segment is in use. The + * only likely case is (non-Linux, assumed spec-compliant) EIDRM, + * which implies that the segment has been IPC_RMID'd but there are + * still processes attached to it. + */ + return SHMSTATE_ANALYSIS_FAILURE; + } + + /* + * Try to attach to the segment and see if it matches our data directory. + * This avoids any risk of duplicate-shmem-key conflicts on machines that + * are running several postmasters under the same userid. + * + * (When we're called from PGSharedMemoryCreate, this stat call is + * duplicative; but since this isn't a high-traffic case it's not worth + * trying to optimize.) + */ + if (stat(DataDir, &statbuf) < 0) + return SHMSTATE_ANALYSIS_FAILURE; /* can't stat; be conservative */ + + hdr = (PGShmemHeader *) shmat(shmId, attachAt, PG_SHMAT_FLAGS); + if (hdr == (PGShmemHeader *) -1) + { + /* + * Attachment failed. The cases we're interested in are the same as + * for the shmctl() call above. In particular, note that the owning + * postmaster could have terminated and removed the segment between + * shmctl() and shmat(). + * + * If attachAt isn't NULL, it's possible that EINVAL reflects a + * problem with that address not a vanished segment, so it's best to + * pass NULL when probing for conflicting segments. + */ + if (errno == EINVAL) + return SHMSTATE_ENOENT; /* segment disappeared */ + if (errno == EACCES) + return SHMSTATE_FOREIGN; /* must be non-Postgres */ +#ifdef HAVE_LINUX_EIDRM_BUG + if (errno == EIDRM) + return SHMSTATE_ENOENT; /* segment disappeared */ +#endif + /* Otherwise, be conservative. */ + return SHMSTATE_ANALYSIS_FAILURE; + } + *addr = hdr; + + if (hdr->magic != PGShmemMagic || + hdr->device != statbuf.st_dev || + hdr->inode != statbuf.st_ino) + { + /* + * It's either not a Postgres segment, or not one for my data + * directory. + */ + return SHMSTATE_FOREIGN; + } + + /* + * It does match our data directory, so now test whether any processes are + * still attached to it. (We are, now, but the shm_nattch result is from + * before we attached to it.) + */ + return shmStat.shm_nattch == 0 ? SHMSTATE_UNATTACHED : SHMSTATE_ATTACHED; +} + +#ifdef MAP_HUGETLB + +/* + * Identify the huge page size to use, and compute the related mmap flags. + * + * Some Linux kernel versions have a bug causing mmap() to fail on requests + * that are not a multiple of the hugepage size. Versions without that bug + * instead silently round the request up to the next hugepage multiple --- + * and then munmap() fails when we give it a size different from that. + * So we have to round our request up to a multiple of the actual hugepage + * size to avoid trouble. + * + * Doing the round-up ourselves also lets us make use of the extra memory, + * rather than just wasting it. Currently, we just increase the available + * space recorded in the shmem header, which will make the extra usable for + * purposes such as additional locktable entries. Someday, for very large + * hugepage sizes, we might want to think about more invasive strategies, + * such as increasing shared_buffers to absorb the extra space. + * + * Returns the (real, assumed or config provided) page size into *hugepagesize, + * and the hugepage-related mmap flags to use into *mmap_flags. + */ +static void +GetHugePageSize(Size *hugepagesize, int *mmap_flags) +{ + Size default_hugepagesize = 0; + + /* + * System-dependent code to find out the default huge page size. + * + * On Linux, read /proc/meminfo looking for a line like "Hugepagesize: + * nnnn kB". Ignore any failures, falling back to the preset default. + */ +#ifdef __linux__ + + { + FILE *fp = AllocateFile("/proc/meminfo", "r"); + char buf[128]; + unsigned int sz; + char ch; + + if (fp) + { + while (fgets(buf, sizeof(buf), fp)) + { + if (sscanf(buf, "Hugepagesize: %u %c", &sz, &ch) == 2) + { + if (ch == 'k') + { + default_hugepagesize = sz * (Size) 1024; + break; + } + /* We could accept other units besides kB, if needed */ + } + } + FreeFile(fp); + } + } +#endif /* __linux__ */ + + if (huge_page_size != 0) + { + /* If huge page size is requested explicitly, use that. */ + *hugepagesize = (Size) huge_page_size * 1024; + } + else if (default_hugepagesize != 0) + { + /* Otherwise use the system default, if we have it. */ + *hugepagesize = default_hugepagesize; + } + else + { + /* + * If we fail to find out the system's default huge page size, or no + * huge page size is requested explicitly, assume it is 2MB. This will + * work fine when the actual size is less. If it's more, we might get + * mmap() or munmap() failures due to unaligned requests; but at this + * writing, there are no reports of any non-Linux systems being picky + * about that. + */ + *hugepagesize = 2 * 1024 * 1024; + } + + *mmap_flags = MAP_HUGETLB; + + /* + * On recent enough Linux, also include the explicit page size, if + * necessary. + */ +#if defined(MAP_HUGE_MASK) && defined(MAP_HUGE_SHIFT) + if (*hugepagesize != default_hugepagesize) + { + int shift = pg_ceil_log2_64(*hugepagesize); + + *mmap_flags |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT; + } +#endif +} + +#endif /* MAP_HUGETLB */ + +/* + * Creates an anonymous mmap()ed shared memory segment. + * + * Pass the requested size in *size. This function will modify *size to the + * actual size of the allocation, if it ends up allocating a segment that is + * larger than requested. + */ +static void * +CreateAnonymousSegment(Size *size) +{ + Size allocsize = *size; + void *ptr = MAP_FAILED; + int mmap_errno = 0; + +#ifndef MAP_HUGETLB + /* PGSharedMemoryCreate should have dealt with this case */ + Assert(huge_pages != HUGE_PAGES_ON); +#else + if (huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY) + { + /* + * Round up the request size to a suitable large value. + */ + Size hugepagesize; + int mmap_flags; + + GetHugePageSize(&hugepagesize, &mmap_flags); + + if (allocsize % hugepagesize != 0) + allocsize += hugepagesize - (allocsize % hugepagesize); + + ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE, + PG_MMAP_FLAGS | mmap_flags, -1, 0); + mmap_errno = errno; + if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED) + elog(DEBUG1, "mmap(%zu) with MAP_HUGETLB failed, huge pages disabled: %m", + allocsize); + } +#endif + + if (ptr == MAP_FAILED && huge_pages != HUGE_PAGES_ON) + { + /* + * Use the original size, not the rounded-up value, when falling back + * to non-huge pages. + */ + allocsize = *size; + ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE, + PG_MMAP_FLAGS, -1, 0); + mmap_errno = errno; + } + + if (ptr == MAP_FAILED) + { + errno = mmap_errno; + ereport(FATAL, + (errmsg("could not map anonymous shared memory: %m"), + (mmap_errno == ENOMEM) ? + errhint("This error usually means that PostgreSQL's request " + "for a shared memory segment exceeded available memory, " + "swap space, or huge pages. To reduce the request size " + "(currently %zu bytes), reduce PostgreSQL's shared " + "memory usage, perhaps by reducing shared_buffers or " + "max_connections.", + allocsize) : 0)); + } + + *size = allocsize; + return ptr; +} + +/* + * AnonymousShmemDetach --- detach from an anonymous mmap'd block + * (called as an on_shmem_exit callback, hence funny argument list) + */ +static void +AnonymousShmemDetach(int status, Datum arg) +{ + /* Release anonymous shared memory block, if any. */ + if (AnonymousShmem != NULL) + { + if (munmap(AnonymousShmem, AnonymousShmemSize) < 0) + elog(LOG, "munmap(%p, %zu) failed: %m", + AnonymousShmem, AnonymousShmemSize); + AnonymousShmem = NULL; + } +} + +/* + * PGSharedMemoryCreate + * + * Create a shared memory segment of the given size and initialize its + * standard header. Also, register an on_shmem_exit callback to release + * the storage. + * + * Dead Postgres segments pertinent to this DataDir are recycled if found, but + * we do not fail upon collision with foreign shmem segments. The idea here + * is to detect and re-use keys that may have been assigned by a crashed + * postmaster or backend. + */ +PGShmemHeader * +PGSharedMemoryCreate(Size size, + PGShmemHeader **shim) +{ + IpcMemoryKey NextShmemSegID; + void *memAddress; + PGShmemHeader *hdr; + struct stat statbuf; + Size sysvsize; + + /* + * We use the data directory's ID info (inode and device numbers) to + * positively identify shmem segments associated with this data dir, and + * also as seeds for searching for a free shmem key. + */ + if (stat(DataDir, &statbuf) < 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not stat data directory \"%s\": %m", + DataDir))); + + /* Complain if hugepages demanded but we can't possibly support them */ +#if !defined(MAP_HUGETLB) + if (huge_pages == HUGE_PAGES_ON) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("huge pages not supported on this platform"))); +#endif + + /* For now, we don't support huge pages in SysV memory */ + if (huge_pages == HUGE_PAGES_ON && shared_memory_type != SHMEM_TYPE_MMAP) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("huge pages not supported with the current shared_memory_type setting"))); + + /* Room for a header? */ + Assert(size > MAXALIGN(sizeof(PGShmemHeader))); + + if (shared_memory_type == SHMEM_TYPE_MMAP) + { + AnonymousShmem = CreateAnonymousSegment(&size); + AnonymousShmemSize = size; + + /* Register on-exit routine to unmap the anonymous segment */ + on_shmem_exit(AnonymousShmemDetach, (Datum) 0); + + /* Now we need only allocate a minimal-sized SysV shmem block. */ + sysvsize = sizeof(PGShmemHeader); + } + else + sysvsize = size; + + /* + * Loop till we find a free IPC key. Trust CreateDataDirLockFile() to + * ensure no more than one postmaster per data directory can enter this + * loop simultaneously. (CreateDataDirLockFile() does not entirely ensure + * that, but prefer fixing it over coping here.) + */ + NextShmemSegID = statbuf.st_ino; + + for (;;) + { + IpcMemoryId shmid; + PGShmemHeader *oldhdr; + IpcMemoryState state; + + /* Try to create new segment */ + memAddress = InternalIpcMemoryCreate(NextShmemSegID, sysvsize); + if (memAddress) + break; /* successful create and attach */ + + /* Check shared memory and possibly remove and recreate */ + + /* + * shmget() failure is typically EACCES, hence SHMSTATE_FOREIGN. + * ENOENT, a narrow possibility, implies SHMSTATE_ENOENT, but one can + * safely treat SHMSTATE_ENOENT like SHMSTATE_FOREIGN. + */ + shmid = shmget(NextShmemSegID, sizeof(PGShmemHeader), 0); + if (shmid < 0) + { + oldhdr = NULL; + state = SHMSTATE_FOREIGN; + } + else + state = PGSharedMemoryAttach(shmid, NULL, &oldhdr); + + switch (state) + { + case SHMSTATE_ANALYSIS_FAILURE: + case SHMSTATE_ATTACHED: + ereport(FATAL, + (errcode(ERRCODE_LOCK_FILE_EXISTS), + errmsg("pre-existing shared memory block (key %lu, ID %lu) is still in use", + (unsigned long) NextShmemSegID, + (unsigned long) shmid), + errhint("Terminate any old server processes associated with data directory \"%s\".", + DataDir))); + break; + case SHMSTATE_ENOENT: + + /* + * To our surprise, some other process deleted since our last + * InternalIpcMemoryCreate(). Moments earlier, we would have + * seen SHMSTATE_FOREIGN. Try that same ID again. + */ + elog(LOG, + "shared memory block (key %lu, ID %lu) deleted during startup", + (unsigned long) NextShmemSegID, + (unsigned long) shmid); + break; + case SHMSTATE_FOREIGN: + NextShmemSegID++; + break; + case SHMSTATE_UNATTACHED: + + /* + * The segment pertains to DataDir, and every process that had + * used it has died or detached. Zap it, if possible, and any + * associated dynamic shared memory segments, as well. This + * shouldn't fail, but if it does, assume the segment belongs + * to someone else after all, and try the next candidate. + * Otherwise, try again to create the segment. That may fail + * if some other process creates the same shmem key before we + * do, in which case we'll try the next key. + */ + if (oldhdr->dsm_control != 0) + dsm_cleanup_using_control_segment(oldhdr->dsm_control); + if (shmctl(shmid, IPC_RMID, NULL) < 0) + NextShmemSegID++; + break; + } + + if (oldhdr && shmdt(oldhdr) < 0) + elog(LOG, "shmdt(%p) failed: %m", oldhdr); + } + + /* Initialize new segment. */ + hdr = (PGShmemHeader *) memAddress; + hdr->creatorPID = getpid(); + hdr->magic = PGShmemMagic; + hdr->dsm_control = 0; + + /* Fill in the data directory ID info, too */ + hdr->device = statbuf.st_dev; + hdr->inode = statbuf.st_ino; + + /* + * Initialize space allocation status for segment. + */ + hdr->totalsize = size; + hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader)); + *shim = hdr; + + /* Save info for possible future use */ + UsedShmemSegAddr = memAddress; + UsedShmemSegID = (unsigned long) NextShmemSegID; + + /* + * If AnonymousShmem is NULL here, then we're not using anonymous shared + * memory, and should return a pointer to the System V shared memory + * block. Otherwise, the System V shared memory block is only a shim, and + * we must return a pointer to the real block. + */ + if (AnonymousShmem == NULL) + return hdr; + memcpy(AnonymousShmem, hdr, sizeof(PGShmemHeader)); + return (PGShmemHeader *) AnonymousShmem; +} + +#ifdef EXEC_BACKEND + +/* + * PGSharedMemoryReAttach + * + * This is called during startup of a postmaster child process to re-attach to + * an already existing shared memory segment. This is needed only in the + * EXEC_BACKEND case; otherwise postmaster children inherit the shared memory + * segment attachment via fork(). + * + * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this + * routine. The caller must have already restored them to the postmaster's + * values. + */ +void +PGSharedMemoryReAttach(void) +{ + IpcMemoryId shmid; + PGShmemHeader *hdr; + IpcMemoryState state; + void *origUsedShmemSegAddr = UsedShmemSegAddr; + + Assert(UsedShmemSegAddr != NULL); + Assert(IsUnderPostmaster); + +#ifdef __CYGWIN__ + /* cygipc (currently) appears to not detach on exec. */ + PGSharedMemoryDetach(); + UsedShmemSegAddr = origUsedShmemSegAddr; +#endif + + elog(DEBUG3, "attaching to %p", UsedShmemSegAddr); + shmid = shmget(UsedShmemSegID, sizeof(PGShmemHeader), 0); + if (shmid < 0) + state = SHMSTATE_FOREIGN; + else + state = PGSharedMemoryAttach(shmid, UsedShmemSegAddr, &hdr); + if (state != SHMSTATE_ATTACHED) + elog(FATAL, "could not reattach to shared memory (key=%d, addr=%p): %m", + (int) UsedShmemSegID, UsedShmemSegAddr); + if (hdr != origUsedShmemSegAddr) + elog(FATAL, "reattaching to shared memory returned unexpected address (got %p, expected %p)", + hdr, origUsedShmemSegAddr); + dsm_set_control_handle(hdr->dsm_control); + + UsedShmemSegAddr = hdr; /* probably redundant */ +} + +/* + * PGSharedMemoryNoReAttach + * + * This is called during startup of a postmaster child process when we choose + * *not* to re-attach to the existing shared memory segment. We must clean up + * to leave things in the appropriate state. This is not used in the non + * EXEC_BACKEND case, either. + * + * The child process startup logic might or might not call PGSharedMemoryDetach + * after this; make sure that it will be a no-op if called. + * + * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this + * routine. The caller must have already restored them to the postmaster's + * values. + */ +void +PGSharedMemoryNoReAttach(void) +{ + Assert(UsedShmemSegAddr != NULL); + Assert(IsUnderPostmaster); + +#ifdef __CYGWIN__ + /* cygipc (currently) appears to not detach on exec. */ + PGSharedMemoryDetach(); +#endif + + /* For cleanliness, reset UsedShmemSegAddr to show we're not attached. */ + UsedShmemSegAddr = NULL; + /* And the same for UsedShmemSegID. */ + UsedShmemSegID = 0; +} + +#endif /* EXEC_BACKEND */ + +/* + * PGSharedMemoryDetach + * + * Detach from the shared memory segment, if still attached. This is not + * intended to be called explicitly by the process that originally created the + * segment (it will have on_shmem_exit callback(s) registered to do that). + * Rather, this is for subprocesses that have inherited an attachment and want + * to get rid of it. + * + * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this + * routine, also AnonymousShmem and AnonymousShmemSize. + */ +void +PGSharedMemoryDetach(void) +{ + if (UsedShmemSegAddr != NULL) + { + if ((shmdt(UsedShmemSegAddr) < 0) +#if defined(EXEC_BACKEND) && defined(__CYGWIN__) + /* Work-around for cygipc exec bug */ + && shmdt(NULL) < 0 +#endif + ) + elog(LOG, "shmdt(%p) failed: %m", UsedShmemSegAddr); + UsedShmemSegAddr = NULL; + } + + if (AnonymousShmem != NULL) + { + if (munmap(AnonymousShmem, AnonymousShmemSize) < 0) + elog(LOG, "munmap(%p, %zu) failed: %m", + AnonymousShmem, AnonymousShmemSize); + AnonymousShmem = NULL; + } +} |