diff options
| author | vvvv <[email protected]> | 2024-11-07 12:29:36 +0300 |
|---|---|---|
| committer | vvvv <[email protected]> | 2024-11-07 13:49:47 +0300 |
| commit | d4c258e9431675bab6745c8638df6e3dfd4dca6b (patch) | |
| tree | b5efcfa11351152a4c872fccaea35749141c0b11 /yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical | |
| parent | 13a4f274caef5cfdaf0263b24e4d6bdd5521472b (diff) | |
Moved other yql/essentials libs YQL-19206
init
commit_hash:7d4c435602078407bbf20dd3c32f9c90d2bbcbc0
Diffstat (limited to 'yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical')
13 files changed, 24640 insertions, 0 deletions
diff --git a/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/applyparallelworker.c b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/applyparallelworker.c new file mode 100644 index 00000000000..889e5f1edf3 --- /dev/null +++ b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/applyparallelworker.c @@ -0,0 +1,1642 @@ +/*------------------------------------------------------------------------- + * applyparallelworker.c + * Support routines for applying xact by parallel apply worker + * + * Copyright (c) 2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/applyparallelworker.c + * + * This file contains the code to launch, set up, and teardown a parallel apply + * worker which receives the changes from the leader worker and invokes routines + * to apply those on the subscriber database. Additionally, this file contains + * routines that are intended to support setting up, using, and tearing down a + * ParallelApplyWorkerInfo which is required so the leader worker and parallel + * apply workers can communicate with each other. + * + * The parallel apply workers are assigned (if available) as soon as xact's + * first stream is received for subscriptions that have set their 'streaming' + * option as parallel. The leader apply worker will send changes to this new + * worker via shared memory. We keep this worker assigned till the transaction + * commit is received and also wait for the worker to finish at commit. This + * preserves commit ordering and avoid file I/O in most cases, although we + * still need to spill to a file if there is no worker available. See comments + * atop logical/worker to know more about streamed xacts whose changes are + * spilled to disk. It is important to maintain commit order to avoid failures + * due to: (a) transaction dependencies - say if we insert a row in the first + * transaction and update it in the second transaction on publisher then + * allowing the subscriber to apply both in parallel can lead to failure in the + * update; (b) deadlocks - allowing transactions that update the same set of + * rows/tables in the opposite order to be applied in parallel can lead to + * deadlocks. + * + * A worker pool is used to avoid restarting workers for each streaming + * transaction. We maintain each worker's information (ParallelApplyWorkerInfo) + * in the ParallelApplyWorkerPool. After successfully launching a new worker, + * its information is added to the ParallelApplyWorkerPool. Once the worker + * finishes applying the transaction, it is marked as available for re-use. + * Now, before starting a new worker to apply the streaming transaction, we + * check the list for any available worker. Note that we retain a maximum of + * half the max_parallel_apply_workers_per_subscription workers in the pool and + * after that, we simply exit the worker after applying the transaction. + * + * XXX This worker pool threshold is arbitrary and we can provide a GUC + * variable for this in the future if required. + * + * The leader apply worker will create a separate dynamic shared memory segment + * when each parallel apply worker starts. The reason for this design is that + * we cannot predict how many workers will be needed. It may be possible to + * allocate enough shared memory in one segment based on the maximum number of + * parallel apply workers (max_parallel_apply_workers_per_subscription), but + * this would waste memory if no process is actually started. + * + * The dynamic shared memory segment contains: (a) a shm_mq that is used to + * send changes in the transaction from leader apply worker to parallel apply + * worker; (b) another shm_mq that is used to send errors (and other messages + * reported via elog/ereport) from the parallel apply worker to leader apply + * worker; (c) necessary information to be shared among parallel apply workers + * and the leader apply worker (i.e. members of ParallelApplyWorkerShared). + * + * Locking Considerations + * ---------------------- + * We have a risk of deadlock due to concurrently applying the transactions in + * parallel mode that were independent on the publisher side but became + * dependent on the subscriber side due to the different database structures + * (like schema of subscription tables, constraints, etc.) on each side. This + * can happen even without parallel mode when there are concurrent operations + * on the subscriber. In order to detect the deadlocks among leader (LA) and + * parallel apply (PA) workers, we used lmgr locks when the PA waits for the + * next stream (set of changes) and LA waits for PA to finish the transaction. + * An alternative approach could be to not allow parallelism when the schema of + * tables is different between the publisher and subscriber but that would be + * too restrictive and would require the publisher to send much more + * information than it is currently sending. + * + * Consider a case where the subscribed table does not have a unique key on the + * publisher and has a unique key on the subscriber. The deadlock can happen in + * the following ways: + * + * 1) Deadlock between the leader apply worker and a parallel apply worker + * + * Consider that the parallel apply worker (PA) is executing TX-1 and the + * leader apply worker (LA) is executing TX-2 concurrently on the subscriber. + * Now, LA is waiting for PA because of the unique key constraint of the + * subscribed table while PA is waiting for LA to send the next stream of + * changes or transaction finish command message. + * + * In order for lmgr to detect this, we have LA acquire a session lock on the + * remote transaction (by pa_lock_stream()) and have PA wait on the lock before + * trying to receive the next stream of changes. Specifically, LA will acquire + * the lock in AccessExclusive mode before sending the STREAM_STOP and will + * release it if already acquired after sending the STREAM_START, STREAM_ABORT + * (for toplevel transaction), STREAM_PREPARE, and STREAM_COMMIT. The PA will + * acquire the lock in AccessShare mode after processing STREAM_STOP and + * STREAM_ABORT (for subtransaction) and then release the lock immediately + * after acquiring it. + * + * The lock graph for the above example will look as follows: + * LA (waiting to acquire the lock on the unique index) -> PA (waiting to + * acquire the stream lock) -> LA + * + * This way, when PA is waiting for LA for the next stream of changes, we can + * have a wait-edge from PA to LA in lmgr, which will make us detect the + * deadlock between LA and PA. + * + * 2) Deadlock between the leader apply worker and parallel apply workers + * + * This scenario is similar to the first case but TX-1 and TX-2 are executed by + * two parallel apply workers (PA-1 and PA-2 respectively). In this scenario, + * PA-2 is waiting for PA-1 to complete its transaction while PA-1 is waiting + * for subsequent input from LA. Also, LA is waiting for PA-2 to complete its + * transaction in order to preserve the commit order. There is a deadlock among + * the three processes. + * + * In order for lmgr to detect this, we have PA acquire a session lock (this is + * a different lock than referred in the previous case, see + * pa_lock_transaction()) on the transaction being applied and have LA wait on + * the lock before proceeding in the transaction finish commands. Specifically, + * PA will acquire this lock in AccessExclusive mode before executing the first + * message of the transaction and release it at the xact end. LA will acquire + * this lock in AccessShare mode at transaction finish commands and release it + * immediately. + * + * The lock graph for the above example will look as follows: + * LA (waiting to acquire the transaction lock) -> PA-2 (waiting to acquire the + * lock due to unique index constraint) -> PA-1 (waiting to acquire the stream + * lock) -> LA + * + * This way when LA is waiting to finish the transaction end command to preserve + * the commit order, we will be able to detect deadlock, if any. + * + * One might think we can use XactLockTableWait(), but XactLockTableWait() + * considers PREPARED TRANSACTION as still in progress which means the lock + * won't be released even after the parallel apply worker has prepared the + * transaction. + * + * 3) Deadlock when the shm_mq buffer is full + * + * In the previous scenario (ie. PA-1 and PA-2 are executing transactions + * concurrently), if the shm_mq buffer between LA and PA-2 is full, LA has to + * wait to send messages, and this wait doesn't appear in lmgr. + * + * To avoid this wait, we use a non-blocking write and wait with a timeout. If + * the timeout is exceeded, the LA will serialize all the pending messages to + * a file and indicate PA-2 that it needs to read that file for the remaining + * messages. Then LA will start waiting for commit as in the previous case + * which will detect deadlock if any. See pa_send_data() and + * enum TransApplyAction. + * + * Lock types + * ---------- + * Both the stream lock and the transaction lock mentioned above are + * session-level locks because both locks could be acquired outside the + * transaction, and the stream lock in the leader needs to persist across + * transaction boundaries i.e. until the end of the streaming transaction. + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "libpq/pqformat.h" +#include "libpq/pqmq.h" +#include "pgstat.h" +#include "postmaster/interrupt.h" +#include "replication/logicallauncher.h" +#include "replication/logicalworker.h" +#include "replication/origin.h" +#include "replication/worker_internal.h" +#include "storage/ipc.h" +#include "storage/lmgr.h" +#include "tcop/tcopprot.h" +#include "utils/inval.h" +#include "utils/memutils.h" +#include "utils/syscache.h" + +#define PG_LOGICAL_APPLY_SHM_MAGIC 0x787ca067 + +/* + * DSM keys for parallel apply worker. Unlike other parallel execution code, + * since we don't need to worry about DSM keys conflicting with plan_node_id we + * can use small integers. + */ +#define PARALLEL_APPLY_KEY_SHARED 1 +#define PARALLEL_APPLY_KEY_MQ 2 +#define PARALLEL_APPLY_KEY_ERROR_QUEUE 3 + +/* Queue size of DSM, 16 MB for now. */ +#define DSM_QUEUE_SIZE (16 * 1024 * 1024) + +/* + * Error queue size of DSM. It is desirable to make it large enough that a + * typical ErrorResponse can be sent without blocking. That way, a worker that + * errors out can write the whole message into the queue and terminate without + * waiting for the user backend. + */ +#define DSM_ERROR_QUEUE_SIZE (16 * 1024) + +/* + * There are three fields in each message received by the parallel apply + * worker: start_lsn, end_lsn and send_time. Because we have updated these + * statistics in the leader apply worker, we can ignore these fields in the + * parallel apply worker (see function LogicalRepApplyLoop). + */ +#define SIZE_STATS_MESSAGE (2 * sizeof(XLogRecPtr) + sizeof(TimestampTz)) + +/* + * The type of session-level lock on a transaction being applied on a logical + * replication subscriber. + */ +#define PARALLEL_APPLY_LOCK_STREAM 0 +#define PARALLEL_APPLY_LOCK_XACT 1 + +/* + * Hash table entry to map xid to the parallel apply worker state. + */ +typedef struct ParallelApplyWorkerEntry +{ + TransactionId xid; /* Hash key -- must be first */ + ParallelApplyWorkerInfo *winfo; +} ParallelApplyWorkerEntry; + +/* + * A hash table used to cache the state of streaming transactions being applied + * by the parallel apply workers. + */ +static __thread HTAB *ParallelApplyTxnHash = NULL; + +/* +* A list (pool) of active parallel apply workers. The information for +* the new worker is added to the list after successfully launching it. The +* list entry is removed if there are already enough workers in the worker +* pool at the end of the transaction. For more information about the worker +* pool, see comments atop this file. + */ +static __thread List *ParallelApplyWorkerPool = NIL; + +/* + * Information shared between leader apply worker and parallel apply worker. + */ +__thread ParallelApplyWorkerShared *MyParallelShared = NULL; + +/* + * Is there a message sent by a parallel apply worker that the leader apply + * worker needs to receive? + */ +__thread volatile sig_atomic_t ParallelApplyMessagePending = false; + +/* + * Cache the parallel apply worker information required for applying the + * current streaming transaction. It is used to save the cost of searching the + * hash table when applying the changes between STREAM_START and STREAM_STOP. + */ +static __thread ParallelApplyWorkerInfo *stream_apply_worker = NULL; + +/* A list to maintain subtransactions, if any. */ +static __thread List *subxactlist = NIL; + +static void pa_free_worker_info(ParallelApplyWorkerInfo *winfo); +static ParallelTransState pa_get_xact_state(ParallelApplyWorkerShared *wshared); +static PartialFileSetState pa_get_fileset_state(void); + +/* + * Returns true if it is OK to start a parallel apply worker, false otherwise. + */ +static bool +pa_can_start(void) +{ + /* Only leader apply workers can start parallel apply workers. */ + if (!am_leader_apply_worker()) + return false; + + /* + * It is good to check for any change in the subscription parameter to + * avoid the case where for a very long time the change doesn't get + * reflected. This can happen when there is a constant flow of streaming + * transactions that are handled by parallel apply workers. + * + * It is better to do it before the below checks so that the latest values + * of subscription can be used for the checks. + */ + maybe_reread_subscription(); + + /* + * Don't start a new parallel apply worker if the subscription is not + * using parallel streaming mode, or if the publisher does not support + * parallel apply. + */ + if (!MyLogicalRepWorker->parallel_apply) + return false; + + /* + * Don't start a new parallel worker if user has set skiplsn as it's + * possible that they want to skip the streaming transaction. For + * streaming transactions, we need to serialize the transaction to a file + * so that we can get the last LSN of the transaction to judge whether to + * skip before starting to apply the change. + * + * One might think that we could allow parallelism if the first lsn of the + * transaction is greater than skiplsn, but we don't send it with the + * STREAM START message, and it doesn't seem worth sending the extra eight + * bytes with the STREAM START to enable parallelism for this case. + */ + if (!XLogRecPtrIsInvalid(MySubscription->skiplsn)) + return false; + + /* + * For streaming transactions that are being applied using a parallel + * apply worker, we cannot decide whether to apply the change for a + * relation that is not in the READY state (see + * should_apply_changes_for_rel) as we won't know remote_final_lsn by that + * time. So, we don't start the new parallel apply worker in this case. + */ + if (!AllTablesyncsReady()) + return false; + + return true; +} + +/* + * Set up a dynamic shared memory segment. + * + * We set up a control region that contains a fixed-size worker info + * (ParallelApplyWorkerShared), a message queue, and an error queue. + * + * Returns true on success, false on failure. + */ +static bool +pa_setup_dsm(ParallelApplyWorkerInfo *winfo) +{ + shm_toc_estimator e; + Size segsize; + dsm_segment *seg; + shm_toc *toc; + ParallelApplyWorkerShared *shared; + shm_mq *mq; + Size queue_size = DSM_QUEUE_SIZE; + Size error_queue_size = DSM_ERROR_QUEUE_SIZE; + + /* + * Estimate how much shared memory we need. + * + * Because the TOC machinery may choose to insert padding of oddly-sized + * requests, we must estimate each chunk separately. + * + * We need one key to register the location of the header, and two other + * keys to track the locations of the message queue and the error message + * queue. + */ + shm_toc_initialize_estimator(&e); + shm_toc_estimate_chunk(&e, sizeof(ParallelApplyWorkerShared)); + shm_toc_estimate_chunk(&e, queue_size); + shm_toc_estimate_chunk(&e, error_queue_size); + + shm_toc_estimate_keys(&e, 3); + segsize = shm_toc_estimate(&e); + + /* Create the shared memory segment and establish a table of contents. */ + seg = dsm_create(shm_toc_estimate(&e), 0); + if (!seg) + return false; + + toc = shm_toc_create(PG_LOGICAL_APPLY_SHM_MAGIC, dsm_segment_address(seg), + segsize); + + /* Set up the header region. */ + shared = shm_toc_allocate(toc, sizeof(ParallelApplyWorkerShared)); + SpinLockInit(&shared->mutex); + + shared->xact_state = PARALLEL_TRANS_UNKNOWN; + pg_atomic_init_u32(&(shared->pending_stream_count), 0); + shared->last_commit_end = InvalidXLogRecPtr; + shared->fileset_state = FS_EMPTY; + + shm_toc_insert(toc, PARALLEL_APPLY_KEY_SHARED, shared); + + /* Set up message queue for the worker. */ + mq = shm_mq_create(shm_toc_allocate(toc, queue_size), queue_size); + shm_toc_insert(toc, PARALLEL_APPLY_KEY_MQ, mq); + shm_mq_set_sender(mq, MyProc); + + /* Attach the queue. */ + winfo->mq_handle = shm_mq_attach(mq, seg, NULL); + + /* Set up error queue for the worker. */ + mq = shm_mq_create(shm_toc_allocate(toc, error_queue_size), + error_queue_size); + shm_toc_insert(toc, PARALLEL_APPLY_KEY_ERROR_QUEUE, mq); + shm_mq_set_receiver(mq, MyProc); + + /* Attach the queue. */ + winfo->error_mq_handle = shm_mq_attach(mq, seg, NULL); + + /* Return results to caller. */ + winfo->dsm_seg = seg; + winfo->shared = shared; + + return true; +} + +/* + * Try to get a parallel apply worker from the pool. If none is available then + * start a new one. + */ +static ParallelApplyWorkerInfo * +pa_launch_parallel_worker(void) +{ + MemoryContext oldcontext; + bool launched; + ParallelApplyWorkerInfo *winfo; + ListCell *lc; + + /* Try to get an available parallel apply worker from the worker pool. */ + foreach(lc, ParallelApplyWorkerPool) + { + winfo = (ParallelApplyWorkerInfo *) lfirst(lc); + + if (!winfo->in_use) + return winfo; + } + + /* + * Start a new parallel apply worker. + * + * The worker info can be used for the lifetime of the worker process, so + * create it in a permanent context. + */ + oldcontext = MemoryContextSwitchTo(ApplyContext); + + winfo = (ParallelApplyWorkerInfo *) palloc0(sizeof(ParallelApplyWorkerInfo)); + + /* Setup shared memory. */ + if (!pa_setup_dsm(winfo)) + { + MemoryContextSwitchTo(oldcontext); + pfree(winfo); + return NULL; + } + + launched = logicalrep_worker_launch(MyLogicalRepWorker->dbid, + MySubscription->oid, + MySubscription->name, + MyLogicalRepWorker->userid, + InvalidOid, + dsm_segment_handle(winfo->dsm_seg)); + + if (launched) + { + ParallelApplyWorkerPool = lappend(ParallelApplyWorkerPool, winfo); + } + else + { + pa_free_worker_info(winfo); + winfo = NULL; + } + + MemoryContextSwitchTo(oldcontext); + + return winfo; +} + +/* + * Allocate a parallel apply worker that will be used for the specified xid. + * + * We first try to get an available worker from the pool, if any and then try + * to launch a new worker. On successful allocation, remember the worker + * information in the hash table so that we can get it later for processing the + * streaming changes. + */ +void +pa_allocate_worker(TransactionId xid) +{ + bool found; + ParallelApplyWorkerInfo *winfo = NULL; + ParallelApplyWorkerEntry *entry; + + if (!pa_can_start()) + return; + + winfo = pa_launch_parallel_worker(); + if (!winfo) + return; + + /* First time through, initialize parallel apply worker state hashtable. */ + if (!ParallelApplyTxnHash) + { + HASHCTL ctl; + + MemSet(&ctl, 0, sizeof(ctl)); + ctl.keysize = sizeof(TransactionId); + ctl.entrysize = sizeof(ParallelApplyWorkerEntry); + ctl.hcxt = ApplyContext; + + ParallelApplyTxnHash = hash_create("logical replication parallel apply workers hash", + 16, &ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + } + + /* Create an entry for the requested transaction. */ + entry = hash_search(ParallelApplyTxnHash, &xid, HASH_ENTER, &found); + if (found) + elog(ERROR, "hash table corrupted"); + + /* Update the transaction information in shared memory. */ + SpinLockAcquire(&winfo->shared->mutex); + winfo->shared->xact_state = PARALLEL_TRANS_UNKNOWN; + winfo->shared->xid = xid; + SpinLockRelease(&winfo->shared->mutex); + + winfo->in_use = true; + winfo->serialize_changes = false; + entry->winfo = winfo; + entry->xid = xid; +} + +/* + * Find the assigned worker for the given transaction, if any. + */ +ParallelApplyWorkerInfo * +pa_find_worker(TransactionId xid) +{ + bool found; + ParallelApplyWorkerEntry *entry; + + if (!TransactionIdIsValid(xid)) + return NULL; + + if (!ParallelApplyTxnHash) + return NULL; + + /* Return the cached parallel apply worker if valid. */ + if (stream_apply_worker) + return stream_apply_worker; + + /* Find an entry for the requested transaction. */ + entry = hash_search(ParallelApplyTxnHash, &xid, HASH_FIND, &found); + if (found) + { + /* The worker must not have exited. */ + Assert(entry->winfo->in_use); + return entry->winfo; + } + + return NULL; +} + +/* + * Makes the worker available for reuse. + * + * This removes the parallel apply worker entry from the hash table so that it + * can't be used. If there are enough workers in the pool, it stops the worker + * and frees the corresponding info. Otherwise it just marks the worker as + * available for reuse. + * + * For more information about the worker pool, see comments atop this file. + */ +static void +pa_free_worker(ParallelApplyWorkerInfo *winfo) +{ + Assert(!am_parallel_apply_worker()); + Assert(winfo->in_use); + Assert(pa_get_xact_state(winfo->shared) == PARALLEL_TRANS_FINISHED); + + if (!hash_search(ParallelApplyTxnHash, &winfo->shared->xid, HASH_REMOVE, NULL)) + elog(ERROR, "hash table corrupted"); + + /* + * Stop the worker if there are enough workers in the pool. + * + * XXX Additionally, we also stop the worker if the leader apply worker + * serialize part of the transaction data due to a send timeout. This is + * because the message could be partially written to the queue and there + * is no way to clean the queue other than resending the message until it + * succeeds. Instead of trying to send the data which anyway would have + * been serialized and then letting the parallel apply worker deal with + * the spurious message, we stop the worker. + */ + if (winfo->serialize_changes || + list_length(ParallelApplyWorkerPool) > + (max_parallel_apply_workers_per_subscription / 2)) + { + logicalrep_pa_worker_stop(winfo); + pa_free_worker_info(winfo); + + return; + } + + winfo->in_use = false; + winfo->serialize_changes = false; +} + +/* + * Free the parallel apply worker information and unlink the files with + * serialized changes if any. + */ +static void +pa_free_worker_info(ParallelApplyWorkerInfo *winfo) +{ + Assert(winfo); + + if (winfo->mq_handle) + shm_mq_detach(winfo->mq_handle); + + if (winfo->error_mq_handle) + shm_mq_detach(winfo->error_mq_handle); + + /* Unlink the files with serialized changes. */ + if (winfo->serialize_changes) + stream_cleanup_files(MyLogicalRepWorker->subid, winfo->shared->xid); + + if (winfo->dsm_seg) + dsm_detach(winfo->dsm_seg); + + /* Remove from the worker pool. */ + ParallelApplyWorkerPool = list_delete_ptr(ParallelApplyWorkerPool, winfo); + + pfree(winfo); +} + +/* + * Detach the error queue for all parallel apply workers. + */ +void +pa_detach_all_error_mq(void) +{ + ListCell *lc; + + foreach(lc, ParallelApplyWorkerPool) + { + ParallelApplyWorkerInfo *winfo = (ParallelApplyWorkerInfo *) lfirst(lc); + + if (winfo->error_mq_handle) + { + shm_mq_detach(winfo->error_mq_handle); + winfo->error_mq_handle = NULL; + } + } +} + +/* + * Check if there are any pending spooled messages. + */ +static bool +pa_has_spooled_message_pending() +{ + PartialFileSetState fileset_state; + + fileset_state = pa_get_fileset_state(); + + return (fileset_state != FS_EMPTY); +} + +/* + * Replay the spooled messages once the leader apply worker has finished + * serializing changes to the file. + * + * Returns false if there aren't any pending spooled messages, true otherwise. + */ +static bool +pa_process_spooled_messages_if_required(void) +{ + PartialFileSetState fileset_state; + + fileset_state = pa_get_fileset_state(); + + if (fileset_state == FS_EMPTY) + return false; + + /* + * If the leader apply worker is busy serializing the partial changes then + * acquire the stream lock now and wait for the leader worker to finish + * serializing the changes. Otherwise, the parallel apply worker won't get + * a chance to receive a STREAM_STOP (and acquire the stream lock) until + * the leader had serialized all changes which can lead to undetected + * deadlock. + * + * Note that the fileset state can be FS_SERIALIZE_DONE once the leader + * worker has finished serializing the changes. + */ + if (fileset_state == FS_SERIALIZE_IN_PROGRESS) + { + pa_lock_stream(MyParallelShared->xid, AccessShareLock); + pa_unlock_stream(MyParallelShared->xid, AccessShareLock); + + fileset_state = pa_get_fileset_state(); + } + + /* + * We cannot read the file immediately after the leader has serialized all + * changes to the file because there may still be messages in the memory + * queue. We will apply all spooled messages the next time we call this + * function and that will ensure there are no messages left in the memory + * queue. + */ + if (fileset_state == FS_SERIALIZE_DONE) + { + pa_set_fileset_state(MyParallelShared, FS_READY); + } + else if (fileset_state == FS_READY) + { + apply_spooled_messages(&MyParallelShared->fileset, + MyParallelShared->xid, + InvalidXLogRecPtr); + pa_set_fileset_state(MyParallelShared, FS_EMPTY); + } + + return true; +} + +/* + * Interrupt handler for main loop of parallel apply worker. + */ +static void +ProcessParallelApplyInterrupts(void) +{ + CHECK_FOR_INTERRUPTS(); + + if (ShutdownRequestPending) + { + ereport(LOG, + (errmsg("logical replication parallel apply worker for subscription \"%s\" has finished", + MySubscription->name))); + + proc_exit(0); + } + + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } +} + +/* Parallel apply worker main loop. */ +static void +LogicalParallelApplyLoop(shm_mq_handle *mqh) +{ + shm_mq_result shmq_res; + ErrorContextCallback errcallback; + MemoryContext oldcxt = CurrentMemoryContext; + + /* + * Init the ApplyMessageContext which we clean up after each replication + * protocol message. + */ + ApplyMessageContext = AllocSetContextCreate(ApplyContext, + "ApplyMessageContext", + ALLOCSET_DEFAULT_SIZES); + + /* + * Push apply error context callback. Fields will be filled while applying + * a change. + */ + errcallback.callback = apply_error_callback; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + for (;;) + { + void *data; + Size len; + + ProcessParallelApplyInterrupts(); + + /* Ensure we are reading the data into our memory context. */ + MemoryContextSwitchTo(ApplyMessageContext); + + shmq_res = shm_mq_receive(mqh, &len, &data, true); + + if (shmq_res == SHM_MQ_SUCCESS) + { + StringInfoData s; + int c; + + if (len == 0) + elog(ERROR, "invalid message length"); + + s.cursor = 0; + s.maxlen = -1; + s.data = (char *) data; + s.len = len; + + /* + * The first byte of messages sent from leader apply worker to + * parallel apply workers can only be 'w'. + */ + c = pq_getmsgbyte(&s); + if (c != 'w') + elog(ERROR, "unexpected message \"%c\"", c); + + /* + * Ignore statistics fields that have been updated by the leader + * apply worker. + * + * XXX We can avoid sending the statistics fields from the leader + * apply worker but for that, it needs to rebuild the entire + * message by removing these fields which could be more work than + * simply ignoring these fields in the parallel apply worker. + */ + s.cursor += SIZE_STATS_MESSAGE; + + apply_dispatch(&s); + } + else if (shmq_res == SHM_MQ_WOULD_BLOCK) + { + /* Replay the changes from the file, if any. */ + if (!pa_process_spooled_messages_if_required()) + { + int rc; + + /* Wait for more work. */ + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 1000L, + WAIT_EVENT_LOGICAL_PARALLEL_APPLY_MAIN); + + if (rc & WL_LATCH_SET) + ResetLatch(MyLatch); + } + } + else + { + Assert(shmq_res == SHM_MQ_DETACHED); + + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("lost connection to the logical replication apply worker"))); + } + + MemoryContextReset(ApplyMessageContext); + MemoryContextSwitchTo(oldcxt); + } + + /* Pop the error context stack. */ + error_context_stack = errcallback.previous; + + MemoryContextSwitchTo(oldcxt); +} + +/* + * Make sure the leader apply worker tries to read from our error queue one more + * time. This guards against the case where we exit uncleanly without sending + * an ErrorResponse, for example because some code calls proc_exit directly. + * + * Also explicitly detach from dsm segment to invoke on_dsm_detach callbacks, + * if any. See ParallelWorkerShutdown for details. + */ +static void +pa_shutdown(int code, Datum arg) +{ + SendProcSignal(MyLogicalRepWorker->leader_pid, + PROCSIG_PARALLEL_APPLY_MESSAGE, + InvalidBackendId); + + dsm_detach((dsm_segment *) DatumGetPointer(arg)); +} + +/* + * Parallel apply worker entry point. + */ +void +ParallelApplyWorkerMain(Datum main_arg) +{ + ParallelApplyWorkerShared *shared; + dsm_handle handle; + dsm_segment *seg; + shm_toc *toc; + shm_mq *mq; + shm_mq_handle *mqh; + shm_mq_handle *error_mqh; + RepOriginId originid; + int worker_slot = DatumGetInt32(main_arg); + char originname[NAMEDATALEN]; + + InitializingApplyWorker = true; + + /* Setup signal handling. */ + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGINT, SignalHandlerForShutdownRequest); + pqsignal(SIGTERM, die); + BackgroundWorkerUnblockSignals(); + + /* + * Attach to the dynamic shared memory segment for the parallel apply, and + * find its table of contents. + * + * Like parallel query, we don't need resource owner by this time. See + * ParallelWorkerMain. + */ + memcpy(&handle, MyBgworkerEntry->bgw_extra, sizeof(dsm_handle)); + seg = dsm_attach(handle); + if (!seg) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("could not map dynamic shared memory segment"))); + + toc = shm_toc_attach(PG_LOGICAL_APPLY_SHM_MAGIC, dsm_segment_address(seg)); + if (!toc) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid magic number in dynamic shared memory segment"))); + + /* Look up the shared information. */ + shared = shm_toc_lookup(toc, PARALLEL_APPLY_KEY_SHARED, false); + MyParallelShared = shared; + + /* + * Attach to the message queue. + */ + mq = shm_toc_lookup(toc, PARALLEL_APPLY_KEY_MQ, false); + shm_mq_set_receiver(mq, MyProc); + mqh = shm_mq_attach(mq, seg, NULL); + + /* + * Primary initialization is complete. Now, we can attach to our slot. + * This is to ensure that the leader apply worker does not write data to + * the uninitialized memory queue. + */ + logicalrep_worker_attach(worker_slot); + + /* + * Register the shutdown callback after we are attached to the worker + * slot. This is to ensure that MyLogicalRepWorker remains valid when this + * callback is invoked. + */ + before_shmem_exit(pa_shutdown, PointerGetDatum(seg)); + + SpinLockAcquire(&MyParallelShared->mutex); + MyParallelShared->logicalrep_worker_generation = MyLogicalRepWorker->generation; + MyParallelShared->logicalrep_worker_slot_no = worker_slot; + SpinLockRelease(&MyParallelShared->mutex); + + /* + * Attach to the error queue. + */ + mq = shm_toc_lookup(toc, PARALLEL_APPLY_KEY_ERROR_QUEUE, false); + shm_mq_set_sender(mq, MyProc); + error_mqh = shm_mq_attach(mq, seg, NULL); + + pq_redirect_to_shm_mq(seg, error_mqh); + pq_set_parallel_leader(MyLogicalRepWorker->leader_pid, + InvalidBackendId); + + MyLogicalRepWorker->last_send_time = MyLogicalRepWorker->last_recv_time = + MyLogicalRepWorker->reply_time = 0; + + InitializeApplyWorker(); + + InitializingApplyWorker = false; + + /* Setup replication origin tracking. */ + StartTransactionCommand(); + ReplicationOriginNameForLogicalRep(MySubscription->oid, InvalidOid, + originname, sizeof(originname)); + originid = replorigin_by_name(originname, false); + + /* + * The parallel apply worker doesn't need to monopolize this replication + * origin which was already acquired by its leader process. + */ + replorigin_session_setup(originid, MyLogicalRepWorker->leader_pid); + replorigin_session_origin = originid; + CommitTransactionCommand(); + + /* + * Setup callback for syscache so that we know when something changes in + * the subscription relation state. + */ + CacheRegisterSyscacheCallback(SUBSCRIPTIONRELMAP, + invalidate_syncing_table_states, + (Datum) 0); + + set_apply_error_context_origin(originname); + + LogicalParallelApplyLoop(mqh); + + /* + * The parallel apply worker must not get here because the parallel apply + * worker will only stop when it receives a SIGTERM or SIGINT from the + * leader, or when there is an error. None of these cases will allow the + * code to reach here. + */ + Assert(false); +} + +/* + * Handle receipt of an interrupt indicating a parallel apply worker message. + * + * Note: this is called within a signal handler! All we can do is set a flag + * that will cause the next CHECK_FOR_INTERRUPTS() to invoke + * HandleParallelApplyMessages(). + */ +void +HandleParallelApplyMessageInterrupt(void) +{ + InterruptPending = true; + ParallelApplyMessagePending = true; + SetLatch(MyLatch); +} + +/* + * Handle a single protocol message received from a single parallel apply + * worker. + */ +static void +HandleParallelApplyMessage(StringInfo msg) +{ + char msgtype; + + msgtype = pq_getmsgbyte(msg); + + switch (msgtype) + { + case 'E': /* ErrorResponse */ + { + ErrorData edata; + + /* Parse ErrorResponse. */ + pq_parse_errornotice(msg, &edata); + + /* + * If desired, add a context line to show that this is a + * message propagated from a parallel apply worker. Otherwise, + * it can sometimes be confusing to understand what actually + * happened. + */ + if (edata.context) + edata.context = psprintf("%s\n%s", edata.context, + _("logical replication parallel apply worker")); + else + edata.context = pstrdup(_("logical replication parallel apply worker")); + + /* + * Context beyond that should use the error context callbacks + * that were in effect in LogicalRepApplyLoop(). + */ + error_context_stack = apply_error_context_stack; + + /* + * The actual error must have been reported by the parallel + * apply worker. + */ + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical replication parallel apply worker exited due to error"), + errcontext("%s", edata.context))); + } + + /* + * Don't need to do anything about NoticeResponse and + * NotifyResponse as the logical replication worker doesn't need + * to send messages to the client. + */ + case 'N': + case 'A': + break; + + default: + elog(ERROR, "unrecognized message type received from logical replication parallel apply worker: %c (message length %d bytes)", + msgtype, msg->len); + } +} + +/* + * Handle any queued protocol messages received from parallel apply workers. + */ +void +HandleParallelApplyMessages(void) +{ + ListCell *lc; + MemoryContext oldcontext; + + static __thread MemoryContext hpam_context = NULL; + + /* + * This is invoked from ProcessInterrupts(), and since some of the + * functions it calls contain CHECK_FOR_INTERRUPTS(), there is a potential + * for recursive calls if more signals are received while this runs. It's + * unclear that recursive entry would be safe, and it doesn't seem useful + * even if it is safe, so let's block interrupts until done. + */ + HOLD_INTERRUPTS(); + + /* + * Moreover, CurrentMemoryContext might be pointing almost anywhere. We + * don't want to risk leaking data into long-lived contexts, so let's do + * our work here in a private context that we can reset on each use. + */ + if (!hpam_context) /* first time through? */ + hpam_context = AllocSetContextCreate(TopMemoryContext, + "HandleParallelApplyMessages", + ALLOCSET_DEFAULT_SIZES); + else + MemoryContextReset(hpam_context); + + oldcontext = MemoryContextSwitchTo(hpam_context); + + ParallelApplyMessagePending = false; + + foreach(lc, ParallelApplyWorkerPool) + { + shm_mq_result res; + Size nbytes; + void *data; + ParallelApplyWorkerInfo *winfo = (ParallelApplyWorkerInfo *) lfirst(lc); + + /* + * The leader will detach from the error queue and set it to NULL + * before preparing to stop all parallel apply workers, so we don't + * need to handle error messages anymore. See + * logicalrep_worker_detach. + */ + if (!winfo->error_mq_handle) + continue; + + res = shm_mq_receive(winfo->error_mq_handle, &nbytes, &data, true); + + if (res == SHM_MQ_WOULD_BLOCK) + continue; + else if (res == SHM_MQ_SUCCESS) + { + StringInfoData msg; + + initStringInfo(&msg); + appendBinaryStringInfo(&msg, data, nbytes); + HandleParallelApplyMessage(&msg); + pfree(msg.data); + } + else + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("lost connection to the logical replication parallel apply worker"))); + } + + MemoryContextSwitchTo(oldcontext); + + /* Might as well clear the context on our way out */ + MemoryContextReset(hpam_context); + + RESUME_INTERRUPTS(); +} + +/* + * Send the data to the specified parallel apply worker via shared-memory + * queue. + * + * Returns false if the attempt to send data via shared memory times out, true + * otherwise. + */ +bool +pa_send_data(ParallelApplyWorkerInfo *winfo, Size nbytes, const void *data) +{ + int rc; + shm_mq_result result; + TimestampTz startTime = 0; + + Assert(!IsTransactionState()); + Assert(!winfo->serialize_changes); + + /* + * We don't try to send data to parallel worker for 'immediate' mode. This + * is primarily used for testing purposes. + */ + if (unlikely(debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_IMMEDIATE)) + return false; + +/* + * This timeout is a bit arbitrary but testing revealed that it is sufficient + * to send the message unless the parallel apply worker is waiting on some + * lock or there is a serious resource crunch. See the comments atop this file + * to know why we are using a non-blocking way to send the message. + */ +#define SHM_SEND_RETRY_INTERVAL_MS 1000 +#define SHM_SEND_TIMEOUT_MS (10000 - SHM_SEND_RETRY_INTERVAL_MS) + + for (;;) + { + result = shm_mq_send(winfo->mq_handle, nbytes, data, true, true); + + if (result == SHM_MQ_SUCCESS) + return true; + else if (result == SHM_MQ_DETACHED) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("could not send data to shared-memory queue"))); + + Assert(result == SHM_MQ_WOULD_BLOCK); + + /* Wait before retrying. */ + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + SHM_SEND_RETRY_INTERVAL_MS, + WAIT_EVENT_LOGICAL_APPLY_SEND_DATA); + + if (rc & WL_LATCH_SET) + { + ResetLatch(MyLatch); + CHECK_FOR_INTERRUPTS(); + } + + if (startTime == 0) + startTime = GetCurrentTimestamp(); + else if (TimestampDifferenceExceeds(startTime, GetCurrentTimestamp(), + SHM_SEND_TIMEOUT_MS)) + return false; + } +} + +/* + * Switch to PARTIAL_SERIALIZE mode for the current transaction -- this means + * that the current data and any subsequent data for this transaction will be + * serialized to a file. This is done to prevent possible deadlocks with + * another parallel apply worker (refer to the comments atop this file). + */ +void +pa_switch_to_partial_serialize(ParallelApplyWorkerInfo *winfo, + bool stream_locked) +{ + ereport(LOG, + (errmsg("logical replication apply worker will serialize the remaining changes of remote transaction %u to a file", + winfo->shared->xid))); + + /* + * The parallel apply worker could be stuck for some reason (say waiting + * on some lock by other backend), so stop trying to send data directly to + * it and start serializing data to the file instead. + */ + winfo->serialize_changes = true; + + /* Initialize the stream fileset. */ + stream_start_internal(winfo->shared->xid, true); + + /* + * Acquires the stream lock if not already to make sure that the parallel + * apply worker will wait for the leader to release the stream lock until + * the end of the transaction. + */ + if (!stream_locked) + pa_lock_stream(winfo->shared->xid, AccessExclusiveLock); + + pa_set_fileset_state(winfo->shared, FS_SERIALIZE_IN_PROGRESS); +} + +/* + * Wait until the parallel apply worker's transaction state has reached or + * exceeded the given xact_state. + */ +static void +pa_wait_for_xact_state(ParallelApplyWorkerInfo *winfo, + ParallelTransState xact_state) +{ + for (;;) + { + /* + * Stop if the transaction state has reached or exceeded the given + * xact_state. + */ + if (pa_get_xact_state(winfo->shared) >= xact_state) + break; + + /* Wait to be signalled. */ + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 10L, + WAIT_EVENT_LOGICAL_PARALLEL_APPLY_STATE_CHANGE); + + /* Reset the latch so we don't spin. */ + ResetLatch(MyLatch); + + /* An interrupt may have occurred while we were waiting. */ + CHECK_FOR_INTERRUPTS(); + } +} + +/* + * Wait until the parallel apply worker's transaction finishes. + */ +static void +pa_wait_for_xact_finish(ParallelApplyWorkerInfo *winfo) +{ + /* + * Wait until the parallel apply worker set the state to + * PARALLEL_TRANS_STARTED which means it has acquired the transaction + * lock. This is to prevent leader apply worker from acquiring the + * transaction lock earlier than the parallel apply worker. + */ + pa_wait_for_xact_state(winfo, PARALLEL_TRANS_STARTED); + + /* + * Wait for the transaction lock to be released. This is required to + * detect deadlock among leader and parallel apply workers. Refer to the + * comments atop this file. + */ + pa_lock_transaction(winfo->shared->xid, AccessShareLock); + pa_unlock_transaction(winfo->shared->xid, AccessShareLock); + + /* + * Check if the state becomes PARALLEL_TRANS_FINISHED in case the parallel + * apply worker failed while applying changes causing the lock to be + * released. + */ + if (pa_get_xact_state(winfo->shared) != PARALLEL_TRANS_FINISHED) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("lost connection to the logical replication parallel apply worker"))); +} + +/* + * Set the transaction state for a given parallel apply worker. + */ +void +pa_set_xact_state(ParallelApplyWorkerShared *wshared, + ParallelTransState xact_state) +{ + SpinLockAcquire(&wshared->mutex); + wshared->xact_state = xact_state; + SpinLockRelease(&wshared->mutex); +} + +/* + * Get the transaction state for a given parallel apply worker. + */ +static ParallelTransState +pa_get_xact_state(ParallelApplyWorkerShared *wshared) +{ + ParallelTransState xact_state; + + SpinLockAcquire(&wshared->mutex); + xact_state = wshared->xact_state; + SpinLockRelease(&wshared->mutex); + + return xact_state; +} + +/* + * Cache the parallel apply worker information. + */ +void +pa_set_stream_apply_worker(ParallelApplyWorkerInfo *winfo) +{ + stream_apply_worker = winfo; +} + +/* + * Form a unique savepoint name for the streaming transaction. + * + * Note that different subscriptions for publications on different nodes can + * receive same remote xid, so we need to use subscription id along with it. + * + * Returns the name in the supplied buffer. + */ +static void +pa_savepoint_name(Oid suboid, TransactionId xid, char *spname, Size szsp) +{ + snprintf(spname, szsp, "pg_sp_%u_%u", suboid, xid); +} + +/* + * Define a savepoint for a subxact in parallel apply worker if needed. + * + * The parallel apply worker can figure out if a new subtransaction was + * started by checking if the new change arrived with a different xid. In that + * case define a named savepoint, so that we are able to rollback to it + * if required. + */ +void +pa_start_subtrans(TransactionId current_xid, TransactionId top_xid) +{ + if (current_xid != top_xid && + !list_member_xid(subxactlist, current_xid)) + { + MemoryContext oldctx; + char spname[NAMEDATALEN]; + + pa_savepoint_name(MySubscription->oid, current_xid, + spname, sizeof(spname)); + + elog(DEBUG1, "defining savepoint %s in logical replication parallel apply worker", spname); + + /* We must be in transaction block to define the SAVEPOINT. */ + if (!IsTransactionBlock()) + { + if (!IsTransactionState()) + StartTransactionCommand(); + + BeginTransactionBlock(); + CommitTransactionCommand(); + } + + DefineSavepoint(spname); + + /* + * CommitTransactionCommand is needed to start a subtransaction after + * issuing a SAVEPOINT inside a transaction block (see + * StartSubTransaction()). + */ + CommitTransactionCommand(); + + oldctx = MemoryContextSwitchTo(TopTransactionContext); + subxactlist = lappend_xid(subxactlist, current_xid); + MemoryContextSwitchTo(oldctx); + } +} + +/* Reset the list that maintains subtransactions. */ +void +pa_reset_subtrans(void) +{ + /* + * We don't need to free this explicitly as the allocated memory will be + * freed at the transaction end. + */ + subxactlist = NIL; +} + +/* + * Handle STREAM ABORT message when the transaction was applied in a parallel + * apply worker. + */ +void +pa_stream_abort(LogicalRepStreamAbortData *abort_data) +{ + TransactionId xid = abort_data->xid; + TransactionId subxid = abort_data->subxid; + + /* + * Update origin state so we can restart streaming from correct position + * in case of crash. + */ + replorigin_session_origin_lsn = abort_data->abort_lsn; + replorigin_session_origin_timestamp = abort_data->abort_time; + + /* + * If the two XIDs are the same, it's in fact abort of toplevel xact, so + * just free the subxactlist. + */ + if (subxid == xid) + { + pa_set_xact_state(MyParallelShared, PARALLEL_TRANS_FINISHED); + + /* + * Release the lock as we might be processing an empty streaming + * transaction in which case the lock won't be released during + * transaction rollback. + * + * Note that it's ok to release the transaction lock before aborting + * the transaction because even if the parallel apply worker dies due + * to crash or some other reason, such a transaction would still be + * considered aborted. + */ + pa_unlock_transaction(xid, AccessExclusiveLock); + + AbortCurrentTransaction(); + + if (IsTransactionBlock()) + { + EndTransactionBlock(false); + CommitTransactionCommand(); + } + + pa_reset_subtrans(); + + pgstat_report_activity(STATE_IDLE, NULL); + } + else + { + /* OK, so it's a subxact. Rollback to the savepoint. */ + int i; + char spname[NAMEDATALEN]; + + pa_savepoint_name(MySubscription->oid, subxid, spname, sizeof(spname)); + + elog(DEBUG1, "rolling back to savepoint %s in logical replication parallel apply worker", spname); + + /* + * Search the subxactlist, determine the offset tracked for the + * subxact, and truncate the list. + * + * Note that for an empty sub-transaction we won't find the subxid + * here. + */ + for (i = list_length(subxactlist) - 1; i >= 0; i--) + { + TransactionId xid_tmp = lfirst_xid(list_nth_cell(subxactlist, i)); + + if (xid_tmp == subxid) + { + RollbackToSavepoint(spname); + CommitTransactionCommand(); + subxactlist = list_truncate(subxactlist, i); + break; + } + } + } +} + +/* + * Set the fileset state for a particular parallel apply worker. The fileset + * will be set once the leader worker serialized all changes to the file + * so that it can be used by parallel apply worker. + */ +void +pa_set_fileset_state(ParallelApplyWorkerShared *wshared, + PartialFileSetState fileset_state) +{ + SpinLockAcquire(&wshared->mutex); + wshared->fileset_state = fileset_state; + + if (fileset_state == FS_SERIALIZE_DONE) + { + Assert(am_leader_apply_worker()); + Assert(MyLogicalRepWorker->stream_fileset); + wshared->fileset = *MyLogicalRepWorker->stream_fileset; + } + + SpinLockRelease(&wshared->mutex); +} + +/* + * Get the fileset state for the current parallel apply worker. + */ +static PartialFileSetState +pa_get_fileset_state(void) +{ + PartialFileSetState fileset_state; + + Assert(am_parallel_apply_worker()); + + SpinLockAcquire(&MyParallelShared->mutex); + fileset_state = MyParallelShared->fileset_state; + SpinLockRelease(&MyParallelShared->mutex); + + return fileset_state; +} + +/* + * Helper functions to acquire and release a lock for each stream block. + * + * Set locktag_field4 to PARALLEL_APPLY_LOCK_STREAM to indicate that it's a + * stream lock. + * + * Refer to the comments atop this file to see how the stream lock is used. + */ +void +pa_lock_stream(TransactionId xid, LOCKMODE lockmode) +{ + LockApplyTransactionForSession(MyLogicalRepWorker->subid, xid, + PARALLEL_APPLY_LOCK_STREAM, lockmode); +} + +void +pa_unlock_stream(TransactionId xid, LOCKMODE lockmode) +{ + UnlockApplyTransactionForSession(MyLogicalRepWorker->subid, xid, + PARALLEL_APPLY_LOCK_STREAM, lockmode); +} + +/* + * Helper functions to acquire and release a lock for each local transaction + * apply. + * + * Set locktag_field4 to PARALLEL_APPLY_LOCK_XACT to indicate that it's a + * transaction lock. + * + * Note that all the callers must pass a remote transaction ID instead of a + * local transaction ID as xid. This is because the local transaction ID will + * only be assigned while applying the first change in the parallel apply but + * it's possible that the first change in the parallel apply worker is blocked + * by a concurrently executing transaction in another parallel apply worker. We + * can only communicate the local transaction id to the leader after applying + * the first change so it won't be able to wait after sending the xact finish + * command using this lock. + * + * Refer to the comments atop this file to see how the transaction lock is + * used. + */ +void +pa_lock_transaction(TransactionId xid, LOCKMODE lockmode) +{ + LockApplyTransactionForSession(MyLogicalRepWorker->subid, xid, + PARALLEL_APPLY_LOCK_XACT, lockmode); +} + +void +pa_unlock_transaction(TransactionId xid, LOCKMODE lockmode) +{ + UnlockApplyTransactionForSession(MyLogicalRepWorker->subid, xid, + PARALLEL_APPLY_LOCK_XACT, lockmode); +} + +/* + * Decrement the number of pending streaming blocks and wait on the stream lock + * if there is no pending block available. + */ +void +pa_decr_and_wait_stream_block(void) +{ + Assert(am_parallel_apply_worker()); + + /* + * It is only possible to not have any pending stream chunks when we are + * applying spooled messages. + */ + if (pg_atomic_read_u32(&MyParallelShared->pending_stream_count) == 0) + { + if (pa_has_spooled_message_pending()) + return; + + elog(ERROR, "invalid pending streaming chunk 0"); + } + + if (pg_atomic_sub_fetch_u32(&MyParallelShared->pending_stream_count, 1) == 0) + { + pa_lock_stream(MyParallelShared->xid, AccessShareLock); + pa_unlock_stream(MyParallelShared->xid, AccessShareLock); + } +} + +/* + * Finish processing the streaming transaction in the leader apply worker. + */ +void +pa_xact_finish(ParallelApplyWorkerInfo *winfo, XLogRecPtr remote_lsn) +{ + Assert(am_leader_apply_worker()); + + /* + * Unlock the shared object lock so that parallel apply worker can + * continue to receive and apply changes. + */ + pa_unlock_stream(winfo->shared->xid, AccessExclusiveLock); + + /* + * Wait for that worker to finish. This is necessary to maintain commit + * order which avoids failures due to transaction dependencies and + * deadlocks. + */ + pa_wait_for_xact_finish(winfo); + + if (!XLogRecPtrIsInvalid(remote_lsn)) + store_flush_position(remote_lsn, winfo->shared->last_commit_end); + + pa_free_worker(winfo); +} diff --git a/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/decode.c b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/decode.c new file mode 100644 index 00000000000..d91055a4409 --- /dev/null +++ b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/decode.c @@ -0,0 +1,1292 @@ +/* ------------------------------------------------------------------------- + * + * decode.c + * This module decodes WAL records read using xlogreader.h's APIs for the + * purpose of logical decoding by passing information to the + * reorderbuffer module (containing the actual changes) and to the + * snapbuild module to build a fitting catalog snapshot (to be able to + * properly decode the changes in the reorderbuffer). + * + * NOTE: + * This basically tries to handle all low level xlog stuff for + * reorderbuffer.c and snapbuild.c. There's some minor leakage where a + * specific record's struct is used to pass data along, but those just + * happen to contain the right amount of data in a convenient + * format. There isn't and shouldn't be much intelligence about the + * contents of records in here except turning them into a more usable + * format. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/replication/logical/decode.c + * + * ------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/heapam.h" +#include "access/heapam_xlog.h" +#include "access/transam.h" +#include "access/xact.h" +#include "access/xlog_internal.h" +#include "access/xlogreader.h" +#include "access/xlogrecord.h" +#include "access/xlogutils.h" +#include "catalog/pg_control.h" +#include "replication/decode.h" +#include "replication/logical.h" +#include "replication/message.h" +#include "replication/origin.h" +#include "replication/reorderbuffer.h" +#include "replication/snapbuild.h" +#include "storage/standby.h" + +/* individual record(group)'s handlers */ +static void DecodeInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeTruncate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeSpecConfirm(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); + +static void DecodeCommit(LogicalDecodingContext *ctx, XLogRecordBuffer *buf, + xl_xact_parsed_commit *parsed, TransactionId xid, + bool two_phase); +static void DecodeAbort(LogicalDecodingContext *ctx, XLogRecordBuffer *buf, + xl_xact_parsed_abort *parsed, TransactionId xid, + bool two_phase); +static void DecodePrepare(LogicalDecodingContext *ctx, XLogRecordBuffer *buf, + xl_xact_parsed_prepare *parsed); + + +/* common function to decode tuples */ +static void DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tuple); + +/* helper functions for decoding transactions */ +static inline bool FilterPrepare(LogicalDecodingContext *ctx, + TransactionId xid, const char *gid); +static bool DecodeTXNNeedSkip(LogicalDecodingContext *ctx, + XLogRecordBuffer *buf, Oid txn_dbid, + RepOriginId origin_id); + +/* + * Take every XLogReadRecord()ed record and perform the actions required to + * decode it using the output plugin already setup in the logical decoding + * context. + * + * NB: Note that every record's xid needs to be processed by reorderbuffer + * (xids contained in the content of records are not relevant for this rule). + * That means that for records which'd otherwise not go through the + * reorderbuffer ReorderBufferProcessXid() has to be called. We don't want to + * call ReorderBufferProcessXid for each record type by default, because + * e.g. empty xacts can be handled more efficiently if there's no previous + * state for them. + * + * We also support the ability to fast forward thru records, skipping some + * record types completely - see individual record types for details. + */ +void +LogicalDecodingProcessRecord(LogicalDecodingContext *ctx, XLogReaderState *record) +{ + XLogRecordBuffer buf; + TransactionId txid; + RmgrData rmgr; + + buf.origptr = ctx->reader->ReadRecPtr; + buf.endptr = ctx->reader->EndRecPtr; + buf.record = record; + + txid = XLogRecGetTopXid(record); + + /* + * If the top-level xid is valid, we need to assign the subxact to the + * top-level xact. We need to do this for all records, hence we do it + * before the switch. + */ + if (TransactionIdIsValid(txid)) + { + ReorderBufferAssignChild(ctx->reorder, + txid, + XLogRecGetXid(record), + buf.origptr); + } + + rmgr = GetRmgr(XLogRecGetRmid(record)); + + if (rmgr.rm_decode != NULL) + rmgr.rm_decode(ctx, &buf); + else + { + /* just deal with xid, and done */ + ReorderBufferProcessXid(ctx->reorder, XLogRecGetXid(record), + buf.origptr); + } +} + +/* + * Handle rmgr XLOG_ID records for LogicalDecodingProcessRecord(). + */ +void +xlog_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + SnapBuild *builder = ctx->snapshot_builder; + uint8 info = XLogRecGetInfo(buf->record) & ~XLR_INFO_MASK; + + ReorderBufferProcessXid(ctx->reorder, XLogRecGetXid(buf->record), + buf->origptr); + + switch (info) + { + /* this is also used in END_OF_RECOVERY checkpoints */ + case XLOG_CHECKPOINT_SHUTDOWN: + case XLOG_END_OF_RECOVERY: + SnapBuildSerializationPoint(builder, buf->origptr); + + break; + case XLOG_CHECKPOINT_ONLINE: + + /* + * a RUNNING_XACTS record will have been logged near to this, we + * can restart from there. + */ + break; + case XLOG_PARAMETER_CHANGE: + { + xl_parameter_change *xlrec = + (xl_parameter_change *) XLogRecGetData(buf->record); + + /* + * If wal_level on the primary is reduced to less than + * logical, we want to prevent existing logical slots from + * being used. Existing logical slots on the standby get + * invalidated when this WAL record is replayed; and further, + * slot creation fails when wal_level is not sufficient; but + * all these operations are not synchronized, so a logical + * slot may creep in while the wal_level is being reduced. + * Hence this extra check. + */ + if (xlrec->wal_level < WAL_LEVEL_LOGICAL) + { + /* + * This can occur only on a standby, as a primary would + * not allow to restart after changing wal_level < logical + * if there is pre-existing logical slot. + */ + Assert(RecoveryInProgress()); + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical decoding on standby requires wal_level >= logical on the primary"))); + } + break; + } + case XLOG_NOOP: + case XLOG_NEXTOID: + case XLOG_SWITCH: + case XLOG_BACKUP_END: + case XLOG_RESTORE_POINT: + case XLOG_FPW_CHANGE: + case XLOG_FPI_FOR_HINT: + case XLOG_FPI: + case XLOG_OVERWRITE_CONTRECORD: + break; + default: + elog(ERROR, "unexpected RM_XLOG_ID record type: %u", info); + } +} + +/* + * Handle rmgr XACT_ID records for LogicalDecodingProcessRecord(). + */ +void +xact_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + SnapBuild *builder = ctx->snapshot_builder; + ReorderBuffer *reorder = ctx->reorder; + XLogReaderState *r = buf->record; + uint8 info = XLogRecGetInfo(r) & XLOG_XACT_OPMASK; + + /* + * If the snapshot isn't yet fully built, we cannot decode anything, so + * bail out. + */ + if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT) + return; + + switch (info) + { + case XLOG_XACT_COMMIT: + case XLOG_XACT_COMMIT_PREPARED: + { + xl_xact_commit *xlrec; + xl_xact_parsed_commit parsed; + TransactionId xid; + bool two_phase = false; + + xlrec = (xl_xact_commit *) XLogRecGetData(r); + ParseCommitRecord(XLogRecGetInfo(buf->record), xlrec, &parsed); + + if (!TransactionIdIsValid(parsed.twophase_xid)) + xid = XLogRecGetXid(r); + else + xid = parsed.twophase_xid; + + /* + * We would like to process the transaction in a two-phase + * manner iff output plugin supports two-phase commits and + * doesn't filter the transaction at prepare time. + */ + if (info == XLOG_XACT_COMMIT_PREPARED) + two_phase = !(FilterPrepare(ctx, xid, + parsed.twophase_gid)); + + DecodeCommit(ctx, buf, &parsed, xid, two_phase); + break; + } + case XLOG_XACT_ABORT: + case XLOG_XACT_ABORT_PREPARED: + { + xl_xact_abort *xlrec; + xl_xact_parsed_abort parsed; + TransactionId xid; + bool two_phase = false; + + xlrec = (xl_xact_abort *) XLogRecGetData(r); + ParseAbortRecord(XLogRecGetInfo(buf->record), xlrec, &parsed); + + if (!TransactionIdIsValid(parsed.twophase_xid)) + xid = XLogRecGetXid(r); + else + xid = parsed.twophase_xid; + + /* + * We would like to process the transaction in a two-phase + * manner iff output plugin supports two-phase commits and + * doesn't filter the transaction at prepare time. + */ + if (info == XLOG_XACT_ABORT_PREPARED) + two_phase = !(FilterPrepare(ctx, xid, + parsed.twophase_gid)); + + DecodeAbort(ctx, buf, &parsed, xid, two_phase); + break; + } + case XLOG_XACT_ASSIGNMENT: + + /* + * We assign subxact to the toplevel xact while processing each + * record if required. So, we don't need to do anything here. See + * LogicalDecodingProcessRecord. + */ + break; + case XLOG_XACT_INVALIDATIONS: + { + TransactionId xid; + xl_xact_invals *invals; + + xid = XLogRecGetXid(r); + invals = (xl_xact_invals *) XLogRecGetData(r); + + /* + * Execute the invalidations for xid-less transactions, + * otherwise, accumulate them so that they can be processed at + * the commit time. + */ + if (TransactionIdIsValid(xid)) + { + if (!ctx->fast_forward) + ReorderBufferAddInvalidations(reorder, xid, + buf->origptr, + invals->nmsgs, + invals->msgs); + ReorderBufferXidSetCatalogChanges(ctx->reorder, xid, + buf->origptr); + } + else if ((!ctx->fast_forward)) + ReorderBufferImmediateInvalidation(ctx->reorder, + invals->nmsgs, + invals->msgs); + } + break; + case XLOG_XACT_PREPARE: + { + xl_xact_parsed_prepare parsed; + xl_xact_prepare *xlrec; + + /* ok, parse it */ + xlrec = (xl_xact_prepare *) XLogRecGetData(r); + ParsePrepareRecord(XLogRecGetInfo(buf->record), + xlrec, &parsed); + + /* + * We would like to process the transaction in a two-phase + * manner iff output plugin supports two-phase commits and + * doesn't filter the transaction at prepare time. + */ + if (FilterPrepare(ctx, parsed.twophase_xid, + parsed.twophase_gid)) + { + ReorderBufferProcessXid(reorder, parsed.twophase_xid, + buf->origptr); + break; + } + + /* + * Note that if the prepared transaction has locked [user] + * catalog tables exclusively then decoding prepare can block + * till the main transaction is committed because it needs to + * lock the catalog tables. + * + * XXX Now, this can even lead to a deadlock if the prepare + * transaction is waiting to get it logically replicated for + * distributed 2PC. This can be avoided by disallowing + * preparing transactions that have locked [user] catalog + * tables exclusively but as of now, we ask users not to do + * such an operation. + */ + DecodePrepare(ctx, buf, &parsed); + break; + } + default: + elog(ERROR, "unexpected RM_XACT_ID record type: %u", info); + } +} + +/* + * Handle rmgr STANDBY_ID records for LogicalDecodingProcessRecord(). + */ +void +standby_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + SnapBuild *builder = ctx->snapshot_builder; + XLogReaderState *r = buf->record; + uint8 info = XLogRecGetInfo(r) & ~XLR_INFO_MASK; + + ReorderBufferProcessXid(ctx->reorder, XLogRecGetXid(r), buf->origptr); + + switch (info) + { + case XLOG_RUNNING_XACTS: + { + xl_running_xacts *running = (xl_running_xacts *) XLogRecGetData(r); + + SnapBuildProcessRunningXacts(builder, buf->origptr, running); + + /* + * Abort all transactions that we keep track of, that are + * older than the record's oldestRunningXid. This is the most + * convenient spot for doing so since, in contrast to shutdown + * or end-of-recovery checkpoints, we have information about + * all running transactions which includes prepared ones, + * while shutdown checkpoints just know that no non-prepared + * transactions are in progress. + */ + ReorderBufferAbortOld(ctx->reorder, running->oldestRunningXid); + } + break; + case XLOG_STANDBY_LOCK: + break; + case XLOG_INVALIDATIONS: + + /* + * We are processing the invalidations at the command level via + * XLOG_XACT_INVALIDATIONS. So we don't need to do anything here. + */ + break; + default: + elog(ERROR, "unexpected RM_STANDBY_ID record type: %u", info); + } +} + +/* + * Handle rmgr HEAP2_ID records for LogicalDecodingProcessRecord(). + */ +void +heap2_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + uint8 info = XLogRecGetInfo(buf->record) & XLOG_HEAP_OPMASK; + TransactionId xid = XLogRecGetXid(buf->record); + SnapBuild *builder = ctx->snapshot_builder; + + ReorderBufferProcessXid(ctx->reorder, xid, buf->origptr); + + /* + * If we don't have snapshot or we are just fast-forwarding, there is no + * point in decoding changes. + */ + if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT || + ctx->fast_forward) + return; + + switch (info) + { + case XLOG_HEAP2_MULTI_INSERT: + if (!ctx->fast_forward && + SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeMultiInsert(ctx, buf); + break; + case XLOG_HEAP2_NEW_CID: + { + xl_heap_new_cid *xlrec; + + xlrec = (xl_heap_new_cid *) XLogRecGetData(buf->record); + SnapBuildProcessNewCid(builder, xid, buf->origptr, xlrec); + + break; + } + case XLOG_HEAP2_REWRITE: + + /* + * Although these records only exist to serve the needs of logical + * decoding, all the work happens as part of crash or archive + * recovery, so we don't need to do anything here. + */ + break; + + /* + * Everything else here is just low level physical stuff we're not + * interested in. + */ + case XLOG_HEAP2_FREEZE_PAGE: + case XLOG_HEAP2_PRUNE: + case XLOG_HEAP2_VACUUM: + case XLOG_HEAP2_VISIBLE: + case XLOG_HEAP2_LOCK_UPDATED: + break; + default: + elog(ERROR, "unexpected RM_HEAP2_ID record type: %u", info); + } +} + +/* + * Handle rmgr HEAP_ID records for LogicalDecodingProcessRecord(). + */ +void +heap_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + uint8 info = XLogRecGetInfo(buf->record) & XLOG_HEAP_OPMASK; + TransactionId xid = XLogRecGetXid(buf->record); + SnapBuild *builder = ctx->snapshot_builder; + + ReorderBufferProcessXid(ctx->reorder, xid, buf->origptr); + + /* + * If we don't have snapshot or we are just fast-forwarding, there is no + * point in decoding data changes. + */ + if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT || + ctx->fast_forward) + return; + + switch (info) + { + case XLOG_HEAP_INSERT: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeInsert(ctx, buf); + break; + + /* + * Treat HOT update as normal updates. There is no useful + * information in the fact that we could make it a HOT update + * locally and the WAL layout is compatible. + */ + case XLOG_HEAP_HOT_UPDATE: + case XLOG_HEAP_UPDATE: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeUpdate(ctx, buf); + break; + + case XLOG_HEAP_DELETE: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeDelete(ctx, buf); + break; + + case XLOG_HEAP_TRUNCATE: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeTruncate(ctx, buf); + break; + + case XLOG_HEAP_INPLACE: + + /* + * Inplace updates are only ever performed on catalog tuples and + * can, per definition, not change tuple visibility. Since we + * don't decode catalog tuples, we're not interested in the + * record's contents. + * + * In-place updates can be used either by XID-bearing transactions + * (e.g. in CREATE INDEX CONCURRENTLY) or by XID-less + * transactions (e.g. VACUUM). In the former case, the commit + * record will include cache invalidations, so we mark the + * transaction as catalog modifying here. Currently that's + * redundant because the commit will do that as well, but once we + * support decoding in-progress relations, this will be important. + */ + if (!TransactionIdIsValid(xid)) + break; + + (void) SnapBuildProcessChange(builder, xid, buf->origptr); + ReorderBufferXidSetCatalogChanges(ctx->reorder, xid, buf->origptr); + break; + + case XLOG_HEAP_CONFIRM: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeSpecConfirm(ctx, buf); + break; + + case XLOG_HEAP_LOCK: + /* we don't care about row level locks for now */ + break; + + default: + elog(ERROR, "unexpected RM_HEAP_ID record type: %u", info); + break; + } +} + +/* + * Ask output plugin whether we want to skip this PREPARE and send + * this transaction as a regular commit later. + */ +static inline bool +FilterPrepare(LogicalDecodingContext *ctx, TransactionId xid, + const char *gid) +{ + /* + * Skip if decoding of two-phase transactions at PREPARE time is not + * enabled. In that case, all two-phase transactions are considered + * filtered out and will be applied as regular transactions at COMMIT + * PREPARED. + */ + if (!ctx->twophase) + return true; + + /* + * The filter_prepare callback is optional. When not supplied, all + * prepared transactions should go through. + */ + if (ctx->callbacks.filter_prepare_cb == NULL) + return false; + + return filter_prepare_cb_wrapper(ctx, xid, gid); +} + +static inline bool +FilterByOrigin(LogicalDecodingContext *ctx, RepOriginId origin_id) +{ + if (ctx->callbacks.filter_by_origin_cb == NULL) + return false; + + return filter_by_origin_cb_wrapper(ctx, origin_id); +} + +/* + * Handle rmgr LOGICALMSG_ID records for LogicalDecodingProcessRecord(). + */ +void +logicalmsg_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + SnapBuild *builder = ctx->snapshot_builder; + XLogReaderState *r = buf->record; + TransactionId xid = XLogRecGetXid(r); + uint8 info = XLogRecGetInfo(r) & ~XLR_INFO_MASK; + RepOriginId origin_id = XLogRecGetOrigin(r); + Snapshot snapshot = NULL; + xl_logical_message *message; + + if (info != XLOG_LOGICAL_MESSAGE) + elog(ERROR, "unexpected RM_LOGICALMSG_ID record type: %u", info); + + ReorderBufferProcessXid(ctx->reorder, XLogRecGetXid(r), buf->origptr); + + /* + * If we don't have snapshot or we are just fast-forwarding, there is no + * point in decoding messages. + */ + if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT || + ctx->fast_forward) + return; + + message = (xl_logical_message *) XLogRecGetData(r); + + if (message->dbId != ctx->slot->data.database || + FilterByOrigin(ctx, origin_id)) + return; + + if (message->transactional && + !SnapBuildProcessChange(builder, xid, buf->origptr)) + return; + else if (!message->transactional && + (SnapBuildCurrentState(builder) != SNAPBUILD_CONSISTENT || + SnapBuildXactNeedsSkip(builder, buf->origptr))) + return; + + /* + * If this is a non-transactional change, get the snapshot we're expected + * to use. We only get here when the snapshot is consistent, and the + * change is not meant to be skipped. + * + * For transactional changes we don't need a snapshot, we'll use the + * regular snapshot maintained by ReorderBuffer. We just leave it NULL. + */ + if (!message->transactional) + snapshot = SnapBuildGetOrBuildSnapshot(builder); + + ReorderBufferQueueMessage(ctx->reorder, xid, snapshot, buf->endptr, + message->transactional, + message->message, /* first part of message is + * prefix */ + message->message_size, + message->message + message->prefix_size); +} + +/* + * Consolidated commit record handling between the different form of commit + * records. + * + * 'two_phase' indicates that caller wants to process the transaction in two + * phases, first process prepare if not already done and then process + * commit_prepared. + */ +static void +DecodeCommit(LogicalDecodingContext *ctx, XLogRecordBuffer *buf, + xl_xact_parsed_commit *parsed, TransactionId xid, + bool two_phase) +{ + XLogRecPtr origin_lsn = InvalidXLogRecPtr; + TimestampTz commit_time = parsed->xact_time; + RepOriginId origin_id = XLogRecGetOrigin(buf->record); + int i; + + if (parsed->xinfo & XACT_XINFO_HAS_ORIGIN) + { + origin_lsn = parsed->origin_lsn; + commit_time = parsed->origin_timestamp; + } + + SnapBuildCommitTxn(ctx->snapshot_builder, buf->origptr, xid, + parsed->nsubxacts, parsed->subxacts, + parsed->xinfo); + + /* ---- + * Check whether we are interested in this specific transaction, and tell + * the reorderbuffer to forget the content of the (sub-)transactions + * if not. + * + * We can't just use ReorderBufferAbort() here, because we need to execute + * the transaction's invalidations. This currently won't be needed if + * we're just skipping over the transaction because currently we only do + * so during startup, to get to the first transaction the client needs. As + * we have reset the catalog caches before starting to read WAL, and we + * haven't yet touched any catalogs, there can't be anything to invalidate. + * But if we're "forgetting" this commit because it happened in another + * database, the invalidations might be important, because they could be + * for shared catalogs and we might have loaded data into the relevant + * syscaches. + * --- + */ + if (DecodeTXNNeedSkip(ctx, buf, parsed->dbId, origin_id)) + { + for (i = 0; i < parsed->nsubxacts; i++) + { + ReorderBufferForget(ctx->reorder, parsed->subxacts[i], buf->origptr); + } + ReorderBufferForget(ctx->reorder, xid, buf->origptr); + + return; + } + + /* tell the reorderbuffer about the surviving subtransactions */ + for (i = 0; i < parsed->nsubxacts; i++) + { + ReorderBufferCommitChild(ctx->reorder, xid, parsed->subxacts[i], + buf->origptr, buf->endptr); + } + + /* + * Send the final commit record if the transaction data is already + * decoded, otherwise, process the entire transaction. + */ + if (two_phase) + { + ReorderBufferFinishPrepared(ctx->reorder, xid, buf->origptr, buf->endptr, + SnapBuildGetTwoPhaseAt(ctx->snapshot_builder), + commit_time, origin_id, origin_lsn, + parsed->twophase_gid, true); + } + else + { + ReorderBufferCommit(ctx->reorder, xid, buf->origptr, buf->endptr, + commit_time, origin_id, origin_lsn); + } + + /* + * Update the decoding stats at transaction prepare/commit/abort. + * Additionally we send the stats when we spill or stream the changes to + * avoid losing them in case the decoding is interrupted. It is not clear + * that sending more or less frequently than this would be better. + */ + UpdateDecodingStats(ctx); +} + +/* + * Decode PREPARE record. Similar logic as in DecodeCommit. + * + * Note that we don't skip prepare even if have detected concurrent abort + * because it is quite possible that we had already sent some changes before we + * detect abort in which case we need to abort those changes in the subscriber. + * To abort such changes, we do send the prepare and then the rollback prepared + * which is what happened on the publisher-side as well. Now, we can invent a + * new abort API wherein in such cases we send abort and skip sending prepared + * and rollback prepared but then it is not that straightforward because we + * might have streamed this transaction by that time in which case it is + * handled when the rollback is encountered. It is not impossible to optimize + * the concurrent abort case but it can introduce design complexity w.r.t + * handling different cases so leaving it for now as it doesn't seem worth it. + */ +static void +DecodePrepare(LogicalDecodingContext *ctx, XLogRecordBuffer *buf, + xl_xact_parsed_prepare *parsed) +{ + SnapBuild *builder = ctx->snapshot_builder; + XLogRecPtr origin_lsn = parsed->origin_lsn; + TimestampTz prepare_time = parsed->xact_time; + RepOriginId origin_id = XLogRecGetOrigin(buf->record); + int i; + TransactionId xid = parsed->twophase_xid; + + if (parsed->origin_timestamp != 0) + prepare_time = parsed->origin_timestamp; + + /* + * Remember the prepare info for a txn so that it can be used later in + * commit prepared if required. See ReorderBufferFinishPrepared. + */ + if (!ReorderBufferRememberPrepareInfo(ctx->reorder, xid, buf->origptr, + buf->endptr, prepare_time, origin_id, + origin_lsn)) + return; + + /* We can't start streaming unless a consistent state is reached. */ + if (SnapBuildCurrentState(builder) < SNAPBUILD_CONSISTENT) + { + ReorderBufferSkipPrepare(ctx->reorder, xid); + return; + } + + /* + * Check whether we need to process this transaction. See + * DecodeTXNNeedSkip for the reasons why we sometimes want to skip the + * transaction. + * + * We can't call ReorderBufferForget as we did in DecodeCommit as the txn + * hasn't yet been committed, removing this txn before a commit might + * result in the computation of an incorrect restart_lsn. See + * SnapBuildProcessRunningXacts. But we need to process cache + * invalidations if there are any for the reasons mentioned in + * DecodeCommit. + */ + if (DecodeTXNNeedSkip(ctx, buf, parsed->dbId, origin_id)) + { + ReorderBufferSkipPrepare(ctx->reorder, xid); + ReorderBufferInvalidate(ctx->reorder, xid, buf->origptr); + return; + } + + /* Tell the reorderbuffer about the surviving subtransactions. */ + for (i = 0; i < parsed->nsubxacts; i++) + { + ReorderBufferCommitChild(ctx->reorder, xid, parsed->subxacts[i], + buf->origptr, buf->endptr); + } + + /* replay actions of all transaction + subtransactions in order */ + ReorderBufferPrepare(ctx->reorder, xid, parsed->twophase_gid); + + /* + * Update the decoding stats at transaction prepare/commit/abort. + * Additionally we send the stats when we spill or stream the changes to + * avoid losing them in case the decoding is interrupted. It is not clear + * that sending more or less frequently than this would be better. + */ + UpdateDecodingStats(ctx); +} + + +/* + * Get the data from the various forms of abort records and pass it on to + * snapbuild.c and reorderbuffer.c. + * + * 'two_phase' indicates to finish prepared transaction. + */ +static void +DecodeAbort(LogicalDecodingContext *ctx, XLogRecordBuffer *buf, + xl_xact_parsed_abort *parsed, TransactionId xid, + bool two_phase) +{ + int i; + XLogRecPtr origin_lsn = InvalidXLogRecPtr; + TimestampTz abort_time = parsed->xact_time; + RepOriginId origin_id = XLogRecGetOrigin(buf->record); + bool skip_xact; + + if (parsed->xinfo & XACT_XINFO_HAS_ORIGIN) + { + origin_lsn = parsed->origin_lsn; + abort_time = parsed->origin_timestamp; + } + + /* + * Check whether we need to process this transaction. See + * DecodeTXNNeedSkip for the reasons why we sometimes want to skip the + * transaction. + */ + skip_xact = DecodeTXNNeedSkip(ctx, buf, parsed->dbId, origin_id); + + /* + * Send the final rollback record for a prepared transaction unless we + * need to skip it. For non-two-phase xacts, simply forget the xact. + */ + if (two_phase && !skip_xact) + { + ReorderBufferFinishPrepared(ctx->reorder, xid, buf->origptr, buf->endptr, + InvalidXLogRecPtr, + abort_time, origin_id, origin_lsn, + parsed->twophase_gid, false); + } + else + { + for (i = 0; i < parsed->nsubxacts; i++) + { + ReorderBufferAbort(ctx->reorder, parsed->subxacts[i], + buf->record->EndRecPtr, abort_time); + } + + ReorderBufferAbort(ctx->reorder, xid, buf->record->EndRecPtr, + abort_time); + } + + /* update the decoding stats */ + UpdateDecodingStats(ctx); +} + +/* + * Parse XLOG_HEAP_INSERT (not MULTI_INSERT!) records into tuplebufs. + * + * Deletes can contain the new tuple. + */ +static void +DecodeInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + Size datalen; + char *tupledata; + Size tuplelen; + XLogReaderState *r = buf->record; + xl_heap_insert *xlrec; + ReorderBufferChange *change; + RelFileLocator target_locator; + + xlrec = (xl_heap_insert *) XLogRecGetData(r); + + /* + * Ignore insert records without new tuples (this does happen when + * raw_heap_insert marks the TOAST record as HEAP_INSERT_NO_LOGICAL). + */ + if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE)) + return; + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); + if (target_locator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + if (!(xlrec->flags & XLH_INSERT_IS_SPECULATIVE)) + change->action = REORDER_BUFFER_CHANGE_INSERT; + else + change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT; + change->origin_id = XLogRecGetOrigin(r); + + memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); + + tupledata = XLogRecGetBlockData(r, 0, &datalen); + tuplelen = datalen - SizeOfHeapHeader; + + change->data.tp.newtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple(tupledata, datalen, change->data.tp.newtuple); + + change->data.tp.clear_toast_afterwards = true; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, + xlrec->flags & XLH_INSERT_ON_TOAST_RELATION); +} + +/* + * Parse XLOG_HEAP_UPDATE and XLOG_HEAP_HOT_UPDATE, which have the same layout + * in the record, from wal into proper tuplebufs. + * + * Updates can possibly contain a new tuple and the old primary key. + */ +static void +DecodeUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + xl_heap_update *xlrec; + ReorderBufferChange *change; + char *data; + RelFileLocator target_locator; + + xlrec = (xl_heap_update *) XLogRecGetData(r); + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); + if (target_locator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + change->action = REORDER_BUFFER_CHANGE_UPDATE; + change->origin_id = XLogRecGetOrigin(r); + memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); + + if (xlrec->flags & XLH_UPDATE_CONTAINS_NEW_TUPLE) + { + Size datalen; + Size tuplelen; + + data = XLogRecGetBlockData(r, 0, &datalen); + + tuplelen = datalen - SizeOfHeapHeader; + + change->data.tp.newtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple(data, datalen, change->data.tp.newtuple); + } + + if (xlrec->flags & XLH_UPDATE_CONTAINS_OLD) + { + Size datalen; + Size tuplelen; + + /* caution, remaining data in record is not aligned */ + data = XLogRecGetData(r) + SizeOfHeapUpdate; + datalen = XLogRecGetDataLen(r) - SizeOfHeapUpdate; + tuplelen = datalen - SizeOfHeapHeader; + + change->data.tp.oldtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple(data, datalen, change->data.tp.oldtuple); + } + + change->data.tp.clear_toast_afterwards = true; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, false); +} + +/* + * Parse XLOG_HEAP_DELETE from wal into proper tuplebufs. + * + * Deletes can possibly contain the old primary key. + */ +static void +DecodeDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + xl_heap_delete *xlrec; + ReorderBufferChange *change; + RelFileLocator target_locator; + + xlrec = (xl_heap_delete *) XLogRecGetData(r); + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); + if (target_locator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + + if (xlrec->flags & XLH_DELETE_IS_SUPER) + change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT; + else + change->action = REORDER_BUFFER_CHANGE_DELETE; + + change->origin_id = XLogRecGetOrigin(r); + + memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); + + /* old primary key stored */ + if (xlrec->flags & XLH_DELETE_CONTAINS_OLD) + { + Size datalen = XLogRecGetDataLen(r) - SizeOfHeapDelete; + Size tuplelen = datalen - SizeOfHeapHeader; + + Assert(XLogRecGetDataLen(r) > (SizeOfHeapDelete + SizeOfHeapHeader)); + + change->data.tp.oldtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple((char *) xlrec + SizeOfHeapDelete, + datalen, change->data.tp.oldtuple); + } + + change->data.tp.clear_toast_afterwards = true; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, false); +} + +/* + * Parse XLOG_HEAP_TRUNCATE from wal + */ +static void +DecodeTruncate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + xl_heap_truncate *xlrec; + ReorderBufferChange *change; + + xlrec = (xl_heap_truncate *) XLogRecGetData(r); + + /* only interested in our database */ + if (xlrec->dbId != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + change->action = REORDER_BUFFER_CHANGE_TRUNCATE; + change->origin_id = XLogRecGetOrigin(r); + if (xlrec->flags & XLH_TRUNCATE_CASCADE) + change->data.truncate.cascade = true; + if (xlrec->flags & XLH_TRUNCATE_RESTART_SEQS) + change->data.truncate.restart_seqs = true; + change->data.truncate.nrelids = xlrec->nrelids; + change->data.truncate.relids = ReorderBufferGetRelids(ctx->reorder, + xlrec->nrelids); + memcpy(change->data.truncate.relids, xlrec->relids, + xlrec->nrelids * sizeof(Oid)); + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), + buf->origptr, change, false); +} + +/* + * Decode XLOG_HEAP2_MULTI_INSERT_insert record into multiple tuplebufs. + * + * Currently MULTI_INSERT will always contain the full tuples. + */ +static void +DecodeMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + xl_heap_multi_insert *xlrec; + int i; + char *data; + char *tupledata; + Size tuplelen; + RelFileLocator rlocator; + + xlrec = (xl_heap_multi_insert *) XLogRecGetData(r); + + /* + * Ignore insert records without new tuples. This happens when a + * multi_insert is done on a catalog or on a non-persistent relation. + */ + if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE)) + return; + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &rlocator, NULL, NULL); + if (rlocator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + /* + * We know that this multi_insert isn't for a catalog, so the block should + * always have data even if a full-page write of it is taken. + */ + tupledata = XLogRecGetBlockData(r, 0, &tuplelen); + Assert(tupledata != NULL); + + data = tupledata; + for (i = 0; i < xlrec->ntuples; i++) + { + ReorderBufferChange *change; + xl_multi_insert_tuple *xlhdr; + int datalen; + ReorderBufferTupleBuf *tuple; + HeapTupleHeader header; + + change = ReorderBufferGetChange(ctx->reorder); + change->action = REORDER_BUFFER_CHANGE_INSERT; + change->origin_id = XLogRecGetOrigin(r); + + memcpy(&change->data.tp.rlocator, &rlocator, sizeof(RelFileLocator)); + + xlhdr = (xl_multi_insert_tuple *) SHORTALIGN(data); + data = ((char *) xlhdr) + SizeOfMultiInsertTuple; + datalen = xlhdr->datalen; + + change->data.tp.newtuple = + ReorderBufferGetTupleBuf(ctx->reorder, datalen); + + tuple = change->data.tp.newtuple; + header = tuple->tuple.t_data; + + /* not a disk based tuple */ + ItemPointerSetInvalid(&tuple->tuple.t_self); + + /* + * We can only figure this out after reassembling the transactions. + */ + tuple->tuple.t_tableOid = InvalidOid; + + tuple->tuple.t_len = datalen + SizeofHeapTupleHeader; + + memset(header, 0, SizeofHeapTupleHeader); + + memcpy((char *) tuple->tuple.t_data + SizeofHeapTupleHeader, + (char *) data, + datalen); + header->t_infomask = xlhdr->t_infomask; + header->t_infomask2 = xlhdr->t_infomask2; + header->t_hoff = xlhdr->t_hoff; + + /* + * Reset toast reassembly state only after the last row in the last + * xl_multi_insert_tuple record emitted by one heap_multi_insert() + * call. + */ + if (xlrec->flags & XLH_INSERT_LAST_IN_MULTI && + (i + 1) == xlrec->ntuples) + change->data.tp.clear_toast_afterwards = true; + else + change->data.tp.clear_toast_afterwards = false; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), + buf->origptr, change, false); + + /* move to the next xl_multi_insert_tuple entry */ + data += datalen; + } + Assert(data == tupledata + tuplelen); +} + +/* + * Parse XLOG_HEAP_CONFIRM from wal into a confirmation change. + * + * This is pretty trivial, all the state essentially already setup by the + * speculative insertion. + */ +static void +DecodeSpecConfirm(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + ReorderBufferChange *change; + RelFileLocator target_locator; + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); + if (target_locator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM; + change->origin_id = XLogRecGetOrigin(r); + + memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); + + change->data.tp.clear_toast_afterwards = true; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, false); +} + + +/* + * Read a HeapTuple as WAL logged by heap_insert, heap_update and heap_delete + * (but not by heap_multi_insert) into a tuplebuf. + * + * The size 'len' and the pointer 'data' in the record need to be + * computed outside as they are record specific. + */ +static void +DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tuple) +{ + xl_heap_header xlhdr; + int datalen = len - SizeOfHeapHeader; + HeapTupleHeader header; + + Assert(datalen >= 0); + + tuple->tuple.t_len = datalen + SizeofHeapTupleHeader; + header = tuple->tuple.t_data; + + /* not a disk based tuple */ + ItemPointerSetInvalid(&tuple->tuple.t_self); + + /* we can only figure this out after reassembling the transactions */ + tuple->tuple.t_tableOid = InvalidOid; + + /* data is not stored aligned, copy to aligned storage */ + memcpy((char *) &xlhdr, + data, + SizeOfHeapHeader); + + memset(header, 0, SizeofHeapTupleHeader); + + memcpy(((char *) tuple->tuple.t_data) + SizeofHeapTupleHeader, + data + SizeOfHeapHeader, + datalen); + + header->t_infomask = xlhdr.t_infomask; + header->t_infomask2 = xlhdr.t_infomask2; + header->t_hoff = xlhdr.t_hoff; +} + +/* + * Check whether we are interested in this specific transaction. + * + * There can be several reasons we might not be interested in this + * transaction: + * 1) We might not be interested in decoding transactions up to this + * LSN. This can happen because we previously decoded it and now just + * are restarting or if we haven't assembled a consistent snapshot yet. + * 2) The transaction happened in another database. + * 3) The output plugin is not interested in the origin. + * 4) We are doing fast-forwarding + */ +static bool +DecodeTXNNeedSkip(LogicalDecodingContext *ctx, XLogRecordBuffer *buf, + Oid txn_dbid, RepOriginId origin_id) +{ + return (SnapBuildXactNeedsSkip(ctx->snapshot_builder, buf->origptr) || + (txn_dbid != InvalidOid && txn_dbid != ctx->slot->data.database) || + ctx->fast_forward || FilterByOrigin(ctx, origin_id)); +} diff --git a/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/launcher.c b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/launcher.c new file mode 100644 index 00000000000..b7fa1cf652f --- /dev/null +++ b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/launcher.c @@ -0,0 +1,1333 @@ +/*------------------------------------------------------------------------- + * launcher.c + * PostgreSQL logical replication worker launcher process + * + * Copyright (c) 2016-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/launcher.c + * + * NOTES + * This module contains the logical replication worker launcher which + * uses the background worker infrastructure to start the logical + * replication workers for every enabled subscription. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/heapam.h" +#include "access/htup.h" +#include "access/htup_details.h" +#include "access/tableam.h" +#include "access/xact.h" +#include "catalog/pg_subscription.h" +#include "catalog/pg_subscription_rel.h" +#include "funcapi.h" +#include "lib/dshash.h" +#include "libpq/pqsignal.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/bgworker.h" +#include "postmaster/fork_process.h" +#include "postmaster/interrupt.h" +#include "postmaster/postmaster.h" +#include "replication/logicallauncher.h" +#include "replication/logicalworker.h" +#include "replication/slot.h" +#include "replication/walreceiver.h" +#include "replication/worker_internal.h" +#include "storage/ipc.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/procsignal.h" +#include "tcop/tcopprot.h" +#include "utils/builtins.h" +#include "utils/memutils.h" +#include "utils/pg_lsn.h" +#include "utils/ps_status.h" +#include "utils/snapmgr.h" +#include "utils/timeout.h" + +/* max sleep time between cycles (3min) */ +#define DEFAULT_NAPTIME_PER_CYCLE 180000L + +/* GUC variables */ +__thread int max_logical_replication_workers = 4; +__thread int max_sync_workers_per_subscription = 2; +__thread int max_parallel_apply_workers_per_subscription = 2; + +__thread LogicalRepWorker *MyLogicalRepWorker = NULL; + +typedef struct LogicalRepCtxStruct +{ + /* Supervisor process. */ + pid_t launcher_pid; + + /* Hash table holding last start times of subscriptions' apply workers. */ + dsa_handle last_start_dsa; + dshash_table_handle last_start_dsh; + + /* Background workers. */ + LogicalRepWorker workers[FLEXIBLE_ARRAY_MEMBER]; +} LogicalRepCtxStruct; + +static __thread LogicalRepCtxStruct *LogicalRepCtx; + +/* an entry in the last-start-times shared hash table */ +typedef struct LauncherLastStartTimesEntry +{ + Oid subid; /* OID of logrep subscription (hash key) */ + TimestampTz last_start_time; /* last time its apply worker was started */ +} LauncherLastStartTimesEntry; + +/* parameters for the last-start-times shared hash table */ +static const dshash_parameters dsh_params = { + sizeof(Oid), + sizeof(LauncherLastStartTimesEntry), + dshash_memcmp, + dshash_memhash, + LWTRANCHE_LAUNCHER_HASH +}; + +static __thread dsa_area *last_start_times_dsa = NULL; +static __thread dshash_table *last_start_times = NULL; + +static __thread bool on_commit_launcher_wakeup = false; + + +static void ApplyLauncherWakeup(void); +static void logicalrep_launcher_onexit(int code, Datum arg); +static void logicalrep_worker_onexit(int code, Datum arg); +static void logicalrep_worker_detach(void); +static void logicalrep_worker_cleanup(LogicalRepWorker *worker); +static int logicalrep_pa_worker_count(Oid subid); +static void logicalrep_launcher_attach_dshmem(void); +static void ApplyLauncherSetWorkerStartTime(Oid subid, TimestampTz start_time); +static TimestampTz ApplyLauncherGetWorkerStartTime(Oid subid); + + +/* + * Load the list of subscriptions. + * + * Only the fields interesting for worker start/stop functions are filled for + * each subscription. + */ +static List * +get_subscription_list(void) +{ + List *res = NIL; + Relation rel; + TableScanDesc scan; + HeapTuple tup; + MemoryContext resultcxt; + + /* This is the context that we will allocate our output data in */ + resultcxt = CurrentMemoryContext; + + /* + * Start a transaction so we can access pg_database, and get a snapshot. + * We don't have a use for the snapshot itself, but we're interested in + * the secondary effect that it sets RecentGlobalXmin. (This is critical + * for anything that reads heap pages, because HOT may decide to prune + * them even if the process doesn't attempt to modify any tuples.) + * + * FIXME: This comment is inaccurate / the code buggy. A snapshot that is + * not pushed/active does not reliably prevent HOT pruning (->xmin could + * e.g. be cleared when cache invalidations are processed). + */ + StartTransactionCommand(); + (void) GetTransactionSnapshot(); + + rel = table_open(SubscriptionRelationId, AccessShareLock); + scan = table_beginscan_catalog(rel, 0, NULL); + + while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection))) + { + Form_pg_subscription subform = (Form_pg_subscription) GETSTRUCT(tup); + Subscription *sub; + MemoryContext oldcxt; + + /* + * Allocate our results in the caller's context, not the + * transaction's. We do this inside the loop, and restore the original + * context at the end, so that leaky things like heap_getnext() are + * not called in a potentially long-lived context. + */ + oldcxt = MemoryContextSwitchTo(resultcxt); + + sub = (Subscription *) palloc0(sizeof(Subscription)); + sub->oid = subform->oid; + sub->dbid = subform->subdbid; + sub->owner = subform->subowner; + sub->enabled = subform->subenabled; + sub->name = pstrdup(NameStr(subform->subname)); + /* We don't fill fields we are not interested in. */ + + res = lappend(res, sub); + MemoryContextSwitchTo(oldcxt); + } + + table_endscan(scan); + table_close(rel, AccessShareLock); + + CommitTransactionCommand(); + + return res; +} + +/* + * Wait for a background worker to start up and attach to the shmem context. + * + * This is only needed for cleaning up the shared memory in case the worker + * fails to attach. + * + * Returns whether the attach was successful. + */ +static bool +WaitForReplicationWorkerAttach(LogicalRepWorker *worker, + uint16 generation, + BackgroundWorkerHandle *handle) +{ + BgwHandleStatus status; + int rc; + + for (;;) + { + pid_t pid; + + CHECK_FOR_INTERRUPTS(); + + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + + /* Worker either died or has started. Return false if died. */ + if (!worker->in_use || worker->proc) + { + LWLockRelease(LogicalRepWorkerLock); + return worker->in_use; + } + + LWLockRelease(LogicalRepWorkerLock); + + /* Check if worker has died before attaching, and clean up after it. */ + status = GetBackgroundWorkerPid(handle, &pid); + + if (status == BGWH_STOPPED) + { + LWLockAcquire(LogicalRepWorkerLock, LW_EXCLUSIVE); + /* Ensure that this was indeed the worker we waited for. */ + if (generation == worker->generation) + logicalrep_worker_cleanup(worker); + LWLockRelease(LogicalRepWorkerLock); + return false; + } + + /* + * We need timeout because we generally don't get notified via latch + * about the worker attach. But we don't expect to have to wait long. + */ + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 10L, WAIT_EVENT_BGWORKER_STARTUP); + + if (rc & WL_LATCH_SET) + { + ResetLatch(MyLatch); + CHECK_FOR_INTERRUPTS(); + } + } +} + +/* + * Walks the workers array and searches for one that matches given + * subscription id and relid. + * + * We are only interested in the leader apply worker or table sync worker. + */ +LogicalRepWorker * +logicalrep_worker_find(Oid subid, Oid relid, bool only_running) +{ + int i; + LogicalRepWorker *res = NULL; + + Assert(LWLockHeldByMe(LogicalRepWorkerLock)); + + /* Search for attached worker for a given subscription id. */ + for (i = 0; i < max_logical_replication_workers; i++) + { + LogicalRepWorker *w = &LogicalRepCtx->workers[i]; + + /* Skip parallel apply workers. */ + if (isParallelApplyWorker(w)) + continue; + + if (w->in_use && w->subid == subid && w->relid == relid && + (!only_running || w->proc)) + { + res = w; + break; + } + } + + return res; +} + +/* + * Similar to logicalrep_worker_find(), but returns a list of all workers for + * the subscription, instead of just one. + */ +List * +logicalrep_workers_find(Oid subid, bool only_running) +{ + int i; + List *res = NIL; + + Assert(LWLockHeldByMe(LogicalRepWorkerLock)); + + /* Search for attached worker for a given subscription id. */ + for (i = 0; i < max_logical_replication_workers; i++) + { + LogicalRepWorker *w = &LogicalRepCtx->workers[i]; + + if (w->in_use && w->subid == subid && (!only_running || w->proc)) + res = lappend(res, w); + } + + return res; +} + +/* + * Start new logical replication background worker, if possible. + * + * Returns true on success, false on failure. + */ +bool +logicalrep_worker_launch(Oid dbid, Oid subid, const char *subname, Oid userid, + Oid relid, dsm_handle subworker_dsm) +{ + BackgroundWorker bgw; + BackgroundWorkerHandle *bgw_handle; + uint16 generation; + int i; + int slot = 0; + LogicalRepWorker *worker = NULL; + int nsyncworkers; + int nparallelapplyworkers; + TimestampTz now; + bool is_parallel_apply_worker = (subworker_dsm != DSM_HANDLE_INVALID); + + /* Sanity check - tablesync worker cannot be a subworker */ + Assert(!(is_parallel_apply_worker && OidIsValid(relid))); + + ereport(DEBUG1, + (errmsg_internal("starting logical replication worker for subscription \"%s\"", + subname))); + + /* Report this after the initial starting message for consistency. */ + if (max_replication_slots == 0) + ereport(ERROR, + (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), + errmsg("cannot start logical replication workers when max_replication_slots = 0"))); + + /* + * We need to do the modification of the shared memory under lock so that + * we have consistent view. + */ + LWLockAcquire(LogicalRepWorkerLock, LW_EXCLUSIVE); + +retry: + /* Find unused worker slot. */ + for (i = 0; i < max_logical_replication_workers; i++) + { + LogicalRepWorker *w = &LogicalRepCtx->workers[i]; + + if (!w->in_use) + { + worker = w; + slot = i; + break; + } + } + + nsyncworkers = logicalrep_sync_worker_count(subid); + + now = GetCurrentTimestamp(); + + /* + * If we didn't find a free slot, try to do garbage collection. The + * reason we do this is because if some worker failed to start up and its + * parent has crashed while waiting, the in_use state was never cleared. + */ + if (worker == NULL || nsyncworkers >= max_sync_workers_per_subscription) + { + bool did_cleanup = false; + + for (i = 0; i < max_logical_replication_workers; i++) + { + LogicalRepWorker *w = &LogicalRepCtx->workers[i]; + + /* + * If the worker was marked in use but didn't manage to attach in + * time, clean it up. + */ + if (w->in_use && !w->proc && + TimestampDifferenceExceeds(w->launch_time, now, + wal_receiver_timeout)) + { + elog(WARNING, + "logical replication worker for subscription %u took too long to start; canceled", + w->subid); + + logicalrep_worker_cleanup(w); + did_cleanup = true; + } + } + + if (did_cleanup) + goto retry; + } + + /* + * We don't allow to invoke more sync workers once we have reached the + * sync worker limit per subscription. So, just return silently as we + * might get here because of an otherwise harmless race condition. + */ + if (OidIsValid(relid) && nsyncworkers >= max_sync_workers_per_subscription) + { + LWLockRelease(LogicalRepWorkerLock); + return false; + } + + nparallelapplyworkers = logicalrep_pa_worker_count(subid); + + /* + * Return false if the number of parallel apply workers reached the limit + * per subscription. + */ + if (is_parallel_apply_worker && + nparallelapplyworkers >= max_parallel_apply_workers_per_subscription) + { + LWLockRelease(LogicalRepWorkerLock); + return false; + } + + /* + * However if there are no more free worker slots, inform user about it + * before exiting. + */ + if (worker == NULL) + { + LWLockRelease(LogicalRepWorkerLock); + ereport(WARNING, + (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), + errmsg("out of logical replication worker slots"), + errhint("You might need to increase %s.", "max_logical_replication_workers"))); + return false; + } + + /* Prepare the worker slot. */ + worker->launch_time = now; + worker->in_use = true; + worker->generation++; + worker->proc = NULL; + worker->dbid = dbid; + worker->userid = userid; + worker->subid = subid; + worker->relid = relid; + worker->relstate = SUBREL_STATE_UNKNOWN; + worker->relstate_lsn = InvalidXLogRecPtr; + worker->stream_fileset = NULL; + worker->leader_pid = is_parallel_apply_worker ? MyProcPid : InvalidPid; + worker->parallel_apply = is_parallel_apply_worker; + worker->last_lsn = InvalidXLogRecPtr; + TIMESTAMP_NOBEGIN(worker->last_send_time); + TIMESTAMP_NOBEGIN(worker->last_recv_time); + worker->reply_lsn = InvalidXLogRecPtr; + TIMESTAMP_NOBEGIN(worker->reply_time); + + /* Before releasing lock, remember generation for future identification. */ + generation = worker->generation; + + LWLockRelease(LogicalRepWorkerLock); + + /* Register the new dynamic worker. */ + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | + BGWORKER_BACKEND_DATABASE_CONNECTION; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres"); + + if (is_parallel_apply_worker) + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "ParallelApplyWorkerMain"); + else + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "ApplyWorkerMain"); + + if (OidIsValid(relid)) + snprintf(bgw.bgw_name, BGW_MAXLEN, + "logical replication worker for subscription %u sync %u", subid, relid); + else if (is_parallel_apply_worker) + snprintf(bgw.bgw_name, BGW_MAXLEN, + "logical replication parallel apply worker for subscription %u", subid); + else + snprintf(bgw.bgw_name, BGW_MAXLEN, + "logical replication apply worker for subscription %u", subid); + + if (is_parallel_apply_worker) + snprintf(bgw.bgw_type, BGW_MAXLEN, "logical replication parallel worker"); + else + snprintf(bgw.bgw_type, BGW_MAXLEN, "logical replication worker"); + + bgw.bgw_restart_time = BGW_NEVER_RESTART; + bgw.bgw_notify_pid = MyProcPid; + bgw.bgw_main_arg = Int32GetDatum(slot); + + if (is_parallel_apply_worker) + memcpy(bgw.bgw_extra, &subworker_dsm, sizeof(dsm_handle)); + + if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle)) + { + /* Failed to start worker, so clean up the worker slot. */ + LWLockAcquire(LogicalRepWorkerLock, LW_EXCLUSIVE); + Assert(generation == worker->generation); + logicalrep_worker_cleanup(worker); + LWLockRelease(LogicalRepWorkerLock); + + ereport(WARNING, + (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), + errmsg("out of background worker slots"), + errhint("You might need to increase %s.", "max_worker_processes"))); + return false; + } + + /* Now wait until it attaches. */ + return WaitForReplicationWorkerAttach(worker, generation, bgw_handle); +} + +/* + * Internal function to stop the worker and wait until it detaches from the + * slot. + */ +static void +logicalrep_worker_stop_internal(LogicalRepWorker *worker, int signo) +{ + uint16 generation; + + Assert(LWLockHeldByMeInMode(LogicalRepWorkerLock, LW_SHARED)); + + /* + * Remember which generation was our worker so we can check if what we see + * is still the same one. + */ + generation = worker->generation; + + /* + * If we found a worker but it does not have proc set then it is still + * starting up; wait for it to finish starting and then kill it. + */ + while (worker->in_use && !worker->proc) + { + int rc; + + LWLockRelease(LogicalRepWorkerLock); + + /* Wait a bit --- we don't expect to have to wait long. */ + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 10L, WAIT_EVENT_BGWORKER_STARTUP); + + if (rc & WL_LATCH_SET) + { + ResetLatch(MyLatch); + CHECK_FOR_INTERRUPTS(); + } + + /* Recheck worker status. */ + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + + /* + * Check whether the worker slot is no longer used, which would mean + * that the worker has exited, or whether the worker generation is + * different, meaning that a different worker has taken the slot. + */ + if (!worker->in_use || worker->generation != generation) + return; + + /* Worker has assigned proc, so it has started. */ + if (worker->proc) + break; + } + + /* Now terminate the worker ... */ + kill(worker->proc->pid, signo); + + /* ... and wait for it to die. */ + for (;;) + { + int rc; + + /* is it gone? */ + if (!worker->proc || worker->generation != generation) + break; + + LWLockRelease(LogicalRepWorkerLock); + + /* Wait a bit --- we don't expect to have to wait long. */ + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 10L, WAIT_EVENT_BGWORKER_SHUTDOWN); + + if (rc & WL_LATCH_SET) + { + ResetLatch(MyLatch); + CHECK_FOR_INTERRUPTS(); + } + + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + } +} + +/* + * Stop the logical replication worker for subid/relid, if any. + */ +void +logicalrep_worker_stop(Oid subid, Oid relid) +{ + LogicalRepWorker *worker; + + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + + worker = logicalrep_worker_find(subid, relid, false); + + if (worker) + { + Assert(!isParallelApplyWorker(worker)); + logicalrep_worker_stop_internal(worker, SIGTERM); + } + + LWLockRelease(LogicalRepWorkerLock); +} + +/* + * Stop the given logical replication parallel apply worker. + * + * Node that the function sends SIGINT instead of SIGTERM to the parallel apply + * worker so that the worker exits cleanly. + */ +void +logicalrep_pa_worker_stop(ParallelApplyWorkerInfo *winfo) +{ + int slot_no; + uint16 generation; + LogicalRepWorker *worker; + + SpinLockAcquire(&winfo->shared->mutex); + generation = winfo->shared->logicalrep_worker_generation; + slot_no = winfo->shared->logicalrep_worker_slot_no; + SpinLockRelease(&winfo->shared->mutex); + + Assert(slot_no >= 0 && slot_no < max_logical_replication_workers); + + /* + * Detach from the error_mq_handle for the parallel apply worker before + * stopping it. This prevents the leader apply worker from trying to + * receive the message from the error queue that might already be detached + * by the parallel apply worker. + */ + if (winfo->error_mq_handle) + { + shm_mq_detach(winfo->error_mq_handle); + winfo->error_mq_handle = NULL; + } + + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + + worker = &LogicalRepCtx->workers[slot_no]; + Assert(isParallelApplyWorker(worker)); + + /* + * Only stop the worker if the generation matches and the worker is alive. + */ + if (worker->generation == generation && worker->proc) + logicalrep_worker_stop_internal(worker, SIGINT); + + LWLockRelease(LogicalRepWorkerLock); +} + +/* + * Wake up (using latch) any logical replication worker for specified sub/rel. + */ +void +logicalrep_worker_wakeup(Oid subid, Oid relid) +{ + LogicalRepWorker *worker; + + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + + worker = logicalrep_worker_find(subid, relid, true); + + if (worker) + logicalrep_worker_wakeup_ptr(worker); + + LWLockRelease(LogicalRepWorkerLock); +} + +/* + * Wake up (using latch) the specified logical replication worker. + * + * Caller must hold lock, else worker->proc could change under us. + */ +void +logicalrep_worker_wakeup_ptr(LogicalRepWorker *worker) +{ + Assert(LWLockHeldByMe(LogicalRepWorkerLock)); + + SetLatch(&worker->proc->procLatch); +} + +/* + * Attach to a slot. + */ +void +logicalrep_worker_attach(int slot) +{ + /* Block concurrent access. */ + LWLockAcquire(LogicalRepWorkerLock, LW_EXCLUSIVE); + + Assert(slot >= 0 && slot < max_logical_replication_workers); + MyLogicalRepWorker = &LogicalRepCtx->workers[slot]; + + if (!MyLogicalRepWorker->in_use) + { + LWLockRelease(LogicalRepWorkerLock); + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical replication worker slot %d is empty, cannot attach", + slot))); + } + + if (MyLogicalRepWorker->proc) + { + LWLockRelease(LogicalRepWorkerLock); + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical replication worker slot %d is already used by " + "another worker, cannot attach", slot))); + } + + MyLogicalRepWorker->proc = MyProc; + before_shmem_exit(logicalrep_worker_onexit, (Datum) 0); + + LWLockRelease(LogicalRepWorkerLock); +} + +/* + * Stop the parallel apply workers if any, and detach the leader apply worker + * (cleans up the worker info). + */ +static void +logicalrep_worker_detach(void) +{ + /* Stop the parallel apply workers. */ + if (am_leader_apply_worker()) + { + List *workers; + ListCell *lc; + + /* + * Detach from the error_mq_handle for all parallel apply workers + * before terminating them. This prevents the leader apply worker from + * receiving the worker termination message and sending it to logs + * when the same is already done by the parallel worker. + */ + pa_detach_all_error_mq(); + + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + + workers = logicalrep_workers_find(MyLogicalRepWorker->subid, true); + foreach(lc, workers) + { + LogicalRepWorker *w = (LogicalRepWorker *) lfirst(lc); + + if (isParallelApplyWorker(w)) + logicalrep_worker_stop_internal(w, SIGTERM); + } + + LWLockRelease(LogicalRepWorkerLock); + } + + /* Block concurrent access. */ + LWLockAcquire(LogicalRepWorkerLock, LW_EXCLUSIVE); + + logicalrep_worker_cleanup(MyLogicalRepWorker); + + LWLockRelease(LogicalRepWorkerLock); +} + +/* + * Clean up worker info. + */ +static void +logicalrep_worker_cleanup(LogicalRepWorker *worker) +{ + Assert(LWLockHeldByMeInMode(LogicalRepWorkerLock, LW_EXCLUSIVE)); + + worker->in_use = false; + worker->proc = NULL; + worker->dbid = InvalidOid; + worker->userid = InvalidOid; + worker->subid = InvalidOid; + worker->relid = InvalidOid; + worker->leader_pid = InvalidPid; + worker->parallel_apply = false; +} + +/* + * Cleanup function for logical replication launcher. + * + * Called on logical replication launcher exit. + */ +static void +logicalrep_launcher_onexit(int code, Datum arg) +{ + LogicalRepCtx->launcher_pid = 0; +} + +/* + * Cleanup function. + * + * Called on logical replication worker exit. + */ +static void +logicalrep_worker_onexit(int code, Datum arg) +{ + /* Disconnect gracefully from the remote side. */ + if (LogRepWorkerWalRcvConn) + walrcv_disconnect(LogRepWorkerWalRcvConn); + + logicalrep_worker_detach(); + + /* Cleanup fileset used for streaming transactions. */ + if (MyLogicalRepWorker->stream_fileset != NULL) + FileSetDeleteAll(MyLogicalRepWorker->stream_fileset); + + /* + * Session level locks may be acquired outside of a transaction in + * parallel apply mode and will not be released when the worker + * terminates, so manually release all locks before the worker exits. + * + * The locks will be acquired once the worker is initialized. + */ + if (!InitializingApplyWorker) + LockReleaseAll(DEFAULT_LOCKMETHOD, true); + + ApplyLauncherWakeup(); +} + +/* + * Count the number of registered (not necessarily running) sync workers + * for a subscription. + */ +int +logicalrep_sync_worker_count(Oid subid) +{ + int i; + int res = 0; + + Assert(LWLockHeldByMe(LogicalRepWorkerLock)); + + /* Search for attached worker for a given subscription id. */ + for (i = 0; i < max_logical_replication_workers; i++) + { + LogicalRepWorker *w = &LogicalRepCtx->workers[i]; + + if (w->subid == subid && OidIsValid(w->relid)) + res++; + } + + return res; +} + +/* + * Count the number of registered (but not necessarily running) parallel apply + * workers for a subscription. + */ +static int +logicalrep_pa_worker_count(Oid subid) +{ + int i; + int res = 0; + + Assert(LWLockHeldByMe(LogicalRepWorkerLock)); + + /* + * Scan all attached parallel apply workers, only counting those which + * have the given subscription id. + */ + for (i = 0; i < max_logical_replication_workers; i++) + { + LogicalRepWorker *w = &LogicalRepCtx->workers[i]; + + if (w->subid == subid && isParallelApplyWorker(w)) + res++; + } + + return res; +} + +/* + * ApplyLauncherShmemSize + * Compute space needed for replication launcher shared memory + */ +Size +ApplyLauncherShmemSize(void) +{ + Size size; + + /* + * Need the fixed struct and the array of LogicalRepWorker. + */ + size = sizeof(LogicalRepCtxStruct); + size = MAXALIGN(size); + size = add_size(size, mul_size(max_logical_replication_workers, + sizeof(LogicalRepWorker))); + return size; +} + +/* + * ApplyLauncherRegister + * Register a background worker running the logical replication launcher. + */ +void +ApplyLauncherRegister(void) +{ + BackgroundWorker bgw; + + if (max_logical_replication_workers == 0) + return; + + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | + BGWORKER_BACKEND_DATABASE_CONNECTION; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "ApplyLauncherMain"); + snprintf(bgw.bgw_name, BGW_MAXLEN, + "logical replication launcher"); + snprintf(bgw.bgw_type, BGW_MAXLEN, + "logical replication launcher"); + bgw.bgw_restart_time = 5; + bgw.bgw_notify_pid = 0; + bgw.bgw_main_arg = (Datum) 0; + + RegisterBackgroundWorker(&bgw); +} + +/* + * ApplyLauncherShmemInit + * Allocate and initialize replication launcher shared memory + */ +void +ApplyLauncherShmemInit(void) +{ + bool found; + + LogicalRepCtx = (LogicalRepCtxStruct *) + ShmemInitStruct("Logical Replication Launcher Data", + ApplyLauncherShmemSize(), + &found); + + if (!found) + { + int slot; + + memset(LogicalRepCtx, 0, ApplyLauncherShmemSize()); + + LogicalRepCtx->last_start_dsa = DSA_HANDLE_INVALID; + LogicalRepCtx->last_start_dsh = DSHASH_HANDLE_INVALID; + + /* Initialize memory and spin locks for each worker slot. */ + for (slot = 0; slot < max_logical_replication_workers; slot++) + { + LogicalRepWorker *worker = &LogicalRepCtx->workers[slot]; + + memset(worker, 0, sizeof(LogicalRepWorker)); + SpinLockInit(&worker->relmutex); + } + } +} + +/* + * Initialize or attach to the dynamic shared hash table that stores the + * last-start times, if not already done. + * This must be called before accessing the table. + */ +static void +logicalrep_launcher_attach_dshmem(void) +{ + MemoryContext oldcontext; + + /* Quick exit if we already did this. */ + if (LogicalRepCtx->last_start_dsh != DSHASH_HANDLE_INVALID && + last_start_times != NULL) + return; + + /* Otherwise, use a lock to ensure only one process creates the table. */ + LWLockAcquire(LogicalRepWorkerLock, LW_EXCLUSIVE); + + /* Be sure any local memory allocated by DSA routines is persistent. */ + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + + if (LogicalRepCtx->last_start_dsh == DSHASH_HANDLE_INVALID) + { + /* Initialize dynamic shared hash table for last-start times. */ + last_start_times_dsa = dsa_create(LWTRANCHE_LAUNCHER_DSA); + dsa_pin(last_start_times_dsa); + dsa_pin_mapping(last_start_times_dsa); + last_start_times = dshash_create(last_start_times_dsa, &dsh_params, 0); + + /* Store handles in shared memory for other backends to use. */ + LogicalRepCtx->last_start_dsa = dsa_get_handle(last_start_times_dsa); + LogicalRepCtx->last_start_dsh = dshash_get_hash_table_handle(last_start_times); + } + else if (!last_start_times) + { + /* Attach to existing dynamic shared hash table. */ + last_start_times_dsa = dsa_attach(LogicalRepCtx->last_start_dsa); + dsa_pin_mapping(last_start_times_dsa); + last_start_times = dshash_attach(last_start_times_dsa, &dsh_params, + LogicalRepCtx->last_start_dsh, 0); + } + + MemoryContextSwitchTo(oldcontext); + LWLockRelease(LogicalRepWorkerLock); +} + +/* + * Set the last-start time for the subscription. + */ +static void +ApplyLauncherSetWorkerStartTime(Oid subid, TimestampTz start_time) +{ + LauncherLastStartTimesEntry *entry; + bool found; + + logicalrep_launcher_attach_dshmem(); + + entry = dshash_find_or_insert(last_start_times, &subid, &found); + entry->last_start_time = start_time; + dshash_release_lock(last_start_times, entry); +} + +/* + * Return the last-start time for the subscription, or 0 if there isn't one. + */ +static TimestampTz +ApplyLauncherGetWorkerStartTime(Oid subid) +{ + LauncherLastStartTimesEntry *entry; + TimestampTz ret; + + logicalrep_launcher_attach_dshmem(); + + entry = dshash_find(last_start_times, &subid, false); + if (entry == NULL) + return 0; + + ret = entry->last_start_time; + dshash_release_lock(last_start_times, entry); + + return ret; +} + +/* + * Remove the last-start-time entry for the subscription, if one exists. + * + * This has two use-cases: to remove the entry related to a subscription + * that's been deleted or disabled (just to avoid leaking shared memory), + * and to allow immediate restart of an apply worker that has exited + * due to subscription parameter changes. + */ +void +ApplyLauncherForgetWorkerStartTime(Oid subid) +{ + logicalrep_launcher_attach_dshmem(); + + (void) dshash_delete_key(last_start_times, &subid); +} + +/* + * Wakeup the launcher on commit if requested. + */ +void +AtEOXact_ApplyLauncher(bool isCommit) +{ + if (isCommit) + { + if (on_commit_launcher_wakeup) + ApplyLauncherWakeup(); + } + + on_commit_launcher_wakeup = false; +} + +/* + * Request wakeup of the launcher on commit of the transaction. + * + * This is used to send launcher signal to stop sleeping and process the + * subscriptions when current transaction commits. Should be used when new + * tuple was added to the pg_subscription catalog. +*/ +void +ApplyLauncherWakeupAtCommit(void) +{ + if (!on_commit_launcher_wakeup) + on_commit_launcher_wakeup = true; +} + +static void +ApplyLauncherWakeup(void) +{ + if (LogicalRepCtx->launcher_pid != 0) + kill(LogicalRepCtx->launcher_pid, SIGUSR1); +} + +/* + * Main loop for the apply launcher process. + */ +void +ApplyLauncherMain(Datum main_arg) +{ + ereport(DEBUG1, + (errmsg_internal("logical replication launcher started"))); + + before_shmem_exit(logicalrep_launcher_onexit, (Datum) 0); + + Assert(LogicalRepCtx->launcher_pid == 0); + LogicalRepCtx->launcher_pid = MyProcPid; + + /* Establish signal handlers. */ + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGTERM, die); + BackgroundWorkerUnblockSignals(); + + /* + * Establish connection to nailed catalogs (we only ever access + * pg_subscription). + */ + BackgroundWorkerInitializeConnection(NULL, NULL, 0); + + /* Enter main loop */ + for (;;) + { + int rc; + List *sublist; + ListCell *lc; + MemoryContext subctx; + MemoryContext oldctx; + long wait_time = DEFAULT_NAPTIME_PER_CYCLE; + + CHECK_FOR_INTERRUPTS(); + + /* Use temporary context to avoid leaking memory across cycles. */ + subctx = AllocSetContextCreate(TopMemoryContext, + "Logical Replication Launcher sublist", + ALLOCSET_DEFAULT_SIZES); + oldctx = MemoryContextSwitchTo(subctx); + + /* Start any missing workers for enabled subscriptions. */ + sublist = get_subscription_list(); + foreach(lc, sublist) + { + Subscription *sub = (Subscription *) lfirst(lc); + LogicalRepWorker *w; + TimestampTz last_start; + TimestampTz now; + long elapsed; + + if (!sub->enabled) + continue; + + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + w = logicalrep_worker_find(sub->oid, InvalidOid, false); + LWLockRelease(LogicalRepWorkerLock); + + if (w != NULL) + continue; /* worker is running already */ + + /* + * If the worker is eligible to start now, launch it. Otherwise, + * adjust wait_time so that we'll wake up as soon as it can be + * started. + * + * Each subscription's apply worker can only be restarted once per + * wal_retrieve_retry_interval, so that errors do not cause us to + * repeatedly restart the worker as fast as possible. In cases + * where a restart is expected (e.g., subscription parameter + * changes), another process should remove the last-start entry + * for the subscription so that the worker can be restarted + * without waiting for wal_retrieve_retry_interval to elapse. + */ + last_start = ApplyLauncherGetWorkerStartTime(sub->oid); + now = GetCurrentTimestamp(); + if (last_start == 0 || + (elapsed = TimestampDifferenceMilliseconds(last_start, now)) >= wal_retrieve_retry_interval) + { + ApplyLauncherSetWorkerStartTime(sub->oid, now); + logicalrep_worker_launch(sub->dbid, sub->oid, sub->name, + sub->owner, InvalidOid, + DSM_HANDLE_INVALID); + } + else + { + wait_time = Min(wait_time, + wal_retrieve_retry_interval - elapsed); + } + } + + /* Switch back to original memory context. */ + MemoryContextSwitchTo(oldctx); + /* Clean the temporary memory. */ + MemoryContextDelete(subctx); + + /* Wait for more work. */ + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + wait_time, + WAIT_EVENT_LOGICAL_LAUNCHER_MAIN); + + if (rc & WL_LATCH_SET) + { + ResetLatch(MyLatch); + CHECK_FOR_INTERRUPTS(); + } + + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } + } + + /* Not reachable */ +} + +/* + * Is current process the logical replication launcher? + */ +bool +IsLogicalLauncher(void) +{ + return LogicalRepCtx->launcher_pid == MyProcPid; +} + +/* + * Return the pid of the leader apply worker if the given pid is the pid of a + * parallel apply worker, otherwise, return InvalidPid. + */ +pid_t +GetLeaderApplyWorkerPid(pid_t pid) +{ + int leader_pid = InvalidPid; + int i; + + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + + for (i = 0; i < max_logical_replication_workers; i++) + { + LogicalRepWorker *w = &LogicalRepCtx->workers[i]; + + if (isParallelApplyWorker(w) && w->proc && pid == w->proc->pid) + { + leader_pid = w->leader_pid; + break; + } + } + + LWLockRelease(LogicalRepWorkerLock); + + return leader_pid; +} + +/* + * Returns state of the subscriptions. + */ +Datum +pg_stat_get_subscription(PG_FUNCTION_ARGS) +{ +#define PG_STAT_GET_SUBSCRIPTION_COLS 9 + Oid subid = PG_ARGISNULL(0) ? InvalidOid : PG_GETARG_OID(0); + int i; + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + + InitMaterializedSRF(fcinfo, 0); + + /* Make sure we get consistent view of the workers. */ + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + + for (i = 0; i < max_logical_replication_workers; i++) + { + /* for each row */ + Datum values[PG_STAT_GET_SUBSCRIPTION_COLS] = {0}; + bool nulls[PG_STAT_GET_SUBSCRIPTION_COLS] = {0}; + int worker_pid; + LogicalRepWorker worker; + + memcpy(&worker, &LogicalRepCtx->workers[i], + sizeof(LogicalRepWorker)); + if (!worker.proc || !IsBackendPid(worker.proc->pid)) + continue; + + if (OidIsValid(subid) && worker.subid != subid) + continue; + + worker_pid = worker.proc->pid; + + values[0] = ObjectIdGetDatum(worker.subid); + if (OidIsValid(worker.relid)) + values[1] = ObjectIdGetDatum(worker.relid); + else + nulls[1] = true; + values[2] = Int32GetDatum(worker_pid); + + if (isParallelApplyWorker(&worker)) + values[3] = Int32GetDatum(worker.leader_pid); + else + nulls[3] = true; + + if (XLogRecPtrIsInvalid(worker.last_lsn)) + nulls[4] = true; + else + values[4] = LSNGetDatum(worker.last_lsn); + if (worker.last_send_time == 0) + nulls[5] = true; + else + values[5] = TimestampTzGetDatum(worker.last_send_time); + if (worker.last_recv_time == 0) + nulls[6] = true; + else + values[6] = TimestampTzGetDatum(worker.last_recv_time); + if (XLogRecPtrIsInvalid(worker.reply_lsn)) + nulls[7] = true; + else + values[7] = LSNGetDatum(worker.reply_lsn); + if (worker.reply_time == 0) + nulls[8] = true; + else + values[8] = TimestampTzGetDatum(worker.reply_time); + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, + values, nulls); + + /* + * If only a single subscription was requested, and we found it, + * break. + */ + if (OidIsValid(subid)) + break; + } + + LWLockRelease(LogicalRepWorkerLock); + + return (Datum) 0; +} diff --git a/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/logical.c b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/logical.c new file mode 100644 index 00000000000..41243d0187a --- /dev/null +++ b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/logical.c @@ -0,0 +1,1951 @@ +/*------------------------------------------------------------------------- + * logical.c + * PostgreSQL logical decoding coordination + * + * Copyright (c) 2012-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/logical.c + * + * NOTES + * This file coordinates interaction between the various modules that + * together provide logical decoding, primarily by providing so + * called LogicalDecodingContexts. The goal is to encapsulate most of the + * internal complexity for consumers of logical decoding, so they can + * create and consume a changestream with a low amount of code. Builtin + * consumers are the walsender and SQL SRF interface, but it's possible to + * add further ones without changing core code, e.g. to consume changes in + * a bgworker. + * + * The idea is that a consumer provides three callbacks, one to read WAL, + * one to prepare a data write, and a final one for actually writing since + * their implementation depends on the type of consumer. Check + * logicalfuncs.c for an example implementation of a fairly simple consumer + * and an implementation of a WAL reading callback that's suitable for + * simple consumers. + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/xact.h" +#include "access/xlog_internal.h" +#include "fmgr.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "replication/decode.h" +#include "replication/logical.h" +#include "replication/origin.h" +#include "replication/reorderbuffer.h" +#include "replication/snapbuild.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "utils/builtins.h" +#include "utils/memutils.h" + +/* data for errcontext callback */ +typedef struct LogicalErrorCallbackState +{ + LogicalDecodingContext *ctx; + const char *callback_name; + XLogRecPtr report_location; +} LogicalErrorCallbackState; + +/* wrappers around output plugin callbacks */ +static void output_plugin_error_callback(void *arg); +static void startup_cb_wrapper(LogicalDecodingContext *ctx, OutputPluginOptions *opt, + bool is_init); +static void shutdown_cb_wrapper(LogicalDecodingContext *ctx); +static void begin_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn); +static void commit_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn); +static void begin_prepare_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn); +static void prepare_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr prepare_lsn); +static void commit_prepared_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn); +static void rollback_prepared_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr prepare_end_lsn, TimestampTz prepare_time); +static void change_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change); +static void truncate_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + int nrelations, Relation relations[], ReorderBufferChange *change); +static void message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr message_lsn, bool transactional, + const char *prefix, Size message_size, const char *message); + +/* streaming callbacks */ +static void stream_start_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr first_lsn); +static void stream_stop_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr last_lsn); +static void stream_abort_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr abort_lsn); +static void stream_prepare_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr prepare_lsn); +static void stream_commit_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn); +static void stream_change_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change); +static void stream_message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr message_lsn, bool transactional, + const char *prefix, Size message_size, const char *message); +static void stream_truncate_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + int nrelations, Relation relations[], ReorderBufferChange *change); + +/* callback to update txn's progress */ +static void update_progress_txn_cb_wrapper(ReorderBuffer *cache, + ReorderBufferTXN *txn, + XLogRecPtr lsn); + +static void LoadOutputPlugin(OutputPluginCallbacks *callbacks, const char *plugin); + +/* + * Make sure the current settings & environment are capable of doing logical + * decoding. + */ +void +CheckLogicalDecodingRequirements(void) +{ + CheckSlotRequirements(); + + /* + * NB: Adding a new requirement likely means that RestoreSlotFromDisk() + * needs the same check. + */ + + if (wal_level < WAL_LEVEL_LOGICAL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical decoding requires wal_level >= logical"))); + + if (MyDatabaseId == InvalidOid) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical decoding requires a database connection"))); + + if (RecoveryInProgress()) + { + /* + * This check may have race conditions, but whenever + * XLOG_PARAMETER_CHANGE indicates that wal_level has changed, we + * verify that there are no existing logical replication slots. And to + * avoid races around creating a new slot, + * CheckLogicalDecodingRequirements() is called once before creating + * the slot, and once when logical decoding is initially starting up. + */ + if (GetActiveWalLevelOnStandby() < WAL_LEVEL_LOGICAL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical decoding on standby requires wal_level >= logical on the primary"))); + } +} + +/* + * Helper function for CreateInitDecodingContext() and + * CreateDecodingContext() performing common tasks. + */ +static LogicalDecodingContext * +StartupDecodingContext(List *output_plugin_options, + XLogRecPtr start_lsn, + TransactionId xmin_horizon, + bool need_full_snapshot, + bool fast_forward, + XLogReaderRoutine *xl_routine, + LogicalOutputPluginWriterPrepareWrite prepare_write, + LogicalOutputPluginWriterWrite do_write, + LogicalOutputPluginWriterUpdateProgress update_progress) +{ + ReplicationSlot *slot; + MemoryContext context, + old_context; + LogicalDecodingContext *ctx; + + /* shorter lines... */ + slot = MyReplicationSlot; + + context = AllocSetContextCreate(CurrentMemoryContext, + "Logical decoding context", + ALLOCSET_DEFAULT_SIZES); + old_context = MemoryContextSwitchTo(context); + ctx = palloc0(sizeof(LogicalDecodingContext)); + + ctx->context = context; + + /* + * (re-)load output plugins, so we detect a bad (removed) output plugin + * now. + */ + if (!fast_forward) + LoadOutputPlugin(&ctx->callbacks, NameStr(slot->data.plugin)); + + /* + * Now that the slot's xmin has been set, we can announce ourselves as a + * logical decoding backend which doesn't need to be checked individually + * when computing the xmin horizon because the xmin is enforced via + * replication slots. + * + * We can only do so if we're outside of a transaction (i.e. the case when + * streaming changes via walsender), otherwise an already setup + * snapshot/xid would end up being ignored. That's not a particularly + * bothersome restriction since the SQL interface can't be used for + * streaming anyway. + */ + if (!IsTransactionOrTransactionBlock()) + { + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + MyProc->statusFlags |= PROC_IN_LOGICAL_DECODING; + ProcGlobal->statusFlags[MyProc->pgxactoff] = MyProc->statusFlags; + LWLockRelease(ProcArrayLock); + } + + ctx->slot = slot; + + ctx->reader = XLogReaderAllocate(wal_segment_size, NULL, xl_routine, ctx); + if (!ctx->reader) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed while allocating a WAL reading processor."))); + + ctx->reorder = ReorderBufferAllocate(); + ctx->snapshot_builder = + AllocateSnapshotBuilder(ctx->reorder, xmin_horizon, start_lsn, + need_full_snapshot, slot->data.two_phase_at); + + ctx->reorder->private_data = ctx; + + /* wrap output plugin callbacks, so we can add error context information */ + ctx->reorder->begin = begin_cb_wrapper; + ctx->reorder->apply_change = change_cb_wrapper; + ctx->reorder->apply_truncate = truncate_cb_wrapper; + ctx->reorder->commit = commit_cb_wrapper; + ctx->reorder->message = message_cb_wrapper; + + /* + * To support streaming, we require start/stop/abort/commit/change + * callbacks. The message and truncate callbacks are optional, similar to + * regular output plugins. We however enable streaming when at least one + * of the methods is enabled so that we can easily identify missing + * methods. + * + * We decide it here, but only check it later in the wrappers. + */ + ctx->streaming = (ctx->callbacks.stream_start_cb != NULL) || + (ctx->callbacks.stream_stop_cb != NULL) || + (ctx->callbacks.stream_abort_cb != NULL) || + (ctx->callbacks.stream_commit_cb != NULL) || + (ctx->callbacks.stream_change_cb != NULL) || + (ctx->callbacks.stream_message_cb != NULL) || + (ctx->callbacks.stream_truncate_cb != NULL); + + /* + * streaming callbacks + * + * stream_message and stream_truncate callbacks are optional, so we do not + * fail with ERROR when missing, but the wrappers simply do nothing. We + * must set the ReorderBuffer callbacks to something, otherwise the calls + * from there will crash (we don't want to move the checks there). + */ + ctx->reorder->stream_start = stream_start_cb_wrapper; + ctx->reorder->stream_stop = stream_stop_cb_wrapper; + ctx->reorder->stream_abort = stream_abort_cb_wrapper; + ctx->reorder->stream_prepare = stream_prepare_cb_wrapper; + ctx->reorder->stream_commit = stream_commit_cb_wrapper; + ctx->reorder->stream_change = stream_change_cb_wrapper; + ctx->reorder->stream_message = stream_message_cb_wrapper; + ctx->reorder->stream_truncate = stream_truncate_cb_wrapper; + + + /* + * To support two-phase logical decoding, we require + * begin_prepare/prepare/commit-prepare/abort-prepare callbacks. The + * filter_prepare callback is optional. We however enable two-phase + * logical decoding when at least one of the methods is enabled so that we + * can easily identify missing methods. + * + * We decide it here, but only check it later in the wrappers. + */ + ctx->twophase = (ctx->callbacks.begin_prepare_cb != NULL) || + (ctx->callbacks.prepare_cb != NULL) || + (ctx->callbacks.commit_prepared_cb != NULL) || + (ctx->callbacks.rollback_prepared_cb != NULL) || + (ctx->callbacks.stream_prepare_cb != NULL) || + (ctx->callbacks.filter_prepare_cb != NULL); + + /* + * Callback to support decoding at prepare time. + */ + ctx->reorder->begin_prepare = begin_prepare_cb_wrapper; + ctx->reorder->prepare = prepare_cb_wrapper; + ctx->reorder->commit_prepared = commit_prepared_cb_wrapper; + ctx->reorder->rollback_prepared = rollback_prepared_cb_wrapper; + + /* + * Callback to support updating progress during sending data of a + * transaction (and its subtransactions) to the output plugin. + */ + ctx->reorder->update_progress_txn = update_progress_txn_cb_wrapper; + + ctx->out = makeStringInfo(); + ctx->prepare_write = prepare_write; + ctx->write = do_write; + ctx->update_progress = update_progress; + + ctx->output_plugin_options = output_plugin_options; + + ctx->fast_forward = fast_forward; + + MemoryContextSwitchTo(old_context); + + return ctx; +} + +/* + * Create a new decoding context, for a new logical slot. + * + * plugin -- contains the name of the output plugin + * output_plugin_options -- contains options passed to the output plugin + * need_full_snapshot -- if true, must obtain a snapshot able to read all + * tables; if false, one that can read only catalogs is acceptable. + * restart_lsn -- if given as invalid, it's this routine's responsibility to + * mark WAL as reserved by setting a convenient restart_lsn for the slot. + * Otherwise, we set for decoding to start from the given LSN without + * marking WAL reserved beforehand. In that scenario, it's up to the + * caller to guarantee that WAL remains available. + * xl_routine -- XLogReaderRoutine for underlying XLogReader + * prepare_write, do_write, update_progress -- + * callbacks that perform the use-case dependent, actual, work. + * + * Needs to be called while in a memory context that's at least as long lived + * as the decoding context because further memory contexts will be created + * inside it. + * + * Returns an initialized decoding context after calling the output plugin's + * startup function. + */ +LogicalDecodingContext * +CreateInitDecodingContext(const char *plugin, + List *output_plugin_options, + bool need_full_snapshot, + XLogRecPtr restart_lsn, + XLogReaderRoutine *xl_routine, + LogicalOutputPluginWriterPrepareWrite prepare_write, + LogicalOutputPluginWriterWrite do_write, + LogicalOutputPluginWriterUpdateProgress update_progress) +{ + TransactionId xmin_horizon = InvalidTransactionId; + ReplicationSlot *slot; + NameData plugin_name; + LogicalDecodingContext *ctx; + MemoryContext old_context; + + /* + * On a standby, this check is also required while creating the slot. + * Check the comments in the function. + */ + CheckLogicalDecodingRequirements(); + + /* shorter lines... */ + slot = MyReplicationSlot; + + /* first some sanity checks that are unlikely to be violated */ + if (slot == NULL) + elog(ERROR, "cannot perform logical decoding without an acquired slot"); + + if (plugin == NULL) + elog(ERROR, "cannot initialize logical decoding without a specified plugin"); + + /* Make sure the passed slot is suitable. These are user facing errors. */ + if (SlotIsPhysical(slot)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot use physical replication slot for logical decoding"))); + + if (slot->data.database != MyDatabaseId) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("replication slot \"%s\" was not created in this database", + NameStr(slot->data.name)))); + + if (IsTransactionState() && + GetTopTransactionIdIfAny() != InvalidTransactionId) + ereport(ERROR, + (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION), + errmsg("cannot create logical replication slot in transaction that has performed writes"))); + + /* + * Register output plugin name with slot. We need the mutex to avoid + * concurrent reading of a partially copied string. But we don't want any + * complicated code while holding a spinlock, so do namestrcpy() outside. + */ + namestrcpy(&plugin_name, plugin); + SpinLockAcquire(&slot->mutex); + slot->data.plugin = plugin_name; + SpinLockRelease(&slot->mutex); + + if (XLogRecPtrIsInvalid(restart_lsn)) + ReplicationSlotReserveWal(); + else + { + SpinLockAcquire(&slot->mutex); + slot->data.restart_lsn = restart_lsn; + SpinLockRelease(&slot->mutex); + } + + /* ---- + * This is a bit tricky: We need to determine a safe xmin horizon to start + * decoding from, to avoid starting from a running xacts record referring + * to xids whose rows have been vacuumed or pruned + * already. GetOldestSafeDecodingTransactionId() returns such a value, but + * without further interlock its return value might immediately be out of + * date. + * + * So we have to acquire the ProcArrayLock to prevent computation of new + * xmin horizons by other backends, get the safe decoding xid, and inform + * the slot machinery about the new limit. Once that's done the + * ProcArrayLock can be released as the slot machinery now is + * protecting against vacuum. + * + * Note that, temporarily, the data, not just the catalog, xmin has to be + * reserved if a data snapshot is to be exported. Otherwise the initial + * data snapshot created here is not guaranteed to be valid. After that + * the data xmin doesn't need to be managed anymore and the global xmin + * should be recomputed. As we are fine with losing the pegged data xmin + * after crash - no chance a snapshot would get exported anymore - we can + * get away with just setting the slot's + * effective_xmin. ReplicationSlotRelease will reset it again. + * + * ---- + */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + xmin_horizon = GetOldestSafeDecodingTransactionId(!need_full_snapshot); + + SpinLockAcquire(&slot->mutex); + slot->effective_catalog_xmin = xmin_horizon; + slot->data.catalog_xmin = xmin_horizon; + if (need_full_snapshot) + slot->effective_xmin = xmin_horizon; + SpinLockRelease(&slot->mutex); + + ReplicationSlotsComputeRequiredXmin(true); + + LWLockRelease(ProcArrayLock); + + ReplicationSlotMarkDirty(); + ReplicationSlotSave(); + + ctx = StartupDecodingContext(NIL, restart_lsn, xmin_horizon, + need_full_snapshot, false, + xl_routine, prepare_write, do_write, + update_progress); + + /* call output plugin initialization callback */ + old_context = MemoryContextSwitchTo(ctx->context); + if (ctx->callbacks.startup_cb != NULL) + startup_cb_wrapper(ctx, &ctx->options, true); + MemoryContextSwitchTo(old_context); + + /* + * We allow decoding of prepared transactions when the two_phase is + * enabled at the time of slot creation, or when the two_phase option is + * given at the streaming start, provided the plugin supports all the + * callbacks for two-phase. + */ + ctx->twophase &= slot->data.two_phase; + + ctx->reorder->output_rewrites = ctx->options.receive_rewrites; + + return ctx; +} + +/* + * Create a new decoding context, for a logical slot that has previously been + * used already. + * + * start_lsn + * The LSN at which to start decoding. If InvalidXLogRecPtr, restart + * from the slot's confirmed_flush; otherwise, start from the specified + * location (but move it forwards to confirmed_flush if it's older than + * that, see below). + * + * output_plugin_options + * options passed to the output plugin. + * + * fast_forward + * bypass the generation of logical changes. + * + * xl_routine + * XLogReaderRoutine used by underlying xlogreader + * + * prepare_write, do_write, update_progress + * callbacks that have to be filled to perform the use-case dependent, + * actual work. + * + * Needs to be called while in a memory context that's at least as long lived + * as the decoding context because further memory contexts will be created + * inside it. + * + * Returns an initialized decoding context after calling the output plugin's + * startup function. + */ +LogicalDecodingContext * +CreateDecodingContext(XLogRecPtr start_lsn, + List *output_plugin_options, + bool fast_forward, + XLogReaderRoutine *xl_routine, + LogicalOutputPluginWriterPrepareWrite prepare_write, + LogicalOutputPluginWriterWrite do_write, + LogicalOutputPluginWriterUpdateProgress update_progress) +{ + LogicalDecodingContext *ctx; + ReplicationSlot *slot; + MemoryContext old_context; + + /* shorter lines... */ + slot = MyReplicationSlot; + + /* first some sanity checks that are unlikely to be violated */ + if (slot == NULL) + elog(ERROR, "cannot perform logical decoding without an acquired slot"); + + /* make sure the passed slot is suitable, these are user facing errors */ + if (SlotIsPhysical(slot)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot use physical replication slot for logical decoding"))); + + if (slot->data.database != MyDatabaseId) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("replication slot \"%s\" was not created in this database", + NameStr(slot->data.name)))); + + /* + * Check if slot has been invalidated due to max_slot_wal_keep_size. Avoid + * "cannot get changes" wording in this errmsg because that'd be + * confusingly ambiguous about no changes being available when called from + * pg_logical_slot_get_changes_guts(). + */ + if (MyReplicationSlot->data.invalidated == RS_INVAL_WAL_REMOVED) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("can no longer get changes from replication slot \"%s\"", + NameStr(MyReplicationSlot->data.name)), + errdetail("This slot has been invalidated because it exceeded the maximum reserved size."))); + + if (MyReplicationSlot->data.invalidated != RS_INVAL_NONE) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("can no longer get changes from replication slot \"%s\"", + NameStr(MyReplicationSlot->data.name)), + errdetail("This slot has been invalidated because it was conflicting with recovery."))); + + Assert(MyReplicationSlot->data.invalidated == RS_INVAL_NONE); + Assert(MyReplicationSlot->data.restart_lsn != InvalidXLogRecPtr); + + if (start_lsn == InvalidXLogRecPtr) + { + /* continue from last position */ + start_lsn = slot->data.confirmed_flush; + } + else if (start_lsn < slot->data.confirmed_flush) + { + /* + * It might seem like we should error out in this case, but it's + * pretty common for a client to acknowledge a LSN it doesn't have to + * do anything for, and thus didn't store persistently, because the + * xlog records didn't result in anything relevant for logical + * decoding. Clients have to be able to do that to support synchronous + * replication. + * + * Starting at a different LSN than requested might not catch certain + * kinds of client errors; so the client may wish to check that + * confirmed_flush_lsn matches its expectations. + */ + elog(LOG, "%X/%X has been already streamed, forwarding to %X/%X", + LSN_FORMAT_ARGS(start_lsn), + LSN_FORMAT_ARGS(slot->data.confirmed_flush)); + + start_lsn = slot->data.confirmed_flush; + } + + ctx = StartupDecodingContext(output_plugin_options, + start_lsn, InvalidTransactionId, false, + fast_forward, xl_routine, prepare_write, + do_write, update_progress); + + /* call output plugin initialization callback */ + old_context = MemoryContextSwitchTo(ctx->context); + if (ctx->callbacks.startup_cb != NULL) + startup_cb_wrapper(ctx, &ctx->options, false); + MemoryContextSwitchTo(old_context); + + /* + * We allow decoding of prepared transactions when the two_phase is + * enabled at the time of slot creation, or when the two_phase option is + * given at the streaming start, provided the plugin supports all the + * callbacks for two-phase. + */ + ctx->twophase &= (slot->data.two_phase || ctx->twophase_opt_given); + + /* Mark slot to allow two_phase decoding if not already marked */ + if (ctx->twophase && !slot->data.two_phase) + { + SpinLockAcquire(&slot->mutex); + slot->data.two_phase = true; + slot->data.two_phase_at = start_lsn; + SpinLockRelease(&slot->mutex); + ReplicationSlotMarkDirty(); + ReplicationSlotSave(); + SnapBuildSetTwoPhaseAt(ctx->snapshot_builder, start_lsn); + } + + ctx->reorder->output_rewrites = ctx->options.receive_rewrites; + + ereport(LOG, + (errmsg("starting logical decoding for slot \"%s\"", + NameStr(slot->data.name)), + errdetail("Streaming transactions committing after %X/%X, reading WAL from %X/%X.", + LSN_FORMAT_ARGS(slot->data.confirmed_flush), + LSN_FORMAT_ARGS(slot->data.restart_lsn)))); + + return ctx; +} + +/* + * Returns true if a consistent initial decoding snapshot has been built. + */ +bool +DecodingContextReady(LogicalDecodingContext *ctx) +{ + return SnapBuildCurrentState(ctx->snapshot_builder) == SNAPBUILD_CONSISTENT; +} + +/* + * Read from the decoding slot, until it is ready to start extracting changes. + */ +void +DecodingContextFindStartpoint(LogicalDecodingContext *ctx) +{ + ReplicationSlot *slot = ctx->slot; + + /* Initialize from where to start reading WAL. */ + XLogBeginRead(ctx->reader, slot->data.restart_lsn); + + elog(DEBUG1, "searching for logical decoding starting point, starting at %X/%X", + LSN_FORMAT_ARGS(slot->data.restart_lsn)); + + /* Wait for a consistent starting point */ + for (;;) + { + XLogRecord *record; + char *err = NULL; + + /* the read_page callback waits for new WAL */ + record = XLogReadRecord(ctx->reader, &err); + if (err) + elog(ERROR, "could not find logical decoding starting point: %s", err); + if (!record) + elog(ERROR, "could not find logical decoding starting point"); + + LogicalDecodingProcessRecord(ctx, ctx->reader); + + /* only continue till we found a consistent spot */ + if (DecodingContextReady(ctx)) + break; + + CHECK_FOR_INTERRUPTS(); + } + + SpinLockAcquire(&slot->mutex); + slot->data.confirmed_flush = ctx->reader->EndRecPtr; + if (slot->data.two_phase) + slot->data.two_phase_at = ctx->reader->EndRecPtr; + SpinLockRelease(&slot->mutex); +} + +/* + * Free a previously allocated decoding context, invoking the shutdown + * callback if necessary. + */ +void +FreeDecodingContext(LogicalDecodingContext *ctx) +{ + if (ctx->callbacks.shutdown_cb != NULL) + shutdown_cb_wrapper(ctx); + + ReorderBufferFree(ctx->reorder); + FreeSnapshotBuilder(ctx->snapshot_builder); + XLogReaderFree(ctx->reader); + MemoryContextDelete(ctx->context); +} + +/* + * Prepare a write using the context's output routine. + */ +void +OutputPluginPrepareWrite(struct LogicalDecodingContext *ctx, bool last_write) +{ + if (!ctx->accept_writes) + elog(ERROR, "writes are only accepted in commit, begin and change callbacks"); + + ctx->prepare_write(ctx, ctx->write_location, ctx->write_xid, last_write); + ctx->prepared_write = true; +} + +/* + * Perform a write using the context's output routine. + */ +void +OutputPluginWrite(struct LogicalDecodingContext *ctx, bool last_write) +{ + if (!ctx->prepared_write) + elog(ERROR, "OutputPluginPrepareWrite needs to be called before OutputPluginWrite"); + + ctx->write(ctx, ctx->write_location, ctx->write_xid, last_write); + ctx->prepared_write = false; +} + +/* + * Update progress tracking (if supported). + */ +void +OutputPluginUpdateProgress(struct LogicalDecodingContext *ctx, + bool skipped_xact) +{ + if (!ctx->update_progress) + return; + + ctx->update_progress(ctx, ctx->write_location, ctx->write_xid, + skipped_xact); +} + +/* + * Load the output plugin, lookup its output plugin init function, and check + * that it provides the required callbacks. + */ +static void +LoadOutputPlugin(OutputPluginCallbacks *callbacks, const char *plugin) +{ + LogicalOutputPluginInit plugin_init; + + plugin_init = (LogicalOutputPluginInit) + load_external_function(plugin, "_PG_output_plugin_init", false, NULL); + + if (plugin_init == NULL) + elog(ERROR, "output plugins have to declare the _PG_output_plugin_init symbol"); + + /* ask the output plugin to fill the callback struct */ + plugin_init(callbacks); + + if (callbacks->begin_cb == NULL) + elog(ERROR, "output plugins have to register a begin callback"); + if (callbacks->change_cb == NULL) + elog(ERROR, "output plugins have to register a change callback"); + if (callbacks->commit_cb == NULL) + elog(ERROR, "output plugins have to register a commit callback"); +} + +static void +output_plugin_error_callback(void *arg) +{ + LogicalErrorCallbackState *state = (LogicalErrorCallbackState *) arg; + + /* not all callbacks have an associated LSN */ + if (state->report_location != InvalidXLogRecPtr) + errcontext("slot \"%s\", output plugin \"%s\", in the %s callback, associated LSN %X/%X", + NameStr(state->ctx->slot->data.name), + NameStr(state->ctx->slot->data.plugin), + state->callback_name, + LSN_FORMAT_ARGS(state->report_location)); + else + errcontext("slot \"%s\", output plugin \"%s\", in the %s callback", + NameStr(state->ctx->slot->data.name), + NameStr(state->ctx->slot->data.plugin), + state->callback_name); +} + +static void +startup_cb_wrapper(LogicalDecodingContext *ctx, OutputPluginOptions *opt, bool is_init) +{ + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "startup"; + state.report_location = InvalidXLogRecPtr; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = false; + ctx->end_xact = false; + + /* do the actual work: call callback */ + ctx->callbacks.startup_cb(ctx, opt, is_init); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +shutdown_cb_wrapper(LogicalDecodingContext *ctx) +{ + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "shutdown"; + state.report_location = InvalidXLogRecPtr; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = false; + ctx->end_xact = false; + + /* do the actual work: call callback */ + ctx->callbacks.shutdown_cb(ctx); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + + +/* + * Callbacks for ReorderBuffer which add in some more information and then call + * output_plugin.h plugins. + */ +static void +begin_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "begin"; + state.report_location = txn->first_lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + ctx->write_location = txn->first_lsn; + ctx->end_xact = false; + + /* do the actual work: call callback */ + ctx->callbacks.begin_cb(ctx, txn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +commit_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "commit"; + state.report_location = txn->final_lsn; /* beginning of commit record */ + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + ctx->write_location = txn->end_lsn; /* points to the end of the record */ + ctx->end_xact = true; + + /* do the actual work: call callback */ + ctx->callbacks.commit_cb(ctx, txn, commit_lsn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +/* + * The functionality of begin_prepare is quite similar to begin with the + * exception that this will have gid (global transaction id) information which + * can be used by plugin. Now, we thought about extending the existing begin + * but that would break the replication protocol and additionally this looks + * cleaner. + */ +static void +begin_prepare_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when two-phase commits are supported */ + Assert(ctx->twophase); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "begin_prepare"; + state.report_location = txn->first_lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + ctx->write_location = txn->first_lsn; + ctx->end_xact = false; + + /* + * If the plugin supports two-phase commits then begin prepare callback is + * mandatory + */ + if (ctx->callbacks.begin_prepare_cb == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical replication at prepare time requires a %s callback", + "begin_prepare_cb"))); + + /* do the actual work: call callback */ + ctx->callbacks.begin_prepare_cb(ctx, txn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +prepare_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr prepare_lsn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when two-phase commits are supported */ + Assert(ctx->twophase); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "prepare"; + state.report_location = txn->final_lsn; /* beginning of prepare record */ + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + ctx->write_location = txn->end_lsn; /* points to the end of the record */ + ctx->end_xact = true; + + /* + * If the plugin supports two-phase commits then prepare callback is + * mandatory + */ + if (ctx->callbacks.prepare_cb == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical replication at prepare time requires a %s callback", + "prepare_cb"))); + + /* do the actual work: call callback */ + ctx->callbacks.prepare_cb(ctx, txn, prepare_lsn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +commit_prepared_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when two-phase commits are supported */ + Assert(ctx->twophase); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "commit_prepared"; + state.report_location = txn->final_lsn; /* beginning of commit record */ + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + ctx->write_location = txn->end_lsn; /* points to the end of the record */ + ctx->end_xact = true; + + /* + * If the plugin support two-phase commits then commit prepared callback + * is mandatory + */ + if (ctx->callbacks.commit_prepared_cb == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical replication at prepare time requires a %s callback", + "commit_prepared_cb"))); + + /* do the actual work: call callback */ + ctx->callbacks.commit_prepared_cb(ctx, txn, commit_lsn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +rollback_prepared_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr prepare_end_lsn, + TimestampTz prepare_time) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when two-phase commits are supported */ + Assert(ctx->twophase); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "rollback_prepared"; + state.report_location = txn->final_lsn; /* beginning of commit record */ + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + ctx->write_location = txn->end_lsn; /* points to the end of the record */ + ctx->end_xact = true; + + /* + * If the plugin support two-phase commits then rollback prepared callback + * is mandatory + */ + if (ctx->callbacks.rollback_prepared_cb == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical replication at prepare time requires a %s callback", + "rollback_prepared_cb"))); + + /* do the actual work: call callback */ + ctx->callbacks.rollback_prepared_cb(ctx, txn, prepare_end_lsn, + prepare_time); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +change_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "change"; + state.report_location = change->lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + + /* + * Report this change's lsn so replies from clients can give an up-to-date + * answer. This won't ever be enough (and shouldn't be!) to confirm + * receipt of this transaction, but it might allow another transaction's + * commit to be confirmed with one message. + */ + ctx->write_location = change->lsn; + + ctx->end_xact = false; + + ctx->callbacks.change_cb(ctx, txn, relation, change); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +truncate_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + int nrelations, Relation relations[], ReorderBufferChange *change) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + if (!ctx->callbacks.truncate_cb) + return; + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "truncate"; + state.report_location = change->lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + + /* + * Report this change's lsn so replies from clients can give an up-to-date + * answer. This won't ever be enough (and shouldn't be!) to confirm + * receipt of this transaction, but it might allow another transaction's + * commit to be confirmed with one message. + */ + ctx->write_location = change->lsn; + + ctx->end_xact = false; + + ctx->callbacks.truncate_cb(ctx, txn, nrelations, relations, change); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +bool +filter_prepare_cb_wrapper(LogicalDecodingContext *ctx, TransactionId xid, + const char *gid) +{ + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + bool ret; + + Assert(!ctx->fast_forward); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "filter_prepare"; + state.report_location = InvalidXLogRecPtr; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = false; + ctx->end_xact = false; + + /* do the actual work: call callback */ + ret = ctx->callbacks.filter_prepare_cb(ctx, xid, gid); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; + + return ret; +} + +bool +filter_by_origin_cb_wrapper(LogicalDecodingContext *ctx, RepOriginId origin_id) +{ + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + bool ret; + + Assert(!ctx->fast_forward); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "filter_by_origin"; + state.report_location = InvalidXLogRecPtr; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = false; + ctx->end_xact = false; + + /* do the actual work: call callback */ + ret = ctx->callbacks.filter_by_origin_cb(ctx, origin_id); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; + + return ret; +} + +static void +message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr message_lsn, bool transactional, + const char *prefix, Size message_size, const char *message) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + if (ctx->callbacks.message_cb == NULL) + return; + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "message"; + state.report_location = message_lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn != NULL ? txn->xid : InvalidTransactionId; + ctx->write_location = message_lsn; + ctx->end_xact = false; + + /* do the actual work: call callback */ + ctx->callbacks.message_cb(ctx, txn, message_lsn, transactional, prefix, + message_size, message); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +stream_start_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr first_lsn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when streaming is supported. */ + Assert(ctx->streaming); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "stream_start"; + state.report_location = first_lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + + /* + * Report this message's lsn so replies from clients can give an + * up-to-date answer. This won't ever be enough (and shouldn't be!) to + * confirm receipt of this transaction, but it might allow another + * transaction's commit to be confirmed with one message. + */ + ctx->write_location = first_lsn; + + ctx->end_xact = false; + + /* in streaming mode, stream_start_cb is required */ + if (ctx->callbacks.stream_start_cb == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical streaming requires a %s callback", + "stream_start_cb"))); + + ctx->callbacks.stream_start_cb(ctx, txn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +stream_stop_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr last_lsn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when streaming is supported. */ + Assert(ctx->streaming); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "stream_stop"; + state.report_location = last_lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + + /* + * Report this message's lsn so replies from clients can give an + * up-to-date answer. This won't ever be enough (and shouldn't be!) to + * confirm receipt of this transaction, but it might allow another + * transaction's commit to be confirmed with one message. + */ + ctx->write_location = last_lsn; + + ctx->end_xact = false; + + /* in streaming mode, stream_stop_cb is required */ + if (ctx->callbacks.stream_stop_cb == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical streaming requires a %s callback", + "stream_stop_cb"))); + + ctx->callbacks.stream_stop_cb(ctx, txn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +stream_abort_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr abort_lsn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when streaming is supported. */ + Assert(ctx->streaming); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "stream_abort"; + state.report_location = abort_lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + ctx->write_location = abort_lsn; + ctx->end_xact = true; + + /* in streaming mode, stream_abort_cb is required */ + if (ctx->callbacks.stream_abort_cb == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical streaming requires a %s callback", + "stream_abort_cb"))); + + ctx->callbacks.stream_abort_cb(ctx, txn, abort_lsn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +stream_prepare_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr prepare_lsn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* + * We're only supposed to call this when streaming and two-phase commits + * are supported. + */ + Assert(ctx->streaming); + Assert(ctx->twophase); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "stream_prepare"; + state.report_location = txn->final_lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + ctx->write_location = txn->end_lsn; + ctx->end_xact = true; + + /* in streaming mode with two-phase commits, stream_prepare_cb is required */ + if (ctx->callbacks.stream_prepare_cb == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical streaming at prepare time requires a %s callback", + "stream_prepare_cb"))); + + ctx->callbacks.stream_prepare_cb(ctx, txn, prepare_lsn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +stream_commit_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when streaming is supported. */ + Assert(ctx->streaming); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "stream_commit"; + state.report_location = txn->final_lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + ctx->write_location = txn->end_lsn; + ctx->end_xact = true; + + /* in streaming mode, stream_commit_cb is required */ + if (ctx->callbacks.stream_commit_cb == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical streaming requires a %s callback", + "stream_commit_cb"))); + + ctx->callbacks.stream_commit_cb(ctx, txn, commit_lsn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +stream_change_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when streaming is supported. */ + Assert(ctx->streaming); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "stream_change"; + state.report_location = change->lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + + /* + * Report this change's lsn so replies from clients can give an up-to-date + * answer. This won't ever be enough (and shouldn't be!) to confirm + * receipt of this transaction, but it might allow another transaction's + * commit to be confirmed with one message. + */ + ctx->write_location = change->lsn; + + ctx->end_xact = false; + + /* in streaming mode, stream_change_cb is required */ + if (ctx->callbacks.stream_change_cb == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical streaming requires a %s callback", + "stream_change_cb"))); + + ctx->callbacks.stream_change_cb(ctx, txn, relation, change); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +stream_message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr message_lsn, bool transactional, + const char *prefix, Size message_size, const char *message) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when streaming is supported. */ + Assert(ctx->streaming); + + /* this callback is optional */ + if (ctx->callbacks.stream_message_cb == NULL) + return; + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "stream_message"; + state.report_location = message_lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn != NULL ? txn->xid : InvalidTransactionId; + ctx->write_location = message_lsn; + ctx->end_xact = false; + + /* do the actual work: call callback */ + ctx->callbacks.stream_message_cb(ctx, txn, message_lsn, transactional, prefix, + message_size, message); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +stream_truncate_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + int nrelations, Relation relations[], + ReorderBufferChange *change) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when streaming is supported. */ + Assert(ctx->streaming); + + /* this callback is optional */ + if (!ctx->callbacks.stream_truncate_cb) + return; + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "stream_truncate"; + state.report_location = change->lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + + /* + * Report this change's lsn so replies from clients can give an up-to-date + * answer. This won't ever be enough (and shouldn't be!) to confirm + * receipt of this transaction, but it might allow another transaction's + * commit to be confirmed with one message. + */ + ctx->write_location = change->lsn; + + ctx->end_xact = false; + + ctx->callbacks.stream_truncate_cb(ctx, txn, nrelations, relations, change); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +update_progress_txn_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr lsn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "update_progress_txn"; + state.report_location = lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = false; + ctx->write_xid = txn->xid; + + /* + * Report this change's lsn so replies from clients can give an up-to-date + * answer. This won't ever be enough (and shouldn't be!) to confirm + * receipt of this transaction, but it might allow another transaction's + * commit to be confirmed with one message. + */ + ctx->write_location = lsn; + + ctx->end_xact = false; + + OutputPluginUpdateProgress(ctx, false); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +/* + * Set the required catalog xmin horizon for historic snapshots in the current + * replication slot. + * + * Note that in the most cases, we won't be able to immediately use the xmin + * to increase the xmin horizon: we need to wait till the client has confirmed + * receiving current_lsn with LogicalConfirmReceivedLocation(). + */ +void +LogicalIncreaseXminForSlot(XLogRecPtr current_lsn, TransactionId xmin) +{ + bool updated_xmin = false; + ReplicationSlot *slot; + bool got_new_xmin = false; + + slot = MyReplicationSlot; + + Assert(slot != NULL); + + SpinLockAcquire(&slot->mutex); + + /* + * don't overwrite if we already have a newer xmin. This can happen if we + * restart decoding in a slot. + */ + if (TransactionIdPrecedesOrEquals(xmin, slot->data.catalog_xmin)) + { + } + + /* + * If the client has already confirmed up to this lsn, we directly can + * mark this as accepted. This can happen if we restart decoding in a + * slot. + */ + else if (current_lsn <= slot->data.confirmed_flush) + { + slot->candidate_catalog_xmin = xmin; + slot->candidate_xmin_lsn = current_lsn; + + /* our candidate can directly be used */ + updated_xmin = true; + } + + /* + * Only increase if the previous values have been applied, otherwise we + * might never end up updating if the receiver acks too slowly. + */ + else if (slot->candidate_xmin_lsn == InvalidXLogRecPtr) + { + slot->candidate_catalog_xmin = xmin; + slot->candidate_xmin_lsn = current_lsn; + + /* + * Log new xmin at an appropriate log level after releasing the + * spinlock. + */ + got_new_xmin = true; + } + SpinLockRelease(&slot->mutex); + + if (got_new_xmin) + elog(DEBUG1, "got new catalog xmin %u at %X/%X", xmin, + LSN_FORMAT_ARGS(current_lsn)); + + /* candidate already valid with the current flush position, apply */ + if (updated_xmin) + LogicalConfirmReceivedLocation(slot->data.confirmed_flush); +} + +/* + * Mark the minimal LSN (restart_lsn) we need to read to replay all + * transactions that have not yet committed at current_lsn. + * + * Just like LogicalIncreaseXminForSlot this only takes effect when the + * client has confirmed to have received current_lsn. + */ +void +LogicalIncreaseRestartDecodingForSlot(XLogRecPtr current_lsn, XLogRecPtr restart_lsn) +{ + bool updated_lsn = false; + ReplicationSlot *slot; + + slot = MyReplicationSlot; + + Assert(slot != NULL); + Assert(restart_lsn != InvalidXLogRecPtr); + Assert(current_lsn != InvalidXLogRecPtr); + + SpinLockAcquire(&slot->mutex); + + /* don't overwrite if have a newer restart lsn */ + if (restart_lsn <= slot->data.restart_lsn) + { + } + + /* + * We might have already flushed far enough to directly accept this lsn, + * in this case there is no need to check for existing candidate LSNs + */ + else if (current_lsn <= slot->data.confirmed_flush) + { + slot->candidate_restart_valid = current_lsn; + slot->candidate_restart_lsn = restart_lsn; + + /* our candidate can directly be used */ + updated_lsn = true; + } + + /* + * Only increase if the previous values have been applied, otherwise we + * might never end up updating if the receiver acks too slowly. A missed + * value here will just cause some extra effort after reconnecting. + */ + if (slot->candidate_restart_valid == InvalidXLogRecPtr) + { + slot->candidate_restart_valid = current_lsn; + slot->candidate_restart_lsn = restart_lsn; + SpinLockRelease(&slot->mutex); + + elog(DEBUG1, "got new restart lsn %X/%X at %X/%X", + LSN_FORMAT_ARGS(restart_lsn), + LSN_FORMAT_ARGS(current_lsn)); + } + else + { + XLogRecPtr candidate_restart_lsn; + XLogRecPtr candidate_restart_valid; + XLogRecPtr confirmed_flush; + + candidate_restart_lsn = slot->candidate_restart_lsn; + candidate_restart_valid = slot->candidate_restart_valid; + confirmed_flush = slot->data.confirmed_flush; + SpinLockRelease(&slot->mutex); + + elog(DEBUG1, "failed to increase restart lsn: proposed %X/%X, after %X/%X, current candidate %X/%X, current after %X/%X, flushed up to %X/%X", + LSN_FORMAT_ARGS(restart_lsn), + LSN_FORMAT_ARGS(current_lsn), + LSN_FORMAT_ARGS(candidate_restart_lsn), + LSN_FORMAT_ARGS(candidate_restart_valid), + LSN_FORMAT_ARGS(confirmed_flush)); + } + + /* candidates are already valid with the current flush position, apply */ + if (updated_lsn) + LogicalConfirmReceivedLocation(slot->data.confirmed_flush); +} + +/* + * Handle a consumer's confirmation having received all changes up to lsn. + */ +void +LogicalConfirmReceivedLocation(XLogRecPtr lsn) +{ + Assert(lsn != InvalidXLogRecPtr); + + /* Do an unlocked check for candidate_lsn first. */ + if (MyReplicationSlot->candidate_xmin_lsn != InvalidXLogRecPtr || + MyReplicationSlot->candidate_restart_valid != InvalidXLogRecPtr) + { + bool updated_xmin = false; + bool updated_restart = false; + + SpinLockAcquire(&MyReplicationSlot->mutex); + + MyReplicationSlot->data.confirmed_flush = lsn; + + /* if we're past the location required for bumping xmin, do so */ + if (MyReplicationSlot->candidate_xmin_lsn != InvalidXLogRecPtr && + MyReplicationSlot->candidate_xmin_lsn <= lsn) + { + /* + * We have to write the changed xmin to disk *before* we change + * the in-memory value, otherwise after a crash we wouldn't know + * that some catalog tuples might have been removed already. + * + * Ensure that by first writing to ->xmin and only update + * ->effective_xmin once the new state is synced to disk. After a + * crash ->effective_xmin is set to ->xmin. + */ + if (TransactionIdIsValid(MyReplicationSlot->candidate_catalog_xmin) && + MyReplicationSlot->data.catalog_xmin != MyReplicationSlot->candidate_catalog_xmin) + { + MyReplicationSlot->data.catalog_xmin = MyReplicationSlot->candidate_catalog_xmin; + MyReplicationSlot->candidate_catalog_xmin = InvalidTransactionId; + MyReplicationSlot->candidate_xmin_lsn = InvalidXLogRecPtr; + updated_xmin = true; + } + } + + if (MyReplicationSlot->candidate_restart_valid != InvalidXLogRecPtr && + MyReplicationSlot->candidate_restart_valid <= lsn) + { + Assert(MyReplicationSlot->candidate_restart_lsn != InvalidXLogRecPtr); + + MyReplicationSlot->data.restart_lsn = MyReplicationSlot->candidate_restart_lsn; + MyReplicationSlot->candidate_restart_lsn = InvalidXLogRecPtr; + MyReplicationSlot->candidate_restart_valid = InvalidXLogRecPtr; + updated_restart = true; + } + + SpinLockRelease(&MyReplicationSlot->mutex); + + /* first write new xmin to disk, so we know what's up after a crash */ + if (updated_xmin || updated_restart) + { + ReplicationSlotMarkDirty(); + ReplicationSlotSave(); + elog(DEBUG1, "updated xmin: %u restart: %u", updated_xmin, updated_restart); + } + + /* + * Now the new xmin is safely on disk, we can let the global value + * advance. We do not take ProcArrayLock or similar since we only + * advance xmin here and there's not much harm done by a concurrent + * computation missing that. + */ + if (updated_xmin) + { + SpinLockAcquire(&MyReplicationSlot->mutex); + MyReplicationSlot->effective_catalog_xmin = MyReplicationSlot->data.catalog_xmin; + SpinLockRelease(&MyReplicationSlot->mutex); + + ReplicationSlotsComputeRequiredXmin(false); + ReplicationSlotsComputeRequiredLSN(); + } + } + else + { + SpinLockAcquire(&MyReplicationSlot->mutex); + MyReplicationSlot->data.confirmed_flush = lsn; + SpinLockRelease(&MyReplicationSlot->mutex); + } +} + +/* + * Clear logical streaming state during (sub)transaction abort. + */ +void +ResetLogicalStreamingState(void) +{ + CheckXidAlive = InvalidTransactionId; + bsysscan = false; +} + +/* + * Report stats for a slot. + */ +void +UpdateDecodingStats(LogicalDecodingContext *ctx) +{ + ReorderBuffer *rb = ctx->reorder; + PgStat_StatReplSlotEntry repSlotStat; + + /* Nothing to do if we don't have any replication stats to be sent. */ + if (rb->spillBytes <= 0 && rb->streamBytes <= 0 && rb->totalBytes <= 0) + return; + + elog(DEBUG2, "UpdateDecodingStats: updating stats %p %lld %lld %lld %lld %lld %lld %lld %lld", + rb, + (long long) rb->spillTxns, + (long long) rb->spillCount, + (long long) rb->spillBytes, + (long long) rb->streamTxns, + (long long) rb->streamCount, + (long long) rb->streamBytes, + (long long) rb->totalTxns, + (long long) rb->totalBytes); + + repSlotStat.spill_txns = rb->spillTxns; + repSlotStat.spill_count = rb->spillCount; + repSlotStat.spill_bytes = rb->spillBytes; + repSlotStat.stream_txns = rb->streamTxns; + repSlotStat.stream_count = rb->streamCount; + repSlotStat.stream_bytes = rb->streamBytes; + repSlotStat.total_txns = rb->totalTxns; + repSlotStat.total_bytes = rb->totalBytes; + + pgstat_report_replslot(ctx->slot, &repSlotStat); + + rb->spillTxns = 0; + rb->spillCount = 0; + rb->spillBytes = 0; + rb->streamTxns = 0; + rb->streamCount = 0; + rb->streamBytes = 0; + rb->totalTxns = 0; + rb->totalBytes = 0; +} diff --git a/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/logicalfuncs.c b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/logicalfuncs.c new file mode 100644 index 00000000000..55a24c02c94 --- /dev/null +++ b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/logicalfuncs.c @@ -0,0 +1,377 @@ +/*------------------------------------------------------------------------- + * + * logicalfuncs.c + * + * Support functions for using logical decoding and management of + * logical replication slots via SQL. + * + * + * Copyright (c) 2012-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/logicalfuncs.c + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <unistd.h> + +#include "access/xact.h" +#include "access/xlog_internal.h" +#include "access/xlogrecovery.h" +#include "access/xlogutils.h" +#include "catalog/pg_type.h" +#include "fmgr.h" +#include "funcapi.h" +#include "mb/pg_wchar.h" +#include "miscadmin.h" +#include "nodes/makefuncs.h" +#include "replication/decode.h" +#include "replication/logical.h" +#include "replication/message.h" +#include "storage/fd.h" +#include "utils/array.h" +#include "utils/builtins.h" +#include "utils/inval.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/pg_lsn.h" +#include "utils/regproc.h" +#include "utils/resowner.h" + +/* Private data for writing out data */ +typedef struct DecodingOutputState +{ + Tuplestorestate *tupstore; + TupleDesc tupdesc; + bool binary_output; + int64 returned_rows; +} DecodingOutputState; + +/* + * Prepare for an output plugin write. + */ +static void +LogicalOutputPrepareWrite(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, + bool last_write) +{ + resetStringInfo(ctx->out); +} + +/* + * Perform output plugin write into tuplestore. + */ +static void +LogicalOutputWrite(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, + bool last_write) +{ + Datum values[3]; + bool nulls[3]; + DecodingOutputState *p; + + /* SQL Datums can only be of a limited length... */ + if (ctx->out->len > MaxAllocSize - VARHDRSZ) + elog(ERROR, "too much output for sql interface"); + + p = (DecodingOutputState *) ctx->output_writer_private; + + memset(nulls, 0, sizeof(nulls)); + values[0] = LSNGetDatum(lsn); + values[1] = TransactionIdGetDatum(xid); + + /* + * Assert ctx->out is in database encoding when we're writing textual + * output. + */ + if (!p->binary_output) + Assert(pg_verify_mbstr(GetDatabaseEncoding(), + ctx->out->data, ctx->out->len, + false)); + + /* ick, but cstring_to_text_with_len works for bytea perfectly fine */ + values[2] = PointerGetDatum(cstring_to_text_with_len(ctx->out->data, ctx->out->len)); + + tuplestore_putvalues(p->tupstore, p->tupdesc, values, nulls); + p->returned_rows++; +} + +/* + * Helper function for the various SQL callable logical decoding functions. + */ +static Datum +pg_logical_slot_get_changes_guts(FunctionCallInfo fcinfo, bool confirm, bool binary) +{ + Name name; + XLogRecPtr upto_lsn; + int32 upto_nchanges; + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + MemoryContext per_query_ctx; + MemoryContext oldcontext; + XLogRecPtr end_of_wal; + LogicalDecodingContext *ctx; + ResourceOwner old_resowner = CurrentResourceOwner; + ArrayType *arr; + Size ndim; + List *options = NIL; + DecodingOutputState *p; + + CheckSlotPermissions(); + + CheckLogicalDecodingRequirements(); + + if (PG_ARGISNULL(0)) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("slot name must not be null"))); + name = PG_GETARG_NAME(0); + + if (PG_ARGISNULL(1)) + upto_lsn = InvalidXLogRecPtr; + else + upto_lsn = PG_GETARG_LSN(1); + + if (PG_ARGISNULL(2)) + upto_nchanges = InvalidXLogRecPtr; + else + upto_nchanges = PG_GETARG_INT32(2); + + if (PG_ARGISNULL(3)) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("options array must not be null"))); + arr = PG_GETARG_ARRAYTYPE_P(3); + + /* state to write output to */ + p = palloc0(sizeof(DecodingOutputState)); + + p->binary_output = binary; + + per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; + oldcontext = MemoryContextSwitchTo(per_query_ctx); + + /* Deconstruct options array */ + ndim = ARR_NDIM(arr); + if (ndim > 1) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("array must be one-dimensional"))); + } + else if (array_contains_nulls(arr)) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("array must not contain nulls"))); + } + else if (ndim == 1) + { + int nelems; + Datum *datum_opts; + int i; + + Assert(ARR_ELEMTYPE(arr) == TEXTOID); + + deconstruct_array_builtin(arr, TEXTOID, &datum_opts, NULL, &nelems); + + if (nelems % 2 != 0) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("array must have even number of elements"))); + + for (i = 0; i < nelems; i += 2) + { + char *name = TextDatumGetCString(datum_opts[i]); + char *opt = TextDatumGetCString(datum_opts[i + 1]); + + options = lappend(options, makeDefElem(name, (Node *) makeString(opt), -1)); + } + } + + InitMaterializedSRF(fcinfo, 0); + p->tupstore = rsinfo->setResult; + p->tupdesc = rsinfo->setDesc; + + /* + * Compute the current end-of-wal. + */ + if (!RecoveryInProgress()) + end_of_wal = GetFlushRecPtr(NULL); + else + end_of_wal = GetXLogReplayRecPtr(NULL); + + ReplicationSlotAcquire(NameStr(*name), true); + + PG_TRY(); + { + /* restart at slot's confirmed_flush */ + ctx = CreateDecodingContext(InvalidXLogRecPtr, + options, + false, + XL_ROUTINE(.page_read = read_local_xlog_page, + .segment_open = wal_segment_open, + .segment_close = wal_segment_close), + LogicalOutputPrepareWrite, + LogicalOutputWrite, NULL); + + MemoryContextSwitchTo(oldcontext); + + /* + * Check whether the output plugin writes textual output if that's + * what we need. + */ + if (!binary && + ctx->options.output_type !=OUTPUT_PLUGIN_TEXTUAL_OUTPUT) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("logical decoding output plugin \"%s\" produces binary output, but function \"%s\" expects textual data", + NameStr(MyReplicationSlot->data.plugin), + format_procedure(fcinfo->flinfo->fn_oid)))); + + ctx->output_writer_private = p; + + /* + * Decoding of WAL must start at restart_lsn so that the entirety of + * xacts that committed after the slot's confirmed_flush can be + * accumulated into reorder buffers. + */ + XLogBeginRead(ctx->reader, MyReplicationSlot->data.restart_lsn); + + /* invalidate non-timetravel entries */ + InvalidateSystemCaches(); + + /* Decode until we run out of records */ + while (ctx->reader->EndRecPtr < end_of_wal) + { + XLogRecord *record; + char *errm = NULL; + + record = XLogReadRecord(ctx->reader, &errm); + if (errm) + elog(ERROR, "could not find record for logical decoding: %s", errm); + + /* + * The {begin_txn,change,commit_txn}_wrapper callbacks above will + * store the description into our tuplestore. + */ + if (record != NULL) + LogicalDecodingProcessRecord(ctx, ctx->reader); + + /* check limits */ + if (upto_lsn != InvalidXLogRecPtr && + upto_lsn <= ctx->reader->EndRecPtr) + break; + if (upto_nchanges != 0 && + upto_nchanges <= p->returned_rows) + break; + CHECK_FOR_INTERRUPTS(); + } + + /* + * Logical decoding could have clobbered CurrentResourceOwner during + * transaction management, so restore the executor's value. (This is + * a kluge, but it's not worth cleaning up right now.) + */ + CurrentResourceOwner = old_resowner; + + /* + * Next time, start where we left off. (Hunting things, the family + * business..) + */ + if (ctx->reader->EndRecPtr != InvalidXLogRecPtr && confirm) + { + LogicalConfirmReceivedLocation(ctx->reader->EndRecPtr); + + /* + * If only the confirmed_flush_lsn has changed the slot won't get + * marked as dirty by the above. Callers on the walsender + * interface are expected to keep track of their own progress and + * don't need it written out. But SQL-interface users cannot + * specify their own start positions and it's harder for them to + * keep track of their progress, so we should make more of an + * effort to save it for them. + * + * Dirty the slot so it's written out at the next checkpoint. + * We'll still lose its position on crash, as documented, but it's + * better than always losing the position even on clean restart. + */ + ReplicationSlotMarkDirty(); + } + + /* free context, call shutdown callback */ + FreeDecodingContext(ctx); + + ReplicationSlotRelease(); + InvalidateSystemCaches(); + } + PG_CATCH(); + { + /* clear all timetravel entries */ + InvalidateSystemCaches(); + + PG_RE_THROW(); + } + PG_END_TRY(); + + return (Datum) 0; +} + +/* + * SQL function returning the changestream as text, consuming the data. + */ +Datum +pg_logical_slot_get_changes(PG_FUNCTION_ARGS) +{ + return pg_logical_slot_get_changes_guts(fcinfo, true, false); +} + +/* + * SQL function returning the changestream as text, only peeking ahead. + */ +Datum +pg_logical_slot_peek_changes(PG_FUNCTION_ARGS) +{ + return pg_logical_slot_get_changes_guts(fcinfo, false, false); +} + +/* + * SQL function returning the changestream in binary, consuming the data. + */ +Datum +pg_logical_slot_get_binary_changes(PG_FUNCTION_ARGS) +{ + return pg_logical_slot_get_changes_guts(fcinfo, true, true); +} + +/* + * SQL function returning the changestream in binary, only peeking ahead. + */ +Datum +pg_logical_slot_peek_binary_changes(PG_FUNCTION_ARGS) +{ + return pg_logical_slot_get_changes_guts(fcinfo, false, true); +} + + +/* + * SQL function for writing logical decoding message into WAL. + */ +Datum +pg_logical_emit_message_bytea(PG_FUNCTION_ARGS) +{ + bool transactional = PG_GETARG_BOOL(0); + char *prefix = text_to_cstring(PG_GETARG_TEXT_PP(1)); + bytea *data = PG_GETARG_BYTEA_PP(2); + XLogRecPtr lsn; + + lsn = LogLogicalMessage(prefix, VARDATA_ANY(data), VARSIZE_ANY_EXHDR(data), + transactional); + PG_RETURN_LSN(lsn); +} + +Datum +pg_logical_emit_message_text(PG_FUNCTION_ARGS) +{ + /* bytea and text are compatible */ + return pg_logical_emit_message_bytea(fcinfo); +} diff --git a/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/message.c b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/message.c new file mode 100644 index 00000000000..c5de14afc65 --- /dev/null +++ b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/message.c @@ -0,0 +1,89 @@ +/*------------------------------------------------------------------------- + * + * message.c + * Generic logical messages. + * + * Copyright (c) 2013-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/message.c + * + * NOTES + * + * Generic logical messages allow XLOG logging of arbitrary binary blobs that + * get passed to the logical decoding plugin. In normal XLOG processing they + * are same as NOOP. + * + * These messages can be either transactional or non-transactional. + * Transactional messages are part of current transaction and will be sent to + * decoding plugin using in a same way as DML operations. + * Non-transactional messages are sent to the plugin at the time when the + * logical decoding reads them from XLOG. This also means that transactional + * messages won't be delivered if the transaction was rolled back but the + * non-transactional one will always be delivered. + * + * Every message carries prefix to avoid conflicts between different decoding + * plugins. The plugin authors must take extra care to use unique prefix, + * good options seems to be for example to use the name of the extension. + * + * --------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/xact.h" +#include "access/xloginsert.h" +#include "miscadmin.h" +#include "nodes/execnodes.h" +#include "replication/logical.h" +#include "replication/message.h" +#include "utils/memutils.h" + +/* + * Write logical decoding message into XLog. + */ +XLogRecPtr +LogLogicalMessage(const char *prefix, const char *message, size_t size, + bool transactional) +{ + xl_logical_message xlrec; + + /* + * Force xid to be allocated if we're emitting a transactional message. + */ + if (transactional) + { + Assert(IsTransactionState()); + GetCurrentTransactionId(); + } + + xlrec.dbId = MyDatabaseId; + xlrec.transactional = transactional; + /* trailing zero is critical; see logicalmsg_desc */ + xlrec.prefix_size = strlen(prefix) + 1; + xlrec.message_size = size; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfLogicalMessage); + XLogRegisterData(unconstify(char *, prefix), xlrec.prefix_size); + XLogRegisterData(unconstify(char *, message), size); + + /* allow origin filtering */ + XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); + + return XLogInsert(RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE); +} + +/* + * Redo is basically just noop for logical decoding messages. + */ +void +logicalmsg_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + if (info != XLOG_LOGICAL_MESSAGE) + elog(PANIC, "logicalmsg_redo: unknown op code %u", info); + + /* This is only interesting for logical decoding, see decode.c. */ +} diff --git a/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/origin.c b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/origin.c new file mode 100644 index 00000000000..c43db00a42c --- /dev/null +++ b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/origin.c @@ -0,0 +1,1581 @@ +/*------------------------------------------------------------------------- + * + * origin.c + * Logical replication progress tracking support. + * + * Copyright (c) 2013-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/origin.c + * + * NOTES + * + * This file provides the following: + * * An infrastructure to name nodes in a replication setup + * * A facility to efficiently store and persist replication progress in an + * efficient and durable manner. + * + * Replication origin consist out of a descriptive, user defined, external + * name and a short, thus space efficient, internal 2 byte one. This split + * exists because replication origin have to be stored in WAL and shared + * memory and long descriptors would be inefficient. For now only use 2 bytes + * for the internal id of a replication origin as it seems unlikely that there + * soon will be more than 65k nodes in one replication setup; and using only + * two bytes allow us to be more space efficient. + * + * Replication progress is tracked in a shared memory table + * (ReplicationState) that's dumped to disk every checkpoint. Entries + * ('slots') in this table are identified by the internal id. That's the case + * because it allows to increase replication progress during crash + * recovery. To allow doing so we store the original LSN (from the originating + * system) of a transaction in the commit record. That allows to recover the + * precise replayed state after crash recovery; without requiring synchronous + * commits. Allowing logical replication to use asynchronous commit is + * generally good for performance, but especially important as it allows a + * single threaded replay process to keep up with a source that has multiple + * backends generating changes concurrently. For efficiency and simplicity + * reasons a backend can setup one replication origin that's from then used as + * the source of changes produced by the backend, until reset again. + * + * This infrastructure is intended to be used in cooperation with logical + * decoding. When replaying from a remote system the configured origin is + * provided to output plugins, allowing prevention of replication loops and + * other filtering. + * + * There are several levels of locking at work: + * + * * To create and drop replication origins an exclusive lock on + * pg_replication_slot is required for the duration. That allows us to + * safely and conflict free assign new origins using a dirty snapshot. + * + * * When creating an in-memory replication progress slot the ReplicationOrigin + * LWLock has to be held exclusively; when iterating over the replication + * progress a shared lock has to be held, the same when advancing the + * replication progress of an individual backend that has not setup as the + * session's replication origin. + * + * * When manipulating or looking at the remote_lsn and local_lsn fields of a + * replication progress slot that slot's lwlock has to be held. That's + * primarily because we do not assume 8 byte writes (the LSN) is atomic on + * all our platforms, but it also simplifies memory ordering concerns + * between the remote and local lsn. We use a lwlock instead of a spinlock + * so it's less harmful to hold the lock over a WAL write + * (cf. AdvanceReplicationProgress). + * + * --------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <unistd.h> +#include <sys/stat.h> + +#include "access/genam.h" +#include "access/htup_details.h" +#include "access/table.h" +#include "access/xact.h" +#include "access/xloginsert.h" +#include "catalog/catalog.h" +#include "catalog/indexing.h" +#include "catalog/pg_subscription.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "nodes/execnodes.h" +#include "pgstat.h" +#include "replication/logical.h" +#include "replication/origin.h" +#include "storage/condition_variable.h" +#include "storage/copydir.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/lmgr.h" +#include "utils/builtins.h" +#include "utils/fmgroids.h" +#include "utils/pg_lsn.h" +#include "utils/rel.h" +#include "utils/snapmgr.h" +#include "utils/syscache.h" + +/* + * Replay progress of a single remote node. + */ +typedef struct ReplicationState +{ + /* + * Local identifier for the remote node. + */ + RepOriginId roident; + + /* + * Location of the latest commit from the remote side. + */ + XLogRecPtr remote_lsn; + + /* + * Remember the local lsn of the commit record so we can XLogFlush() to it + * during a checkpoint so we know the commit record actually is safe on + * disk. + */ + XLogRecPtr local_lsn; + + /* + * PID of backend that's acquired slot, or 0 if none. + */ + int acquired_by; + + /* + * Condition variable that's signaled when acquired_by changes. + */ + ConditionVariable origin_cv; + + /* + * Lock protecting remote_lsn and local_lsn. + */ + LWLock lock; +} ReplicationState; + +/* + * On disk version of ReplicationState. + */ +typedef struct ReplicationStateOnDisk +{ + RepOriginId roident; + XLogRecPtr remote_lsn; +} ReplicationStateOnDisk; + + +typedef struct ReplicationStateCtl +{ + /* Tranche to use for per-origin LWLocks */ + int tranche_id; + /* Array of length max_replication_slots */ + ReplicationState states[FLEXIBLE_ARRAY_MEMBER]; +} ReplicationStateCtl; + +/* external variables */ +__thread RepOriginId replorigin_session_origin = InvalidRepOriginId; /* assumed identity */ +__thread XLogRecPtr replorigin_session_origin_lsn = InvalidXLogRecPtr; +__thread TimestampTz replorigin_session_origin_timestamp = 0; + +/* + * Base address into a shared memory array of replication states of size + * max_replication_slots. + * + * XXX: Should we use a separate variable to size this rather than + * max_replication_slots? + */ +static __thread ReplicationState *replication_states; + +/* + * Actual shared memory block (replication_states[] is now part of this). + */ +static __thread ReplicationStateCtl *replication_states_ctl; + +/* + * Backend-local, cached element from ReplicationState for use in a backend + * replaying remote commits, so we don't have to search ReplicationState for + * the backends current RepOriginId. + */ +static __thread ReplicationState *session_replication_state = NULL; + +/* Magic for on disk files. */ +#define REPLICATION_STATE_MAGIC ((uint32) 0x1257DADE) + +static void +replorigin_check_prerequisites(bool check_slots, bool recoveryOK) +{ + if (check_slots && max_replication_slots == 0) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot query or manipulate replication origin when max_replication_slots = 0"))); + + if (!recoveryOK && RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_READ_ONLY_SQL_TRANSACTION), + errmsg("cannot manipulate replication origins during recovery"))); +} + + +/* + * IsReservedOriginName + * True iff name is either "none" or "any". + */ +static bool +IsReservedOriginName(const char *name) +{ + return ((pg_strcasecmp(name, LOGICALREP_ORIGIN_NONE) == 0) || + (pg_strcasecmp(name, LOGICALREP_ORIGIN_ANY) == 0)); +} + +/* --------------------------------------------------------------------------- + * Functions for working with replication origins themselves. + * --------------------------------------------------------------------------- + */ + +/* + * Check for a persistent replication origin identified by name. + * + * Returns InvalidOid if the node isn't known yet and missing_ok is true. + */ +RepOriginId +replorigin_by_name(const char *roname, bool missing_ok) +{ + Form_pg_replication_origin ident; + Oid roident = InvalidOid; + HeapTuple tuple; + Datum roname_d; + + roname_d = CStringGetTextDatum(roname); + + tuple = SearchSysCache1(REPLORIGNAME, roname_d); + if (HeapTupleIsValid(tuple)) + { + ident = (Form_pg_replication_origin) GETSTRUCT(tuple); + roident = ident->roident; + ReleaseSysCache(tuple); + } + else if (!missing_ok) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("replication origin \"%s\" does not exist", + roname))); + + return roident; +} + +/* + * Create a replication origin. + * + * Needs to be called in a transaction. + */ +RepOriginId +replorigin_create(const char *roname) +{ + Oid roident; + HeapTuple tuple = NULL; + Relation rel; + Datum roname_d; + SnapshotData SnapshotDirty; + SysScanDesc scan; + ScanKeyData key; + + roname_d = CStringGetTextDatum(roname); + + Assert(IsTransactionState()); + + /* + * We need the numeric replication origin to be 16bit wide, so we cannot + * rely on the normal oid allocation. Instead we simply scan + * pg_replication_origin for the first unused id. That's not particularly + * efficient, but this should be a fairly infrequent operation - we can + * easily spend a bit more code on this when it turns out it needs to be + * faster. + * + * We handle concurrency by taking an exclusive lock (allowing reads!) + * over the table for the duration of the search. Because we use a "dirty + * snapshot" we can read rows that other in-progress sessions have + * written, even though they would be invisible with normal snapshots. Due + * to the exclusive lock there's no danger that new rows can appear while + * we're checking. + */ + InitDirtySnapshot(SnapshotDirty); + + rel = table_open(ReplicationOriginRelationId, ExclusiveLock); + + for (roident = InvalidOid + 1; roident < PG_UINT16_MAX; roident++) + { + bool nulls[Natts_pg_replication_origin]; + Datum values[Natts_pg_replication_origin]; + bool collides; + + CHECK_FOR_INTERRUPTS(); + + ScanKeyInit(&key, + Anum_pg_replication_origin_roident, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(roident)); + + scan = systable_beginscan(rel, ReplicationOriginIdentIndex, + true /* indexOK */ , + &SnapshotDirty, + 1, &key); + + collides = HeapTupleIsValid(systable_getnext(scan)); + + systable_endscan(scan); + + if (!collides) + { + /* + * Ok, found an unused roident, insert the new row and do a CCI, + * so our callers can look it up if they want to. + */ + memset(&nulls, 0, sizeof(nulls)); + + values[Anum_pg_replication_origin_roident - 1] = ObjectIdGetDatum(roident); + values[Anum_pg_replication_origin_roname - 1] = roname_d; + + tuple = heap_form_tuple(RelationGetDescr(rel), values, nulls); + CatalogTupleInsert(rel, tuple); + CommandCounterIncrement(); + break; + } + } + + /* now release lock again, */ + table_close(rel, ExclusiveLock); + + if (tuple == NULL) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("could not find free replication origin ID"))); + + heap_freetuple(tuple); + return roident; +} + +/* + * Helper function to drop a replication origin. + */ +static void +replorigin_state_clear(RepOriginId roident, bool nowait) +{ + int i; + + /* + * Clean up the slot state info, if there is any matching slot. + */ +restart: + LWLockAcquire(ReplicationOriginLock, LW_EXCLUSIVE); + + for (i = 0; i < max_replication_slots; i++) + { + ReplicationState *state = &replication_states[i]; + + if (state->roident == roident) + { + /* found our slot, is it busy? */ + if (state->acquired_by != 0) + { + ConditionVariable *cv; + + if (nowait) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_IN_USE), + errmsg("could not drop replication origin with ID %d, in use by PID %d", + state->roident, + state->acquired_by))); + + /* + * We must wait and then retry. Since we don't know which CV + * to wait on until here, we can't readily use + * ConditionVariablePrepareToSleep (calling it here would be + * wrong, since we could miss the signal if we did so); just + * use ConditionVariableSleep directly. + */ + cv = &state->origin_cv; + + LWLockRelease(ReplicationOriginLock); + + ConditionVariableSleep(cv, WAIT_EVENT_REPLICATION_ORIGIN_DROP); + goto restart; + } + + /* first make a WAL log entry */ + { + xl_replorigin_drop xlrec; + + xlrec.node_id = roident; + XLogBeginInsert(); + XLogRegisterData((char *) (&xlrec), sizeof(xlrec)); + XLogInsert(RM_REPLORIGIN_ID, XLOG_REPLORIGIN_DROP); + } + + /* then clear the in-memory slot */ + state->roident = InvalidRepOriginId; + state->remote_lsn = InvalidXLogRecPtr; + state->local_lsn = InvalidXLogRecPtr; + break; + } + } + LWLockRelease(ReplicationOriginLock); + ConditionVariableCancelSleep(); +} + +/* + * Drop replication origin (by name). + * + * Needs to be called in a transaction. + */ +void +replorigin_drop_by_name(const char *name, bool missing_ok, bool nowait) +{ + RepOriginId roident; + Relation rel; + HeapTuple tuple; + + Assert(IsTransactionState()); + + rel = table_open(ReplicationOriginRelationId, RowExclusiveLock); + + roident = replorigin_by_name(name, missing_ok); + + /* Lock the origin to prevent concurrent drops. */ + LockSharedObject(ReplicationOriginRelationId, roident, 0, + AccessExclusiveLock); + + tuple = SearchSysCache1(REPLORIGIDENT, ObjectIdGetDatum(roident)); + if (!HeapTupleIsValid(tuple)) + { + if (!missing_ok) + elog(ERROR, "cache lookup failed for replication origin with ID %d", + roident); + + /* + * We don't need to retain the locks if the origin is already dropped. + */ + UnlockSharedObject(ReplicationOriginRelationId, roident, 0, + AccessExclusiveLock); + table_close(rel, RowExclusiveLock); + return; + } + + replorigin_state_clear(roident, nowait); + + /* + * Now, we can delete the catalog entry. + */ + CatalogTupleDelete(rel, &tuple->t_self); + ReleaseSysCache(tuple); + + CommandCounterIncrement(); + + /* We keep the lock on pg_replication_origin until commit */ + table_close(rel, NoLock); +} + +/* + * Lookup replication origin via its oid and return the name. + * + * The external name is palloc'd in the calling context. + * + * Returns true if the origin is known, false otherwise. + */ +bool +replorigin_by_oid(RepOriginId roident, bool missing_ok, char **roname) +{ + HeapTuple tuple; + Form_pg_replication_origin ric; + + Assert(OidIsValid((Oid) roident)); + Assert(roident != InvalidRepOriginId); + Assert(roident != DoNotReplicateId); + + tuple = SearchSysCache1(REPLORIGIDENT, + ObjectIdGetDatum((Oid) roident)); + + if (HeapTupleIsValid(tuple)) + { + ric = (Form_pg_replication_origin) GETSTRUCT(tuple); + *roname = text_to_cstring(&ric->roname); + ReleaseSysCache(tuple); + + return true; + } + else + { + *roname = NULL; + + if (!missing_ok) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("replication origin with ID %d does not exist", + roident))); + + return false; + } +} + + +/* --------------------------------------------------------------------------- + * Functions for handling replication progress. + * --------------------------------------------------------------------------- + */ + +Size +ReplicationOriginShmemSize(void) +{ + Size size = 0; + + /* + * XXX: max_replication_slots is arguably the wrong thing to use, as here + * we keep the replay state of *remote* transactions. But for now it seems + * sufficient to reuse it, rather than introduce a separate GUC. + */ + if (max_replication_slots == 0) + return size; + + size = add_size(size, offsetof(ReplicationStateCtl, states)); + + size = add_size(size, + mul_size(max_replication_slots, sizeof(ReplicationState))); + return size; +} + +void +ReplicationOriginShmemInit(void) +{ + bool found; + + if (max_replication_slots == 0) + return; + + replication_states_ctl = (ReplicationStateCtl *) + ShmemInitStruct("ReplicationOriginState", + ReplicationOriginShmemSize(), + &found); + replication_states = replication_states_ctl->states; + + if (!found) + { + int i; + + MemSet(replication_states_ctl, 0, ReplicationOriginShmemSize()); + + replication_states_ctl->tranche_id = LWTRANCHE_REPLICATION_ORIGIN_STATE; + + for (i = 0; i < max_replication_slots; i++) + { + LWLockInitialize(&replication_states[i].lock, + replication_states_ctl->tranche_id); + ConditionVariableInit(&replication_states[i].origin_cv); + } + } +} + +/* --------------------------------------------------------------------------- + * Perform a checkpoint of each replication origin's progress with respect to + * the replayed remote_lsn. Make sure that all transactions we refer to in the + * checkpoint (local_lsn) are actually on-disk. This might not yet be the case + * if the transactions were originally committed asynchronously. + * + * We store checkpoints in the following format: + * +-------+------------------------+------------------+-----+--------+ + * | MAGIC | ReplicationStateOnDisk | struct Replic... | ... | CRC32C | EOF + * +-------+------------------------+------------------+-----+--------+ + * + * So its just the magic, followed by the statically sized + * ReplicationStateOnDisk structs. Note that the maximum number of + * ReplicationState is determined by max_replication_slots. + * --------------------------------------------------------------------------- + */ +void +CheckPointReplicationOrigin(void) +{ + const char *tmppath = "pg_logical/replorigin_checkpoint.tmp"; + const char *path = "pg_logical/replorigin_checkpoint"; + int tmpfd; + int i; + uint32 magic = REPLICATION_STATE_MAGIC; + pg_crc32c crc; + + if (max_replication_slots == 0) + return; + + INIT_CRC32C(crc); + + /* make sure no old temp file is remaining */ + if (unlink(tmppath) < 0 && errno != ENOENT) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", + tmppath))); + + /* + * no other backend can perform this at the same time; only one checkpoint + * can happen at a time. + */ + tmpfd = OpenTransientFile(tmppath, + O_CREAT | O_EXCL | O_WRONLY | PG_BINARY); + if (tmpfd < 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", + tmppath))); + + /* write magic */ + errno = 0; + if ((write(tmpfd, &magic, sizeof(magic))) != sizeof(magic)) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", + tmppath))); + } + COMP_CRC32C(crc, &magic, sizeof(magic)); + + /* prevent concurrent creations/drops */ + LWLockAcquire(ReplicationOriginLock, LW_SHARED); + + /* write actual data */ + for (i = 0; i < max_replication_slots; i++) + { + ReplicationStateOnDisk disk_state; + ReplicationState *curstate = &replication_states[i]; + XLogRecPtr local_lsn; + + if (curstate->roident == InvalidRepOriginId) + continue; + + /* zero, to avoid uninitialized padding bytes */ + memset(&disk_state, 0, sizeof(disk_state)); + + LWLockAcquire(&curstate->lock, LW_SHARED); + + disk_state.roident = curstate->roident; + + disk_state.remote_lsn = curstate->remote_lsn; + local_lsn = curstate->local_lsn; + + LWLockRelease(&curstate->lock); + + /* make sure we only write out a commit that's persistent */ + XLogFlush(local_lsn); + + errno = 0; + if ((write(tmpfd, &disk_state, sizeof(disk_state))) != + sizeof(disk_state)) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", + tmppath))); + } + + COMP_CRC32C(crc, &disk_state, sizeof(disk_state)); + } + + LWLockRelease(ReplicationOriginLock); + + /* write out the CRC */ + FIN_CRC32C(crc); + errno = 0; + if ((write(tmpfd, &crc, sizeof(crc))) != sizeof(crc)) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", + tmppath))); + } + + if (CloseTransientFile(tmpfd) != 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", + tmppath))); + + /* fsync, rename to permanent file, fsync file and directory */ + durable_rename(tmppath, path, PANIC); +} + +/* + * Recover replication replay status from checkpoint data saved earlier by + * CheckPointReplicationOrigin. + * + * This only needs to be called at startup and *not* during every checkpoint + * read during recovery (e.g. in HS or PITR from a base backup) afterwards. All + * state thereafter can be recovered by looking at commit records. + */ +void +StartupReplicationOrigin(void) +{ + const char *path = "pg_logical/replorigin_checkpoint"; + int fd; + int readBytes; + uint32 magic = REPLICATION_STATE_MAGIC; + int last_state = 0; + pg_crc32c file_crc; + pg_crc32c crc; + + /* don't want to overwrite already existing state */ +#ifdef USE_ASSERT_CHECKING + static bool already_started = false; + + Assert(!already_started); + already_started = true; +#endif + + if (max_replication_slots == 0) + return; + + INIT_CRC32C(crc); + + elog(DEBUG2, "starting up replication origin progress state"); + + fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); + + /* + * might have had max_replication_slots == 0 last run, or we just brought + * up a standby. + */ + if (fd < 0 && errno == ENOENT) + return; + else if (fd < 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", + path))); + + /* verify magic, that is written even if nothing was active */ + readBytes = read(fd, &magic, sizeof(magic)); + if (readBytes != sizeof(magic)) + { + if (readBytes < 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + path))); + else + ereport(PANIC, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("could not read file \"%s\": read %d of %zu", + path, readBytes, sizeof(magic)))); + } + COMP_CRC32C(crc, &magic, sizeof(magic)); + + if (magic != REPLICATION_STATE_MAGIC) + ereport(PANIC, + (errmsg("replication checkpoint has wrong magic %u instead of %u", + magic, REPLICATION_STATE_MAGIC))); + + /* we can skip locking here, no other access is possible */ + + /* recover individual states, until there are no more to be found */ + while (true) + { + ReplicationStateOnDisk disk_state; + + readBytes = read(fd, &disk_state, sizeof(disk_state)); + + /* no further data */ + if (readBytes == sizeof(crc)) + { + /* not pretty, but simple ... */ + file_crc = *(pg_crc32c *) &disk_state; + break; + } + + if (readBytes < 0) + { + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + path))); + } + + if (readBytes != sizeof(disk_state)) + { + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": read %d of %zu", + path, readBytes, sizeof(disk_state)))); + } + + COMP_CRC32C(crc, &disk_state, sizeof(disk_state)); + + if (last_state == max_replication_slots) + ereport(PANIC, + (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), + errmsg("could not find free replication state, increase max_replication_slots"))); + + /* copy data to shared memory */ + replication_states[last_state].roident = disk_state.roident; + replication_states[last_state].remote_lsn = disk_state.remote_lsn; + last_state++; + + ereport(LOG, + (errmsg("recovered replication state of node %d to %X/%X", + disk_state.roident, + LSN_FORMAT_ARGS(disk_state.remote_lsn)))); + } + + /* now check checksum */ + FIN_CRC32C(crc); + if (file_crc != crc) + ereport(PANIC, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("replication slot checkpoint has wrong checksum %u, expected %u", + crc, file_crc))); + + if (CloseTransientFile(fd) != 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", + path))); +} + +void +replorigin_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + switch (info) + { + case XLOG_REPLORIGIN_SET: + { + xl_replorigin_set *xlrec = + (xl_replorigin_set *) XLogRecGetData(record); + + replorigin_advance(xlrec->node_id, + xlrec->remote_lsn, record->EndRecPtr, + xlrec->force /* backward */ , + false /* WAL log */ ); + break; + } + case XLOG_REPLORIGIN_DROP: + { + xl_replorigin_drop *xlrec; + int i; + + xlrec = (xl_replorigin_drop *) XLogRecGetData(record); + + for (i = 0; i < max_replication_slots; i++) + { + ReplicationState *state = &replication_states[i]; + + /* found our slot */ + if (state->roident == xlrec->node_id) + { + /* reset entry */ + state->roident = InvalidRepOriginId; + state->remote_lsn = InvalidXLogRecPtr; + state->local_lsn = InvalidXLogRecPtr; + break; + } + } + break; + } + default: + elog(PANIC, "replorigin_redo: unknown op code %u", info); + } +} + + +/* + * Tell the replication origin progress machinery that a commit from 'node' + * that originated at the LSN remote_commit on the remote node was replayed + * successfully and that we don't need to do so again. In combination with + * setting up replorigin_session_origin_lsn and replorigin_session_origin + * that ensures we won't lose knowledge about that after a crash if the + * transaction had a persistent effect (think of asynchronous commits). + * + * local_commit needs to be a local LSN of the commit so that we can make sure + * upon a checkpoint that enough WAL has been persisted to disk. + * + * Needs to be called with a RowExclusiveLock on pg_replication_origin, + * unless running in recovery. + */ +void +replorigin_advance(RepOriginId node, + XLogRecPtr remote_commit, XLogRecPtr local_commit, + bool go_backward, bool wal_log) +{ + int i; + ReplicationState *replication_state = NULL; + ReplicationState *free_state = NULL; + + Assert(node != InvalidRepOriginId); + + /* we don't track DoNotReplicateId */ + if (node == DoNotReplicateId) + return; + + /* + * XXX: For the case where this is called by WAL replay, it'd be more + * efficient to restore into a backend local hashtable and only dump into + * shmem after recovery is finished. Let's wait with implementing that + * till it's shown to be a measurable expense + */ + + /* Lock exclusively, as we may have to create a new table entry. */ + LWLockAcquire(ReplicationOriginLock, LW_EXCLUSIVE); + + /* + * Search for either an existing slot for the origin, or a free one we can + * use. + */ + for (i = 0; i < max_replication_slots; i++) + { + ReplicationState *curstate = &replication_states[i]; + + /* remember where to insert if necessary */ + if (curstate->roident == InvalidRepOriginId && + free_state == NULL) + { + free_state = curstate; + continue; + } + + /* not our slot */ + if (curstate->roident != node) + { + continue; + } + + /* ok, found slot */ + replication_state = curstate; + + LWLockAcquire(&replication_state->lock, LW_EXCLUSIVE); + + /* Make sure it's not used by somebody else */ + if (replication_state->acquired_by != 0) + { + ereport(ERROR, + (errcode(ERRCODE_OBJECT_IN_USE), + errmsg("replication origin with ID %d is already active for PID %d", + replication_state->roident, + replication_state->acquired_by))); + } + + break; + } + + if (replication_state == NULL && free_state == NULL) + ereport(ERROR, + (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), + errmsg("could not find free replication state slot for replication origin with ID %d", + node), + errhint("Increase max_replication_slots and try again."))); + + if (replication_state == NULL) + { + /* initialize new slot */ + LWLockAcquire(&free_state->lock, LW_EXCLUSIVE); + replication_state = free_state; + Assert(replication_state->remote_lsn == InvalidXLogRecPtr); + Assert(replication_state->local_lsn == InvalidXLogRecPtr); + replication_state->roident = node; + } + + Assert(replication_state->roident != InvalidRepOriginId); + + /* + * If somebody "forcefully" sets this slot, WAL log it, so it's durable + * and the standby gets the message. Primarily this will be called during + * WAL replay (of commit records) where no WAL logging is necessary. + */ + if (wal_log) + { + xl_replorigin_set xlrec; + + xlrec.remote_lsn = remote_commit; + xlrec.node_id = node; + xlrec.force = go_backward; + + XLogBeginInsert(); + XLogRegisterData((char *) (&xlrec), sizeof(xlrec)); + + XLogInsert(RM_REPLORIGIN_ID, XLOG_REPLORIGIN_SET); + } + + /* + * Due to - harmless - race conditions during a checkpoint we could see + * values here that are older than the ones we already have in memory. We + * could also see older values for prepared transactions when the prepare + * is sent at a later point of time along with commit prepared and there + * are other transactions commits between prepare and commit prepared. See + * ReorderBufferFinishPrepared. Don't overwrite those. + */ + if (go_backward || replication_state->remote_lsn < remote_commit) + replication_state->remote_lsn = remote_commit; + if (local_commit != InvalidXLogRecPtr && + (go_backward || replication_state->local_lsn < local_commit)) + replication_state->local_lsn = local_commit; + LWLockRelease(&replication_state->lock); + + /* + * Release *after* changing the LSNs, slot isn't acquired and thus could + * otherwise be dropped anytime. + */ + LWLockRelease(ReplicationOriginLock); +} + + +XLogRecPtr +replorigin_get_progress(RepOriginId node, bool flush) +{ + int i; + XLogRecPtr local_lsn = InvalidXLogRecPtr; + XLogRecPtr remote_lsn = InvalidXLogRecPtr; + + /* prevent slots from being concurrently dropped */ + LWLockAcquire(ReplicationOriginLock, LW_SHARED); + + for (i = 0; i < max_replication_slots; i++) + { + ReplicationState *state; + + state = &replication_states[i]; + + if (state->roident == node) + { + LWLockAcquire(&state->lock, LW_SHARED); + + remote_lsn = state->remote_lsn; + local_lsn = state->local_lsn; + + LWLockRelease(&state->lock); + + break; + } + } + + LWLockRelease(ReplicationOriginLock); + + if (flush && local_lsn != InvalidXLogRecPtr) + XLogFlush(local_lsn); + + return remote_lsn; +} + +/* + * Tear down a (possibly) configured session replication origin during process + * exit. + */ +static void +ReplicationOriginExitCleanup(int code, Datum arg) +{ + ConditionVariable *cv = NULL; + + LWLockAcquire(ReplicationOriginLock, LW_EXCLUSIVE); + + if (session_replication_state != NULL && + session_replication_state->acquired_by == MyProcPid) + { + cv = &session_replication_state->origin_cv; + + session_replication_state->acquired_by = 0; + session_replication_state = NULL; + } + + LWLockRelease(ReplicationOriginLock); + + if (cv) + ConditionVariableBroadcast(cv); +} + +/* + * Setup a replication origin in the shared memory struct if it doesn't + * already exist and cache access to the specific ReplicationSlot so the + * array doesn't have to be searched when calling + * replorigin_session_advance(). + * + * Normally only one such cached origin can exist per process so the cached + * value can only be set again after the previous value is torn down with + * replorigin_session_reset(). For this normal case pass acquired_by = 0 + * (meaning the slot is not allowed to be already acquired by another process). + * + * However, sometimes multiple processes can safely re-use the same origin slot + * (for example, multiple parallel apply processes can safely use the same + * origin, provided they maintain commit order by allowing only one process to + * commit at a time). For this case the first process must pass acquired_by = + * 0, and then the other processes sharing that same origin can pass + * acquired_by = PID of the first process. + */ +void +replorigin_session_setup(RepOriginId node, int acquired_by) +{ + static __thread bool registered_cleanup; + int i; + int free_slot = -1; + + if (!registered_cleanup) + { + on_shmem_exit(ReplicationOriginExitCleanup, 0); + registered_cleanup = true; + } + + Assert(max_replication_slots > 0); + + if (session_replication_state != NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot setup replication origin when one is already setup"))); + + /* Lock exclusively, as we may have to create a new table entry. */ + LWLockAcquire(ReplicationOriginLock, LW_EXCLUSIVE); + + /* + * Search for either an existing slot for the origin, or a free one we can + * use. + */ + for (i = 0; i < max_replication_slots; i++) + { + ReplicationState *curstate = &replication_states[i]; + + /* remember where to insert if necessary */ + if (curstate->roident == InvalidRepOriginId && + free_slot == -1) + { + free_slot = i; + continue; + } + + /* not our slot */ + if (curstate->roident != node) + continue; + + else if (curstate->acquired_by != 0 && acquired_by == 0) + { + ereport(ERROR, + (errcode(ERRCODE_OBJECT_IN_USE), + errmsg("replication origin with ID %d is already active for PID %d", + curstate->roident, curstate->acquired_by))); + } + + /* ok, found slot */ + session_replication_state = curstate; + } + + + if (session_replication_state == NULL && free_slot == -1) + ereport(ERROR, + (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), + errmsg("could not find free replication state slot for replication origin with ID %d", + node), + errhint("Increase max_replication_slots and try again."))); + else if (session_replication_state == NULL) + { + /* initialize new slot */ + session_replication_state = &replication_states[free_slot]; + Assert(session_replication_state->remote_lsn == InvalidXLogRecPtr); + Assert(session_replication_state->local_lsn == InvalidXLogRecPtr); + session_replication_state->roident = node; + } + + + Assert(session_replication_state->roident != InvalidRepOriginId); + + if (acquired_by == 0) + session_replication_state->acquired_by = MyProcPid; + else if (session_replication_state->acquired_by != acquired_by) + elog(ERROR, "could not find replication state slot for replication origin with OID %u which was acquired by %d", + node, acquired_by); + + LWLockRelease(ReplicationOriginLock); + + /* probably this one is pointless */ + ConditionVariableBroadcast(&session_replication_state->origin_cv); +} + +/* + * Reset replay state previously setup in this session. + * + * This function may only be called if an origin was setup with + * replorigin_session_setup(). + */ +void +replorigin_session_reset(void) +{ + ConditionVariable *cv; + + Assert(max_replication_slots != 0); + + if (session_replication_state == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("no replication origin is configured"))); + + LWLockAcquire(ReplicationOriginLock, LW_EXCLUSIVE); + + session_replication_state->acquired_by = 0; + cv = &session_replication_state->origin_cv; + session_replication_state = NULL; + + LWLockRelease(ReplicationOriginLock); + + ConditionVariableBroadcast(cv); +} + +/* + * Do the same work replorigin_advance() does, just on the session's + * configured origin. + * + * This is noticeably cheaper than using replorigin_advance(). + */ +void +replorigin_session_advance(XLogRecPtr remote_commit, XLogRecPtr local_commit) +{ + Assert(session_replication_state != NULL); + Assert(session_replication_state->roident != InvalidRepOriginId); + + LWLockAcquire(&session_replication_state->lock, LW_EXCLUSIVE); + if (session_replication_state->local_lsn < local_commit) + session_replication_state->local_lsn = local_commit; + if (session_replication_state->remote_lsn < remote_commit) + session_replication_state->remote_lsn = remote_commit; + LWLockRelease(&session_replication_state->lock); +} + +/* + * Ask the machinery about the point up to which we successfully replayed + * changes from an already setup replication origin. + */ +XLogRecPtr +replorigin_session_get_progress(bool flush) +{ + XLogRecPtr remote_lsn; + XLogRecPtr local_lsn; + + Assert(session_replication_state != NULL); + + LWLockAcquire(&session_replication_state->lock, LW_SHARED); + remote_lsn = session_replication_state->remote_lsn; + local_lsn = session_replication_state->local_lsn; + LWLockRelease(&session_replication_state->lock); + + if (flush && local_lsn != InvalidXLogRecPtr) + XLogFlush(local_lsn); + + return remote_lsn; +} + + + +/* --------------------------------------------------------------------------- + * SQL functions for working with replication origin. + * + * These mostly should be fairly short wrappers around more generic functions. + * --------------------------------------------------------------------------- + */ + +/* + * Create replication origin for the passed in name, and return the assigned + * oid. + */ +Datum +pg_replication_origin_create(PG_FUNCTION_ARGS) +{ + char *name; + RepOriginId roident; + + replorigin_check_prerequisites(false, false); + + name = text_to_cstring((text *) DatumGetPointer(PG_GETARG_DATUM(0))); + + /* + * Replication origins "any and "none" are reserved for system options. + * The origins "pg_xxx" are reserved for internal use. + */ + if (IsReservedName(name) || IsReservedOriginName(name)) + ereport(ERROR, + (errcode(ERRCODE_RESERVED_NAME), + errmsg("replication origin name \"%s\" is reserved", + name), + errdetail("Origin names \"%s\", \"%s\", and names starting with \"pg_\" are reserved.", + LOGICALREP_ORIGIN_ANY, LOGICALREP_ORIGIN_NONE))); + + /* + * If built with appropriate switch, whine when regression-testing + * conventions for replication origin names are violated. + */ +#ifdef ENFORCE_REGRESSION_TEST_NAME_RESTRICTIONS + if (strncmp(name, "regress_", 8) != 0) + elog(WARNING, "replication origins created by regression test cases should have names starting with \"regress_\""); +#endif + + roident = replorigin_create(name); + + pfree(name); + + PG_RETURN_OID(roident); +} + +/* + * Drop replication origin. + */ +Datum +pg_replication_origin_drop(PG_FUNCTION_ARGS) +{ + char *name; + + replorigin_check_prerequisites(false, false); + + name = text_to_cstring((text *) DatumGetPointer(PG_GETARG_DATUM(0))); + + replorigin_drop_by_name(name, false, true); + + pfree(name); + + PG_RETURN_VOID(); +} + +/* + * Return oid of a replication origin. + */ +Datum +pg_replication_origin_oid(PG_FUNCTION_ARGS) +{ + char *name; + RepOriginId roident; + + replorigin_check_prerequisites(false, false); + + name = text_to_cstring((text *) DatumGetPointer(PG_GETARG_DATUM(0))); + roident = replorigin_by_name(name, true); + + pfree(name); + + if (OidIsValid(roident)) + PG_RETURN_OID(roident); + PG_RETURN_NULL(); +} + +/* + * Setup a replication origin for this session. + */ +Datum +pg_replication_origin_session_setup(PG_FUNCTION_ARGS) +{ + char *name; + RepOriginId origin; + + replorigin_check_prerequisites(true, false); + + name = text_to_cstring((text *) DatumGetPointer(PG_GETARG_DATUM(0))); + origin = replorigin_by_name(name, false); + replorigin_session_setup(origin, 0); + + replorigin_session_origin = origin; + + pfree(name); + + PG_RETURN_VOID(); +} + +/* + * Reset previously setup origin in this session + */ +Datum +pg_replication_origin_session_reset(PG_FUNCTION_ARGS) +{ + replorigin_check_prerequisites(true, false); + + replorigin_session_reset(); + + replorigin_session_origin = InvalidRepOriginId; + replorigin_session_origin_lsn = InvalidXLogRecPtr; + replorigin_session_origin_timestamp = 0; + + PG_RETURN_VOID(); +} + +/* + * Has a replication origin been setup for this session. + */ +Datum +pg_replication_origin_session_is_setup(PG_FUNCTION_ARGS) +{ + replorigin_check_prerequisites(false, false); + + PG_RETURN_BOOL(replorigin_session_origin != InvalidRepOriginId); +} + + +/* + * Return the replication progress for origin setup in the current session. + * + * If 'flush' is set to true it is ensured that the returned value corresponds + * to a local transaction that has been flushed. This is useful if asynchronous + * commits are used when replaying replicated transactions. + */ +Datum +pg_replication_origin_session_progress(PG_FUNCTION_ARGS) +{ + XLogRecPtr remote_lsn = InvalidXLogRecPtr; + bool flush = PG_GETARG_BOOL(0); + + replorigin_check_prerequisites(true, false); + + if (session_replication_state == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("no replication origin is configured"))); + + remote_lsn = replorigin_session_get_progress(flush); + + if (remote_lsn == InvalidXLogRecPtr) + PG_RETURN_NULL(); + + PG_RETURN_LSN(remote_lsn); +} + +Datum +pg_replication_origin_xact_setup(PG_FUNCTION_ARGS) +{ + XLogRecPtr location = PG_GETARG_LSN(0); + + replorigin_check_prerequisites(true, false); + + if (session_replication_state == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("no replication origin is configured"))); + + replorigin_session_origin_lsn = location; + replorigin_session_origin_timestamp = PG_GETARG_TIMESTAMPTZ(1); + + PG_RETURN_VOID(); +} + +Datum +pg_replication_origin_xact_reset(PG_FUNCTION_ARGS) +{ + replorigin_check_prerequisites(true, false); + + replorigin_session_origin_lsn = InvalidXLogRecPtr; + replorigin_session_origin_timestamp = 0; + + PG_RETURN_VOID(); +} + + +Datum +pg_replication_origin_advance(PG_FUNCTION_ARGS) +{ + text *name = PG_GETARG_TEXT_PP(0); + XLogRecPtr remote_commit = PG_GETARG_LSN(1); + RepOriginId node; + + replorigin_check_prerequisites(true, false); + + /* lock to prevent the replication origin from vanishing */ + LockRelationOid(ReplicationOriginRelationId, RowExclusiveLock); + + node = replorigin_by_name(text_to_cstring(name), false); + + /* + * Can't sensibly pass a local commit to be flushed at checkpoint - this + * xact hasn't committed yet. This is why this function should be used to + * set up the initial replication state, but not for replay. + */ + replorigin_advance(node, remote_commit, InvalidXLogRecPtr, + true /* go backward */ , true /* WAL log */ ); + + UnlockRelationOid(ReplicationOriginRelationId, RowExclusiveLock); + + PG_RETURN_VOID(); +} + + +/* + * Return the replication progress for an individual replication origin. + * + * If 'flush' is set to true it is ensured that the returned value corresponds + * to a local transaction that has been flushed. This is useful if asynchronous + * commits are used when replaying replicated transactions. + */ +Datum +pg_replication_origin_progress(PG_FUNCTION_ARGS) +{ + char *name; + bool flush; + RepOriginId roident; + XLogRecPtr remote_lsn = InvalidXLogRecPtr; + + replorigin_check_prerequisites(true, true); + + name = text_to_cstring((text *) DatumGetPointer(PG_GETARG_DATUM(0))); + flush = PG_GETARG_BOOL(1); + + roident = replorigin_by_name(name, false); + Assert(OidIsValid(roident)); + + remote_lsn = replorigin_get_progress(roident, flush); + + if (remote_lsn == InvalidXLogRecPtr) + PG_RETURN_NULL(); + + PG_RETURN_LSN(remote_lsn); +} + + +Datum +pg_show_replication_origin_status(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + int i; +#define REPLICATION_ORIGIN_PROGRESS_COLS 4 + + /* we want to return 0 rows if slot is set to zero */ + replorigin_check_prerequisites(false, true); + + InitMaterializedSRF(fcinfo, 0); + + /* prevent slots from being concurrently dropped */ + LWLockAcquire(ReplicationOriginLock, LW_SHARED); + + /* + * Iterate through all possible replication_states, display if they are + * filled. Note that we do not take any locks, so slightly corrupted/out + * of date values are a possibility. + */ + for (i = 0; i < max_replication_slots; i++) + { + ReplicationState *state; + Datum values[REPLICATION_ORIGIN_PROGRESS_COLS]; + bool nulls[REPLICATION_ORIGIN_PROGRESS_COLS]; + char *roname; + + state = &replication_states[i]; + + /* unused slot, nothing to display */ + if (state->roident == InvalidRepOriginId) + continue; + + memset(values, 0, sizeof(values)); + memset(nulls, 1, sizeof(nulls)); + + values[0] = ObjectIdGetDatum(state->roident); + nulls[0] = false; + + /* + * We're not preventing the origin to be dropped concurrently, so + * silently accept that it might be gone. + */ + if (replorigin_by_oid(state->roident, true, + &roname)) + { + values[1] = CStringGetTextDatum(roname); + nulls[1] = false; + } + + LWLockAcquire(&state->lock, LW_SHARED); + + values[2] = LSNGetDatum(state->remote_lsn); + nulls[2] = false; + + values[3] = LSNGetDatum(state->local_lsn); + nulls[3] = false; + + LWLockRelease(&state->lock); + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, + values, nulls); + } + + LWLockRelease(ReplicationOriginLock); + +#undef REPLICATION_ORIGIN_PROGRESS_COLS + + return (Datum) 0; +} diff --git a/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/proto.c b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/proto.c new file mode 100644 index 00000000000..ae2d754db95 --- /dev/null +++ b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/proto.c @@ -0,0 +1,1271 @@ +/*------------------------------------------------------------------------- + * + * proto.c + * logical replication protocol functions + * + * Copyright (c) 2015-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/proto.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/sysattr.h" +#include "catalog/pg_namespace.h" +#include "catalog/pg_type.h" +#include "libpq/pqformat.h" +#include "replication/logicalproto.h" +#include "utils/lsyscache.h" +#include "utils/syscache.h" + +/* + * Protocol message flags. + */ +#define LOGICALREP_IS_REPLICA_IDENTITY 1 + +#define MESSAGE_TRANSACTIONAL (1<<0) +#define TRUNCATE_CASCADE (1<<0) +#define TRUNCATE_RESTART_SEQS (1<<1) + +static void logicalrep_write_attrs(StringInfo out, Relation rel, + Bitmapset *columns); +static void logicalrep_write_tuple(StringInfo out, Relation rel, + TupleTableSlot *slot, + bool binary, Bitmapset *columns); +static void logicalrep_read_attrs(StringInfo in, LogicalRepRelation *rel); +static void logicalrep_read_tuple(StringInfo in, LogicalRepTupleData *tuple); + +static void logicalrep_write_namespace(StringInfo out, Oid nspid); +static const char *logicalrep_read_namespace(StringInfo in); + +/* + * Check if a column is covered by a column list. + * + * Need to be careful about NULL, which is treated as a column list covering + * all columns. + */ +static bool +column_in_column_list(int attnum, Bitmapset *columns) +{ + return (columns == NULL || bms_is_member(attnum, columns)); +} + + +/* + * Write BEGIN to the output stream. + */ +void +logicalrep_write_begin(StringInfo out, ReorderBufferTXN *txn) +{ + pq_sendbyte(out, LOGICAL_REP_MSG_BEGIN); + + /* fixed fields */ + pq_sendint64(out, txn->final_lsn); + pq_sendint64(out, txn->xact_time.commit_time); + pq_sendint32(out, txn->xid); +} + +/* + * Read transaction BEGIN from the stream. + */ +void +logicalrep_read_begin(StringInfo in, LogicalRepBeginData *begin_data) +{ + /* read fields */ + begin_data->final_lsn = pq_getmsgint64(in); + if (begin_data->final_lsn == InvalidXLogRecPtr) + elog(ERROR, "final_lsn not set in begin message"); + begin_data->committime = pq_getmsgint64(in); + begin_data->xid = pq_getmsgint(in, 4); +} + + +/* + * Write COMMIT to the output stream. + */ +void +logicalrep_write_commit(StringInfo out, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn) +{ + uint8 flags = 0; + + pq_sendbyte(out, LOGICAL_REP_MSG_COMMIT); + + /* send the flags field (unused for now) */ + pq_sendbyte(out, flags); + + /* send fields */ + pq_sendint64(out, commit_lsn); + pq_sendint64(out, txn->end_lsn); + pq_sendint64(out, txn->xact_time.commit_time); +} + +/* + * Read transaction COMMIT from the stream. + */ +void +logicalrep_read_commit(StringInfo in, LogicalRepCommitData *commit_data) +{ + /* read flags (unused for now) */ + uint8 flags = pq_getmsgbyte(in); + + if (flags != 0) + elog(ERROR, "unrecognized flags %u in commit message", flags); + + /* read fields */ + commit_data->commit_lsn = pq_getmsgint64(in); + commit_data->end_lsn = pq_getmsgint64(in); + commit_data->committime = pq_getmsgint64(in); +} + +/* + * Write BEGIN PREPARE to the output stream. + */ +void +logicalrep_write_begin_prepare(StringInfo out, ReorderBufferTXN *txn) +{ + pq_sendbyte(out, LOGICAL_REP_MSG_BEGIN_PREPARE); + + /* fixed fields */ + pq_sendint64(out, txn->final_lsn); + pq_sendint64(out, txn->end_lsn); + pq_sendint64(out, txn->xact_time.prepare_time); + pq_sendint32(out, txn->xid); + + /* send gid */ + pq_sendstring(out, txn->gid); +} + +/* + * Read transaction BEGIN PREPARE from the stream. + */ +void +logicalrep_read_begin_prepare(StringInfo in, LogicalRepPreparedTxnData *begin_data) +{ + /* read fields */ + begin_data->prepare_lsn = pq_getmsgint64(in); + if (begin_data->prepare_lsn == InvalidXLogRecPtr) + elog(ERROR, "prepare_lsn not set in begin prepare message"); + begin_data->end_lsn = pq_getmsgint64(in); + if (begin_data->end_lsn == InvalidXLogRecPtr) + elog(ERROR, "end_lsn not set in begin prepare message"); + begin_data->prepare_time = pq_getmsgint64(in); + begin_data->xid = pq_getmsgint(in, 4); + + /* read gid (copy it into a pre-allocated buffer) */ + strlcpy(begin_data->gid, pq_getmsgstring(in), sizeof(begin_data->gid)); +} + +/* + * The core functionality for logicalrep_write_prepare and + * logicalrep_write_stream_prepare. + */ +static void +logicalrep_write_prepare_common(StringInfo out, LogicalRepMsgType type, + ReorderBufferTXN *txn, XLogRecPtr prepare_lsn) +{ + uint8 flags = 0; + + pq_sendbyte(out, type); + + /* + * This should only ever happen for two-phase commit transactions, in + * which case we expect to have a valid GID. + */ + Assert(txn->gid != NULL); + Assert(rbtxn_prepared(txn)); + Assert(TransactionIdIsValid(txn->xid)); + + /* send the flags field */ + pq_sendbyte(out, flags); + + /* send fields */ + pq_sendint64(out, prepare_lsn); + pq_sendint64(out, txn->end_lsn); + pq_sendint64(out, txn->xact_time.prepare_time); + pq_sendint32(out, txn->xid); + + /* send gid */ + pq_sendstring(out, txn->gid); +} + +/* + * Write PREPARE to the output stream. + */ +void +logicalrep_write_prepare(StringInfo out, ReorderBufferTXN *txn, + XLogRecPtr prepare_lsn) +{ + logicalrep_write_prepare_common(out, LOGICAL_REP_MSG_PREPARE, + txn, prepare_lsn); +} + +/* + * The core functionality for logicalrep_read_prepare and + * logicalrep_read_stream_prepare. + */ +static void +logicalrep_read_prepare_common(StringInfo in, char *msgtype, + LogicalRepPreparedTxnData *prepare_data) +{ + /* read flags */ + uint8 flags = pq_getmsgbyte(in); + + if (flags != 0) + elog(ERROR, "unrecognized flags %u in %s message", flags, msgtype); + + /* read fields */ + prepare_data->prepare_lsn = pq_getmsgint64(in); + if (prepare_data->prepare_lsn == InvalidXLogRecPtr) + elog(ERROR, "prepare_lsn is not set in %s message", msgtype); + prepare_data->end_lsn = pq_getmsgint64(in); + if (prepare_data->end_lsn == InvalidXLogRecPtr) + elog(ERROR, "end_lsn is not set in %s message", msgtype); + prepare_data->prepare_time = pq_getmsgint64(in); + prepare_data->xid = pq_getmsgint(in, 4); + if (prepare_data->xid == InvalidTransactionId) + elog(ERROR, "invalid two-phase transaction ID in %s message", msgtype); + + /* read gid (copy it into a pre-allocated buffer) */ + strlcpy(prepare_data->gid, pq_getmsgstring(in), sizeof(prepare_data->gid)); +} + +/* + * Read transaction PREPARE from the stream. + */ +void +logicalrep_read_prepare(StringInfo in, LogicalRepPreparedTxnData *prepare_data) +{ + logicalrep_read_prepare_common(in, "prepare", prepare_data); +} + +/* + * Write COMMIT PREPARED to the output stream. + */ +void +logicalrep_write_commit_prepared(StringInfo out, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn) +{ + uint8 flags = 0; + + pq_sendbyte(out, LOGICAL_REP_MSG_COMMIT_PREPARED); + + /* + * This should only ever happen for two-phase commit transactions, in + * which case we expect to have a valid GID. + */ + Assert(txn->gid != NULL); + + /* send the flags field */ + pq_sendbyte(out, flags); + + /* send fields */ + pq_sendint64(out, commit_lsn); + pq_sendint64(out, txn->end_lsn); + pq_sendint64(out, txn->xact_time.commit_time); + pq_sendint32(out, txn->xid); + + /* send gid */ + pq_sendstring(out, txn->gid); +} + +/* + * Read transaction COMMIT PREPARED from the stream. + */ +void +logicalrep_read_commit_prepared(StringInfo in, LogicalRepCommitPreparedTxnData *prepare_data) +{ + /* read flags */ + uint8 flags = pq_getmsgbyte(in); + + if (flags != 0) + elog(ERROR, "unrecognized flags %u in commit prepared message", flags); + + /* read fields */ + prepare_data->commit_lsn = pq_getmsgint64(in); + if (prepare_data->commit_lsn == InvalidXLogRecPtr) + elog(ERROR, "commit_lsn is not set in commit prepared message"); + prepare_data->end_lsn = pq_getmsgint64(in); + if (prepare_data->end_lsn == InvalidXLogRecPtr) + elog(ERROR, "end_lsn is not set in commit prepared message"); + prepare_data->commit_time = pq_getmsgint64(in); + prepare_data->xid = pq_getmsgint(in, 4); + + /* read gid (copy it into a pre-allocated buffer) */ + strlcpy(prepare_data->gid, pq_getmsgstring(in), sizeof(prepare_data->gid)); +} + +/* + * Write ROLLBACK PREPARED to the output stream. + */ +void +logicalrep_write_rollback_prepared(StringInfo out, ReorderBufferTXN *txn, + XLogRecPtr prepare_end_lsn, + TimestampTz prepare_time) +{ + uint8 flags = 0; + + pq_sendbyte(out, LOGICAL_REP_MSG_ROLLBACK_PREPARED); + + /* + * This should only ever happen for two-phase commit transactions, in + * which case we expect to have a valid GID. + */ + Assert(txn->gid != NULL); + + /* send the flags field */ + pq_sendbyte(out, flags); + + /* send fields */ + pq_sendint64(out, prepare_end_lsn); + pq_sendint64(out, txn->end_lsn); + pq_sendint64(out, prepare_time); + pq_sendint64(out, txn->xact_time.commit_time); + pq_sendint32(out, txn->xid); + + /* send gid */ + pq_sendstring(out, txn->gid); +} + +/* + * Read transaction ROLLBACK PREPARED from the stream. + */ +void +logicalrep_read_rollback_prepared(StringInfo in, + LogicalRepRollbackPreparedTxnData *rollback_data) +{ + /* read flags */ + uint8 flags = pq_getmsgbyte(in); + + if (flags != 0) + elog(ERROR, "unrecognized flags %u in rollback prepared message", flags); + + /* read fields */ + rollback_data->prepare_end_lsn = pq_getmsgint64(in); + if (rollback_data->prepare_end_lsn == InvalidXLogRecPtr) + elog(ERROR, "prepare_end_lsn is not set in rollback prepared message"); + rollback_data->rollback_end_lsn = pq_getmsgint64(in); + if (rollback_data->rollback_end_lsn == InvalidXLogRecPtr) + elog(ERROR, "rollback_end_lsn is not set in rollback prepared message"); + rollback_data->prepare_time = pq_getmsgint64(in); + rollback_data->rollback_time = pq_getmsgint64(in); + rollback_data->xid = pq_getmsgint(in, 4); + + /* read gid (copy it into a pre-allocated buffer) */ + strlcpy(rollback_data->gid, pq_getmsgstring(in), sizeof(rollback_data->gid)); +} + +/* + * Write STREAM PREPARE to the output stream. + */ +void +logicalrep_write_stream_prepare(StringInfo out, + ReorderBufferTXN *txn, + XLogRecPtr prepare_lsn) +{ + logicalrep_write_prepare_common(out, LOGICAL_REP_MSG_STREAM_PREPARE, + txn, prepare_lsn); +} + +/* + * Read STREAM PREPARE from the stream. + */ +void +logicalrep_read_stream_prepare(StringInfo in, LogicalRepPreparedTxnData *prepare_data) +{ + logicalrep_read_prepare_common(in, "stream prepare", prepare_data); +} + +/* + * Write ORIGIN to the output stream. + */ +void +logicalrep_write_origin(StringInfo out, const char *origin, + XLogRecPtr origin_lsn) +{ + pq_sendbyte(out, LOGICAL_REP_MSG_ORIGIN); + + /* fixed fields */ + pq_sendint64(out, origin_lsn); + + /* origin string */ + pq_sendstring(out, origin); +} + +/* + * Read ORIGIN from the output stream. + */ +char * +logicalrep_read_origin(StringInfo in, XLogRecPtr *origin_lsn) +{ + /* fixed fields */ + *origin_lsn = pq_getmsgint64(in); + + /* return origin */ + return pstrdup(pq_getmsgstring(in)); +} + +/* + * Write INSERT to the output stream. + */ +void +logicalrep_write_insert(StringInfo out, TransactionId xid, Relation rel, + TupleTableSlot *newslot, bool binary, Bitmapset *columns) +{ + pq_sendbyte(out, LOGICAL_REP_MSG_INSERT); + + /* transaction ID (if not valid, we're not streaming) */ + if (TransactionIdIsValid(xid)) + pq_sendint32(out, xid); + + /* use Oid as relation identifier */ + pq_sendint32(out, RelationGetRelid(rel)); + + pq_sendbyte(out, 'N'); /* new tuple follows */ + logicalrep_write_tuple(out, rel, newslot, binary, columns); +} + +/* + * Read INSERT from stream. + * + * Fills the new tuple. + */ +LogicalRepRelId +logicalrep_read_insert(StringInfo in, LogicalRepTupleData *newtup) +{ + char action; + LogicalRepRelId relid; + + /* read the relation id */ + relid = pq_getmsgint(in, 4); + + action = pq_getmsgbyte(in); + if (action != 'N') + elog(ERROR, "expected new tuple but got %d", + action); + + logicalrep_read_tuple(in, newtup); + + return relid; +} + +/* + * Write UPDATE to the output stream. + */ +void +logicalrep_write_update(StringInfo out, TransactionId xid, Relation rel, + TupleTableSlot *oldslot, TupleTableSlot *newslot, + bool binary, Bitmapset *columns) +{ + pq_sendbyte(out, LOGICAL_REP_MSG_UPDATE); + + Assert(rel->rd_rel->relreplident == REPLICA_IDENTITY_DEFAULT || + rel->rd_rel->relreplident == REPLICA_IDENTITY_FULL || + rel->rd_rel->relreplident == REPLICA_IDENTITY_INDEX); + + /* transaction ID (if not valid, we're not streaming) */ + if (TransactionIdIsValid(xid)) + pq_sendint32(out, xid); + + /* use Oid as relation identifier */ + pq_sendint32(out, RelationGetRelid(rel)); + + if (oldslot != NULL) + { + if (rel->rd_rel->relreplident == REPLICA_IDENTITY_FULL) + pq_sendbyte(out, 'O'); /* old tuple follows */ + else + pq_sendbyte(out, 'K'); /* old key follows */ + logicalrep_write_tuple(out, rel, oldslot, binary, columns); + } + + pq_sendbyte(out, 'N'); /* new tuple follows */ + logicalrep_write_tuple(out, rel, newslot, binary, columns); +} + +/* + * Read UPDATE from stream. + */ +LogicalRepRelId +logicalrep_read_update(StringInfo in, bool *has_oldtuple, + LogicalRepTupleData *oldtup, + LogicalRepTupleData *newtup) +{ + char action; + LogicalRepRelId relid; + + /* read the relation id */ + relid = pq_getmsgint(in, 4); + + /* read and verify action */ + action = pq_getmsgbyte(in); + if (action != 'K' && action != 'O' && action != 'N') + elog(ERROR, "expected action 'N', 'O' or 'K', got %c", + action); + + /* check for old tuple */ + if (action == 'K' || action == 'O') + { + logicalrep_read_tuple(in, oldtup); + *has_oldtuple = true; + + action = pq_getmsgbyte(in); + } + else + *has_oldtuple = false; + + /* check for new tuple */ + if (action != 'N') + elog(ERROR, "expected action 'N', got %c", + action); + + logicalrep_read_tuple(in, newtup); + + return relid; +} + +/* + * Write DELETE to the output stream. + */ +void +logicalrep_write_delete(StringInfo out, TransactionId xid, Relation rel, + TupleTableSlot *oldslot, bool binary, + Bitmapset *columns) +{ + Assert(rel->rd_rel->relreplident == REPLICA_IDENTITY_DEFAULT || + rel->rd_rel->relreplident == REPLICA_IDENTITY_FULL || + rel->rd_rel->relreplident == REPLICA_IDENTITY_INDEX); + + pq_sendbyte(out, LOGICAL_REP_MSG_DELETE); + + /* transaction ID (if not valid, we're not streaming) */ + if (TransactionIdIsValid(xid)) + pq_sendint32(out, xid); + + /* use Oid as relation identifier */ + pq_sendint32(out, RelationGetRelid(rel)); + + if (rel->rd_rel->relreplident == REPLICA_IDENTITY_FULL) + pq_sendbyte(out, 'O'); /* old tuple follows */ + else + pq_sendbyte(out, 'K'); /* old key follows */ + + logicalrep_write_tuple(out, rel, oldslot, binary, columns); +} + +/* + * Read DELETE from stream. + * + * Fills the old tuple. + */ +LogicalRepRelId +logicalrep_read_delete(StringInfo in, LogicalRepTupleData *oldtup) +{ + char action; + LogicalRepRelId relid; + + /* read the relation id */ + relid = pq_getmsgint(in, 4); + + /* read and verify action */ + action = pq_getmsgbyte(in); + if (action != 'K' && action != 'O') + elog(ERROR, "expected action 'O' or 'K', got %c", action); + + logicalrep_read_tuple(in, oldtup); + + return relid; +} + +/* + * Write TRUNCATE to the output stream. + */ +void +logicalrep_write_truncate(StringInfo out, + TransactionId xid, + int nrelids, + Oid relids[], + bool cascade, bool restart_seqs) +{ + int i; + uint8 flags = 0; + + pq_sendbyte(out, LOGICAL_REP_MSG_TRUNCATE); + + /* transaction ID (if not valid, we're not streaming) */ + if (TransactionIdIsValid(xid)) + pq_sendint32(out, xid); + + pq_sendint32(out, nrelids); + + /* encode and send truncate flags */ + if (cascade) + flags |= TRUNCATE_CASCADE; + if (restart_seqs) + flags |= TRUNCATE_RESTART_SEQS; + pq_sendint8(out, flags); + + for (i = 0; i < nrelids; i++) + pq_sendint32(out, relids[i]); +} + +/* + * Read TRUNCATE from stream. + */ +List * +logicalrep_read_truncate(StringInfo in, + bool *cascade, bool *restart_seqs) +{ + int i; + int nrelids; + List *relids = NIL; + uint8 flags; + + nrelids = pq_getmsgint(in, 4); + + /* read and decode truncate flags */ + flags = pq_getmsgint(in, 1); + *cascade = (flags & TRUNCATE_CASCADE) > 0; + *restart_seqs = (flags & TRUNCATE_RESTART_SEQS) > 0; + + for (i = 0; i < nrelids; i++) + relids = lappend_oid(relids, pq_getmsgint(in, 4)); + + return relids; +} + +/* + * Write MESSAGE to stream + */ +void +logicalrep_write_message(StringInfo out, TransactionId xid, XLogRecPtr lsn, + bool transactional, const char *prefix, Size sz, + const char *message) +{ + uint8 flags = 0; + + pq_sendbyte(out, LOGICAL_REP_MSG_MESSAGE); + + /* encode and send message flags */ + if (transactional) + flags |= MESSAGE_TRANSACTIONAL; + + /* transaction ID (if not valid, we're not streaming) */ + if (TransactionIdIsValid(xid)) + pq_sendint32(out, xid); + + pq_sendint8(out, flags); + pq_sendint64(out, lsn); + pq_sendstring(out, prefix); + pq_sendint32(out, sz); + pq_sendbytes(out, message, sz); +} + +/* + * Write relation description to the output stream. + */ +void +logicalrep_write_rel(StringInfo out, TransactionId xid, Relation rel, + Bitmapset *columns) +{ + char *relname; + + pq_sendbyte(out, LOGICAL_REP_MSG_RELATION); + + /* transaction ID (if not valid, we're not streaming) */ + if (TransactionIdIsValid(xid)) + pq_sendint32(out, xid); + + /* use Oid as relation identifier */ + pq_sendint32(out, RelationGetRelid(rel)); + + /* send qualified relation name */ + logicalrep_write_namespace(out, RelationGetNamespace(rel)); + relname = RelationGetRelationName(rel); + pq_sendstring(out, relname); + + /* send replica identity */ + pq_sendbyte(out, rel->rd_rel->relreplident); + + /* send the attribute info */ + logicalrep_write_attrs(out, rel, columns); +} + +/* + * Read the relation info from stream and return as LogicalRepRelation. + */ +LogicalRepRelation * +logicalrep_read_rel(StringInfo in) +{ + LogicalRepRelation *rel = palloc(sizeof(LogicalRepRelation)); + + rel->remoteid = pq_getmsgint(in, 4); + + /* Read relation name from stream */ + rel->nspname = pstrdup(logicalrep_read_namespace(in)); + rel->relname = pstrdup(pq_getmsgstring(in)); + + /* Read the replica identity. */ + rel->replident = pq_getmsgbyte(in); + + /* Get attribute description */ + logicalrep_read_attrs(in, rel); + + return rel; +} + +/* + * Write type info to the output stream. + * + * This function will always write base type info. + */ +void +logicalrep_write_typ(StringInfo out, TransactionId xid, Oid typoid) +{ + Oid basetypoid = getBaseType(typoid); + HeapTuple tup; + Form_pg_type typtup; + + pq_sendbyte(out, LOGICAL_REP_MSG_TYPE); + + /* transaction ID (if not valid, we're not streaming) */ + if (TransactionIdIsValid(xid)) + pq_sendint32(out, xid); + + tup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(basetypoid)); + if (!HeapTupleIsValid(tup)) + elog(ERROR, "cache lookup failed for type %u", basetypoid); + typtup = (Form_pg_type) GETSTRUCT(tup); + + /* use Oid as relation identifier */ + pq_sendint32(out, typoid); + + /* send qualified type name */ + logicalrep_write_namespace(out, typtup->typnamespace); + pq_sendstring(out, NameStr(typtup->typname)); + + ReleaseSysCache(tup); +} + +/* + * Read type info from the output stream. + */ +void +logicalrep_read_typ(StringInfo in, LogicalRepTyp *ltyp) +{ + ltyp->remoteid = pq_getmsgint(in, 4); + + /* Read type name from stream */ + ltyp->nspname = pstrdup(logicalrep_read_namespace(in)); + ltyp->typname = pstrdup(pq_getmsgstring(in)); +} + +/* + * Write a tuple to the outputstream, in the most efficient format possible. + */ +static void +logicalrep_write_tuple(StringInfo out, Relation rel, TupleTableSlot *slot, + bool binary, Bitmapset *columns) +{ + TupleDesc desc; + Datum *values; + bool *isnull; + int i; + uint16 nliveatts = 0; + + desc = RelationGetDescr(rel); + + for (i = 0; i < desc->natts; i++) + { + Form_pg_attribute att = TupleDescAttr(desc, i); + + if (att->attisdropped || att->attgenerated) + continue; + + if (!column_in_column_list(att->attnum, columns)) + continue; + + nliveatts++; + } + pq_sendint16(out, nliveatts); + + slot_getallattrs(slot); + values = slot->tts_values; + isnull = slot->tts_isnull; + + /* Write the values */ + for (i = 0; i < desc->natts; i++) + { + HeapTuple typtup; + Form_pg_type typclass; + Form_pg_attribute att = TupleDescAttr(desc, i); + + if (att->attisdropped || att->attgenerated) + continue; + + if (!column_in_column_list(att->attnum, columns)) + continue; + + if (isnull[i]) + { + pq_sendbyte(out, LOGICALREP_COLUMN_NULL); + continue; + } + + if (att->attlen == -1 && VARATT_IS_EXTERNAL_ONDISK(values[i])) + { + /* + * Unchanged toasted datum. (Note that we don't promise to detect + * unchanged data in general; this is just a cheap check to avoid + * sending large values unnecessarily.) + */ + pq_sendbyte(out, LOGICALREP_COLUMN_UNCHANGED); + continue; + } + + typtup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(att->atttypid)); + if (!HeapTupleIsValid(typtup)) + elog(ERROR, "cache lookup failed for type %u", att->atttypid); + typclass = (Form_pg_type) GETSTRUCT(typtup); + + /* + * Send in binary if requested and type has suitable send function. + */ + if (binary && OidIsValid(typclass->typsend)) + { + bytea *outputbytes; + int len; + + pq_sendbyte(out, LOGICALREP_COLUMN_BINARY); + outputbytes = OidSendFunctionCall(typclass->typsend, values[i]); + len = VARSIZE(outputbytes) - VARHDRSZ; + pq_sendint(out, len, 4); /* length */ + pq_sendbytes(out, VARDATA(outputbytes), len); /* data */ + pfree(outputbytes); + } + else + { + char *outputstr; + + pq_sendbyte(out, LOGICALREP_COLUMN_TEXT); + outputstr = OidOutputFunctionCall(typclass->typoutput, values[i]); + pq_sendcountedtext(out, outputstr, strlen(outputstr), false); + pfree(outputstr); + } + + ReleaseSysCache(typtup); + } +} + +/* + * Read tuple in logical replication format from stream. + */ +static void +logicalrep_read_tuple(StringInfo in, LogicalRepTupleData *tuple) +{ + int i; + int natts; + + /* Get number of attributes */ + natts = pq_getmsgint(in, 2); + + /* Allocate space for per-column values; zero out unused StringInfoDatas */ + tuple->colvalues = (StringInfoData *) palloc0(natts * sizeof(StringInfoData)); + tuple->colstatus = (char *) palloc(natts * sizeof(char)); + tuple->ncols = natts; + + /* Read the data */ + for (i = 0; i < natts; i++) + { + char kind; + int len; + StringInfo value = &tuple->colvalues[i]; + + kind = pq_getmsgbyte(in); + tuple->colstatus[i] = kind; + + switch (kind) + { + case LOGICALREP_COLUMN_NULL: + /* nothing more to do */ + break; + case LOGICALREP_COLUMN_UNCHANGED: + /* we don't receive the value of an unchanged column */ + break; + case LOGICALREP_COLUMN_TEXT: + case LOGICALREP_COLUMN_BINARY: + len = pq_getmsgint(in, 4); /* read length */ + + /* and data */ + value->data = palloc(len + 1); + pq_copymsgbytes(in, value->data, len); + + /* + * Not strictly necessary for LOGICALREP_COLUMN_BINARY, but + * per StringInfo practice. + */ + value->data[len] = '\0'; + + /* make StringInfo fully valid */ + value->len = len; + value->cursor = 0; + value->maxlen = len; + break; + default: + elog(ERROR, "unrecognized data representation type '%c'", kind); + } + } +} + +/* + * Write relation attribute metadata to the stream. + */ +static void +logicalrep_write_attrs(StringInfo out, Relation rel, Bitmapset *columns) +{ + TupleDesc desc; + int i; + uint16 nliveatts = 0; + Bitmapset *idattrs = NULL; + bool replidentfull; + + desc = RelationGetDescr(rel); + + /* send number of live attributes */ + for (i = 0; i < desc->natts; i++) + { + Form_pg_attribute att = TupleDescAttr(desc, i); + + if (att->attisdropped || att->attgenerated) + continue; + + if (!column_in_column_list(att->attnum, columns)) + continue; + + nliveatts++; + } + pq_sendint16(out, nliveatts); + + /* fetch bitmap of REPLICATION IDENTITY attributes */ + replidentfull = (rel->rd_rel->relreplident == REPLICA_IDENTITY_FULL); + if (!replidentfull) + idattrs = RelationGetIdentityKeyBitmap(rel); + + /* send the attributes */ + for (i = 0; i < desc->natts; i++) + { + Form_pg_attribute att = TupleDescAttr(desc, i); + uint8 flags = 0; + + if (att->attisdropped || att->attgenerated) + continue; + + if (!column_in_column_list(att->attnum, columns)) + continue; + + /* REPLICA IDENTITY FULL means all columns are sent as part of key. */ + if (replidentfull || + bms_is_member(att->attnum - FirstLowInvalidHeapAttributeNumber, + idattrs)) + flags |= LOGICALREP_IS_REPLICA_IDENTITY; + + pq_sendbyte(out, flags); + + /* attribute name */ + pq_sendstring(out, NameStr(att->attname)); + + /* attribute type id */ + pq_sendint32(out, (int) att->atttypid); + + /* attribute mode */ + pq_sendint32(out, att->atttypmod); + } + + bms_free(idattrs); +} + +/* + * Read relation attribute metadata from the stream. + */ +static void +logicalrep_read_attrs(StringInfo in, LogicalRepRelation *rel) +{ + int i; + int natts; + char **attnames; + Oid *atttyps; + Bitmapset *attkeys = NULL; + + natts = pq_getmsgint(in, 2); + attnames = palloc(natts * sizeof(char *)); + atttyps = palloc(natts * sizeof(Oid)); + + /* read the attributes */ + for (i = 0; i < natts; i++) + { + uint8 flags; + + /* Check for replica identity column */ + flags = pq_getmsgbyte(in); + if (flags & LOGICALREP_IS_REPLICA_IDENTITY) + attkeys = bms_add_member(attkeys, i); + + /* attribute name */ + attnames[i] = pstrdup(pq_getmsgstring(in)); + + /* attribute type id */ + atttyps[i] = (Oid) pq_getmsgint(in, 4); + + /* we ignore attribute mode for now */ + (void) pq_getmsgint(in, 4); + } + + rel->attnames = attnames; + rel->atttyps = atttyps; + rel->attkeys = attkeys; + rel->natts = natts; +} + +/* + * Write the namespace name or empty string for pg_catalog (to save space). + */ +static void +logicalrep_write_namespace(StringInfo out, Oid nspid) +{ + if (nspid == PG_CATALOG_NAMESPACE) + pq_sendbyte(out, '\0'); + else + { + char *nspname = get_namespace_name(nspid); + + if (nspname == NULL) + elog(ERROR, "cache lookup failed for namespace %u", + nspid); + + pq_sendstring(out, nspname); + } +} + +/* + * Read the namespace name while treating empty string as pg_catalog. + */ +static const char * +logicalrep_read_namespace(StringInfo in) +{ + const char *nspname = pq_getmsgstring(in); + + if (nspname[0] == '\0') + nspname = "pg_catalog"; + + return nspname; +} + +/* + * Write the information for the start stream message to the output stream. + */ +void +logicalrep_write_stream_start(StringInfo out, + TransactionId xid, bool first_segment) +{ + pq_sendbyte(out, LOGICAL_REP_MSG_STREAM_START); + + Assert(TransactionIdIsValid(xid)); + + /* transaction ID (we're starting to stream, so must be valid) */ + pq_sendint32(out, xid); + + /* 1 if this is the first streaming segment for this xid */ + pq_sendbyte(out, first_segment ? 1 : 0); +} + +/* + * Read the information about the start stream message from output stream. + */ +TransactionId +logicalrep_read_stream_start(StringInfo in, bool *first_segment) +{ + TransactionId xid; + + Assert(first_segment); + + xid = pq_getmsgint(in, 4); + *first_segment = (pq_getmsgbyte(in) == 1); + + return xid; +} + +/* + * Write the stop stream message to the output stream. + */ +void +logicalrep_write_stream_stop(StringInfo out) +{ + pq_sendbyte(out, LOGICAL_REP_MSG_STREAM_STOP); +} + +/* + * Write STREAM COMMIT to the output stream. + */ +void +logicalrep_write_stream_commit(StringInfo out, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn) +{ + uint8 flags = 0; + + pq_sendbyte(out, LOGICAL_REP_MSG_STREAM_COMMIT); + + Assert(TransactionIdIsValid(txn->xid)); + + /* transaction ID */ + pq_sendint32(out, txn->xid); + + /* send the flags field (unused for now) */ + pq_sendbyte(out, flags); + + /* send fields */ + pq_sendint64(out, commit_lsn); + pq_sendint64(out, txn->end_lsn); + pq_sendint64(out, txn->xact_time.commit_time); +} + +/* + * Read STREAM COMMIT from the output stream. + */ +TransactionId +logicalrep_read_stream_commit(StringInfo in, LogicalRepCommitData *commit_data) +{ + TransactionId xid; + uint8 flags; + + xid = pq_getmsgint(in, 4); + + /* read flags (unused for now) */ + flags = pq_getmsgbyte(in); + + if (flags != 0) + elog(ERROR, "unrecognized flags %u in commit message", flags); + + /* read fields */ + commit_data->commit_lsn = pq_getmsgint64(in); + commit_data->end_lsn = pq_getmsgint64(in); + commit_data->committime = pq_getmsgint64(in); + + return xid; +} + +/* + * Write STREAM ABORT to the output stream. Note that xid and subxid will be + * same for the top-level transaction abort. + * + * If write_abort_info is true, send the abort_lsn and abort_time fields, + * otherwise don't. + */ +void +logicalrep_write_stream_abort(StringInfo out, TransactionId xid, + TransactionId subxid, XLogRecPtr abort_lsn, + TimestampTz abort_time, bool write_abort_info) +{ + pq_sendbyte(out, LOGICAL_REP_MSG_STREAM_ABORT); + + Assert(TransactionIdIsValid(xid) && TransactionIdIsValid(subxid)); + + /* transaction ID */ + pq_sendint32(out, xid); + pq_sendint32(out, subxid); + + if (write_abort_info) + { + pq_sendint64(out, abort_lsn); + pq_sendint64(out, abort_time); + } +} + +/* + * Read STREAM ABORT from the output stream. + * + * If read_abort_info is true, read the abort_lsn and abort_time fields, + * otherwise don't. + */ +void +logicalrep_read_stream_abort(StringInfo in, + LogicalRepStreamAbortData *abort_data, + bool read_abort_info) +{ + Assert(abort_data); + + abort_data->xid = pq_getmsgint(in, 4); + abort_data->subxid = pq_getmsgint(in, 4); + + if (read_abort_info) + { + abort_data->abort_lsn = pq_getmsgint64(in); + abort_data->abort_time = pq_getmsgint64(in); + } + else + { + abort_data->abort_lsn = InvalidXLogRecPtr; + abort_data->abort_time = 0; + } +} + +/* + * Get string representing LogicalRepMsgType. + */ +const char * +logicalrep_message_type(LogicalRepMsgType action) +{ + static __thread char err_unknown[20]; + + switch (action) + { + case LOGICAL_REP_MSG_BEGIN: + return "BEGIN"; + case LOGICAL_REP_MSG_COMMIT: + return "COMMIT"; + case LOGICAL_REP_MSG_ORIGIN: + return "ORIGIN"; + case LOGICAL_REP_MSG_INSERT: + return "INSERT"; + case LOGICAL_REP_MSG_UPDATE: + return "UPDATE"; + case LOGICAL_REP_MSG_DELETE: + return "DELETE"; + case LOGICAL_REP_MSG_TRUNCATE: + return "TRUNCATE"; + case LOGICAL_REP_MSG_RELATION: + return "RELATION"; + case LOGICAL_REP_MSG_TYPE: + return "TYPE"; + case LOGICAL_REP_MSG_MESSAGE: + return "MESSAGE"; + case LOGICAL_REP_MSG_BEGIN_PREPARE: + return "BEGIN PREPARE"; + case LOGICAL_REP_MSG_PREPARE: + return "PREPARE"; + case LOGICAL_REP_MSG_COMMIT_PREPARED: + return "COMMIT PREPARED"; + case LOGICAL_REP_MSG_ROLLBACK_PREPARED: + return "ROLLBACK PREPARED"; + case LOGICAL_REP_MSG_STREAM_START: + return "STREAM START"; + case LOGICAL_REP_MSG_STREAM_STOP: + return "STREAM STOP"; + case LOGICAL_REP_MSG_STREAM_COMMIT: + return "STREAM COMMIT"; + case LOGICAL_REP_MSG_STREAM_ABORT: + return "STREAM ABORT"; + case LOGICAL_REP_MSG_STREAM_PREPARE: + return "STREAM PREPARE"; + } + + /* + * This message provides context in the error raised when applying a + * logical message. So we can't throw an error here. Return an unknown + * indicator value so that the original error is still reported. + */ + snprintf(err_unknown, sizeof(err_unknown), "??? (%d)", action); + + return err_unknown; +} diff --git a/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/relation.c b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/relation.c new file mode 100644 index 00000000000..2d7ee69b8d3 --- /dev/null +++ b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/relation.c @@ -0,0 +1,889 @@ +/*------------------------------------------------------------------------- + * relation.c + * PostgreSQL logical replication relation mapping cache + * + * Copyright (c) 2016-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/relation.c + * + * NOTES + * Routines in this file mainly have to do with mapping the properties + * of local replication target relations to the properties of their + * remote counterpart. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/genam.h" +#include "access/table.h" +#include "catalog/namespace.h" +#include "catalog/pg_am_d.h" +#include "catalog/pg_subscription_rel.h" +#include "executor/executor.h" +#include "nodes/makefuncs.h" +#include "replication/logicalrelation.h" +#include "replication/worker_internal.h" +#include "utils/inval.h" + + +static __thread MemoryContext LogicalRepRelMapContext = NULL; + +static __thread HTAB *LogicalRepRelMap = NULL; + +/* + * Partition map (LogicalRepPartMap) + * + * When a partitioned table is used as replication target, replicated + * operations are actually performed on its leaf partitions, which requires + * the partitions to also be mapped to the remote relation. Parent's entry + * (LogicalRepRelMapEntry) cannot be used as-is for all partitions, because + * individual partitions may have different attribute numbers, which means + * attribute mappings to remote relation's attributes must be maintained + * separately for each partition. + */ +static __thread MemoryContext LogicalRepPartMapContext = NULL; +static __thread HTAB *LogicalRepPartMap = NULL; +typedef struct LogicalRepPartMapEntry +{ + Oid partoid; /* LogicalRepPartMap's key */ + LogicalRepRelMapEntry relmapentry; +} LogicalRepPartMapEntry; + +static Oid FindLogicalRepLocalIndex(Relation localrel, LogicalRepRelation *remoterel, + AttrMap *attrMap); + +/* + * Relcache invalidation callback for our relation map cache. + */ +static void +logicalrep_relmap_invalidate_cb(Datum arg, Oid reloid) +{ + LogicalRepRelMapEntry *entry; + + /* Just to be sure. */ + if (LogicalRepRelMap == NULL) + return; + + if (reloid != InvalidOid) + { + HASH_SEQ_STATUS status; + + hash_seq_init(&status, LogicalRepRelMap); + + /* TODO, use inverse lookup hashtable? */ + while ((entry = (LogicalRepRelMapEntry *) hash_seq_search(&status)) != NULL) + { + if (entry->localreloid == reloid) + { + entry->localrelvalid = false; + hash_seq_term(&status); + break; + } + } + } + else + { + /* invalidate all cache entries */ + HASH_SEQ_STATUS status; + + hash_seq_init(&status, LogicalRepRelMap); + + while ((entry = (LogicalRepRelMapEntry *) hash_seq_search(&status)) != NULL) + entry->localrelvalid = false; + } +} + +/* + * Initialize the relation map cache. + */ +static void +logicalrep_relmap_init(void) +{ + HASHCTL ctl; + + if (!LogicalRepRelMapContext) + LogicalRepRelMapContext = + AllocSetContextCreate(CacheMemoryContext, + "LogicalRepRelMapContext", + ALLOCSET_DEFAULT_SIZES); + + /* Initialize the relation hash table. */ + ctl.keysize = sizeof(LogicalRepRelId); + ctl.entrysize = sizeof(LogicalRepRelMapEntry); + ctl.hcxt = LogicalRepRelMapContext; + + LogicalRepRelMap = hash_create("logicalrep relation map cache", 128, &ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + /* Watch for invalidation events. */ + CacheRegisterRelcacheCallback(logicalrep_relmap_invalidate_cb, + (Datum) 0); +} + +/* + * Free the entry of a relation map cache. + */ +static void +logicalrep_relmap_free_entry(LogicalRepRelMapEntry *entry) +{ + LogicalRepRelation *remoterel; + + remoterel = &entry->remoterel; + + pfree(remoterel->nspname); + pfree(remoterel->relname); + + if (remoterel->natts > 0) + { + int i; + + for (i = 0; i < remoterel->natts; i++) + pfree(remoterel->attnames[i]); + + pfree(remoterel->attnames); + pfree(remoterel->atttyps); + } + bms_free(remoterel->attkeys); + + if (entry->attrmap) + free_attrmap(entry->attrmap); +} + +/* + * Add new entry or update existing entry in the relation map cache. + * + * Called when new relation mapping is sent by the publisher to update + * our expected view of incoming data from said publisher. + */ +void +logicalrep_relmap_update(LogicalRepRelation *remoterel) +{ + MemoryContext oldctx; + LogicalRepRelMapEntry *entry; + bool found; + int i; + + if (LogicalRepRelMap == NULL) + logicalrep_relmap_init(); + + /* + * HASH_ENTER returns the existing entry if present or creates a new one. + */ + entry = hash_search(LogicalRepRelMap, &remoterel->remoteid, + HASH_ENTER, &found); + + if (found) + logicalrep_relmap_free_entry(entry); + + memset(entry, 0, sizeof(LogicalRepRelMapEntry)); + + /* Make cached copy of the data */ + oldctx = MemoryContextSwitchTo(LogicalRepRelMapContext); + entry->remoterel.remoteid = remoterel->remoteid; + entry->remoterel.nspname = pstrdup(remoterel->nspname); + entry->remoterel.relname = pstrdup(remoterel->relname); + entry->remoterel.natts = remoterel->natts; + entry->remoterel.attnames = palloc(remoterel->natts * sizeof(char *)); + entry->remoterel.atttyps = palloc(remoterel->natts * sizeof(Oid)); + for (i = 0; i < remoterel->natts; i++) + { + entry->remoterel.attnames[i] = pstrdup(remoterel->attnames[i]); + entry->remoterel.atttyps[i] = remoterel->atttyps[i]; + } + entry->remoterel.replident = remoterel->replident; + entry->remoterel.attkeys = bms_copy(remoterel->attkeys); + MemoryContextSwitchTo(oldctx); +} + +/* + * Find attribute index in TupleDesc struct by attribute name. + * + * Returns -1 if not found. + */ +static int +logicalrep_rel_att_by_name(LogicalRepRelation *remoterel, const char *attname) +{ + int i; + + for (i = 0; i < remoterel->natts; i++) + { + if (strcmp(remoterel->attnames[i], attname) == 0) + return i; + } + + return -1; +} + +/* + * Report error with names of the missing local relation column(s), if any. + */ +static void +logicalrep_report_missing_attrs(LogicalRepRelation *remoterel, + Bitmapset *missingatts) +{ + if (!bms_is_empty(missingatts)) + { + StringInfoData missingattsbuf; + int missingattcnt = 0; + int i; + + initStringInfo(&missingattsbuf); + + i = -1; + while ((i = bms_next_member(missingatts, i)) >= 0) + { + missingattcnt++; + if (missingattcnt == 1) + appendStringInfo(&missingattsbuf, _("\"%s\""), + remoterel->attnames[i]); + else + appendStringInfo(&missingattsbuf, _(", \"%s\""), + remoterel->attnames[i]); + } + + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg_plural("logical replication target relation \"%s.%s\" is missing replicated column: %s", + "logical replication target relation \"%s.%s\" is missing replicated columns: %s", + missingattcnt, + remoterel->nspname, + remoterel->relname, + missingattsbuf.data))); + } +} + +/* + * Check if replica identity matches and mark the updatable flag. + * + * We allow for stricter replica identity (fewer columns) on subscriber as + * that will not stop us from finding unique tuple. IE, if publisher has + * identity (id,timestamp) and subscriber just (id) this will not be a + * problem, but in the opposite scenario it will. + * + * We just mark the relation entry as not updatable here if the local + * replica identity is found to be insufficient for applying + * updates/deletes (inserts don't care!) and leave it to + * check_relation_updatable() to throw the actual error if needed. + */ +static void +logicalrep_rel_mark_updatable(LogicalRepRelMapEntry *entry) +{ + Bitmapset *idkey; + LogicalRepRelation *remoterel = &entry->remoterel; + int i; + + entry->updatable = true; + + idkey = RelationGetIndexAttrBitmap(entry->localrel, + INDEX_ATTR_BITMAP_IDENTITY_KEY); + /* fallback to PK if no replica identity */ + if (idkey == NULL) + { + idkey = RelationGetIndexAttrBitmap(entry->localrel, + INDEX_ATTR_BITMAP_PRIMARY_KEY); + + /* + * If no replica identity index and no PK, the published table must + * have replica identity FULL. + */ + if (idkey == NULL && remoterel->replident != REPLICA_IDENTITY_FULL) + entry->updatable = false; + } + + i = -1; + while ((i = bms_next_member(idkey, i)) >= 0) + { + int attnum = i + FirstLowInvalidHeapAttributeNumber; + + if (!AttrNumberIsForUserDefinedAttr(attnum)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical replication target relation \"%s.%s\" uses " + "system columns in REPLICA IDENTITY index", + remoterel->nspname, remoterel->relname))); + + attnum = AttrNumberGetAttrOffset(attnum); + + if (entry->attrmap->attnums[attnum] < 0 || + !bms_is_member(entry->attrmap->attnums[attnum], remoterel->attkeys)) + { + entry->updatable = false; + break; + } + } +} + +/* + * Open the local relation associated with the remote one. + * + * Rebuilds the Relcache mapping if it was invalidated by local DDL. + */ +LogicalRepRelMapEntry * +logicalrep_rel_open(LogicalRepRelId remoteid, LOCKMODE lockmode) +{ + LogicalRepRelMapEntry *entry; + bool found; + LogicalRepRelation *remoterel; + + if (LogicalRepRelMap == NULL) + logicalrep_relmap_init(); + + /* Search for existing entry. */ + entry = hash_search(LogicalRepRelMap, &remoteid, + HASH_FIND, &found); + + if (!found) + elog(ERROR, "no relation map entry for remote relation ID %u", + remoteid); + + remoterel = &entry->remoterel; + + /* Ensure we don't leak a relcache refcount. */ + if (entry->localrel) + elog(ERROR, "remote relation ID %u is already open", remoteid); + + /* + * When opening and locking a relation, pending invalidation messages are + * processed which can invalidate the relation. Hence, if the entry is + * currently considered valid, try to open the local relation by OID and + * see if invalidation ensues. + */ + if (entry->localrelvalid) + { + entry->localrel = try_table_open(entry->localreloid, lockmode); + if (!entry->localrel) + { + /* Table was renamed or dropped. */ + entry->localrelvalid = false; + } + else if (!entry->localrelvalid) + { + /* Note we release the no-longer-useful lock here. */ + table_close(entry->localrel, lockmode); + entry->localrel = NULL; + } + } + + /* + * If the entry has been marked invalid since we last had lock on it, + * re-open the local relation by name and rebuild all derived data. + */ + if (!entry->localrelvalid) + { + Oid relid; + TupleDesc desc; + MemoryContext oldctx; + int i; + Bitmapset *missingatts; + + /* Release the no-longer-useful attrmap, if any. */ + if (entry->attrmap) + { + free_attrmap(entry->attrmap); + entry->attrmap = NULL; + } + + /* Try to find and lock the relation by name. */ + relid = RangeVarGetRelid(makeRangeVar(remoterel->nspname, + remoterel->relname, -1), + lockmode, true); + if (!OidIsValid(relid)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical replication target relation \"%s.%s\" does not exist", + remoterel->nspname, remoterel->relname))); + entry->localrel = table_open(relid, NoLock); + entry->localreloid = relid; + + /* Check for supported relkind. */ + CheckSubscriptionRelkind(entry->localrel->rd_rel->relkind, + remoterel->nspname, remoterel->relname); + + /* + * Build the mapping of local attribute numbers to remote attribute + * numbers and validate that we don't miss any replicated columns as + * that would result in potentially unwanted data loss. + */ + desc = RelationGetDescr(entry->localrel); + oldctx = MemoryContextSwitchTo(LogicalRepRelMapContext); + entry->attrmap = make_attrmap(desc->natts); + MemoryContextSwitchTo(oldctx); + + /* check and report missing attrs, if any */ + missingatts = bms_add_range(NULL, 0, remoterel->natts - 1); + for (i = 0; i < desc->natts; i++) + { + int attnum; + Form_pg_attribute attr = TupleDescAttr(desc, i); + + if (attr->attisdropped || attr->attgenerated) + { + entry->attrmap->attnums[i] = -1; + continue; + } + + attnum = logicalrep_rel_att_by_name(remoterel, + NameStr(attr->attname)); + + entry->attrmap->attnums[i] = attnum; + if (attnum >= 0) + missingatts = bms_del_member(missingatts, attnum); + } + + logicalrep_report_missing_attrs(remoterel, missingatts); + + /* be tidy */ + bms_free(missingatts); + + /* + * Set if the table's replica identity is enough to apply + * update/delete. + */ + logicalrep_rel_mark_updatable(entry); + + /* + * Finding a usable index is an infrequent task. It occurs when an + * operation is first performed on the relation, or after invalidation + * of the relation cache entry (such as ANALYZE or CREATE/DROP index + * on the relation). + */ + entry->localindexoid = FindLogicalRepLocalIndex(entry->localrel, remoterel, + entry->attrmap); + + entry->localrelvalid = true; + } + + if (entry->state != SUBREL_STATE_READY) + entry->state = GetSubscriptionRelState(MySubscription->oid, + entry->localreloid, + &entry->statelsn); + + return entry; +} + +/* + * Close the previously opened logical relation. + */ +void +logicalrep_rel_close(LogicalRepRelMapEntry *rel, LOCKMODE lockmode) +{ + table_close(rel->localrel, lockmode); + rel->localrel = NULL; +} + +/* + * Partition cache: look up partition LogicalRepRelMapEntry's + * + * Unlike relation map cache, this is keyed by partition OID, not remote + * relation OID, because we only have to use this cache in the case where + * partitions are not directly mapped to any remote relation, such as when + * replication is occurring with one of their ancestors as target. + */ + +/* + * Relcache invalidation callback + */ +static void +logicalrep_partmap_invalidate_cb(Datum arg, Oid reloid) +{ + LogicalRepPartMapEntry *entry; + + /* Just to be sure. */ + if (LogicalRepPartMap == NULL) + return; + + if (reloid != InvalidOid) + { + HASH_SEQ_STATUS status; + + hash_seq_init(&status, LogicalRepPartMap); + + /* TODO, use inverse lookup hashtable? */ + while ((entry = (LogicalRepPartMapEntry *) hash_seq_search(&status)) != NULL) + { + if (entry->relmapentry.localreloid == reloid) + { + entry->relmapentry.localrelvalid = false; + hash_seq_term(&status); + break; + } + } + } + else + { + /* invalidate all cache entries */ + HASH_SEQ_STATUS status; + + hash_seq_init(&status, LogicalRepPartMap); + + while ((entry = (LogicalRepPartMapEntry *) hash_seq_search(&status)) != NULL) + entry->relmapentry.localrelvalid = false; + } +} + +/* + * Reset the entries in the partition map that refer to remoterel. + * + * Called when new relation mapping is sent by the publisher to update our + * expected view of incoming data from said publisher. + * + * Note that we don't update the remoterel information in the entry here, + * we will update the information in logicalrep_partition_open to avoid + * unnecessary work. + */ +void +logicalrep_partmap_reset_relmap(LogicalRepRelation *remoterel) +{ + HASH_SEQ_STATUS status; + LogicalRepPartMapEntry *part_entry; + LogicalRepRelMapEntry *entry; + + if (LogicalRepPartMap == NULL) + return; + + hash_seq_init(&status, LogicalRepPartMap); + while ((part_entry = (LogicalRepPartMapEntry *) hash_seq_search(&status)) != NULL) + { + entry = &part_entry->relmapentry; + + if (entry->remoterel.remoteid != remoterel->remoteid) + continue; + + logicalrep_relmap_free_entry(entry); + + memset(entry, 0, sizeof(LogicalRepRelMapEntry)); + } +} + +/* + * Initialize the partition map cache. + */ +static void +logicalrep_partmap_init(void) +{ + HASHCTL ctl; + + if (!LogicalRepPartMapContext) + LogicalRepPartMapContext = + AllocSetContextCreate(CacheMemoryContext, + "LogicalRepPartMapContext", + ALLOCSET_DEFAULT_SIZES); + + /* Initialize the relation hash table. */ + ctl.keysize = sizeof(Oid); /* partition OID */ + ctl.entrysize = sizeof(LogicalRepPartMapEntry); + ctl.hcxt = LogicalRepPartMapContext; + + LogicalRepPartMap = hash_create("logicalrep partition map cache", 64, &ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + /* Watch for invalidation events. */ + CacheRegisterRelcacheCallback(logicalrep_partmap_invalidate_cb, + (Datum) 0); +} + +/* + * logicalrep_partition_open + * + * Returned entry reuses most of the values of the root table's entry, save + * the attribute map, which can be different for the partition. However, + * we must physically copy all the data, in case the root table's entry + * gets freed/rebuilt. + * + * Note there's no logicalrep_partition_close, because the caller closes the + * component relation. + */ +LogicalRepRelMapEntry * +logicalrep_partition_open(LogicalRepRelMapEntry *root, + Relation partrel, AttrMap *map) +{ + LogicalRepRelMapEntry *entry; + LogicalRepPartMapEntry *part_entry; + LogicalRepRelation *remoterel = &root->remoterel; + Oid partOid = RelationGetRelid(partrel); + AttrMap *attrmap = root->attrmap; + bool found; + MemoryContext oldctx; + + if (LogicalRepPartMap == NULL) + logicalrep_partmap_init(); + + /* Search for existing entry. */ + part_entry = (LogicalRepPartMapEntry *) hash_search(LogicalRepPartMap, + &partOid, + HASH_ENTER, &found); + + entry = &part_entry->relmapentry; + + /* + * We must always overwrite entry->localrel with the latest partition + * Relation pointer, because the Relation pointed to by the old value may + * have been cleared after the caller would have closed the partition + * relation after the last use of this entry. Note that localrelvalid is + * only updated by the relcache invalidation callback, so it may still be + * true irrespective of whether the Relation pointed to by localrel has + * been cleared or not. + */ + if (found && entry->localrelvalid) + { + entry->localrel = partrel; + return entry; + } + + /* Switch to longer-lived context. */ + oldctx = MemoryContextSwitchTo(LogicalRepPartMapContext); + + if (!found) + { + memset(part_entry, 0, sizeof(LogicalRepPartMapEntry)); + part_entry->partoid = partOid; + } + + /* Release the no-longer-useful attrmap, if any. */ + if (entry->attrmap) + { + free_attrmap(entry->attrmap); + entry->attrmap = NULL; + } + + if (!entry->remoterel.remoteid) + { + int i; + + /* Remote relation is copied as-is from the root entry. */ + entry = &part_entry->relmapentry; + entry->remoterel.remoteid = remoterel->remoteid; + entry->remoterel.nspname = pstrdup(remoterel->nspname); + entry->remoterel.relname = pstrdup(remoterel->relname); + entry->remoterel.natts = remoterel->natts; + entry->remoterel.attnames = palloc(remoterel->natts * sizeof(char *)); + entry->remoterel.atttyps = palloc(remoterel->natts * sizeof(Oid)); + for (i = 0; i < remoterel->natts; i++) + { + entry->remoterel.attnames[i] = pstrdup(remoterel->attnames[i]); + entry->remoterel.atttyps[i] = remoterel->atttyps[i]; + } + entry->remoterel.replident = remoterel->replident; + entry->remoterel.attkeys = bms_copy(remoterel->attkeys); + } + + entry->localrel = partrel; + entry->localreloid = partOid; + + /* + * If the partition's attributes don't match the root relation's, we'll + * need to make a new attrmap which maps partition attribute numbers to + * remoterel's, instead of the original which maps root relation's + * attribute numbers to remoterel's. + * + * Note that 'map' which comes from the tuple routing data structure + * contains 1-based attribute numbers (of the parent relation). However, + * the map in 'entry', a logical replication data structure, contains + * 0-based attribute numbers (of the remote relation). + */ + if (map) + { + AttrNumber attno; + + entry->attrmap = make_attrmap(map->maplen); + for (attno = 0; attno < entry->attrmap->maplen; attno++) + { + AttrNumber root_attno = map->attnums[attno]; + + /* 0 means it's a dropped attribute. See comments atop AttrMap. */ + if (root_attno == 0) + entry->attrmap->attnums[attno] = -1; + else + entry->attrmap->attnums[attno] = attrmap->attnums[root_attno - 1]; + } + } + else + { + /* Lacking copy_attmap, do this the hard way. */ + entry->attrmap = make_attrmap(attrmap->maplen); + memcpy(entry->attrmap->attnums, attrmap->attnums, + attrmap->maplen * sizeof(AttrNumber)); + } + + /* Set if the table's replica identity is enough to apply update/delete. */ + logicalrep_rel_mark_updatable(entry); + + /* state and statelsn are left set to 0. */ + MemoryContextSwitchTo(oldctx); + + /* + * Finding a usable index is an infrequent task. It occurs when an + * operation is first performed on the relation, or after invalidation of + * the relation cache entry (such as ANALYZE or CREATE/DROP index on the + * relation). + * + * We also prefer to run this code on the oldctx so that we do not leak + * anything in the LogicalRepPartMapContext (hence CacheMemoryContext). + */ + entry->localindexoid = FindLogicalRepLocalIndex(partrel, remoterel, + entry->attrmap); + + entry->localrelvalid = true; + + return entry; +} + +/* + * Returns the oid of an index that can be used by the apply worker to scan + * the relation. + * + * We expect to call this function when REPLICA IDENTITY FULL is defined for + * the remote relation. + * + * If no suitable index is found, returns InvalidOid. + */ +static Oid +FindUsableIndexForReplicaIdentityFull(Relation localrel, AttrMap *attrmap) +{ + List *idxlist = RelationGetIndexList(localrel); + ListCell *lc; + + foreach(lc, idxlist) + { + Oid idxoid = lfirst_oid(lc); + bool isUsableIdx; + Relation idxRel; + IndexInfo *idxInfo; + + idxRel = index_open(idxoid, AccessShareLock); + idxInfo = BuildIndexInfo(idxRel); + isUsableIdx = IsIndexUsableForReplicaIdentityFull(idxInfo, attrmap); + index_close(idxRel, AccessShareLock); + + /* Return the first eligible index found */ + if (isUsableIdx) + return idxoid; + } + + return InvalidOid; +} + +/* + * Returns true if the index is usable for replica identity full. + * + * The index must be btree, non-partial, and the leftmost field must be a + * column (not an expression) that references the remote relation column. + * These limitations help to keep the index scan similar to PK/RI index + * scans. + * + * attrmap is a map of local attributes to remote ones. We can consult this + * map to check whether the local index attribute has a corresponding remote + * attribute. + * + * Note that the limitations of index scans for replica identity full only + * adheres to a subset of the limitations of PK/RI. For example, we support + * columns that are marked as [NULL] or we are not interested in the [NOT + * DEFERRABLE] aspect of constraints here. It works for us because we always + * compare the tuples for non-PK/RI index scans. See + * RelationFindReplTupleByIndex(). + * + * XXX: There are no fundamental problems for supporting non-btree indexes. + * We mostly need to relax the limitations in RelationFindReplTupleByIndex(). + * For partial indexes, the required changes are likely to be larger. If + * none of the tuples satisfy the expression for the index scan, we fall-back + * to sequential execution, which might not be a good idea in some cases. + */ +bool +IsIndexUsableForReplicaIdentityFull(IndexInfo *indexInfo, AttrMap *attrmap) +{ + AttrNumber keycol; + + /* The index must be a Btree index */ + if (indexInfo->ii_Am != BTREE_AM_OID) + return false; + + /* The index must not be a partial index */ + if (indexInfo->ii_Predicate != NIL) + return false; + + Assert(indexInfo->ii_NumIndexAttrs >= 1); + + /* The leftmost index field must not be an expression */ + keycol = indexInfo->ii_IndexAttrNumbers[0]; + if (!AttributeNumberIsValid(keycol)) + return false; + + /* + * And the leftmost index field must reference the remote relation column. + * This is because if it doesn't, the sequential scan is favorable over + * index scan in most cases. + */ + if (attrmap->maplen <= AttrNumberGetAttrOffset(keycol) || + attrmap->attnums[AttrNumberGetAttrOffset(keycol)] < 0) + return false; + + return true; +} + +/* + * Get replica identity index or if it is not defined a primary key. + * + * If neither is defined, returns InvalidOid + */ +Oid +GetRelationIdentityOrPK(Relation rel) +{ + Oid idxoid; + + idxoid = RelationGetReplicaIndex(rel); + + if (!OidIsValid(idxoid)) + idxoid = RelationGetPrimaryKeyIndex(rel); + + return idxoid; +} + +/* + * Returns the index oid if we can use an index for subscriber. Otherwise, + * returns InvalidOid. + */ +static Oid +FindLogicalRepLocalIndex(Relation localrel, LogicalRepRelation *remoterel, + AttrMap *attrMap) +{ + Oid idxoid; + + /* + * We never need index oid for partitioned tables, always rely on leaf + * partition's index. + */ + if (localrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + return InvalidOid; + + /* + * Simple case, we already have a primary key or a replica identity index. + */ + idxoid = GetRelationIdentityOrPK(localrel); + if (OidIsValid(idxoid)) + return idxoid; + + if (remoterel->replident == REPLICA_IDENTITY_FULL) + { + /* + * We are looking for one more opportunity for using an index. If + * there are any indexes defined on the local relation, try to pick a + * suitable index. + * + * The index selection safely assumes that all the columns are going + * to be available for the index scan given that remote relation has + * replica identity full. + * + * Note that we are not using the planner to find the cheapest method + * to scan the relation as that would require us to either use lower + * level planner functions which would be a maintenance burden in the + * long run or use the full-fledged planner which could cause + * overhead. + */ + return FindUsableIndexForReplicaIdentityFull(localrel, attrMap); + } + + return InvalidOid; +} diff --git a/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/reorderbuffer.c b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/reorderbuffer.c new file mode 100644 index 00000000000..5d3df2c6f1a --- /dev/null +++ b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/reorderbuffer.c @@ -0,0 +1,5280 @@ +/*------------------------------------------------------------------------- + * + * reorderbuffer.c + * PostgreSQL logical replay/reorder buffer management + * + * + * Copyright (c) 2012-2023, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/replication/logical/reorderbuffer.c + * + * NOTES + * This module gets handed individual pieces of transactions in the order + * they are written to the WAL and is responsible to reassemble them into + * toplevel transaction sized pieces. When a transaction is completely + * reassembled - signaled by reading the transaction commit record - it + * will then call the output plugin (cf. ReorderBufferCommit()) with the + * individual changes. The output plugins rely on snapshots built by + * snapbuild.c which hands them to us. + * + * Transactions and subtransactions/savepoints in postgres are not + * immediately linked to each other from outside the performing + * backend. Only at commit/abort (or special xact_assignment records) they + * are linked together. Which means that we will have to splice together a + * toplevel transaction from its subtransactions. To do that efficiently we + * build a binary heap indexed by the smallest current lsn of the individual + * subtransactions' changestreams. As the individual streams are inherently + * ordered by LSN - since that is where we build them from - the transaction + * can easily be reassembled by always using the subtransaction with the + * smallest current LSN from the heap. + * + * In order to cope with large transactions - which can be several times as + * big as the available memory - this module supports spooling the contents + * of a large transactions to disk. When the transaction is replayed the + * contents of individual (sub-)transactions will be read from disk in + * chunks. + * + * This module also has to deal with reassembling toast records from the + * individual chunks stored in WAL. When a new (or initial) version of a + * tuple is stored in WAL it will always be preceded by the toast chunks + * emitted for the columns stored out of line. Within a single toplevel + * transaction there will be no other data carrying records between a row's + * toast chunks and the row data itself. See ReorderBufferToast* for + * details. + * + * ReorderBuffer uses two special memory context types - SlabContext for + * allocations of fixed-length structures (changes and transactions), and + * GenerationContext for the variable-length transaction data (allocated + * and freed in groups with similar lifespans). + * + * To limit the amount of memory used by decoded changes, we track memory + * used at the reorder buffer level (i.e. total amount of memory), and for + * each transaction. When the total amount of used memory exceeds the + * limit, the transaction consuming the most memory is then serialized to + * disk. + * + * Only decoded changes are evicted from memory (spilled to disk), not the + * transaction records. The number of toplevel transactions is limited, + * but a transaction with many subtransactions may still consume significant + * amounts of memory. However, the transaction records are fairly small and + * are not included in the memory limit. + * + * The current eviction algorithm is very simple - the transaction is + * picked merely by size, while it might be useful to also consider age + * (LSN) of the changes for example. With the new Generational memory + * allocator, evicting the oldest changes would make it more likely the + * memory gets actually freed. + * + * We still rely on max_changes_in_memory when loading serialized changes + * back into memory. At that point we can't use the memory limit directly + * as we load the subxacts independently. One option to deal with this + * would be to count the subxacts, and allow each to allocate 1/N of the + * memory limit. That however does not seem very appealing, because with + * many subtransactions it may easily cause thrashing (short cycles of + * deserializing and applying very few changes). We probably should give + * a bit more memory to the oldest subtransactions, because it's likely + * they are the source for the next sequence of changes. + * + * ------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <unistd.h> +#include <sys/stat.h> + +#include "access/detoast.h" +#include "access/heapam.h" +#include "access/rewriteheap.h" +#include "access/transam.h" +#include "access/xact.h" +#include "access/xlog_internal.h" +#include "catalog/catalog.h" +#include "lib/binaryheap.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "replication/logical.h" +#include "replication/reorderbuffer.h" +#include "replication/slot.h" +#include "replication/snapbuild.h" /* just for SnapBuildSnapDecRefcount */ +#include "storage/bufmgr.h" +#include "storage/fd.h" +#include "storage/sinval.h" +#include "utils/builtins.h" +#include "utils/combocid.h" +#include "utils/memdebug.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "utils/relfilenumbermap.h" + + +/* entry for a hash table we use to map from xid to our transaction state */ +typedef struct ReorderBufferTXNByIdEnt +{ + TransactionId xid; + ReorderBufferTXN *txn; +} ReorderBufferTXNByIdEnt; + +/* data structures for (relfilelocator, ctid) => (cmin, cmax) mapping */ +typedef struct ReorderBufferTupleCidKey +{ + RelFileLocator rlocator; + ItemPointerData tid; +} ReorderBufferTupleCidKey; + +typedef struct ReorderBufferTupleCidEnt +{ + ReorderBufferTupleCidKey key; + CommandId cmin; + CommandId cmax; + CommandId combocid; /* just for debugging */ +} ReorderBufferTupleCidEnt; + +/* Virtual file descriptor with file offset tracking */ +typedef struct TXNEntryFile +{ + File vfd; /* -1 when the file is closed */ + off_t curOffset; /* offset for next write or read. Reset to 0 + * when vfd is opened. */ +} TXNEntryFile; + +/* k-way in-order change iteration support structures */ +typedef struct ReorderBufferIterTXNEntry +{ + XLogRecPtr lsn; + ReorderBufferChange *change; + ReorderBufferTXN *txn; + TXNEntryFile file; + XLogSegNo segno; +} ReorderBufferIterTXNEntry; + +typedef struct ReorderBufferIterTXNState +{ + binaryheap *heap; + Size nr_txns; + dlist_head old_change; + ReorderBufferIterTXNEntry entries[FLEXIBLE_ARRAY_MEMBER]; +} ReorderBufferIterTXNState; + +/* toast datastructures */ +typedef struct ReorderBufferToastEnt +{ + Oid chunk_id; /* toast_table.chunk_id */ + int32 last_chunk_seq; /* toast_table.chunk_seq of the last chunk we + * have seen */ + Size num_chunks; /* number of chunks we've already seen */ + Size size; /* combined size of chunks seen */ + dlist_head chunks; /* linked list of chunks */ + struct varlena *reconstructed; /* reconstructed varlena now pointed to in + * main tup */ +} ReorderBufferToastEnt; + +/* Disk serialization support datastructures */ +typedef struct ReorderBufferDiskChange +{ + Size size; + ReorderBufferChange change; + /* data follows */ +} ReorderBufferDiskChange; + +#define IsSpecInsert(action) \ +( \ + ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT) \ +) +#define IsSpecConfirmOrAbort(action) \ +( \ + (((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM) || \ + ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT)) \ +) +#define IsInsertOrUpdate(action) \ +( \ + (((action) == REORDER_BUFFER_CHANGE_INSERT) || \ + ((action) == REORDER_BUFFER_CHANGE_UPDATE) || \ + ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT)) \ +) + +/* + * Maximum number of changes kept in memory, per transaction. After that, + * changes are spooled to disk. + * + * The current value should be sufficient to decode the entire transaction + * without hitting disk in OLTP workloads, while starting to spool to disk in + * other workloads reasonably fast. + * + * At some point in the future it probably makes sense to have a more elaborate + * resource management here, but it's not entirely clear what that would look + * like. + */ +__thread int logical_decoding_work_mem; +static const Size max_changes_in_memory = 4096; /* XXX for restore only */ + +/* GUC variable */ +__thread int debug_logical_replication_streaming = DEBUG_LOGICAL_REP_STREAMING_BUFFERED; + +/* --------------------------------------- + * primary reorderbuffer support routines + * --------------------------------------- + */ +static ReorderBufferTXN *ReorderBufferGetTXN(ReorderBuffer *rb); +static void ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn); +static ReorderBufferTXN *ReorderBufferTXNByXid(ReorderBuffer *rb, + TransactionId xid, bool create, bool *is_new, + XLogRecPtr lsn, bool create_as_top); +static void ReorderBufferTransferSnapToParent(ReorderBufferTXN *txn, + ReorderBufferTXN *subtxn); + +static void AssertTXNLsnOrder(ReorderBuffer *rb); + +/* --------------------------------------- + * support functions for lsn-order iterating over the ->changes of a + * transaction and its subtransactions + * + * used for iteration over the k-way heap merge of a transaction and its + * subtransactions + * --------------------------------------- + */ +static void ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn, + ReorderBufferIterTXNState *volatile *iter_state); +static ReorderBufferChange *ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state); +static void ReorderBufferIterTXNFinish(ReorderBuffer *rb, + ReorderBufferIterTXNState *state); +static void ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs); + +/* + * --------------------------------------- + * Disk serialization support functions + * --------------------------------------- + */ +static void ReorderBufferCheckMemoryLimit(ReorderBuffer *rb); +static void ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn); +static void ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn, + int fd, ReorderBufferChange *change); +static Size ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn, + TXNEntryFile *file, XLogSegNo *segno); +static void ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn, + char *data); +static void ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn); +static void ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, + bool txn_prepared); +static void ReorderBufferCleanupSerializedTXNs(const char *slotname); +static void ReorderBufferSerializedPath(char *path, ReplicationSlot *slot, + TransactionId xid, XLogSegNo segno); + +static void ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap); +static Snapshot ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap, + ReorderBufferTXN *txn, CommandId cid); + +/* + * --------------------------------------- + * Streaming support functions + * --------------------------------------- + */ +static inline bool ReorderBufferCanStream(ReorderBuffer *rb); +static inline bool ReorderBufferCanStartStreaming(ReorderBuffer *rb); +static void ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn); +static void ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn); + +/* --------------------------------------- + * toast reassembly support + * --------------------------------------- + */ +static void ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn); +static void ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn); +static void ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change); +static void ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change); + +/* + * --------------------------------------- + * memory accounting + * --------------------------------------- + */ +static Size ReorderBufferChangeSize(ReorderBufferChange *change); +static void ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb, + ReorderBufferChange *change, + bool addition, Size sz); + +/* + * Allocate a new ReorderBuffer and clean out any old serialized state from + * prior ReorderBuffer instances for the same slot. + */ +ReorderBuffer * +ReorderBufferAllocate(void) +{ + ReorderBuffer *buffer; + HASHCTL hash_ctl; + MemoryContext new_ctx; + + Assert(MyReplicationSlot != NULL); + + /* allocate memory in own context, to have better accountability */ + new_ctx = AllocSetContextCreate(CurrentMemoryContext, + "ReorderBuffer", + ALLOCSET_DEFAULT_SIZES); + + buffer = + (ReorderBuffer *) MemoryContextAlloc(new_ctx, sizeof(ReorderBuffer)); + + memset(&hash_ctl, 0, sizeof(hash_ctl)); + + buffer->context = new_ctx; + + buffer->change_context = SlabContextCreate(new_ctx, + "Change", + SLAB_DEFAULT_BLOCK_SIZE, + sizeof(ReorderBufferChange)); + + buffer->txn_context = SlabContextCreate(new_ctx, + "TXN", + SLAB_DEFAULT_BLOCK_SIZE, + sizeof(ReorderBufferTXN)); + + /* + * XXX the allocation sizes used below pre-date generation context's block + * growing code. These values should likely be benchmarked and set to + * more suitable values. + */ + buffer->tup_context = GenerationContextCreate(new_ctx, + "Tuples", + SLAB_LARGE_BLOCK_SIZE, + SLAB_LARGE_BLOCK_SIZE, + SLAB_LARGE_BLOCK_SIZE); + + hash_ctl.keysize = sizeof(TransactionId); + hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt); + hash_ctl.hcxt = buffer->context; + + buffer->by_txn = hash_create("ReorderBufferByXid", 1000, &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + buffer->by_txn_last_xid = InvalidTransactionId; + buffer->by_txn_last_txn = NULL; + + buffer->outbuf = NULL; + buffer->outbufsize = 0; + buffer->size = 0; + + buffer->spillTxns = 0; + buffer->spillCount = 0; + buffer->spillBytes = 0; + buffer->streamTxns = 0; + buffer->streamCount = 0; + buffer->streamBytes = 0; + buffer->totalTxns = 0; + buffer->totalBytes = 0; + + buffer->current_restart_decoding_lsn = InvalidXLogRecPtr; + + dlist_init(&buffer->toplevel_by_lsn); + dlist_init(&buffer->txns_by_base_snapshot_lsn); + dclist_init(&buffer->catchange_txns); + + /* + * Ensure there's no stale data from prior uses of this slot, in case some + * prior exit avoided calling ReorderBufferFree. Failure to do this can + * produce duplicated txns, and it's very cheap if there's nothing there. + */ + ReorderBufferCleanupSerializedTXNs(NameStr(MyReplicationSlot->data.name)); + + return buffer; +} + +/* + * Free a ReorderBuffer + */ +void +ReorderBufferFree(ReorderBuffer *rb) +{ + MemoryContext context = rb->context; + + /* + * We free separately allocated data by entirely scrapping reorderbuffer's + * memory context. + */ + MemoryContextDelete(context); + + /* Free disk space used by unconsumed reorder buffers */ + ReorderBufferCleanupSerializedTXNs(NameStr(MyReplicationSlot->data.name)); +} + +/* + * Get an unused, possibly preallocated, ReorderBufferTXN. + */ +static ReorderBufferTXN * +ReorderBufferGetTXN(ReorderBuffer *rb) +{ + ReorderBufferTXN *txn; + + txn = (ReorderBufferTXN *) + MemoryContextAlloc(rb->txn_context, sizeof(ReorderBufferTXN)); + + memset(txn, 0, sizeof(ReorderBufferTXN)); + + dlist_init(&txn->changes); + dlist_init(&txn->tuplecids); + dlist_init(&txn->subtxns); + + /* InvalidCommandId is not zero, so set it explicitly */ + txn->command_id = InvalidCommandId; + txn->output_plugin_private = NULL; + + return txn; +} + +/* + * Free a ReorderBufferTXN. + */ +static void +ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + /* clean the lookup cache if we were cached (quite likely) */ + if (rb->by_txn_last_xid == txn->xid) + { + rb->by_txn_last_xid = InvalidTransactionId; + rb->by_txn_last_txn = NULL; + } + + /* free data that's contained */ + + if (txn->gid != NULL) + { + pfree(txn->gid); + txn->gid = NULL; + } + + if (txn->tuplecid_hash != NULL) + { + hash_destroy(txn->tuplecid_hash); + txn->tuplecid_hash = NULL; + } + + if (txn->invalidations) + { + pfree(txn->invalidations); + txn->invalidations = NULL; + } + + /* Reset the toast hash */ + ReorderBufferToastReset(rb, txn); + + pfree(txn); +} + +/* + * Get a fresh ReorderBufferChange. + */ +ReorderBufferChange * +ReorderBufferGetChange(ReorderBuffer *rb) +{ + ReorderBufferChange *change; + + change = (ReorderBufferChange *) + MemoryContextAlloc(rb->change_context, sizeof(ReorderBufferChange)); + + memset(change, 0, sizeof(ReorderBufferChange)); + return change; +} + +/* + * Free a ReorderBufferChange and update memory accounting, if requested. + */ +void +ReorderBufferReturnChange(ReorderBuffer *rb, ReorderBufferChange *change, + bool upd_mem) +{ + /* update memory accounting info */ + if (upd_mem) + ReorderBufferChangeMemoryUpdate(rb, change, false, + ReorderBufferChangeSize(change)); + + /* free contained data */ + switch (change->action) + { + case REORDER_BUFFER_CHANGE_INSERT: + case REORDER_BUFFER_CHANGE_UPDATE: + case REORDER_BUFFER_CHANGE_DELETE: + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT: + if (change->data.tp.newtuple) + { + ReorderBufferReturnTupleBuf(rb, change->data.tp.newtuple); + change->data.tp.newtuple = NULL; + } + + if (change->data.tp.oldtuple) + { + ReorderBufferReturnTupleBuf(rb, change->data.tp.oldtuple); + change->data.tp.oldtuple = NULL; + } + break; + case REORDER_BUFFER_CHANGE_MESSAGE: + if (change->data.msg.prefix != NULL) + pfree(change->data.msg.prefix); + change->data.msg.prefix = NULL; + if (change->data.msg.message != NULL) + pfree(change->data.msg.message); + change->data.msg.message = NULL; + break; + case REORDER_BUFFER_CHANGE_INVALIDATION: + if (change->data.inval.invalidations) + pfree(change->data.inval.invalidations); + change->data.inval.invalidations = NULL; + break; + case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT: + if (change->data.snapshot) + { + ReorderBufferFreeSnap(rb, change->data.snapshot); + change->data.snapshot = NULL; + } + break; + /* no data in addition to the struct itself */ + case REORDER_BUFFER_CHANGE_TRUNCATE: + if (change->data.truncate.relids != NULL) + { + ReorderBufferReturnRelids(rb, change->data.truncate.relids); + change->data.truncate.relids = NULL; + } + break; + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM: + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT: + case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID: + case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID: + break; + } + + pfree(change); +} + +/* + * Get a fresh ReorderBufferTupleBuf fitting at least a tuple of size + * tuple_len (excluding header overhead). + */ +ReorderBufferTupleBuf * +ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len) +{ + ReorderBufferTupleBuf *tuple; + Size alloc_len; + + alloc_len = tuple_len + SizeofHeapTupleHeader; + + tuple = (ReorderBufferTupleBuf *) + MemoryContextAlloc(rb->tup_context, + sizeof(ReorderBufferTupleBuf) + + MAXIMUM_ALIGNOF + alloc_len); + tuple->alloc_tuple_size = alloc_len; + tuple->tuple.t_data = ReorderBufferTupleBufData(tuple); + + return tuple; +} + +/* + * Free a ReorderBufferTupleBuf. + */ +void +ReorderBufferReturnTupleBuf(ReorderBuffer *rb, ReorderBufferTupleBuf *tuple) +{ + pfree(tuple); +} + +/* + * Get an array for relids of truncated relations. + * + * We use the global memory context (for the whole reorder buffer), because + * none of the existing ones seems like a good match (some are SLAB, so we + * can't use those, and tup_context is meant for tuple data, not relids). We + * could add yet another context, but it seems like an overkill - TRUNCATE is + * not particularly common operation, so it does not seem worth it. + */ +Oid * +ReorderBufferGetRelids(ReorderBuffer *rb, int nrelids) +{ + Oid *relids; + Size alloc_len; + + alloc_len = sizeof(Oid) * nrelids; + + relids = (Oid *) MemoryContextAlloc(rb->context, alloc_len); + + return relids; +} + +/* + * Free an array of relids. + */ +void +ReorderBufferReturnRelids(ReorderBuffer *rb, Oid *relids) +{ + pfree(relids); +} + +/* + * Return the ReorderBufferTXN from the given buffer, specified by Xid. + * If create is true, and a transaction doesn't already exist, create it + * (with the given LSN, and as top transaction if that's specified); + * when this happens, is_new is set to true. + */ +static ReorderBufferTXN * +ReorderBufferTXNByXid(ReorderBuffer *rb, TransactionId xid, bool create, + bool *is_new, XLogRecPtr lsn, bool create_as_top) +{ + ReorderBufferTXN *txn; + ReorderBufferTXNByIdEnt *ent; + bool found; + + Assert(TransactionIdIsValid(xid)); + + /* + * Check the one-entry lookup cache first + */ + if (TransactionIdIsValid(rb->by_txn_last_xid) && + rb->by_txn_last_xid == xid) + { + txn = rb->by_txn_last_txn; + + if (txn != NULL) + { + /* found it, and it's valid */ + if (is_new) + *is_new = false; + return txn; + } + + /* + * cached as non-existent, and asked not to create? Then nothing else + * to do. + */ + if (!create) + return NULL; + /* otherwise fall through to create it */ + } + + /* + * If the cache wasn't hit or it yielded a "does-not-exist" and we want to + * create an entry. + */ + + /* search the lookup table */ + ent = (ReorderBufferTXNByIdEnt *) + hash_search(rb->by_txn, + &xid, + create ? HASH_ENTER : HASH_FIND, + &found); + if (found) + txn = ent->txn; + else if (create) + { + /* initialize the new entry, if creation was requested */ + Assert(ent != NULL); + Assert(lsn != InvalidXLogRecPtr); + + ent->txn = ReorderBufferGetTXN(rb); + ent->txn->xid = xid; + txn = ent->txn; + txn->first_lsn = lsn; + txn->restart_decoding_lsn = rb->current_restart_decoding_lsn; + + if (create_as_top) + { + dlist_push_tail(&rb->toplevel_by_lsn, &txn->node); + AssertTXNLsnOrder(rb); + } + } + else + txn = NULL; /* not found and not asked to create */ + + /* update cache */ + rb->by_txn_last_xid = xid; + rb->by_txn_last_txn = txn; + + if (is_new) + *is_new = !found; + + Assert(!create || txn != NULL); + return txn; +} + +/* + * Record the partial change for the streaming of in-progress transactions. We + * can stream only complete changes so if we have a partial change like toast + * table insert or speculative insert then we mark such a 'txn' so that it + * can't be streamed. We also ensure that if the changes in such a 'txn' can + * be streamed and are above logical_decoding_work_mem threshold then we stream + * them as soon as we have a complete change. + */ +static void +ReorderBufferProcessPartialChange(ReorderBuffer *rb, ReorderBufferTXN *txn, + ReorderBufferChange *change, + bool toast_insert) +{ + ReorderBufferTXN *toptxn; + + /* + * The partial changes need to be processed only while streaming + * in-progress transactions. + */ + if (!ReorderBufferCanStream(rb)) + return; + + /* Get the top transaction. */ + toptxn = rbtxn_get_toptxn(txn); + + /* + * Indicate a partial change for toast inserts. The change will be + * considered as complete once we get the insert or update on the main + * table and we are sure that the pending toast chunks are not required + * anymore. + * + * If we allow streaming when there are pending toast chunks then such + * chunks won't be released till the insert (multi_insert) is complete and + * we expect the txn to have streamed all changes after streaming. This + * restriction is mainly to ensure the correctness of streamed + * transactions and it doesn't seem worth uplifting such a restriction + * just to allow this case because anyway we will stream the transaction + * once such an insert is complete. + */ + if (toast_insert) + toptxn->txn_flags |= RBTXN_HAS_PARTIAL_CHANGE; + else if (rbtxn_has_partial_change(toptxn) && + IsInsertOrUpdate(change->action) && + change->data.tp.clear_toast_afterwards) + toptxn->txn_flags &= ~RBTXN_HAS_PARTIAL_CHANGE; + + /* + * Indicate a partial change for speculative inserts. The change will be + * considered as complete once we get the speculative confirm or abort + * token. + */ + if (IsSpecInsert(change->action)) + toptxn->txn_flags |= RBTXN_HAS_PARTIAL_CHANGE; + else if (rbtxn_has_partial_change(toptxn) && + IsSpecConfirmOrAbort(change->action)) + toptxn->txn_flags &= ~RBTXN_HAS_PARTIAL_CHANGE; + + /* + * Stream the transaction if it is serialized before and the changes are + * now complete in the top-level transaction. + * + * The reason for doing the streaming of such a transaction as soon as we + * get the complete change for it is that previously it would have reached + * the memory threshold and wouldn't get streamed because of incomplete + * changes. Delaying such transactions would increase apply lag for them. + */ + if (ReorderBufferCanStartStreaming(rb) && + !(rbtxn_has_partial_change(toptxn)) && + rbtxn_is_serialized(txn) && + rbtxn_has_streamable_change(toptxn)) + ReorderBufferStreamTXN(rb, toptxn); +} + +/* + * Queue a change into a transaction so it can be replayed upon commit or will be + * streamed when we reach logical_decoding_work_mem threshold. + */ +void +ReorderBufferQueueChange(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, + ReorderBufferChange *change, bool toast_insert) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true); + + /* + * While streaming the previous changes we have detected that the + * transaction is aborted. So there is no point in collecting further + * changes for it. + */ + if (txn->concurrent_abort) + { + /* + * We don't need to update memory accounting for this change as we + * have not added it to the queue yet. + */ + ReorderBufferReturnChange(rb, change, false); + return; + } + + /* + * The changes that are sent downstream are considered streamable. We + * remember such transactions so that only those will later be considered + * for streaming. + */ + if (change->action == REORDER_BUFFER_CHANGE_INSERT || + change->action == REORDER_BUFFER_CHANGE_UPDATE || + change->action == REORDER_BUFFER_CHANGE_DELETE || + change->action == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT || + change->action == REORDER_BUFFER_CHANGE_TRUNCATE || + change->action == REORDER_BUFFER_CHANGE_MESSAGE) + { + ReorderBufferTXN *toptxn = rbtxn_get_toptxn(txn); + + toptxn->txn_flags |= RBTXN_HAS_STREAMABLE_CHANGE; + } + + change->lsn = lsn; + change->txn = txn; + + Assert(InvalidXLogRecPtr != lsn); + dlist_push_tail(&txn->changes, &change->node); + txn->nentries++; + txn->nentries_mem++; + + /* update memory accounting information */ + ReorderBufferChangeMemoryUpdate(rb, change, true, + ReorderBufferChangeSize(change)); + + /* process partial change */ + ReorderBufferProcessPartialChange(rb, txn, change, toast_insert); + + /* check the memory limits and evict something if needed */ + ReorderBufferCheckMemoryLimit(rb); +} + +/* + * A transactional message is queued to be processed upon commit and a + * non-transactional message gets processed immediately. + */ +void +ReorderBufferQueueMessage(ReorderBuffer *rb, TransactionId xid, + Snapshot snap, XLogRecPtr lsn, + bool transactional, const char *prefix, + Size message_size, const char *message) +{ + if (transactional) + { + MemoryContext oldcontext; + ReorderBufferChange *change; + + Assert(xid != InvalidTransactionId); + + /* + * We don't expect snapshots for transactional changes - we'll use the + * snapshot derived later during apply (unless the change gets + * skipped). + */ + Assert(!snap); + + oldcontext = MemoryContextSwitchTo(rb->context); + + change = ReorderBufferGetChange(rb); + change->action = REORDER_BUFFER_CHANGE_MESSAGE; + change->data.msg.prefix = pstrdup(prefix); + change->data.msg.message_size = message_size; + change->data.msg.message = palloc(message_size); + memcpy(change->data.msg.message, message, message_size); + + ReorderBufferQueueChange(rb, xid, lsn, change, false); + + MemoryContextSwitchTo(oldcontext); + } + else + { + ReorderBufferTXN *txn = NULL; + volatile Snapshot snapshot_now = snap; + + /* Non-transactional changes require a valid snapshot. */ + Assert(snapshot_now); + + if (xid != InvalidTransactionId) + txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true); + + /* setup snapshot to allow catalog access */ + SetupHistoricSnapshot(snapshot_now, NULL); + PG_TRY(); + { + rb->message(rb, txn, lsn, false, prefix, message_size, message); + + TeardownHistoricSnapshot(false); + } + PG_CATCH(); + { + TeardownHistoricSnapshot(true); + PG_RE_THROW(); + } + PG_END_TRY(); + } +} + +/* + * AssertTXNLsnOrder + * Verify LSN ordering of transaction lists in the reorderbuffer + * + * Other LSN-related invariants are checked too. + * + * No-op if assertions are not in use. + */ +static void +AssertTXNLsnOrder(ReorderBuffer *rb) +{ +#ifdef USE_ASSERT_CHECKING + LogicalDecodingContext *ctx = rb->private_data; + dlist_iter iter; + XLogRecPtr prev_first_lsn = InvalidXLogRecPtr; + XLogRecPtr prev_base_snap_lsn = InvalidXLogRecPtr; + + /* + * Skip the verification if we don't reach the LSN at which we start + * decoding the contents of transactions yet because until we reach the + * LSN, we could have transactions that don't have the association between + * the top-level transaction and subtransaction yet and consequently have + * the same LSN. We don't guarantee this association until we try to + * decode the actual contents of transaction. The ordering of the records + * prior to the start_decoding_at LSN should have been checked before the + * restart. + */ + if (SnapBuildXactNeedsSkip(ctx->snapshot_builder, ctx->reader->EndRecPtr)) + return; + + dlist_foreach(iter, &rb->toplevel_by_lsn) + { + ReorderBufferTXN *cur_txn = dlist_container(ReorderBufferTXN, node, + iter.cur); + + /* start LSN must be set */ + Assert(cur_txn->first_lsn != InvalidXLogRecPtr); + + /* If there is an end LSN, it must be higher than start LSN */ + if (cur_txn->end_lsn != InvalidXLogRecPtr) + Assert(cur_txn->first_lsn <= cur_txn->end_lsn); + + /* Current initial LSN must be strictly higher than previous */ + if (prev_first_lsn != InvalidXLogRecPtr) + Assert(prev_first_lsn < cur_txn->first_lsn); + + /* known-as-subtxn txns must not be listed */ + Assert(!rbtxn_is_known_subxact(cur_txn)); + + prev_first_lsn = cur_txn->first_lsn; + } + + dlist_foreach(iter, &rb->txns_by_base_snapshot_lsn) + { + ReorderBufferTXN *cur_txn = dlist_container(ReorderBufferTXN, + base_snapshot_node, + iter.cur); + + /* base snapshot (and its LSN) must be set */ + Assert(cur_txn->base_snapshot != NULL); + Assert(cur_txn->base_snapshot_lsn != InvalidXLogRecPtr); + + /* current LSN must be strictly higher than previous */ + if (prev_base_snap_lsn != InvalidXLogRecPtr) + Assert(prev_base_snap_lsn < cur_txn->base_snapshot_lsn); + + /* known-as-subtxn txns must not be listed */ + Assert(!rbtxn_is_known_subxact(cur_txn)); + + prev_base_snap_lsn = cur_txn->base_snapshot_lsn; + } +#endif +} + +/* + * AssertChangeLsnOrder + * + * Check ordering of changes in the (sub)transaction. + */ +static void +AssertChangeLsnOrder(ReorderBufferTXN *txn) +{ +#ifdef USE_ASSERT_CHECKING + dlist_iter iter; + XLogRecPtr prev_lsn = txn->first_lsn; + + dlist_foreach(iter, &txn->changes) + { + ReorderBufferChange *cur_change; + + cur_change = dlist_container(ReorderBufferChange, node, iter.cur); + + Assert(txn->first_lsn != InvalidXLogRecPtr); + Assert(cur_change->lsn != InvalidXLogRecPtr); + Assert(txn->first_lsn <= cur_change->lsn); + + if (txn->end_lsn != InvalidXLogRecPtr) + Assert(cur_change->lsn <= txn->end_lsn); + + Assert(prev_lsn <= cur_change->lsn); + + prev_lsn = cur_change->lsn; + } +#endif +} + +/* + * ReorderBufferGetOldestTXN + * Return oldest transaction in reorderbuffer + */ +ReorderBufferTXN * +ReorderBufferGetOldestTXN(ReorderBuffer *rb) +{ + ReorderBufferTXN *txn; + + AssertTXNLsnOrder(rb); + + if (dlist_is_empty(&rb->toplevel_by_lsn)) + return NULL; + + txn = dlist_head_element(ReorderBufferTXN, node, &rb->toplevel_by_lsn); + + Assert(!rbtxn_is_known_subxact(txn)); + Assert(txn->first_lsn != InvalidXLogRecPtr); + return txn; +} + +/* + * ReorderBufferGetOldestXmin + * Return oldest Xmin in reorderbuffer + * + * Returns oldest possibly running Xid from the point of view of snapshots + * used in the transactions kept by reorderbuffer, or InvalidTransactionId if + * there are none. + * + * Since snapshots are assigned monotonically, this equals the Xmin of the + * base snapshot with minimal base_snapshot_lsn. + */ +TransactionId +ReorderBufferGetOldestXmin(ReorderBuffer *rb) +{ + ReorderBufferTXN *txn; + + AssertTXNLsnOrder(rb); + + if (dlist_is_empty(&rb->txns_by_base_snapshot_lsn)) + return InvalidTransactionId; + + txn = dlist_head_element(ReorderBufferTXN, base_snapshot_node, + &rb->txns_by_base_snapshot_lsn); + return txn->base_snapshot->xmin; +} + +void +ReorderBufferSetRestartPoint(ReorderBuffer *rb, XLogRecPtr ptr) +{ + rb->current_restart_decoding_lsn = ptr; +} + +/* + * ReorderBufferAssignChild + * + * Make note that we know that subxid is a subtransaction of xid, seen as of + * the given lsn. + */ +void +ReorderBufferAssignChild(ReorderBuffer *rb, TransactionId xid, + TransactionId subxid, XLogRecPtr lsn) +{ + ReorderBufferTXN *txn; + ReorderBufferTXN *subtxn; + bool new_top; + bool new_sub; + + txn = ReorderBufferTXNByXid(rb, xid, true, &new_top, lsn, true); + subtxn = ReorderBufferTXNByXid(rb, subxid, true, &new_sub, lsn, false); + + if (!new_sub) + { + if (rbtxn_is_known_subxact(subtxn)) + { + /* already associated, nothing to do */ + return; + } + else + { + /* + * We already saw this transaction, but initially added it to the + * list of top-level txns. Now that we know it's not top-level, + * remove it from there. + */ + dlist_delete(&subtxn->node); + } + } + + subtxn->txn_flags |= RBTXN_IS_SUBXACT; + subtxn->toplevel_xid = xid; + Assert(subtxn->nsubtxns == 0); + + /* set the reference to top-level transaction */ + subtxn->toptxn = txn; + + /* add to subtransaction list */ + dlist_push_tail(&txn->subtxns, &subtxn->node); + txn->nsubtxns++; + + /* Possibly transfer the subtxn's snapshot to its top-level txn. */ + ReorderBufferTransferSnapToParent(txn, subtxn); + + /* Verify LSN-ordering invariant */ + AssertTXNLsnOrder(rb); +} + +/* + * ReorderBufferTransferSnapToParent + * Transfer base snapshot from subtxn to top-level txn, if needed + * + * This is done if the top-level txn doesn't have a base snapshot, or if the + * subtxn's base snapshot has an earlier LSN than the top-level txn's base + * snapshot's LSN. This can happen if there are no changes in the toplevel + * txn but there are some in the subtxn, or the first change in subtxn has + * earlier LSN than first change in the top-level txn and we learned about + * their kinship only now. + * + * The subtransaction's snapshot is cleared regardless of the transfer + * happening, since it's not needed anymore in either case. + * + * We do this as soon as we become aware of their kinship, to avoid queueing + * extra snapshots to txns known-as-subtxns -- only top-level txns will + * receive further snapshots. + */ +static void +ReorderBufferTransferSnapToParent(ReorderBufferTXN *txn, + ReorderBufferTXN *subtxn) +{ + Assert(subtxn->toplevel_xid == txn->xid); + + if (subtxn->base_snapshot != NULL) + { + if (txn->base_snapshot == NULL || + subtxn->base_snapshot_lsn < txn->base_snapshot_lsn) + { + /* + * If the toplevel transaction already has a base snapshot but + * it's newer than the subxact's, purge it. + */ + if (txn->base_snapshot != NULL) + { + SnapBuildSnapDecRefcount(txn->base_snapshot); + dlist_delete(&txn->base_snapshot_node); + } + + /* + * The snapshot is now the top transaction's; transfer it, and + * adjust the list position of the top transaction in the list by + * moving it to where the subtransaction is. + */ + txn->base_snapshot = subtxn->base_snapshot; + txn->base_snapshot_lsn = subtxn->base_snapshot_lsn; + dlist_insert_before(&subtxn->base_snapshot_node, + &txn->base_snapshot_node); + + /* + * The subtransaction doesn't have a snapshot anymore (so it + * mustn't be in the list.) + */ + subtxn->base_snapshot = NULL; + subtxn->base_snapshot_lsn = InvalidXLogRecPtr; + dlist_delete(&subtxn->base_snapshot_node); + } + else + { + /* Base snap of toplevel is fine, so subxact's is not needed */ + SnapBuildSnapDecRefcount(subtxn->base_snapshot); + dlist_delete(&subtxn->base_snapshot_node); + subtxn->base_snapshot = NULL; + subtxn->base_snapshot_lsn = InvalidXLogRecPtr; + } + } +} + +/* + * Associate a subtransaction with its toplevel transaction at commit + * time. There may be no further changes added after this. + */ +void +ReorderBufferCommitChild(ReorderBuffer *rb, TransactionId xid, + TransactionId subxid, XLogRecPtr commit_lsn, + XLogRecPtr end_lsn) +{ + ReorderBufferTXN *subtxn; + + subtxn = ReorderBufferTXNByXid(rb, subxid, false, NULL, + InvalidXLogRecPtr, false); + + /* + * No need to do anything if that subtxn didn't contain any changes + */ + if (!subtxn) + return; + + subtxn->final_lsn = commit_lsn; + subtxn->end_lsn = end_lsn; + + /* + * Assign this subxact as a child of the toplevel xact (no-op if already + * done.) + */ + ReorderBufferAssignChild(rb, xid, subxid, InvalidXLogRecPtr); +} + + +/* + * Support for efficiently iterating over a transaction's and its + * subtransactions' changes. + * + * We do by doing a k-way merge between transactions/subtransactions. For that + * we model the current heads of the different transactions as a binary heap + * so we easily know which (sub-)transaction has the change with the smallest + * lsn next. + * + * We assume the changes in individual transactions are already sorted by LSN. + */ + +/* + * Binary heap comparison function. + */ +static int +ReorderBufferIterCompare(Datum a, Datum b, void *arg) +{ + ReorderBufferIterTXNState *state = (ReorderBufferIterTXNState *) arg; + XLogRecPtr pos_a = state->entries[DatumGetInt32(a)].lsn; + XLogRecPtr pos_b = state->entries[DatumGetInt32(b)].lsn; + + if (pos_a < pos_b) + return 1; + else if (pos_a == pos_b) + return 0; + return -1; +} + +/* + * Allocate & initialize an iterator which iterates in lsn order over a + * transaction and all its subtransactions. + * + * Note: The iterator state is returned through iter_state parameter rather + * than the function's return value. This is because the state gets cleaned up + * in a PG_CATCH block in the caller, so we want to make sure the caller gets + * back the state even if this function throws an exception. + */ +static void +ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn, + ReorderBufferIterTXNState *volatile *iter_state) +{ + Size nr_txns = 0; + ReorderBufferIterTXNState *state; + dlist_iter cur_txn_i; + int32 off; + + *iter_state = NULL; + + /* Check ordering of changes in the toplevel transaction. */ + AssertChangeLsnOrder(txn); + + /* + * Calculate the size of our heap: one element for every transaction that + * contains changes. (Besides the transactions already in the reorder + * buffer, we count the one we were directly passed.) + */ + if (txn->nentries > 0) + nr_txns++; + + dlist_foreach(cur_txn_i, &txn->subtxns) + { + ReorderBufferTXN *cur_txn; + + cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur); + + /* Check ordering of changes in this subtransaction. */ + AssertChangeLsnOrder(cur_txn); + + if (cur_txn->nentries > 0) + nr_txns++; + } + + /* allocate iteration state */ + state = (ReorderBufferIterTXNState *) + MemoryContextAllocZero(rb->context, + sizeof(ReorderBufferIterTXNState) + + sizeof(ReorderBufferIterTXNEntry) * nr_txns); + + state->nr_txns = nr_txns; + dlist_init(&state->old_change); + + for (off = 0; off < state->nr_txns; off++) + { + state->entries[off].file.vfd = -1; + state->entries[off].segno = 0; + } + + /* allocate heap */ + state->heap = binaryheap_allocate(state->nr_txns, + ReorderBufferIterCompare, + state); + + /* Now that the state fields are initialized, it is safe to return it. */ + *iter_state = state; + + /* + * Now insert items into the binary heap, in an unordered fashion. (We + * will run a heap assembly step at the end; this is more efficient.) + */ + + off = 0; + + /* add toplevel transaction if it contains changes */ + if (txn->nentries > 0) + { + ReorderBufferChange *cur_change; + + if (rbtxn_is_serialized(txn)) + { + /* serialize remaining changes */ + ReorderBufferSerializeTXN(rb, txn); + ReorderBufferRestoreChanges(rb, txn, &state->entries[off].file, + &state->entries[off].segno); + } + + cur_change = dlist_head_element(ReorderBufferChange, node, + &txn->changes); + + state->entries[off].lsn = cur_change->lsn; + state->entries[off].change = cur_change; + state->entries[off].txn = txn; + + binaryheap_add_unordered(state->heap, Int32GetDatum(off++)); + } + + /* add subtransactions if they contain changes */ + dlist_foreach(cur_txn_i, &txn->subtxns) + { + ReorderBufferTXN *cur_txn; + + cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur); + + if (cur_txn->nentries > 0) + { + ReorderBufferChange *cur_change; + + if (rbtxn_is_serialized(cur_txn)) + { + /* serialize remaining changes */ + ReorderBufferSerializeTXN(rb, cur_txn); + ReorderBufferRestoreChanges(rb, cur_txn, + &state->entries[off].file, + &state->entries[off].segno); + } + cur_change = dlist_head_element(ReorderBufferChange, node, + &cur_txn->changes); + + state->entries[off].lsn = cur_change->lsn; + state->entries[off].change = cur_change; + state->entries[off].txn = cur_txn; + + binaryheap_add_unordered(state->heap, Int32GetDatum(off++)); + } + } + + /* assemble a valid binary heap */ + binaryheap_build(state->heap); +} + +/* + * Return the next change when iterating over a transaction and its + * subtransactions. + * + * Returns NULL when no further changes exist. + */ +static ReorderBufferChange * +ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state) +{ + ReorderBufferChange *change; + ReorderBufferIterTXNEntry *entry; + int32 off; + + /* nothing there anymore */ + if (state->heap->bh_size == 0) + return NULL; + + off = DatumGetInt32(binaryheap_first(state->heap)); + entry = &state->entries[off]; + + /* free memory we might have "leaked" in the previous *Next call */ + if (!dlist_is_empty(&state->old_change)) + { + change = dlist_container(ReorderBufferChange, node, + dlist_pop_head_node(&state->old_change)); + ReorderBufferReturnChange(rb, change, true); + Assert(dlist_is_empty(&state->old_change)); + } + + change = entry->change; + + /* + * update heap with information about which transaction has the next + * relevant change in LSN order + */ + + /* there are in-memory changes */ + if (dlist_has_next(&entry->txn->changes, &entry->change->node)) + { + dlist_node *next = dlist_next_node(&entry->txn->changes, &change->node); + ReorderBufferChange *next_change = + dlist_container(ReorderBufferChange, node, next); + + /* txn stays the same */ + state->entries[off].lsn = next_change->lsn; + state->entries[off].change = next_change; + + binaryheap_replace_first(state->heap, Int32GetDatum(off)); + return change; + } + + /* try to load changes from disk */ + if (entry->txn->nentries != entry->txn->nentries_mem) + { + /* + * Ugly: restoring changes will reuse *Change records, thus delete the + * current one from the per-tx list and only free in the next call. + */ + dlist_delete(&change->node); + dlist_push_tail(&state->old_change, &change->node); + + /* + * Update the total bytes processed by the txn for which we are + * releasing the current set of changes and restoring the new set of + * changes. + */ + rb->totalBytes += entry->txn->size; + if (ReorderBufferRestoreChanges(rb, entry->txn, &entry->file, + &state->entries[off].segno)) + { + /* successfully restored changes from disk */ + ReorderBufferChange *next_change = + dlist_head_element(ReorderBufferChange, node, + &entry->txn->changes); + + elog(DEBUG2, "restored %u/%u changes from disk", + (uint32) entry->txn->nentries_mem, + (uint32) entry->txn->nentries); + + Assert(entry->txn->nentries_mem); + /* txn stays the same */ + state->entries[off].lsn = next_change->lsn; + state->entries[off].change = next_change; + binaryheap_replace_first(state->heap, Int32GetDatum(off)); + + return change; + } + } + + /* ok, no changes there anymore, remove */ + binaryheap_remove_first(state->heap); + + return change; +} + +/* + * Deallocate the iterator + */ +static void +ReorderBufferIterTXNFinish(ReorderBuffer *rb, + ReorderBufferIterTXNState *state) +{ + int32 off; + + for (off = 0; off < state->nr_txns; off++) + { + if (state->entries[off].file.vfd != -1) + FileClose(state->entries[off].file.vfd); + } + + /* free memory we might have "leaked" in the last *Next call */ + if (!dlist_is_empty(&state->old_change)) + { + ReorderBufferChange *change; + + change = dlist_container(ReorderBufferChange, node, + dlist_pop_head_node(&state->old_change)); + ReorderBufferReturnChange(rb, change, true); + Assert(dlist_is_empty(&state->old_change)); + } + + binaryheap_free(state->heap); + pfree(state); +} + +/* + * Cleanup the contents of a transaction, usually after the transaction + * committed or aborted. + */ +static void +ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + bool found; + dlist_mutable_iter iter; + + /* cleanup subtransactions & their changes */ + dlist_foreach_modify(iter, &txn->subtxns) + { + ReorderBufferTXN *subtxn; + + subtxn = dlist_container(ReorderBufferTXN, node, iter.cur); + + /* + * Subtransactions are always associated to the toplevel TXN, even if + * they originally were happening inside another subtxn, so we won't + * ever recurse more than one level deep here. + */ + Assert(rbtxn_is_known_subxact(subtxn)); + Assert(subtxn->nsubtxns == 0); + + ReorderBufferCleanupTXN(rb, subtxn); + } + + /* cleanup changes in the txn */ + dlist_foreach_modify(iter, &txn->changes) + { + ReorderBufferChange *change; + + change = dlist_container(ReorderBufferChange, node, iter.cur); + + /* Check we're not mixing changes from different transactions. */ + Assert(change->txn == txn); + + ReorderBufferReturnChange(rb, change, true); + } + + /* + * Cleanup the tuplecids we stored for decoding catalog snapshot access. + * They are always stored in the toplevel transaction. + */ + dlist_foreach_modify(iter, &txn->tuplecids) + { + ReorderBufferChange *change; + + change = dlist_container(ReorderBufferChange, node, iter.cur); + + /* Check we're not mixing changes from different transactions. */ + Assert(change->txn == txn); + Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID); + + ReorderBufferReturnChange(rb, change, true); + } + + /* + * Cleanup the base snapshot, if set. + */ + if (txn->base_snapshot != NULL) + { + SnapBuildSnapDecRefcount(txn->base_snapshot); + dlist_delete(&txn->base_snapshot_node); + } + + /* + * Cleanup the snapshot for the last streamed run. + */ + if (txn->snapshot_now != NULL) + { + Assert(rbtxn_is_streamed(txn)); + ReorderBufferFreeSnap(rb, txn->snapshot_now); + } + + /* + * Remove TXN from its containing lists. + * + * Note: if txn is known as subxact, we are deleting the TXN from its + * parent's list of known subxacts; this leaves the parent's nsubxacts + * count too high, but we don't care. Otherwise, we are deleting the TXN + * from the LSN-ordered list of toplevel TXNs. We remove the TXN from the + * list of catalog modifying transactions as well. + */ + dlist_delete(&txn->node); + if (rbtxn_has_catalog_changes(txn)) + dclist_delete_from(&rb->catchange_txns, &txn->catchange_node); + + /* now remove reference from buffer */ + hash_search(rb->by_txn, &txn->xid, HASH_REMOVE, &found); + Assert(found); + + /* remove entries spilled to disk */ + if (rbtxn_is_serialized(txn)) + ReorderBufferRestoreCleanup(rb, txn); + + /* deallocate */ + ReorderBufferReturnTXN(rb, txn); +} + +/* + * Discard changes from a transaction (and subtransactions), either after + * streaming or decoding them at PREPARE. Keep the remaining info - + * transactions, tuplecids, invalidations and snapshots. + * + * We additionally remove tuplecids after decoding the transaction at prepare + * time as we only need to perform invalidation at rollback or commit prepared. + * + * 'txn_prepared' indicates that we have decoded the transaction at prepare + * time. + */ +static void +ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, bool txn_prepared) +{ + dlist_mutable_iter iter; + + /* cleanup subtransactions & their changes */ + dlist_foreach_modify(iter, &txn->subtxns) + { + ReorderBufferTXN *subtxn; + + subtxn = dlist_container(ReorderBufferTXN, node, iter.cur); + + /* + * Subtransactions are always associated to the toplevel TXN, even if + * they originally were happening inside another subtxn, so we won't + * ever recurse more than one level deep here. + */ + Assert(rbtxn_is_known_subxact(subtxn)); + Assert(subtxn->nsubtxns == 0); + + ReorderBufferTruncateTXN(rb, subtxn, txn_prepared); + } + + /* cleanup changes in the txn */ + dlist_foreach_modify(iter, &txn->changes) + { + ReorderBufferChange *change; + + change = dlist_container(ReorderBufferChange, node, iter.cur); + + /* Check we're not mixing changes from different transactions. */ + Assert(change->txn == txn); + + /* remove the change from it's containing list */ + dlist_delete(&change->node); + + ReorderBufferReturnChange(rb, change, true); + } + + /* + * Mark the transaction as streamed. + * + * The top-level transaction, is marked as streamed always, even if it + * does not contain any changes (that is, when all the changes are in + * subtransactions). + * + * For subtransactions, we only mark them as streamed when there are + * changes in them. + * + * We do it this way because of aborts - we don't want to send aborts for + * XIDs the downstream is not aware of. And of course, it always knows + * about the toplevel xact (we send the XID in all messages), but we never + * stream XIDs of empty subxacts. + */ + if ((!txn_prepared) && (rbtxn_is_toptxn(txn) || (txn->nentries_mem != 0))) + txn->txn_flags |= RBTXN_IS_STREAMED; + + if (txn_prepared) + { + /* + * If this is a prepared txn, cleanup the tuplecids we stored for + * decoding catalog snapshot access. They are always stored in the + * toplevel transaction. + */ + dlist_foreach_modify(iter, &txn->tuplecids) + { + ReorderBufferChange *change; + + change = dlist_container(ReorderBufferChange, node, iter.cur); + + /* Check we're not mixing changes from different transactions. */ + Assert(change->txn == txn); + Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID); + + /* Remove the change from its containing list. */ + dlist_delete(&change->node); + + ReorderBufferReturnChange(rb, change, true); + } + } + + /* + * Destroy the (relfilelocator, ctid) hashtable, so that we don't leak any + * memory. We could also keep the hash table and update it with new ctid + * values, but this seems simpler and good enough for now. + */ + if (txn->tuplecid_hash != NULL) + { + hash_destroy(txn->tuplecid_hash); + txn->tuplecid_hash = NULL; + } + + /* If this txn is serialized then clean the disk space. */ + if (rbtxn_is_serialized(txn)) + { + ReorderBufferRestoreCleanup(rb, txn); + txn->txn_flags &= ~RBTXN_IS_SERIALIZED; + + /* + * We set this flag to indicate if the transaction is ever serialized. + * We need this to accurately update the stats as otherwise the same + * transaction can be counted as serialized multiple times. + */ + txn->txn_flags |= RBTXN_IS_SERIALIZED_CLEAR; + } + + /* also reset the number of entries in the transaction */ + txn->nentries_mem = 0; + txn->nentries = 0; +} + +/* + * Build a hash with a (relfilelocator, ctid) -> (cmin, cmax) mapping for use by + * HeapTupleSatisfiesHistoricMVCC. + */ +static void +ReorderBufferBuildTupleCidHash(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + dlist_iter iter; + HASHCTL hash_ctl; + + if (!rbtxn_has_catalog_changes(txn) || dlist_is_empty(&txn->tuplecids)) + return; + + hash_ctl.keysize = sizeof(ReorderBufferTupleCidKey); + hash_ctl.entrysize = sizeof(ReorderBufferTupleCidEnt); + hash_ctl.hcxt = rb->context; + + /* + * create the hash with the exact number of to-be-stored tuplecids from + * the start + */ + txn->tuplecid_hash = + hash_create("ReorderBufferTupleCid", txn->ntuplecids, &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + dlist_foreach(iter, &txn->tuplecids) + { + ReorderBufferTupleCidKey key; + ReorderBufferTupleCidEnt *ent; + bool found; + ReorderBufferChange *change; + + change = dlist_container(ReorderBufferChange, node, iter.cur); + + Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID); + + /* be careful about padding */ + memset(&key, 0, sizeof(ReorderBufferTupleCidKey)); + + key.rlocator = change->data.tuplecid.locator; + + ItemPointerCopy(&change->data.tuplecid.tid, + &key.tid); + + ent = (ReorderBufferTupleCidEnt *) + hash_search(txn->tuplecid_hash, &key, HASH_ENTER, &found); + if (!found) + { + ent->cmin = change->data.tuplecid.cmin; + ent->cmax = change->data.tuplecid.cmax; + ent->combocid = change->data.tuplecid.combocid; + } + else + { + /* + * Maybe we already saw this tuple before in this transaction, but + * if so it must have the same cmin. + */ + Assert(ent->cmin == change->data.tuplecid.cmin); + + /* + * cmax may be initially invalid, but once set it can only grow, + * and never become invalid again. + */ + Assert((ent->cmax == InvalidCommandId) || + ((change->data.tuplecid.cmax != InvalidCommandId) && + (change->data.tuplecid.cmax > ent->cmax))); + ent->cmax = change->data.tuplecid.cmax; + } + } +} + +/* + * Copy a provided snapshot so we can modify it privately. This is needed so + * that catalog modifying transactions can look into intermediate catalog + * states. + */ +static Snapshot +ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap, + ReorderBufferTXN *txn, CommandId cid) +{ + Snapshot snap; + dlist_iter iter; + int i = 0; + Size size; + + size = sizeof(SnapshotData) + + sizeof(TransactionId) * orig_snap->xcnt + + sizeof(TransactionId) * (txn->nsubtxns + 1); + + snap = MemoryContextAllocZero(rb->context, size); + memcpy(snap, orig_snap, sizeof(SnapshotData)); + + snap->copied = true; + snap->active_count = 1; /* mark as active so nobody frees it */ + snap->regd_count = 0; + snap->xip = (TransactionId *) (snap + 1); + + memcpy(snap->xip, orig_snap->xip, sizeof(TransactionId) * snap->xcnt); + + /* + * snap->subxip contains all txids that belong to our transaction which we + * need to check via cmin/cmax. That's why we store the toplevel + * transaction in there as well. + */ + snap->subxip = snap->xip + snap->xcnt; + snap->subxip[i++] = txn->xid; + + /* + * subxcnt isn't decreased when subtransactions abort, so count manually. + * Since it's an upper boundary it is safe to use it for the allocation + * above. + */ + snap->subxcnt = 1; + + dlist_foreach(iter, &txn->subtxns) + { + ReorderBufferTXN *sub_txn; + + sub_txn = dlist_container(ReorderBufferTXN, node, iter.cur); + snap->subxip[i++] = sub_txn->xid; + snap->subxcnt++; + } + + /* sort so we can bsearch() later */ + qsort(snap->subxip, snap->subxcnt, sizeof(TransactionId), xidComparator); + + /* store the specified current CommandId */ + snap->curcid = cid; + + return snap; +} + +/* + * Free a previously ReorderBufferCopySnap'ed snapshot + */ +static void +ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap) +{ + if (snap->copied) + pfree(snap); + else + SnapBuildSnapDecRefcount(snap); +} + +/* + * If the transaction was (partially) streamed, we need to prepare or commit + * it in a 'streamed' way. That is, we first stream the remaining part of the + * transaction, and then invoke stream_prepare or stream_commit message as per + * the case. + */ +static void +ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + /* we should only call this for previously streamed transactions */ + Assert(rbtxn_is_streamed(txn)); + + ReorderBufferStreamTXN(rb, txn); + + if (rbtxn_prepared(txn)) + { + /* + * Note, we send stream prepare even if a concurrent abort is + * detected. See DecodePrepare for more information. + */ + rb->stream_prepare(rb, txn, txn->final_lsn); + + /* + * This is a PREPARED transaction, part of a two-phase commit. The + * full cleanup will happen as part of the COMMIT PREPAREDs, so now + * just truncate txn by removing changes and tuplecids. + */ + ReorderBufferTruncateTXN(rb, txn, true); + /* Reset the CheckXidAlive */ + CheckXidAlive = InvalidTransactionId; + } + else + { + rb->stream_commit(rb, txn, txn->final_lsn); + ReorderBufferCleanupTXN(rb, txn); + } +} + +/* + * Set xid to detect concurrent aborts. + * + * While streaming an in-progress transaction or decoding a prepared + * transaction there is a possibility that the (sub)transaction might get + * aborted concurrently. In such case if the (sub)transaction has catalog + * update then we might decode the tuple using wrong catalog version. For + * example, suppose there is one catalog tuple with (xmin: 500, xmax: 0). Now, + * the transaction 501 updates the catalog tuple and after that we will have + * two tuples (xmin: 500, xmax: 501) and (xmin: 501, xmax: 0). Now, if 501 is + * aborted and some other transaction say 502 updates the same catalog tuple + * then the first tuple will be changed to (xmin: 500, xmax: 502). So, the + * problem is that when we try to decode the tuple inserted/updated in 501 + * after the catalog update, we will see the catalog tuple with (xmin: 500, + * xmax: 502) as visible because it will consider that the tuple is deleted by + * xid 502 which is not visible to our snapshot. And when we will try to + * decode with that catalog tuple, it can lead to a wrong result or a crash. + * So, it is necessary to detect concurrent aborts to allow streaming of + * in-progress transactions or decoding of prepared transactions. + * + * For detecting the concurrent abort we set CheckXidAlive to the current + * (sub)transaction's xid for which this change belongs to. And, during + * catalog scan we can check the status of the xid and if it is aborted we will + * report a specific error so that we can stop streaming current transaction + * and discard the already streamed changes on such an error. We might have + * already streamed some of the changes for the aborted (sub)transaction, but + * that is fine because when we decode the abort we will stream abort message + * to truncate the changes in the subscriber. Similarly, for prepared + * transactions, we stop decoding if concurrent abort is detected and then + * rollback the changes when rollback prepared is encountered. See + * DecodePrepare. + */ +static inline void +SetupCheckXidLive(TransactionId xid) +{ + /* + * If the input transaction id is already set as a CheckXidAlive then + * nothing to do. + */ + if (TransactionIdEquals(CheckXidAlive, xid)) + return; + + /* + * setup CheckXidAlive if it's not committed yet. We don't check if the + * xid is aborted. That will happen during catalog access. + */ + if (!TransactionIdDidCommit(xid)) + CheckXidAlive = xid; + else + CheckXidAlive = InvalidTransactionId; +} + +/* + * Helper function for ReorderBufferProcessTXN for applying change. + */ +static inline void +ReorderBufferApplyChange(ReorderBuffer *rb, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change, + bool streaming) +{ + if (streaming) + rb->stream_change(rb, txn, relation, change); + else + rb->apply_change(rb, txn, relation, change); +} + +/* + * Helper function for ReorderBufferProcessTXN for applying the truncate. + */ +static inline void +ReorderBufferApplyTruncate(ReorderBuffer *rb, ReorderBufferTXN *txn, + int nrelations, Relation *relations, + ReorderBufferChange *change, bool streaming) +{ + if (streaming) + rb->stream_truncate(rb, txn, nrelations, relations, change); + else + rb->apply_truncate(rb, txn, nrelations, relations, change); +} + +/* + * Helper function for ReorderBufferProcessTXN for applying the message. + */ +static inline void +ReorderBufferApplyMessage(ReorderBuffer *rb, ReorderBufferTXN *txn, + ReorderBufferChange *change, bool streaming) +{ + if (streaming) + rb->stream_message(rb, txn, change->lsn, true, + change->data.msg.prefix, + change->data.msg.message_size, + change->data.msg.message); + else + rb->message(rb, txn, change->lsn, true, + change->data.msg.prefix, + change->data.msg.message_size, + change->data.msg.message); +} + +/* + * Function to store the command id and snapshot at the end of the current + * stream so that we can reuse the same while sending the next stream. + */ +static inline void +ReorderBufferSaveTXNSnapshot(ReorderBuffer *rb, ReorderBufferTXN *txn, + Snapshot snapshot_now, CommandId command_id) +{ + txn->command_id = command_id; + + /* Avoid copying if it's already copied. */ + if (snapshot_now->copied) + txn->snapshot_now = snapshot_now; + else + txn->snapshot_now = ReorderBufferCopySnap(rb, snapshot_now, + txn, command_id); +} + +/* + * Helper function for ReorderBufferProcessTXN to handle the concurrent + * abort of the streaming transaction. This resets the TXN such that it + * can be used to stream the remaining data of transaction being processed. + * This can happen when the subtransaction is aborted and we still want to + * continue processing the main or other subtransactions data. + */ +static void +ReorderBufferResetTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, + Snapshot snapshot_now, + CommandId command_id, + XLogRecPtr last_lsn, + ReorderBufferChange *specinsert) +{ + /* Discard the changes that we just streamed */ + ReorderBufferTruncateTXN(rb, txn, rbtxn_prepared(txn)); + + /* Free all resources allocated for toast reconstruction */ + ReorderBufferToastReset(rb, txn); + + /* Return the spec insert change if it is not NULL */ + if (specinsert != NULL) + { + ReorderBufferReturnChange(rb, specinsert, true); + specinsert = NULL; + } + + /* + * For the streaming case, stop the stream and remember the command ID and + * snapshot for the streaming run. + */ + if (rbtxn_is_streamed(txn)) + { + rb->stream_stop(rb, txn, last_lsn); + ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id); + } +} + +/* + * Helper function for ReorderBufferReplay and ReorderBufferStreamTXN. + * + * Send data of a transaction (and its subtransactions) to the + * output plugin. We iterate over the top and subtransactions (using a k-way + * merge) and replay the changes in lsn order. + * + * If streaming is true then data will be sent using stream API. + * + * Note: "volatile" markers on some parameters are to avoid trouble with + * PG_TRY inside the function. + */ +static void +ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn, + volatile Snapshot snapshot_now, + volatile CommandId command_id, + bool streaming) +{ + bool using_subtxn; + MemoryContext ccxt = CurrentMemoryContext; + ReorderBufferIterTXNState *volatile iterstate = NULL; + volatile XLogRecPtr prev_lsn = InvalidXLogRecPtr; + ReorderBufferChange *volatile specinsert = NULL; + volatile bool stream_started = false; + ReorderBufferTXN *volatile curtxn = NULL; + + /* build data to be able to lookup the CommandIds of catalog tuples */ + ReorderBufferBuildTupleCidHash(rb, txn); + + /* setup the initial snapshot */ + SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash); + + /* + * Decoding needs access to syscaches et al., which in turn use + * heavyweight locks and such. Thus we need to have enough state around to + * keep track of those. The easiest way is to simply use a transaction + * internally. That also allows us to easily enforce that nothing writes + * to the database by checking for xid assignments. + * + * When we're called via the SQL SRF there's already a transaction + * started, so start an explicit subtransaction there. + */ + using_subtxn = IsTransactionOrTransactionBlock(); + + PG_TRY(); + { + ReorderBufferChange *change; + int changes_count = 0; /* used to accumulate the number of + * changes */ + + if (using_subtxn) + BeginInternalSubTransaction(streaming ? "stream" : "replay"); + else + StartTransactionCommand(); + + /* + * We only need to send begin/begin-prepare for non-streamed + * transactions. + */ + if (!streaming) + { + if (rbtxn_prepared(txn)) + rb->begin_prepare(rb, txn); + else + rb->begin(rb, txn); + } + + ReorderBufferIterTXNInit(rb, txn, &iterstate); + while ((change = ReorderBufferIterTXNNext(rb, iterstate)) != NULL) + { + Relation relation = NULL; + Oid reloid; + + CHECK_FOR_INTERRUPTS(); + + /* + * We can't call start stream callback before processing first + * change. + */ + if (prev_lsn == InvalidXLogRecPtr) + { + if (streaming) + { + txn->origin_id = change->origin_id; + rb->stream_start(rb, txn, change->lsn); + stream_started = true; + } + } + + /* + * Enforce correct ordering of changes, merged from multiple + * subtransactions. The changes may have the same LSN due to + * MULTI_INSERT xlog records. + */ + Assert(prev_lsn == InvalidXLogRecPtr || prev_lsn <= change->lsn); + + prev_lsn = change->lsn; + + /* + * Set the current xid to detect concurrent aborts. This is + * required for the cases when we decode the changes before the + * COMMIT record is processed. + */ + if (streaming || rbtxn_prepared(change->txn)) + { + curtxn = change->txn; + SetupCheckXidLive(curtxn->xid); + } + + switch (change->action) + { + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM: + + /* + * Confirmation for speculative insertion arrived. Simply + * use as a normal record. It'll be cleaned up at the end + * of INSERT processing. + */ + if (specinsert == NULL) + elog(ERROR, "invalid ordering of speculative insertion changes"); + Assert(specinsert->data.tp.oldtuple == NULL); + change = specinsert; + change->action = REORDER_BUFFER_CHANGE_INSERT; + + /* intentionally fall through */ + case REORDER_BUFFER_CHANGE_INSERT: + case REORDER_BUFFER_CHANGE_UPDATE: + case REORDER_BUFFER_CHANGE_DELETE: + Assert(snapshot_now); + + reloid = RelidByRelfilenumber(change->data.tp.rlocator.spcOid, + change->data.tp.rlocator.relNumber); + + /* + * Mapped catalog tuple without data, emitted while + * catalog table was in the process of being rewritten. We + * can fail to look up the relfilenumber, because the + * relmapper has no "historic" view, in contrast to the + * normal catalog during decoding. Thus repeated rewrites + * can cause a lookup failure. That's OK because we do not + * decode catalog changes anyway. Normally such tuples + * would be skipped over below, but we can't identify + * whether the table should be logically logged without + * mapping the relfilenumber to the oid. + */ + if (reloid == InvalidOid && + change->data.tp.newtuple == NULL && + change->data.tp.oldtuple == NULL) + goto change_done; + else if (reloid == InvalidOid) + elog(ERROR, "could not map filenumber \"%s\" to relation OID", + relpathperm(change->data.tp.rlocator, + MAIN_FORKNUM)); + + relation = RelationIdGetRelation(reloid); + + if (!RelationIsValid(relation)) + elog(ERROR, "could not open relation with OID %u (for filenumber \"%s\")", + reloid, + relpathperm(change->data.tp.rlocator, + MAIN_FORKNUM)); + + if (!RelationIsLogicallyLogged(relation)) + goto change_done; + + /* + * Ignore temporary heaps created during DDL unless the + * plugin has asked for them. + */ + if (relation->rd_rel->relrewrite && !rb->output_rewrites) + goto change_done; + + /* + * For now ignore sequence changes entirely. Most of the + * time they don't log changes using records we + * understand, so it doesn't make sense to handle the few + * cases we do. + */ + if (relation->rd_rel->relkind == RELKIND_SEQUENCE) + goto change_done; + + /* user-triggered change */ + if (!IsToastRelation(relation)) + { + ReorderBufferToastReplace(rb, txn, relation, change); + ReorderBufferApplyChange(rb, txn, relation, change, + streaming); + + /* + * Only clear reassembled toast chunks if we're sure + * they're not required anymore. The creator of the + * tuple tells us. + */ + if (change->data.tp.clear_toast_afterwards) + ReorderBufferToastReset(rb, txn); + } + /* we're not interested in toast deletions */ + else if (change->action == REORDER_BUFFER_CHANGE_INSERT) + { + /* + * Need to reassemble the full toasted Datum in + * memory, to ensure the chunks don't get reused till + * we're done remove it from the list of this + * transaction's changes. Otherwise it will get + * freed/reused while restoring spooled data from + * disk. + */ + Assert(change->data.tp.newtuple != NULL); + + dlist_delete(&change->node); + ReorderBufferToastAppendChunk(rb, txn, relation, + change); + } + + change_done: + + /* + * If speculative insertion was confirmed, the record + * isn't needed anymore. + */ + if (specinsert != NULL) + { + ReorderBufferReturnChange(rb, specinsert, true); + specinsert = NULL; + } + + if (RelationIsValid(relation)) + { + RelationClose(relation); + relation = NULL; + } + break; + + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT: + + /* + * Speculative insertions are dealt with by delaying the + * processing of the insert until the confirmation record + * arrives. For that we simply unlink the record from the + * chain, so it does not get freed/reused while restoring + * spooled data from disk. + * + * This is safe in the face of concurrent catalog changes + * because the relevant relation can't be changed between + * speculative insertion and confirmation due to + * CheckTableNotInUse() and locking. + */ + + /* clear out a pending (and thus failed) speculation */ + if (specinsert != NULL) + { + ReorderBufferReturnChange(rb, specinsert, true); + specinsert = NULL; + } + + /* and memorize the pending insertion */ + dlist_delete(&change->node); + specinsert = change; + break; + + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT: + + /* + * Abort for speculative insertion arrived. So cleanup the + * specinsert tuple and toast hash. + * + * Note that we get the spec abort change for each toast + * entry but we need to perform the cleanup only the first + * time we get it for the main table. + */ + if (specinsert != NULL) + { + /* + * We must clean the toast hash before processing a + * completely new tuple to avoid confusion about the + * previous tuple's toast chunks. + */ + Assert(change->data.tp.clear_toast_afterwards); + ReorderBufferToastReset(rb, txn); + + /* We don't need this record anymore. */ + ReorderBufferReturnChange(rb, specinsert, true); + specinsert = NULL; + } + break; + + case REORDER_BUFFER_CHANGE_TRUNCATE: + { + int i; + int nrelids = change->data.truncate.nrelids; + int nrelations = 0; + Relation *relations; + + relations = palloc0(nrelids * sizeof(Relation)); + for (i = 0; i < nrelids; i++) + { + Oid relid = change->data.truncate.relids[i]; + Relation rel; + + rel = RelationIdGetRelation(relid); + + if (!RelationIsValid(rel)) + elog(ERROR, "could not open relation with OID %u", relid); + + if (!RelationIsLogicallyLogged(rel)) + continue; + + relations[nrelations++] = rel; + } + + /* Apply the truncate. */ + ReorderBufferApplyTruncate(rb, txn, nrelations, + relations, change, + streaming); + + for (i = 0; i < nrelations; i++) + RelationClose(relations[i]); + + break; + } + + case REORDER_BUFFER_CHANGE_MESSAGE: + ReorderBufferApplyMessage(rb, txn, change, streaming); + break; + + case REORDER_BUFFER_CHANGE_INVALIDATION: + /* Execute the invalidation messages locally */ + ReorderBufferExecuteInvalidations(change->data.inval.ninvalidations, + change->data.inval.invalidations); + break; + + case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT: + /* get rid of the old */ + TeardownHistoricSnapshot(false); + + if (snapshot_now->copied) + { + ReorderBufferFreeSnap(rb, snapshot_now); + snapshot_now = + ReorderBufferCopySnap(rb, change->data.snapshot, + txn, command_id); + } + + /* + * Restored from disk, need to be careful not to double + * free. We could introduce refcounting for that, but for + * now this seems infrequent enough not to care. + */ + else if (change->data.snapshot->copied) + { + snapshot_now = + ReorderBufferCopySnap(rb, change->data.snapshot, + txn, command_id); + } + else + { + snapshot_now = change->data.snapshot; + } + + /* and continue with the new one */ + SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash); + break; + + case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID: + Assert(change->data.command_id != InvalidCommandId); + + if (command_id < change->data.command_id) + { + command_id = change->data.command_id; + + if (!snapshot_now->copied) + { + /* we don't use the global one anymore */ + snapshot_now = ReorderBufferCopySnap(rb, snapshot_now, + txn, command_id); + } + + snapshot_now->curcid = command_id; + + TeardownHistoricSnapshot(false); + SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash); + } + + break; + + case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID: + elog(ERROR, "tuplecid value in changequeue"); + break; + } + + /* + * It is possible that the data is not sent to downstream for a + * long time either because the output plugin filtered it or there + * is a DDL that generates a lot of data that is not processed by + * the plugin. So, in such cases, the downstream can timeout. To + * avoid that we try to send a keepalive message if required. + * Trying to send a keepalive message after every change has some + * overhead, but testing showed there is no noticeable overhead if + * we do it after every ~100 changes. + */ +#define CHANGES_THRESHOLD 100 + + if (++changes_count >= CHANGES_THRESHOLD) + { + rb->update_progress_txn(rb, txn, change->lsn); + changes_count = 0; + } + } + + /* speculative insertion record must be freed by now */ + Assert(!specinsert); + + /* clean up the iterator */ + ReorderBufferIterTXNFinish(rb, iterstate); + iterstate = NULL; + + /* + * Update total transaction count and total bytes processed by the + * transaction and its subtransactions. Ensure to not count the + * streamed transaction multiple times. + * + * Note that the statistics computation has to be done after + * ReorderBufferIterTXNFinish as it releases the serialized change + * which we have already accounted in ReorderBufferIterTXNNext. + */ + if (!rbtxn_is_streamed(txn)) + rb->totalTxns++; + + rb->totalBytes += txn->total_size; + + /* + * Done with current changes, send the last message for this set of + * changes depending upon streaming mode. + */ + if (streaming) + { + if (stream_started) + { + rb->stream_stop(rb, txn, prev_lsn); + stream_started = false; + } + } + else + { + /* + * Call either PREPARE (for two-phase transactions) or COMMIT (for + * regular ones). + */ + if (rbtxn_prepared(txn)) + rb->prepare(rb, txn, commit_lsn); + else + rb->commit(rb, txn, commit_lsn); + } + + /* this is just a sanity check against bad output plugin behaviour */ + if (GetCurrentTransactionIdIfAny() != InvalidTransactionId) + elog(ERROR, "output plugin used XID %u", + GetCurrentTransactionId()); + + /* + * Remember the command ID and snapshot for the next set of changes in + * streaming mode. + */ + if (streaming) + ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id); + else if (snapshot_now->copied) + ReorderBufferFreeSnap(rb, snapshot_now); + + /* cleanup */ + TeardownHistoricSnapshot(false); + + /* + * Aborting the current (sub-)transaction as a whole has the right + * semantics. We want all locks acquired in here to be released, not + * reassigned to the parent and we do not want any database access + * have persistent effects. + */ + AbortCurrentTransaction(); + + /* make sure there's no cache pollution */ + ReorderBufferExecuteInvalidations(txn->ninvalidations, txn->invalidations); + + if (using_subtxn) + RollbackAndReleaseCurrentSubTransaction(); + + /* + * We are here due to one of the four reasons: 1. Decoding an + * in-progress txn. 2. Decoding a prepared txn. 3. Decoding of a + * prepared txn that was (partially) streamed. 4. Decoding a committed + * txn. + * + * For 1, we allow truncation of txn data by removing the changes + * already streamed but still keeping other things like invalidations, + * snapshot, and tuplecids. For 2 and 3, we indicate + * ReorderBufferTruncateTXN to do more elaborate truncation of txn + * data as the entire transaction has been decoded except for commit. + * For 4, as the entire txn has been decoded, we can fully clean up + * the TXN reorder buffer. + */ + if (streaming || rbtxn_prepared(txn)) + { + ReorderBufferTruncateTXN(rb, txn, rbtxn_prepared(txn)); + /* Reset the CheckXidAlive */ + CheckXidAlive = InvalidTransactionId; + } + else + ReorderBufferCleanupTXN(rb, txn); + } + PG_CATCH(); + { + MemoryContext ecxt = MemoryContextSwitchTo(ccxt); + ErrorData *errdata = CopyErrorData(); + + /* TODO: Encapsulate cleanup from the PG_TRY and PG_CATCH blocks */ + if (iterstate) + ReorderBufferIterTXNFinish(rb, iterstate); + + TeardownHistoricSnapshot(true); + + /* + * Force cache invalidation to happen outside of a valid transaction + * to prevent catalog access as we just caught an error. + */ + AbortCurrentTransaction(); + + /* make sure there's no cache pollution */ + ReorderBufferExecuteInvalidations(txn->ninvalidations, + txn->invalidations); + + if (using_subtxn) + RollbackAndReleaseCurrentSubTransaction(); + + /* + * The error code ERRCODE_TRANSACTION_ROLLBACK indicates a concurrent + * abort of the (sub)transaction we are streaming or preparing. We + * need to do the cleanup and return gracefully on this error, see + * SetupCheckXidLive. + * + * This error code can be thrown by one of the callbacks we call + * during decoding so we need to ensure that we return gracefully only + * when we are sending the data in streaming mode and the streaming is + * not finished yet or when we are sending the data out on a PREPARE + * during a two-phase commit. + */ + if (errdata->sqlerrcode == ERRCODE_TRANSACTION_ROLLBACK && + (stream_started || rbtxn_prepared(txn))) + { + /* curtxn must be set for streaming or prepared transactions */ + Assert(curtxn); + + /* Cleanup the temporary error state. */ + FlushErrorState(); + FreeErrorData(errdata); + errdata = NULL; + curtxn->concurrent_abort = true; + + /* Reset the TXN so that it is allowed to stream remaining data. */ + ReorderBufferResetTXN(rb, txn, snapshot_now, + command_id, prev_lsn, + specinsert); + } + else + { + ReorderBufferCleanupTXN(rb, txn); + MemoryContextSwitchTo(ecxt); + PG_RE_THROW(); + } + } + PG_END_TRY(); +} + +/* + * Perform the replay of a transaction and its non-aborted subtransactions. + * + * Subtransactions previously have to be processed by + * ReorderBufferCommitChild(), even if previously assigned to the toplevel + * transaction with ReorderBufferAssignChild. + * + * This interface is called once a prepare or toplevel commit is read for both + * streamed as well as non-streamed transactions. + */ +static void +ReorderBufferReplay(ReorderBufferTXN *txn, + ReorderBuffer *rb, TransactionId xid, + XLogRecPtr commit_lsn, XLogRecPtr end_lsn, + TimestampTz commit_time, + RepOriginId origin_id, XLogRecPtr origin_lsn) +{ + Snapshot snapshot_now; + CommandId command_id = FirstCommandId; + + txn->final_lsn = commit_lsn; + txn->end_lsn = end_lsn; + txn->xact_time.commit_time = commit_time; + txn->origin_id = origin_id; + txn->origin_lsn = origin_lsn; + + /* + * If the transaction was (partially) streamed, we need to commit it in a + * 'streamed' way. That is, we first stream the remaining part of the + * transaction, and then invoke stream_commit message. + * + * Called after everything (origin ID, LSN, ...) is stored in the + * transaction to avoid passing that information directly. + */ + if (rbtxn_is_streamed(txn)) + { + ReorderBufferStreamCommit(rb, txn); + return; + } + + /* + * If this transaction has no snapshot, it didn't make any changes to the + * database, so there's nothing to decode. Note that + * ReorderBufferCommitChild will have transferred any snapshots from + * subtransactions if there were any. + */ + if (txn->base_snapshot == NULL) + { + Assert(txn->ninvalidations == 0); + + /* + * Removing this txn before a commit might result in the computation + * of an incorrect restart_lsn. See SnapBuildProcessRunningXacts. + */ + if (!rbtxn_prepared(txn)) + ReorderBufferCleanupTXN(rb, txn); + return; + } + + snapshot_now = txn->base_snapshot; + + /* Process and send the changes to output plugin. */ + ReorderBufferProcessTXN(rb, txn, commit_lsn, snapshot_now, + command_id, false); +} + +/* + * Commit a transaction. + * + * See comments for ReorderBufferReplay(). + */ +void +ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr commit_lsn, XLogRecPtr end_lsn, + TimestampTz commit_time, + RepOriginId origin_id, XLogRecPtr origin_lsn) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, + false); + + /* unknown transaction, nothing to replay */ + if (txn == NULL) + return; + + ReorderBufferReplay(txn, rb, xid, commit_lsn, end_lsn, commit_time, + origin_id, origin_lsn); +} + +/* + * Record the prepare information for a transaction. + */ +bool +ReorderBufferRememberPrepareInfo(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr prepare_lsn, XLogRecPtr end_lsn, + TimestampTz prepare_time, + RepOriginId origin_id, XLogRecPtr origin_lsn) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false); + + /* unknown transaction, nothing to do */ + if (txn == NULL) + return false; + + /* + * Remember the prepare information to be later used by commit prepared in + * case we skip doing prepare. + */ + txn->final_lsn = prepare_lsn; + txn->end_lsn = end_lsn; + txn->xact_time.prepare_time = prepare_time; + txn->origin_id = origin_id; + txn->origin_lsn = origin_lsn; + + return true; +} + +/* Remember that we have skipped prepare */ +void +ReorderBufferSkipPrepare(ReorderBuffer *rb, TransactionId xid) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false); + + /* unknown transaction, nothing to do */ + if (txn == NULL) + return; + + txn->txn_flags |= RBTXN_SKIPPED_PREPARE; +} + +/* + * Prepare a two-phase transaction. + * + * See comments for ReorderBufferReplay(). + */ +void +ReorderBufferPrepare(ReorderBuffer *rb, TransactionId xid, + char *gid) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, + false); + + /* unknown transaction, nothing to replay */ + if (txn == NULL) + return; + + txn->txn_flags |= RBTXN_PREPARE; + txn->gid = pstrdup(gid); + + /* The prepare info must have been updated in txn by now. */ + Assert(txn->final_lsn != InvalidXLogRecPtr); + + ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn, + txn->xact_time.prepare_time, txn->origin_id, txn->origin_lsn); + + /* + * We send the prepare for the concurrently aborted xacts so that later + * when rollback prepared is decoded and sent, the downstream should be + * able to rollback such a xact. See comments atop DecodePrepare. + * + * Note, for the concurrent_abort + streaming case a stream_prepare was + * already sent within the ReorderBufferReplay call above. + */ + if (txn->concurrent_abort && !rbtxn_is_streamed(txn)) + rb->prepare(rb, txn, txn->final_lsn); +} + +/* + * This is used to handle COMMIT/ROLLBACK PREPARED. + */ +void +ReorderBufferFinishPrepared(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr commit_lsn, XLogRecPtr end_lsn, + XLogRecPtr two_phase_at, + TimestampTz commit_time, RepOriginId origin_id, + XLogRecPtr origin_lsn, char *gid, bool is_commit) +{ + ReorderBufferTXN *txn; + XLogRecPtr prepare_end_lsn; + TimestampTz prepare_time; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, commit_lsn, false); + + /* unknown transaction, nothing to do */ + if (txn == NULL) + return; + + /* + * By this time the txn has the prepare record information, remember it to + * be later used for rollback. + */ + prepare_end_lsn = txn->end_lsn; + prepare_time = txn->xact_time.prepare_time; + + /* add the gid in the txn */ + txn->gid = pstrdup(gid); + + /* + * It is possible that this transaction is not decoded at prepare time + * either because by that time we didn't have a consistent snapshot, or + * two_phase was not enabled, or it was decoded earlier but we have + * restarted. We only need to send the prepare if it was not decoded + * earlier. We don't need to decode the xact for aborts if it is not done + * already. + */ + if ((txn->final_lsn < two_phase_at) && is_commit) + { + txn->txn_flags |= RBTXN_PREPARE; + + /* + * The prepare info must have been updated in txn even if we skip + * prepare. + */ + Assert(txn->final_lsn != InvalidXLogRecPtr); + + /* + * By this time the txn has the prepare record information and it is + * important to use that so that downstream gets the accurate + * information. If instead, we have passed commit information here + * then downstream can behave as it has already replayed commit + * prepared after the restart. + */ + ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn, + txn->xact_time.prepare_time, txn->origin_id, txn->origin_lsn); + } + + txn->final_lsn = commit_lsn; + txn->end_lsn = end_lsn; + txn->xact_time.commit_time = commit_time; + txn->origin_id = origin_id; + txn->origin_lsn = origin_lsn; + + if (is_commit) + rb->commit_prepared(rb, txn, commit_lsn); + else + rb->rollback_prepared(rb, txn, prepare_end_lsn, prepare_time); + + /* cleanup: make sure there's no cache pollution */ + ReorderBufferExecuteInvalidations(txn->ninvalidations, + txn->invalidations); + ReorderBufferCleanupTXN(rb, txn); +} + +/* + * Abort a transaction that possibly has previous changes. Needs to be first + * called for subtransactions and then for the toplevel xid. + * + * NB: Transactions handled here have to have actively aborted (i.e. have + * produced an abort record). Implicitly aborted transactions are handled via + * ReorderBufferAbortOld(); transactions we're just not interested in, but + * which have committed are handled in ReorderBufferForget(). + * + * This function purges this transaction and its contents from memory and + * disk. + */ +void +ReorderBufferAbort(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, + TimestampTz abort_time) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, + false); + + /* unknown, nothing to remove */ + if (txn == NULL) + return; + + txn->xact_time.abort_time = abort_time; + + /* For streamed transactions notify the remote node about the abort. */ + if (rbtxn_is_streamed(txn)) + { + rb->stream_abort(rb, txn, lsn); + + /* + * We might have decoded changes for this transaction that could load + * the cache as per the current transaction's view (consider DDL's + * happened in this transaction). We don't want the decoding of future + * transactions to use those cache entries so execute invalidations. + */ + if (txn->ninvalidations > 0) + ReorderBufferImmediateInvalidation(rb, txn->ninvalidations, + txn->invalidations); + } + + /* cosmetic... */ + txn->final_lsn = lsn; + + /* remove potential on-disk data, and deallocate */ + ReorderBufferCleanupTXN(rb, txn); +} + +/* + * Abort all transactions that aren't actually running anymore because the + * server restarted. + * + * NB: These really have to be transactions that have aborted due to a server + * crash/immediate restart, as we don't deal with invalidations here. + */ +void +ReorderBufferAbortOld(ReorderBuffer *rb, TransactionId oldestRunningXid) +{ + dlist_mutable_iter it; + + /* + * Iterate through all (potential) toplevel TXNs and abort all that are + * older than what possibly can be running. Once we've found the first + * that is alive we stop, there might be some that acquired an xid earlier + * but started writing later, but it's unlikely and they will be cleaned + * up in a later call to this function. + */ + dlist_foreach_modify(it, &rb->toplevel_by_lsn) + { + ReorderBufferTXN *txn; + + txn = dlist_container(ReorderBufferTXN, node, it.cur); + + if (TransactionIdPrecedes(txn->xid, oldestRunningXid)) + { + elog(DEBUG2, "aborting old transaction %u", txn->xid); + + /* Notify the remote node about the crash/immediate restart. */ + if (rbtxn_is_streamed(txn)) + rb->stream_abort(rb, txn, InvalidXLogRecPtr); + + /* remove potential on-disk data, and deallocate this tx */ + ReorderBufferCleanupTXN(rb, txn); + } + else + return; + } +} + +/* + * Forget the contents of a transaction if we aren't interested in its + * contents. Needs to be first called for subtransactions and then for the + * toplevel xid. + * + * This is significantly different to ReorderBufferAbort() because + * transactions that have committed need to be treated differently from aborted + * ones since they may have modified the catalog. + * + * Note that this is only allowed to be called in the moment a transaction + * commit has just been read, not earlier; otherwise later records referring + * to this xid might re-create the transaction incompletely. + */ +void +ReorderBufferForget(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, + false); + + /* unknown, nothing to forget */ + if (txn == NULL) + return; + + /* this transaction mustn't be streamed */ + Assert(!rbtxn_is_streamed(txn)); + + /* cosmetic... */ + txn->final_lsn = lsn; + + /* + * Process cache invalidation messages if there are any. Even if we're not + * interested in the transaction's contents, it could have manipulated the + * catalog and we need to update the caches according to that. + */ + if (txn->base_snapshot != NULL && txn->ninvalidations > 0) + ReorderBufferImmediateInvalidation(rb, txn->ninvalidations, + txn->invalidations); + else + Assert(txn->ninvalidations == 0); + + /* remove potential on-disk data, and deallocate */ + ReorderBufferCleanupTXN(rb, txn); +} + +/* + * Invalidate cache for those transactions that need to be skipped just in case + * catalogs were manipulated as part of the transaction. + * + * Note that this is a special-purpose function for prepared transactions where + * we don't want to clean up the TXN even when we decide to skip it. See + * DecodePrepare. + */ +void +ReorderBufferInvalidate(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, + false); + + /* unknown, nothing to do */ + if (txn == NULL) + return; + + /* + * Process cache invalidation messages if there are any. Even if we're not + * interested in the transaction's contents, it could have manipulated the + * catalog and we need to update the caches according to that. + */ + if (txn->base_snapshot != NULL && txn->ninvalidations > 0) + ReorderBufferImmediateInvalidation(rb, txn->ninvalidations, + txn->invalidations); + else + Assert(txn->ninvalidations == 0); +} + + +/* + * Execute invalidations happening outside the context of a decoded + * transaction. That currently happens either for xid-less commits + * (cf. RecordTransactionCommit()) or for invalidations in uninteresting + * transactions (via ReorderBufferForget()). + */ +void +ReorderBufferImmediateInvalidation(ReorderBuffer *rb, uint32 ninvalidations, + SharedInvalidationMessage *invalidations) +{ + bool use_subtxn = IsTransactionOrTransactionBlock(); + int i; + + if (use_subtxn) + BeginInternalSubTransaction("replay"); + + /* + * Force invalidations to happen outside of a valid transaction - that way + * entries will just be marked as invalid without accessing the catalog. + * That's advantageous because we don't need to setup the full state + * necessary for catalog access. + */ + if (use_subtxn) + AbortCurrentTransaction(); + + for (i = 0; i < ninvalidations; i++) + LocalExecuteInvalidationMessage(&invalidations[i]); + + if (use_subtxn) + RollbackAndReleaseCurrentSubTransaction(); +} + +/* + * Tell reorderbuffer about an xid seen in the WAL stream. Has to be called at + * least once for every xid in XLogRecord->xl_xid (other places in records + * may, but do not have to be passed through here). + * + * Reorderbuffer keeps some datastructures about transactions in LSN order, + * for efficiency. To do that it has to know about when transactions are seen + * first in the WAL. As many types of records are not actually interesting for + * logical decoding, they do not necessarily pass though here. + */ +void +ReorderBufferProcessXid(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn) +{ + /* many records won't have an xid assigned, centralize check here */ + if (xid != InvalidTransactionId) + ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true); +} + +/* + * Add a new snapshot to this transaction that may only used after lsn 'lsn' + * because the previous snapshot doesn't describe the catalog correctly for + * following rows. + */ +void +ReorderBufferAddSnapshot(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr lsn, Snapshot snap) +{ + ReorderBufferChange *change = ReorderBufferGetChange(rb); + + change->data.snapshot = snap; + change->action = REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT; + + ReorderBufferQueueChange(rb, xid, lsn, change, false); +} + +/* + * Set up the transaction's base snapshot. + * + * If we know that xid is a subtransaction, set the base snapshot on the + * top-level transaction instead. + */ +void +ReorderBufferSetBaseSnapshot(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr lsn, Snapshot snap) +{ + ReorderBufferTXN *txn; + bool is_new; + + Assert(snap != NULL); + + /* + * Fetch the transaction to operate on. If we know it's a subtransaction, + * operate on its top-level transaction instead. + */ + txn = ReorderBufferTXNByXid(rb, xid, true, &is_new, lsn, true); + if (rbtxn_is_known_subxact(txn)) + txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false, + NULL, InvalidXLogRecPtr, false); + Assert(txn->base_snapshot == NULL); + + txn->base_snapshot = snap; + txn->base_snapshot_lsn = lsn; + dlist_push_tail(&rb->txns_by_base_snapshot_lsn, &txn->base_snapshot_node); + + AssertTXNLsnOrder(rb); +} + +/* + * Access the catalog with this CommandId at this point in the changestream. + * + * May only be called for command ids > 1 + */ +void +ReorderBufferAddNewCommandId(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr lsn, CommandId cid) +{ + ReorderBufferChange *change = ReorderBufferGetChange(rb); + + change->data.command_id = cid; + change->action = REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID; + + ReorderBufferQueueChange(rb, xid, lsn, change, false); +} + +/* + * Update memory counters to account for the new or removed change. + * + * We update two counters - in the reorder buffer, and in the transaction + * containing the change. The reorder buffer counter allows us to quickly + * decide if we reached the memory limit, the transaction counter allows + * us to quickly pick the largest transaction for eviction. + * + * When streaming is enabled, we need to update the toplevel transaction + * counters instead - we don't really care about subtransactions as we + * can't stream them individually anyway, and we only pick toplevel + * transactions for eviction. So only toplevel transactions matter. + */ +static void +ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb, + ReorderBufferChange *change, + bool addition, Size sz) +{ + ReorderBufferTXN *txn; + ReorderBufferTXN *toptxn; + + Assert(change->txn); + + /* + * Ignore tuple CID changes, because those are not evicted when reaching + * memory limit. So we just don't count them, because it might easily + * trigger a pointless attempt to spill. + */ + if (change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID) + return; + + txn = change->txn; + + /* + * Update the total size in top level as well. This is later used to + * compute the decoding stats. + */ + toptxn = rbtxn_get_toptxn(txn); + + if (addition) + { + txn->size += sz; + rb->size += sz; + + /* Update the total size in the top transaction. */ + toptxn->total_size += sz; + } + else + { + Assert((rb->size >= sz) && (txn->size >= sz)); + txn->size -= sz; + rb->size -= sz; + + /* Update the total size in the top transaction. */ + toptxn->total_size -= sz; + } + + Assert(txn->size <= rb->size); +} + +/* + * Add new (relfilelocator, tid) -> (cmin, cmax) mappings. + * + * We do not include this change type in memory accounting, because we + * keep CIDs in a separate list and do not evict them when reaching + * the memory limit. + */ +void +ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr lsn, RelFileLocator locator, + ItemPointerData tid, CommandId cmin, + CommandId cmax, CommandId combocid) +{ + ReorderBufferChange *change = ReorderBufferGetChange(rb); + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true); + + change->data.tuplecid.locator = locator; + change->data.tuplecid.tid = tid; + change->data.tuplecid.cmin = cmin; + change->data.tuplecid.cmax = cmax; + change->data.tuplecid.combocid = combocid; + change->lsn = lsn; + change->txn = txn; + change->action = REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID; + + dlist_push_tail(&txn->tuplecids, &change->node); + txn->ntuplecids++; +} + +/* + * Accumulate the invalidations for executing them later. + * + * This needs to be called for each XLOG_XACT_INVALIDATIONS message and + * accumulates all the invalidation messages in the toplevel transaction, if + * available, otherwise in the current transaction, as well as in the form of + * change in reorder buffer. We require to record it in form of the change + * so that we can execute only the required invalidations instead of executing + * all the invalidations on each CommandId increment. We also need to + * accumulate these in the txn buffer because in some cases where we skip + * processing the transaction (see ReorderBufferForget), we need to execute + * all the invalidations together. + */ +void +ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr lsn, Size nmsgs, + SharedInvalidationMessage *msgs) +{ + ReorderBufferTXN *txn; + MemoryContext oldcontext; + ReorderBufferChange *change; + + txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true); + + oldcontext = MemoryContextSwitchTo(rb->context); + + /* + * Collect all the invalidations under the top transaction, if available, + * so that we can execute them all together. See comments atop this + * function. + */ + txn = rbtxn_get_toptxn(txn); + + Assert(nmsgs > 0); + + /* Accumulate invalidations. */ + if (txn->ninvalidations == 0) + { + txn->ninvalidations = nmsgs; + txn->invalidations = (SharedInvalidationMessage *) + palloc(sizeof(SharedInvalidationMessage) * nmsgs); + memcpy(txn->invalidations, msgs, + sizeof(SharedInvalidationMessage) * nmsgs); + } + else + { + txn->invalidations = (SharedInvalidationMessage *) + repalloc(txn->invalidations, sizeof(SharedInvalidationMessage) * + (txn->ninvalidations + nmsgs)); + + memcpy(txn->invalidations + txn->ninvalidations, msgs, + nmsgs * sizeof(SharedInvalidationMessage)); + txn->ninvalidations += nmsgs; + } + + change = ReorderBufferGetChange(rb); + change->action = REORDER_BUFFER_CHANGE_INVALIDATION; + change->data.inval.ninvalidations = nmsgs; + change->data.inval.invalidations = (SharedInvalidationMessage *) + palloc(sizeof(SharedInvalidationMessage) * nmsgs); + memcpy(change->data.inval.invalidations, msgs, + sizeof(SharedInvalidationMessage) * nmsgs); + + ReorderBufferQueueChange(rb, xid, lsn, change, false); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Apply all invalidations we know. Possibly we only need parts at this point + * in the changestream but we don't know which those are. + */ +static void +ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs) +{ + int i; + + for (i = 0; i < nmsgs; i++) + LocalExecuteInvalidationMessage(&msgs[i]); +} + +/* + * Mark a transaction as containing catalog changes + */ +void +ReorderBufferXidSetCatalogChanges(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr lsn) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true); + + if (!rbtxn_has_catalog_changes(txn)) + { + txn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES; + dclist_push_tail(&rb->catchange_txns, &txn->catchange_node); + } + + /* + * Mark top-level transaction as having catalog changes too if one of its + * children has so that the ReorderBufferBuildTupleCidHash can + * conveniently check just top-level transaction and decide whether to + * build the hash table or not. + */ + if (rbtxn_is_subtxn(txn)) + { + ReorderBufferTXN *toptxn = rbtxn_get_toptxn(txn); + + if (!rbtxn_has_catalog_changes(toptxn)) + { + toptxn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES; + dclist_push_tail(&rb->catchange_txns, &toptxn->catchange_node); + } + } +} + +/* + * Return palloc'ed array of the transactions that have changed catalogs. + * The returned array is sorted in xidComparator order. + * + * The caller must free the returned array when done with it. + */ +TransactionId * +ReorderBufferGetCatalogChangesXacts(ReorderBuffer *rb) +{ + dlist_iter iter; + TransactionId *xids = NULL; + size_t xcnt = 0; + + /* Quick return if the list is empty */ + if (dclist_count(&rb->catchange_txns) == 0) + return NULL; + + /* Initialize XID array */ + xids = (TransactionId *) palloc(sizeof(TransactionId) * + dclist_count(&rb->catchange_txns)); + dclist_foreach(iter, &rb->catchange_txns) + { + ReorderBufferTXN *txn = dclist_container(ReorderBufferTXN, + catchange_node, + iter.cur); + + Assert(rbtxn_has_catalog_changes(txn)); + + xids[xcnt++] = txn->xid; + } + + qsort(xids, xcnt, sizeof(TransactionId), xidComparator); + + Assert(xcnt == dclist_count(&rb->catchange_txns)); + return xids; +} + +/* + * Query whether a transaction is already *known* to contain catalog + * changes. This can be wrong until directly before the commit! + */ +bool +ReorderBufferXidHasCatalogChanges(ReorderBuffer *rb, TransactionId xid) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, + false); + if (txn == NULL) + return false; + + return rbtxn_has_catalog_changes(txn); +} + +/* + * ReorderBufferXidHasBaseSnapshot + * Have we already set the base snapshot for the given txn/subtxn? + */ +bool +ReorderBufferXidHasBaseSnapshot(ReorderBuffer *rb, TransactionId xid) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, false, + NULL, InvalidXLogRecPtr, false); + + /* transaction isn't known yet, ergo no snapshot */ + if (txn == NULL) + return false; + + /* a known subtxn? operate on top-level txn instead */ + if (rbtxn_is_known_subxact(txn)) + txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false, + NULL, InvalidXLogRecPtr, false); + + return txn->base_snapshot != NULL; +} + + +/* + * --------------------------------------- + * Disk serialization support + * --------------------------------------- + */ + +/* + * Ensure the IO buffer is >= sz. + */ +static void +ReorderBufferSerializeReserve(ReorderBuffer *rb, Size sz) +{ + if (!rb->outbufsize) + { + rb->outbuf = MemoryContextAlloc(rb->context, sz); + rb->outbufsize = sz; + } + else if (rb->outbufsize < sz) + { + rb->outbuf = repalloc(rb->outbuf, sz); + rb->outbufsize = sz; + } +} + +/* + * Find the largest transaction (toplevel or subxact) to evict (spill to disk). + * + * XXX With many subtransactions this might be quite slow, because we'll have + * to walk through all of them. There are some options how we could improve + * that: (a) maintain some secondary structure with transactions sorted by + * amount of changes, (b) not looking for the entirely largest transaction, + * but e.g. for transaction using at least some fraction of the memory limit, + * and (c) evicting multiple transactions at once, e.g. to free a given portion + * of the memory limit (e.g. 50%). + */ +static ReorderBufferTXN * +ReorderBufferLargestTXN(ReorderBuffer *rb) +{ + HASH_SEQ_STATUS hash_seq; + ReorderBufferTXNByIdEnt *ent; + ReorderBufferTXN *largest = NULL; + + hash_seq_init(&hash_seq, rb->by_txn); + while ((ent = hash_seq_search(&hash_seq)) != NULL) + { + ReorderBufferTXN *txn = ent->txn; + + /* if the current transaction is larger, remember it */ + if ((!largest) || (txn->size > largest->size)) + largest = txn; + } + + Assert(largest); + Assert(largest->size > 0); + Assert(largest->size <= rb->size); + + return largest; +} + +/* + * Find the largest streamable toplevel transaction to evict (by streaming). + * + * This can be seen as an optimized version of ReorderBufferLargestTXN, which + * should give us the same transaction (because we don't update memory account + * for subtransaction with streaming, so it's always 0). But we can simply + * iterate over the limited number of toplevel transactions that have a base + * snapshot. There is no use of selecting a transaction that doesn't have base + * snapshot because we don't decode such transactions. Also, we do not select + * the transaction which doesn't have any streamable change. + * + * Note that, we skip transactions that contains incomplete changes. There + * is a scope of optimization here such that we can select the largest + * transaction which has incomplete changes. But that will make the code and + * design quite complex and that might not be worth the benefit. If we plan to + * stream the transactions that contains incomplete changes then we need to + * find a way to partially stream/truncate the transaction changes in-memory + * and build a mechanism to partially truncate the spilled files. + * Additionally, whenever we partially stream the transaction we need to + * maintain the last streamed lsn and next time we need to restore from that + * segment and the offset in WAL. As we stream the changes from the top + * transaction and restore them subtransaction wise, we need to even remember + * the subxact from where we streamed the last change. + */ +static ReorderBufferTXN * +ReorderBufferLargestStreamableTopTXN(ReorderBuffer *rb) +{ + dlist_iter iter; + Size largest_size = 0; + ReorderBufferTXN *largest = NULL; + + /* Find the largest top-level transaction having a base snapshot. */ + dlist_foreach(iter, &rb->txns_by_base_snapshot_lsn) + { + ReorderBufferTXN *txn; + + txn = dlist_container(ReorderBufferTXN, base_snapshot_node, iter.cur); + + /* must not be a subtxn */ + Assert(!rbtxn_is_known_subxact(txn)); + /* base_snapshot must be set */ + Assert(txn->base_snapshot != NULL); + + if ((largest == NULL || txn->total_size > largest_size) && + (txn->total_size > 0) && !(rbtxn_has_partial_change(txn)) && + rbtxn_has_streamable_change(txn)) + { + largest = txn; + largest_size = txn->total_size; + } + } + + return largest; +} + +/* + * Check whether the logical_decoding_work_mem limit was reached, and if yes + * pick the largest (sub)transaction at-a-time to evict and spill its changes to + * disk or send to the output plugin until we reach under the memory limit. + * + * If debug_logical_replication_streaming is set to "immediate", stream or + * serialize the changes immediately. + * + * XXX At this point we select the transactions until we reach under the memory + * limit, but we might also adapt a more elaborate eviction strategy - for example + * evicting enough transactions to free certain fraction (e.g. 50%) of the memory + * limit. + */ +static void +ReorderBufferCheckMemoryLimit(ReorderBuffer *rb) +{ + ReorderBufferTXN *txn; + + /* + * Bail out if debug_logical_replication_streaming is buffered and we + * haven't exceeded the memory limit. + */ + if (debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_BUFFERED && + rb->size < logical_decoding_work_mem * 1024L) + return; + + /* + * If debug_logical_replication_streaming is immediate, loop until there's + * no change. Otherwise, loop until we reach under the memory limit. One + * might think that just by evicting the largest (sub)transaction we will + * come under the memory limit based on assumption that the selected + * transaction is at least as large as the most recent change (which + * caused us to go over the memory limit). However, that is not true + * because a user can reduce the logical_decoding_work_mem to a smaller + * value before the most recent change. + */ + while (rb->size >= logical_decoding_work_mem * 1024L || + (debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_IMMEDIATE && + rb->size > 0)) + { + /* + * Pick the largest transaction and evict it from memory by streaming, + * if possible. Otherwise, spill to disk. + */ + if (ReorderBufferCanStartStreaming(rb) && + (txn = ReorderBufferLargestStreamableTopTXN(rb)) != NULL) + { + /* we know there has to be one, because the size is not zero */ + Assert(txn && rbtxn_is_toptxn(txn)); + Assert(txn->total_size > 0); + Assert(rb->size >= txn->total_size); + + ReorderBufferStreamTXN(rb, txn); + } + else + { + /* + * Pick the largest transaction (or subtransaction) and evict it + * from memory by serializing it to disk. + */ + txn = ReorderBufferLargestTXN(rb); + + /* we know there has to be one, because the size is not zero */ + Assert(txn); + Assert(txn->size > 0); + Assert(rb->size >= txn->size); + + ReorderBufferSerializeTXN(rb, txn); + } + + /* + * After eviction, the transaction should have no entries in memory, + * and should use 0 bytes for changes. + */ + Assert(txn->size == 0); + Assert(txn->nentries_mem == 0); + } + + /* We must be under the memory limit now. */ + Assert(rb->size < logical_decoding_work_mem * 1024L); +} + +/* + * Spill data of a large transaction (and its subtransactions) to disk. + */ +static void +ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + dlist_iter subtxn_i; + dlist_mutable_iter change_i; + int fd = -1; + XLogSegNo curOpenSegNo = 0; + Size spilled = 0; + Size size = txn->size; + + elog(DEBUG2, "spill %u changes in XID %u to disk", + (uint32) txn->nentries_mem, txn->xid); + + /* do the same to all child TXs */ + dlist_foreach(subtxn_i, &txn->subtxns) + { + ReorderBufferTXN *subtxn; + + subtxn = dlist_container(ReorderBufferTXN, node, subtxn_i.cur); + ReorderBufferSerializeTXN(rb, subtxn); + } + + /* serialize changestream */ + dlist_foreach_modify(change_i, &txn->changes) + { + ReorderBufferChange *change; + + change = dlist_container(ReorderBufferChange, node, change_i.cur); + + /* + * store in segment in which it belongs by start lsn, don't split over + * multiple segments tho + */ + if (fd == -1 || + !XLByteInSeg(change->lsn, curOpenSegNo, wal_segment_size)) + { + char path[MAXPGPATH]; + + if (fd != -1) + CloseTransientFile(fd); + + XLByteToSeg(change->lsn, curOpenSegNo, wal_segment_size); + + /* + * No need to care about TLIs here, only used during a single run, + * so each LSN only maps to a specific WAL record. + */ + ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid, + curOpenSegNo); + + /* open segment, create it if necessary */ + fd = OpenTransientFile(path, + O_CREAT | O_WRONLY | O_APPEND | PG_BINARY); + + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + } + + ReorderBufferSerializeChange(rb, txn, fd, change); + dlist_delete(&change->node); + ReorderBufferReturnChange(rb, change, true); + + spilled++; + } + + /* update the statistics iff we have spilled anything */ + if (spilled) + { + rb->spillCount += 1; + rb->spillBytes += size; + + /* don't consider already serialized transactions */ + rb->spillTxns += (rbtxn_is_serialized(txn) || rbtxn_is_serialized_clear(txn)) ? 0 : 1; + + /* update the decoding stats */ + UpdateDecodingStats((LogicalDecodingContext *) rb->private_data); + } + + Assert(spilled == txn->nentries_mem); + Assert(dlist_is_empty(&txn->changes)); + txn->nentries_mem = 0; + txn->txn_flags |= RBTXN_IS_SERIALIZED; + + if (fd != -1) + CloseTransientFile(fd); +} + +/* + * Serialize individual change to disk. + */ +static void +ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn, + int fd, ReorderBufferChange *change) +{ + ReorderBufferDiskChange *ondisk; + Size sz = sizeof(ReorderBufferDiskChange); + + ReorderBufferSerializeReserve(rb, sz); + + ondisk = (ReorderBufferDiskChange *) rb->outbuf; + memcpy(&ondisk->change, change, sizeof(ReorderBufferChange)); + + switch (change->action) + { + /* fall through these, they're all similar enough */ + case REORDER_BUFFER_CHANGE_INSERT: + case REORDER_BUFFER_CHANGE_UPDATE: + case REORDER_BUFFER_CHANGE_DELETE: + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT: + { + char *data; + ReorderBufferTupleBuf *oldtup, + *newtup; + Size oldlen = 0; + Size newlen = 0; + + oldtup = change->data.tp.oldtuple; + newtup = change->data.tp.newtuple; + + if (oldtup) + { + sz += sizeof(HeapTupleData); + oldlen = oldtup->tuple.t_len; + sz += oldlen; + } + + if (newtup) + { + sz += sizeof(HeapTupleData); + newlen = newtup->tuple.t_len; + sz += newlen; + } + + /* make sure we have enough space */ + ReorderBufferSerializeReserve(rb, sz); + + data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange); + /* might have been reallocated above */ + ondisk = (ReorderBufferDiskChange *) rb->outbuf; + + if (oldlen) + { + memcpy(data, &oldtup->tuple, sizeof(HeapTupleData)); + data += sizeof(HeapTupleData); + + memcpy(data, oldtup->tuple.t_data, oldlen); + data += oldlen; + } + + if (newlen) + { + memcpy(data, &newtup->tuple, sizeof(HeapTupleData)); + data += sizeof(HeapTupleData); + + memcpy(data, newtup->tuple.t_data, newlen); + data += newlen; + } + break; + } + case REORDER_BUFFER_CHANGE_MESSAGE: + { + char *data; + Size prefix_size = strlen(change->data.msg.prefix) + 1; + + sz += prefix_size + change->data.msg.message_size + + sizeof(Size) + sizeof(Size); + ReorderBufferSerializeReserve(rb, sz); + + data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange); + + /* might have been reallocated above */ + ondisk = (ReorderBufferDiskChange *) rb->outbuf; + + /* write the prefix including the size */ + memcpy(data, &prefix_size, sizeof(Size)); + data += sizeof(Size); + memcpy(data, change->data.msg.prefix, + prefix_size); + data += prefix_size; + + /* write the message including the size */ + memcpy(data, &change->data.msg.message_size, sizeof(Size)); + data += sizeof(Size); + memcpy(data, change->data.msg.message, + change->data.msg.message_size); + data += change->data.msg.message_size; + + break; + } + case REORDER_BUFFER_CHANGE_INVALIDATION: + { + char *data; + Size inval_size = sizeof(SharedInvalidationMessage) * + change->data.inval.ninvalidations; + + sz += inval_size; + + ReorderBufferSerializeReserve(rb, sz); + data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange); + + /* might have been reallocated above */ + ondisk = (ReorderBufferDiskChange *) rb->outbuf; + memcpy(data, change->data.inval.invalidations, inval_size); + data += inval_size; + + break; + } + case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT: + { + Snapshot snap; + char *data; + + snap = change->data.snapshot; + + sz += sizeof(SnapshotData) + + sizeof(TransactionId) * snap->xcnt + + sizeof(TransactionId) * snap->subxcnt; + + /* make sure we have enough space */ + ReorderBufferSerializeReserve(rb, sz); + data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange); + /* might have been reallocated above */ + ondisk = (ReorderBufferDiskChange *) rb->outbuf; + + memcpy(data, snap, sizeof(SnapshotData)); + data += sizeof(SnapshotData); + + if (snap->xcnt) + { + memcpy(data, snap->xip, + sizeof(TransactionId) * snap->xcnt); + data += sizeof(TransactionId) * snap->xcnt; + } + + if (snap->subxcnt) + { + memcpy(data, snap->subxip, + sizeof(TransactionId) * snap->subxcnt); + data += sizeof(TransactionId) * snap->subxcnt; + } + break; + } + case REORDER_BUFFER_CHANGE_TRUNCATE: + { + Size size; + char *data; + + /* account for the OIDs of truncated relations */ + size = sizeof(Oid) * change->data.truncate.nrelids; + sz += size; + + /* make sure we have enough space */ + ReorderBufferSerializeReserve(rb, sz); + + data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange); + /* might have been reallocated above */ + ondisk = (ReorderBufferDiskChange *) rb->outbuf; + + memcpy(data, change->data.truncate.relids, size); + data += size; + + break; + } + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM: + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT: + case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID: + case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID: + /* ReorderBufferChange contains everything important */ + break; + } + + ondisk->size = sz; + + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_REORDER_BUFFER_WRITE); + if (write(fd, rb->outbuf, ondisk->size) != ondisk->size) + { + int save_errno = errno; + + CloseTransientFile(fd); + + /* if write didn't set errno, assume problem is no disk space */ + errno = save_errno ? save_errno : ENOSPC; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to data file for XID %u: %m", + txn->xid))); + } + pgstat_report_wait_end(); + + /* + * Keep the transaction's final_lsn up to date with each change we send to + * disk, so that ReorderBufferRestoreCleanup works correctly. (We used to + * only do this on commit and abort records, but that doesn't work if a + * system crash leaves a transaction without its abort record). + * + * Make sure not to move it backwards. + */ + if (txn->final_lsn < change->lsn) + txn->final_lsn = change->lsn; + + Assert(ondisk->change.action == change->action); +} + +/* Returns true, if the output plugin supports streaming, false, otherwise. */ +static inline bool +ReorderBufferCanStream(ReorderBuffer *rb) +{ + LogicalDecodingContext *ctx = rb->private_data; + + return ctx->streaming; +} + +/* Returns true, if the streaming can be started now, false, otherwise. */ +static inline bool +ReorderBufferCanStartStreaming(ReorderBuffer *rb) +{ + LogicalDecodingContext *ctx = rb->private_data; + SnapBuild *builder = ctx->snapshot_builder; + + /* We can't start streaming unless a consistent state is reached. */ + if (SnapBuildCurrentState(builder) < SNAPBUILD_CONSISTENT) + return false; + + /* + * We can't start streaming immediately even if the streaming is enabled + * because we previously decoded this transaction and now just are + * restarting. + */ + if (ReorderBufferCanStream(rb) && + !SnapBuildXactNeedsSkip(builder, ctx->reader->ReadRecPtr)) + return true; + + return false; +} + +/* + * Send data of a large transaction (and its subtransactions) to the + * output plugin, but using the stream API. + */ +static void +ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + Snapshot snapshot_now; + CommandId command_id; + Size stream_bytes; + bool txn_is_streamed; + + /* We can never reach here for a subtransaction. */ + Assert(rbtxn_is_toptxn(txn)); + + /* + * We can't make any assumptions about base snapshot here, similar to what + * ReorderBufferCommit() does. That relies on base_snapshot getting + * transferred from subxact in ReorderBufferCommitChild(), but that was + * not yet called as the transaction is in-progress. + * + * So just walk the subxacts and use the same logic here. But we only need + * to do that once, when the transaction is streamed for the first time. + * After that we need to reuse the snapshot from the previous run. + * + * Unlike DecodeCommit which adds xids of all the subtransactions in + * snapshot's xip array via SnapBuildCommitTxn, we can't do that here but + * we do add them to subxip array instead via ReorderBufferCopySnap. This + * allows the catalog changes made in subtransactions decoded till now to + * be visible. + */ + if (txn->snapshot_now == NULL) + { + dlist_iter subxact_i; + + /* make sure this transaction is streamed for the first time */ + Assert(!rbtxn_is_streamed(txn)); + + /* at the beginning we should have invalid command ID */ + Assert(txn->command_id == InvalidCommandId); + + dlist_foreach(subxact_i, &txn->subtxns) + { + ReorderBufferTXN *subtxn; + + subtxn = dlist_container(ReorderBufferTXN, node, subxact_i.cur); + ReorderBufferTransferSnapToParent(txn, subtxn); + } + + /* + * If this transaction has no snapshot, it didn't make any changes to + * the database till now, so there's nothing to decode. + */ + if (txn->base_snapshot == NULL) + { + Assert(txn->ninvalidations == 0); + return; + } + + command_id = FirstCommandId; + snapshot_now = ReorderBufferCopySnap(rb, txn->base_snapshot, + txn, command_id); + } + else + { + /* the transaction must have been already streamed */ + Assert(rbtxn_is_streamed(txn)); + + /* + * Nah, we already have snapshot from the previous streaming run. We + * assume new subxacts can't move the LSN backwards, and so can't beat + * the LSN condition in the previous branch (so no need to walk + * through subxacts again). In fact, we must not do that as we may be + * using snapshot half-way through the subxact. + */ + command_id = txn->command_id; + + /* + * We can't use txn->snapshot_now directly because after the last + * streaming run, we might have got some new sub-transactions. So we + * need to add them to the snapshot. + */ + snapshot_now = ReorderBufferCopySnap(rb, txn->snapshot_now, + txn, command_id); + + /* Free the previously copied snapshot. */ + Assert(txn->snapshot_now->copied); + ReorderBufferFreeSnap(rb, txn->snapshot_now); + txn->snapshot_now = NULL; + } + + /* + * Remember this information to be used later to update stats. We can't + * update the stats here as an error while processing the changes would + * lead to the accumulation of stats even though we haven't streamed all + * the changes. + */ + txn_is_streamed = rbtxn_is_streamed(txn); + stream_bytes = txn->total_size; + + /* Process and send the changes to output plugin. */ + ReorderBufferProcessTXN(rb, txn, InvalidXLogRecPtr, snapshot_now, + command_id, true); + + rb->streamCount += 1; + rb->streamBytes += stream_bytes; + + /* Don't consider already streamed transaction. */ + rb->streamTxns += (txn_is_streamed) ? 0 : 1; + + /* update the decoding stats */ + UpdateDecodingStats((LogicalDecodingContext *) rb->private_data); + + Assert(dlist_is_empty(&txn->changes)); + Assert(txn->nentries == 0); + Assert(txn->nentries_mem == 0); +} + +/* + * Size of a change in memory. + */ +static Size +ReorderBufferChangeSize(ReorderBufferChange *change) +{ + Size sz = sizeof(ReorderBufferChange); + + switch (change->action) + { + /* fall through these, they're all similar enough */ + case REORDER_BUFFER_CHANGE_INSERT: + case REORDER_BUFFER_CHANGE_UPDATE: + case REORDER_BUFFER_CHANGE_DELETE: + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT: + { + ReorderBufferTupleBuf *oldtup, + *newtup; + Size oldlen = 0; + Size newlen = 0; + + oldtup = change->data.tp.oldtuple; + newtup = change->data.tp.newtuple; + + if (oldtup) + { + sz += sizeof(HeapTupleData); + oldlen = oldtup->tuple.t_len; + sz += oldlen; + } + + if (newtup) + { + sz += sizeof(HeapTupleData); + newlen = newtup->tuple.t_len; + sz += newlen; + } + + break; + } + case REORDER_BUFFER_CHANGE_MESSAGE: + { + Size prefix_size = strlen(change->data.msg.prefix) + 1; + + sz += prefix_size + change->data.msg.message_size + + sizeof(Size) + sizeof(Size); + + break; + } + case REORDER_BUFFER_CHANGE_INVALIDATION: + { + sz += sizeof(SharedInvalidationMessage) * + change->data.inval.ninvalidations; + break; + } + case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT: + { + Snapshot snap; + + snap = change->data.snapshot; + + sz += sizeof(SnapshotData) + + sizeof(TransactionId) * snap->xcnt + + sizeof(TransactionId) * snap->subxcnt; + + break; + } + case REORDER_BUFFER_CHANGE_TRUNCATE: + { + sz += sizeof(Oid) * change->data.truncate.nrelids; + + break; + } + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM: + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT: + case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID: + case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID: + /* ReorderBufferChange contains everything important */ + break; + } + + return sz; +} + + +/* + * Restore a number of changes spilled to disk back into memory. + */ +static Size +ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn, + TXNEntryFile *file, XLogSegNo *segno) +{ + Size restored = 0; + XLogSegNo last_segno; + dlist_mutable_iter cleanup_iter; + File *fd = &file->vfd; + + Assert(txn->first_lsn != InvalidXLogRecPtr); + Assert(txn->final_lsn != InvalidXLogRecPtr); + + /* free current entries, so we have memory for more */ + dlist_foreach_modify(cleanup_iter, &txn->changes) + { + ReorderBufferChange *cleanup = + dlist_container(ReorderBufferChange, node, cleanup_iter.cur); + + dlist_delete(&cleanup->node); + ReorderBufferReturnChange(rb, cleanup, true); + } + txn->nentries_mem = 0; + Assert(dlist_is_empty(&txn->changes)); + + XLByteToSeg(txn->final_lsn, last_segno, wal_segment_size); + + while (restored < max_changes_in_memory && *segno <= last_segno) + { + int readBytes; + ReorderBufferDiskChange *ondisk; + + CHECK_FOR_INTERRUPTS(); + + if (*fd == -1) + { + char path[MAXPGPATH]; + + /* first time in */ + if (*segno == 0) + XLByteToSeg(txn->first_lsn, *segno, wal_segment_size); + + Assert(*segno != 0 || dlist_is_empty(&txn->changes)); + + /* + * No need to care about TLIs here, only used during a single run, + * so each LSN only maps to a specific WAL record. + */ + ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid, + *segno); + + *fd = PathNameOpenFile(path, O_RDONLY | PG_BINARY); + + /* No harm in resetting the offset even in case of failure */ + file->curOffset = 0; + + if (*fd < 0 && errno == ENOENT) + { + *fd = -1; + (*segno)++; + continue; + } + else if (*fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", + path))); + } + + /* + * Read the statically sized part of a change which has information + * about the total size. If we couldn't read a record, we're at the + * end of this file. + */ + ReorderBufferSerializeReserve(rb, sizeof(ReorderBufferDiskChange)); + readBytes = FileRead(file->vfd, rb->outbuf, + sizeof(ReorderBufferDiskChange), + file->curOffset, WAIT_EVENT_REORDER_BUFFER_READ); + + /* eof */ + if (readBytes == 0) + { + FileClose(*fd); + *fd = -1; + (*segno)++; + continue; + } + else if (readBytes < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from reorderbuffer spill file: %m"))); + else if (readBytes != sizeof(ReorderBufferDiskChange)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes", + readBytes, + (uint32) sizeof(ReorderBufferDiskChange)))); + + file->curOffset += readBytes; + + ondisk = (ReorderBufferDiskChange *) rb->outbuf; + + ReorderBufferSerializeReserve(rb, + sizeof(ReorderBufferDiskChange) + ondisk->size); + ondisk = (ReorderBufferDiskChange *) rb->outbuf; + + readBytes = FileRead(file->vfd, + rb->outbuf + sizeof(ReorderBufferDiskChange), + ondisk->size - sizeof(ReorderBufferDiskChange), + file->curOffset, + WAIT_EVENT_REORDER_BUFFER_READ); + + if (readBytes < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from reorderbuffer spill file: %m"))); + else if (readBytes != ondisk->size - sizeof(ReorderBufferDiskChange)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes", + readBytes, + (uint32) (ondisk->size - sizeof(ReorderBufferDiskChange))))); + + file->curOffset += readBytes; + + /* + * ok, read a full change from disk, now restore it into proper + * in-memory format + */ + ReorderBufferRestoreChange(rb, txn, rb->outbuf); + restored++; + } + + return restored; +} + +/* + * Convert change from its on-disk format to in-memory format and queue it onto + * the TXN's ->changes list. + * + * Note: although "data" is declared char*, at entry it points to a + * maxalign'd buffer, making it safe in most of this function to assume + * that the pointed-to data is suitably aligned for direct access. + */ +static void +ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn, + char *data) +{ + ReorderBufferDiskChange *ondisk; + ReorderBufferChange *change; + + ondisk = (ReorderBufferDiskChange *) data; + + change = ReorderBufferGetChange(rb); + + /* copy static part */ + memcpy(change, &ondisk->change, sizeof(ReorderBufferChange)); + + data += sizeof(ReorderBufferDiskChange); + + /* restore individual stuff */ + switch (change->action) + { + /* fall through these, they're all similar enough */ + case REORDER_BUFFER_CHANGE_INSERT: + case REORDER_BUFFER_CHANGE_UPDATE: + case REORDER_BUFFER_CHANGE_DELETE: + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT: + if (change->data.tp.oldtuple) + { + uint32 tuplelen = ((HeapTuple) data)->t_len; + + change->data.tp.oldtuple = + ReorderBufferGetTupleBuf(rb, tuplelen - SizeofHeapTupleHeader); + + /* restore ->tuple */ + memcpy(&change->data.tp.oldtuple->tuple, data, + sizeof(HeapTupleData)); + data += sizeof(HeapTupleData); + + /* reset t_data pointer into the new tuplebuf */ + change->data.tp.oldtuple->tuple.t_data = + ReorderBufferTupleBufData(change->data.tp.oldtuple); + + /* restore tuple data itself */ + memcpy(change->data.tp.oldtuple->tuple.t_data, data, tuplelen); + data += tuplelen; + } + + if (change->data.tp.newtuple) + { + /* here, data might not be suitably aligned! */ + uint32 tuplelen; + + memcpy(&tuplelen, data + offsetof(HeapTupleData, t_len), + sizeof(uint32)); + + change->data.tp.newtuple = + ReorderBufferGetTupleBuf(rb, tuplelen - SizeofHeapTupleHeader); + + /* restore ->tuple */ + memcpy(&change->data.tp.newtuple->tuple, data, + sizeof(HeapTupleData)); + data += sizeof(HeapTupleData); + + /* reset t_data pointer into the new tuplebuf */ + change->data.tp.newtuple->tuple.t_data = + ReorderBufferTupleBufData(change->data.tp.newtuple); + + /* restore tuple data itself */ + memcpy(change->data.tp.newtuple->tuple.t_data, data, tuplelen); + data += tuplelen; + } + + break; + case REORDER_BUFFER_CHANGE_MESSAGE: + { + Size prefix_size; + + /* read prefix */ + memcpy(&prefix_size, data, sizeof(Size)); + data += sizeof(Size); + change->data.msg.prefix = MemoryContextAlloc(rb->context, + prefix_size); + memcpy(change->data.msg.prefix, data, prefix_size); + Assert(change->data.msg.prefix[prefix_size - 1] == '\0'); + data += prefix_size; + + /* read the message */ + memcpy(&change->data.msg.message_size, data, sizeof(Size)); + data += sizeof(Size); + change->data.msg.message = MemoryContextAlloc(rb->context, + change->data.msg.message_size); + memcpy(change->data.msg.message, data, + change->data.msg.message_size); + data += change->data.msg.message_size; + + break; + } + case REORDER_BUFFER_CHANGE_INVALIDATION: + { + Size inval_size = sizeof(SharedInvalidationMessage) * + change->data.inval.ninvalidations; + + change->data.inval.invalidations = + MemoryContextAlloc(rb->context, inval_size); + + /* read the message */ + memcpy(change->data.inval.invalidations, data, inval_size); + + break; + } + case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT: + { + Snapshot oldsnap; + Snapshot newsnap; + Size size; + + oldsnap = (Snapshot) data; + + size = sizeof(SnapshotData) + + sizeof(TransactionId) * oldsnap->xcnt + + sizeof(TransactionId) * (oldsnap->subxcnt + 0); + + change->data.snapshot = MemoryContextAllocZero(rb->context, size); + + newsnap = change->data.snapshot; + + memcpy(newsnap, data, size); + newsnap->xip = (TransactionId *) + (((char *) newsnap) + sizeof(SnapshotData)); + newsnap->subxip = newsnap->xip + newsnap->xcnt; + newsnap->copied = true; + break; + } + /* the base struct contains all the data, easy peasy */ + case REORDER_BUFFER_CHANGE_TRUNCATE: + { + Oid *relids; + + relids = ReorderBufferGetRelids(rb, + change->data.truncate.nrelids); + memcpy(relids, data, change->data.truncate.nrelids * sizeof(Oid)); + change->data.truncate.relids = relids; + + break; + } + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM: + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT: + case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID: + case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID: + break; + } + + dlist_push_tail(&txn->changes, &change->node); + txn->nentries_mem++; + + /* + * Update memory accounting for the restored change. We need to do this + * although we don't check the memory limit when restoring the changes in + * this branch (we only do that when initially queueing the changes after + * decoding), because we will release the changes later, and that will + * update the accounting too (subtracting the size from the counters). And + * we don't want to underflow there. + */ + ReorderBufferChangeMemoryUpdate(rb, change, true, + ReorderBufferChangeSize(change)); +} + +/* + * Remove all on-disk stored for the passed in transaction. + */ +static void +ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + XLogSegNo first; + XLogSegNo cur; + XLogSegNo last; + + Assert(txn->first_lsn != InvalidXLogRecPtr); + Assert(txn->final_lsn != InvalidXLogRecPtr); + + XLByteToSeg(txn->first_lsn, first, wal_segment_size); + XLByteToSeg(txn->final_lsn, last, wal_segment_size); + + /* iterate over all possible filenames, and delete them */ + for (cur = first; cur <= last; cur++) + { + char path[MAXPGPATH]; + + ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid, cur); + if (unlink(path) != 0 && errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", path))); + } +} + +/* + * Remove any leftover serialized reorder buffers from a slot directory after a + * prior crash or decoding session exit. + */ +static void +ReorderBufferCleanupSerializedTXNs(const char *slotname) +{ + DIR *spill_dir; + struct dirent *spill_de; + struct stat statbuf; + char path[MAXPGPATH * 2 + 12]; + + sprintf(path, "pg_replslot/%s", slotname); + + /* we're only handling directories here, skip if it's not ours */ + if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode)) + return; + + spill_dir = AllocateDir(path); + while ((spill_de = ReadDirExtended(spill_dir, path, INFO)) != NULL) + { + /* only look at names that can be ours */ + if (strncmp(spill_de->d_name, "xid", 3) == 0) + { + snprintf(path, sizeof(path), + "pg_replslot/%s/%s", slotname, + spill_de->d_name); + + if (unlink(path) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\" during removal of pg_replslot/%s/xid*: %m", + path, slotname))); + } + } + FreeDir(spill_dir); +} + +/* + * Given a replication slot, transaction ID and segment number, fill in the + * corresponding spill file into 'path', which is a caller-owned buffer of size + * at least MAXPGPATH. + */ +static void +ReorderBufferSerializedPath(char *path, ReplicationSlot *slot, TransactionId xid, + XLogSegNo segno) +{ + XLogRecPtr recptr; + + XLogSegNoOffsetToRecPtr(segno, 0, wal_segment_size, recptr); + + snprintf(path, MAXPGPATH, "pg_replslot/%s/xid-%u-lsn-%X-%X.spill", + NameStr(MyReplicationSlot->data.name), + xid, LSN_FORMAT_ARGS(recptr)); +} + +/* + * Delete all data spilled to disk after we've restarted/crashed. It will be + * recreated when the respective slots are reused. + */ +void +StartupReorderBuffer(void) +{ + DIR *logical_dir; + struct dirent *logical_de; + + logical_dir = AllocateDir("pg_replslot"); + while ((logical_de = ReadDir(logical_dir, "pg_replslot")) != NULL) + { + if (strcmp(logical_de->d_name, ".") == 0 || + strcmp(logical_de->d_name, "..") == 0) + continue; + + /* if it cannot be a slot, skip the directory */ + if (!ReplicationSlotValidateName(logical_de->d_name, DEBUG2)) + continue; + + /* + * ok, has to be a surviving logical slot, iterate and delete + * everything starting with xid-* + */ + ReorderBufferCleanupSerializedTXNs(logical_de->d_name); + } + FreeDir(logical_dir); +} + +/* --------------------------------------- + * toast reassembly support + * --------------------------------------- + */ + +/* + * Initialize per tuple toast reconstruction support. + */ +static void +ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + HASHCTL hash_ctl; + + Assert(txn->toast_hash == NULL); + + hash_ctl.keysize = sizeof(Oid); + hash_ctl.entrysize = sizeof(ReorderBufferToastEnt); + hash_ctl.hcxt = rb->context; + txn->toast_hash = hash_create("ReorderBufferToastHash", 5, &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); +} + +/* + * Per toast-chunk handling for toast reconstruction + * + * Appends a toast chunk so we can reconstruct it when the tuple "owning" the + * toasted Datum comes along. + */ +static void +ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change) +{ + ReorderBufferToastEnt *ent; + ReorderBufferTupleBuf *newtup; + bool found; + int32 chunksize; + bool isnull; + Pointer chunk; + TupleDesc desc = RelationGetDescr(relation); + Oid chunk_id; + int32 chunk_seq; + + if (txn->toast_hash == NULL) + ReorderBufferToastInitHash(rb, txn); + + Assert(IsToastRelation(relation)); + + newtup = change->data.tp.newtuple; + chunk_id = DatumGetObjectId(fastgetattr(&newtup->tuple, 1, desc, &isnull)); + Assert(!isnull); + chunk_seq = DatumGetInt32(fastgetattr(&newtup->tuple, 2, desc, &isnull)); + Assert(!isnull); + + ent = (ReorderBufferToastEnt *) + hash_search(txn->toast_hash, &chunk_id, HASH_ENTER, &found); + + if (!found) + { + Assert(ent->chunk_id == chunk_id); + ent->num_chunks = 0; + ent->last_chunk_seq = 0; + ent->size = 0; + ent->reconstructed = NULL; + dlist_init(&ent->chunks); + + if (chunk_seq != 0) + elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq 0", + chunk_seq, chunk_id); + } + else if (found && chunk_seq != ent->last_chunk_seq + 1) + elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq %d", + chunk_seq, chunk_id, ent->last_chunk_seq + 1); + + chunk = DatumGetPointer(fastgetattr(&newtup->tuple, 3, desc, &isnull)); + Assert(!isnull); + + /* calculate size so we can allocate the right size at once later */ + if (!VARATT_IS_EXTENDED(chunk)) + chunksize = VARSIZE(chunk) - VARHDRSZ; + else if (VARATT_IS_SHORT(chunk)) + /* could happen due to heap_form_tuple doing its thing */ + chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT; + else + elog(ERROR, "unexpected type of toast chunk"); + + ent->size += chunksize; + ent->last_chunk_seq = chunk_seq; + ent->num_chunks++; + dlist_push_tail(&ent->chunks, &change->node); +} + +/* + * Rejigger change->newtuple to point to in-memory toast tuples instead to + * on-disk toast tuples that may not longer exist (think DROP TABLE or VACUUM). + * + * We cannot replace unchanged toast tuples though, so those will still point + * to on-disk toast data. + * + * While updating the existing change with detoasted tuple data, we need to + * update the memory accounting info, because the change size will differ. + * Otherwise the accounting may get out of sync, triggering serialization + * at unexpected times. + * + * We simply subtract size of the change before rejiggering the tuple, and + * then adding the new size. This makes it look like the change was removed + * and then added back, except it only tweaks the accounting info. + * + * In particular it can't trigger serialization, which would be pointless + * anyway as it happens during commit processing right before handing + * the change to the output plugin. + */ +static void +ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change) +{ + TupleDesc desc; + int natt; + Datum *attrs; + bool *isnull; + bool *free; + HeapTuple tmphtup; + Relation toast_rel; + TupleDesc toast_desc; + MemoryContext oldcontext; + ReorderBufferTupleBuf *newtup; + Size old_size; + + /* no toast tuples changed */ + if (txn->toast_hash == NULL) + return; + + /* + * We're going to modify the size of the change. So, to make sure the + * accounting is correct we record the current change size and then after + * re-computing the change we'll subtract the recorded size and then + * re-add the new change size at the end. We don't immediately subtract + * the old size because if there is any error before we add the new size, + * we will release the changes and that will update the accounting info + * (subtracting the size from the counters). And we don't want to + * underflow there. + */ + old_size = ReorderBufferChangeSize(change); + + oldcontext = MemoryContextSwitchTo(rb->context); + + /* we should only have toast tuples in an INSERT or UPDATE */ + Assert(change->data.tp.newtuple); + + desc = RelationGetDescr(relation); + + toast_rel = RelationIdGetRelation(relation->rd_rel->reltoastrelid); + if (!RelationIsValid(toast_rel)) + elog(ERROR, "could not open toast relation with OID %u (base relation \"%s\")", + relation->rd_rel->reltoastrelid, RelationGetRelationName(relation)); + + toast_desc = RelationGetDescr(toast_rel); + + /* should we allocate from stack instead? */ + attrs = palloc0(sizeof(Datum) * desc->natts); + isnull = palloc0(sizeof(bool) * desc->natts); + free = palloc0(sizeof(bool) * desc->natts); + + newtup = change->data.tp.newtuple; + + heap_deform_tuple(&newtup->tuple, desc, attrs, isnull); + + for (natt = 0; natt < desc->natts; natt++) + { + Form_pg_attribute attr = TupleDescAttr(desc, natt); + ReorderBufferToastEnt *ent; + struct varlena *varlena; + + /* va_rawsize is the size of the original datum -- including header */ + struct varatt_external toast_pointer; + struct varatt_indirect redirect_pointer; + struct varlena *new_datum = NULL; + struct varlena *reconstructed; + dlist_iter it; + Size data_done = 0; + + /* system columns aren't toasted */ + if (attr->attnum < 0) + continue; + + if (attr->attisdropped) + continue; + + /* not a varlena datatype */ + if (attr->attlen != -1) + continue; + + /* no data */ + if (isnull[natt]) + continue; + + /* ok, we know we have a toast datum */ + varlena = (struct varlena *) DatumGetPointer(attrs[natt]); + + /* no need to do anything if the tuple isn't external */ + if (!VARATT_IS_EXTERNAL(varlena)) + continue; + + VARATT_EXTERNAL_GET_POINTER(toast_pointer, varlena); + + /* + * Check whether the toast tuple changed, replace if so. + */ + ent = (ReorderBufferToastEnt *) + hash_search(txn->toast_hash, + &toast_pointer.va_valueid, + HASH_FIND, + NULL); + if (ent == NULL) + continue; + + new_datum = + (struct varlena *) palloc0(INDIRECT_POINTER_SIZE); + + free[natt] = true; + + reconstructed = palloc0(toast_pointer.va_rawsize); + + ent->reconstructed = reconstructed; + + /* stitch toast tuple back together from its parts */ + dlist_foreach(it, &ent->chunks) + { + bool isnull; + ReorderBufferChange *cchange; + ReorderBufferTupleBuf *ctup; + Pointer chunk; + + cchange = dlist_container(ReorderBufferChange, node, it.cur); + ctup = cchange->data.tp.newtuple; + chunk = DatumGetPointer(fastgetattr(&ctup->tuple, 3, toast_desc, &isnull)); + + Assert(!isnull); + Assert(!VARATT_IS_EXTERNAL(chunk)); + Assert(!VARATT_IS_SHORT(chunk)); + + memcpy(VARDATA(reconstructed) + data_done, + VARDATA(chunk), + VARSIZE(chunk) - VARHDRSZ); + data_done += VARSIZE(chunk) - VARHDRSZ; + } + Assert(data_done == VARATT_EXTERNAL_GET_EXTSIZE(toast_pointer)); + + /* make sure its marked as compressed or not */ + if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer)) + SET_VARSIZE_COMPRESSED(reconstructed, data_done + VARHDRSZ); + else + SET_VARSIZE(reconstructed, data_done + VARHDRSZ); + + memset(&redirect_pointer, 0, sizeof(redirect_pointer)); + redirect_pointer.pointer = reconstructed; + + SET_VARTAG_EXTERNAL(new_datum, VARTAG_INDIRECT); + memcpy(VARDATA_EXTERNAL(new_datum), &redirect_pointer, + sizeof(redirect_pointer)); + + attrs[natt] = PointerGetDatum(new_datum); + } + + /* + * Build tuple in separate memory & copy tuple back into the tuplebuf + * passed to the output plugin. We can't directly heap_fill_tuple() into + * the tuplebuf because attrs[] will point back into the current content. + */ + tmphtup = heap_form_tuple(desc, attrs, isnull); + Assert(newtup->tuple.t_len <= MaxHeapTupleSize); + Assert(ReorderBufferTupleBufData(newtup) == newtup->tuple.t_data); + + memcpy(newtup->tuple.t_data, tmphtup->t_data, tmphtup->t_len); + newtup->tuple.t_len = tmphtup->t_len; + + /* + * free resources we won't further need, more persistent stuff will be + * free'd in ReorderBufferToastReset(). + */ + RelationClose(toast_rel); + pfree(tmphtup); + for (natt = 0; natt < desc->natts; natt++) + { + if (free[natt]) + pfree(DatumGetPointer(attrs[natt])); + } + pfree(attrs); + pfree(free); + pfree(isnull); + + MemoryContextSwitchTo(oldcontext); + + /* subtract the old change size */ + ReorderBufferChangeMemoryUpdate(rb, change, false, old_size); + /* now add the change back, with the correct size */ + ReorderBufferChangeMemoryUpdate(rb, change, true, + ReorderBufferChangeSize(change)); +} + +/* + * Free all resources allocated for toast reconstruction. + */ +static void +ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + HASH_SEQ_STATUS hstat; + ReorderBufferToastEnt *ent; + + if (txn->toast_hash == NULL) + return; + + /* sequentially walk over the hash and free everything */ + hash_seq_init(&hstat, txn->toast_hash); + while ((ent = (ReorderBufferToastEnt *) hash_seq_search(&hstat)) != NULL) + { + dlist_mutable_iter it; + + if (ent->reconstructed != NULL) + pfree(ent->reconstructed); + + dlist_foreach_modify(it, &ent->chunks) + { + ReorderBufferChange *change = + dlist_container(ReorderBufferChange, node, it.cur); + + dlist_delete(&change->node); + ReorderBufferReturnChange(rb, change, true); + } + } + + hash_destroy(txn->toast_hash); + txn->toast_hash = NULL; +} + + +/* --------------------------------------- + * Visibility support for logical decoding + * + * + * Lookup actual cmin/cmax values when using decoding snapshot. We can't + * always rely on stored cmin/cmax values because of two scenarios: + * + * * A tuple got changed multiple times during a single transaction and thus + * has got a combo CID. Combo CIDs are only valid for the duration of a + * single transaction. + * * A tuple with a cmin but no cmax (and thus no combo CID) got + * deleted/updated in another transaction than the one which created it + * which we are looking at right now. As only one of cmin, cmax or combo CID + * is actually stored in the heap we don't have access to the value we + * need anymore. + * + * To resolve those problems we have a per-transaction hash of (cmin, + * cmax) tuples keyed by (relfilelocator, ctid) which contains the actual + * (cmin, cmax) values. That also takes care of combo CIDs by simply + * not caring about them at all. As we have the real cmin/cmax values + * combo CIDs aren't interesting. + * + * As we only care about catalog tuples here the overhead of this + * hashtable should be acceptable. + * + * Heap rewrites complicate this a bit, check rewriteheap.c for + * details. + * ------------------------------------------------------------------------- + */ + +/* struct for sorting mapping files by LSN efficiently */ +typedef struct RewriteMappingFile +{ + XLogRecPtr lsn; + char fname[MAXPGPATH]; +} RewriteMappingFile; + +#ifdef NOT_USED +static void +DisplayMapping(HTAB *tuplecid_data) +{ + HASH_SEQ_STATUS hstat; + ReorderBufferTupleCidEnt *ent; + + hash_seq_init(&hstat, tuplecid_data); + while ((ent = (ReorderBufferTupleCidEnt *) hash_seq_search(&hstat)) != NULL) + { + elog(DEBUG3, "mapping: node: %u/%u/%u tid: %u/%u cmin: %u, cmax: %u", + ent->key.rlocator.dbOid, + ent->key.rlocator.spcOid, + ent->key.rlocator.relNumber, + ItemPointerGetBlockNumber(&ent->key.tid), + ItemPointerGetOffsetNumber(&ent->key.tid), + ent->cmin, + ent->cmax + ); + } +} +#endif + +/* + * Apply a single mapping file to tuplecid_data. + * + * The mapping file has to have been verified to be a) committed b) for our + * transaction c) applied in LSN order. + */ +static void +ApplyLogicalMappingFile(HTAB *tuplecid_data, Oid relid, const char *fname) +{ + char path[MAXPGPATH]; + int fd; + int readBytes; + LogicalRewriteMappingData map; + + sprintf(path, "pg_logical/mappings/%s", fname); + fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + + while (true) + { + ReorderBufferTupleCidKey key; + ReorderBufferTupleCidEnt *ent; + ReorderBufferTupleCidEnt *new_ent; + bool found; + + /* be careful about padding */ + memset(&key, 0, sizeof(ReorderBufferTupleCidKey)); + + /* read all mappings till the end of the file */ + pgstat_report_wait_start(WAIT_EVENT_REORDER_LOGICAL_MAPPING_READ); + readBytes = read(fd, &map, sizeof(LogicalRewriteMappingData)); + pgstat_report_wait_end(); + + if (readBytes < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + path))); + else if (readBytes == 0) /* EOF */ + break; + else if (readBytes != sizeof(LogicalRewriteMappingData)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from file \"%s\": read %d instead of %d bytes", + path, readBytes, + (int32) sizeof(LogicalRewriteMappingData)))); + + key.rlocator = map.old_locator; + ItemPointerCopy(&map.old_tid, + &key.tid); + + + ent = (ReorderBufferTupleCidEnt *) + hash_search(tuplecid_data, &key, HASH_FIND, NULL); + + /* no existing mapping, no need to update */ + if (!ent) + continue; + + key.rlocator = map.new_locator; + ItemPointerCopy(&map.new_tid, + &key.tid); + + new_ent = (ReorderBufferTupleCidEnt *) + hash_search(tuplecid_data, &key, HASH_ENTER, &found); + + if (found) + { + /* + * Make sure the existing mapping makes sense. We sometime update + * old records that did not yet have a cmax (e.g. pg_class' own + * entry while rewriting it) during rewrites, so allow that. + */ + Assert(ent->cmin == InvalidCommandId || ent->cmin == new_ent->cmin); + Assert(ent->cmax == InvalidCommandId || ent->cmax == new_ent->cmax); + } + else + { + /* update mapping */ + new_ent->cmin = ent->cmin; + new_ent->cmax = ent->cmax; + new_ent->combocid = ent->combocid; + } + } + + if (CloseTransientFile(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", path))); +} + + +/* + * Check whether the TransactionId 'xid' is in the pre-sorted array 'xip'. + */ +static bool +TransactionIdInArray(TransactionId xid, TransactionId *xip, Size num) +{ + return bsearch(&xid, xip, num, + sizeof(TransactionId), xidComparator) != NULL; +} + +/* + * list_sort() comparator for sorting RewriteMappingFiles in LSN order. + */ +static int +file_sort_by_lsn(const ListCell *a_p, const ListCell *b_p) +{ + RewriteMappingFile *a = (RewriteMappingFile *) lfirst(a_p); + RewriteMappingFile *b = (RewriteMappingFile *) lfirst(b_p); + + if (a->lsn < b->lsn) + return -1; + else if (a->lsn > b->lsn) + return 1; + return 0; +} + +/* + * Apply any existing logical remapping files if there are any targeted at our + * transaction for relid. + */ +static void +UpdateLogicalMappings(HTAB *tuplecid_data, Oid relid, Snapshot snapshot) +{ + DIR *mapping_dir; + struct dirent *mapping_de; + List *files = NIL; + ListCell *file; + Oid dboid = IsSharedRelation(relid) ? InvalidOid : MyDatabaseId; + + mapping_dir = AllocateDir("pg_logical/mappings"); + while ((mapping_de = ReadDir(mapping_dir, "pg_logical/mappings")) != NULL) + { + Oid f_dboid; + Oid f_relid; + TransactionId f_mapped_xid; + TransactionId f_create_xid; + XLogRecPtr f_lsn; + uint32 f_hi, + f_lo; + RewriteMappingFile *f; + + if (strcmp(mapping_de->d_name, ".") == 0 || + strcmp(mapping_de->d_name, "..") == 0) + continue; + + /* Ignore files that aren't ours */ + if (strncmp(mapping_de->d_name, "map-", 4) != 0) + continue; + + if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT, + &f_dboid, &f_relid, &f_hi, &f_lo, + &f_mapped_xid, &f_create_xid) != 6) + elog(ERROR, "could not parse filename \"%s\"", mapping_de->d_name); + + f_lsn = ((uint64) f_hi) << 32 | f_lo; + + /* mapping for another database */ + if (f_dboid != dboid) + continue; + + /* mapping for another relation */ + if (f_relid != relid) + continue; + + /* did the creating transaction abort? */ + if (!TransactionIdDidCommit(f_create_xid)) + continue; + + /* not for our transaction */ + if (!TransactionIdInArray(f_mapped_xid, snapshot->subxip, snapshot->subxcnt)) + continue; + + /* ok, relevant, queue for apply */ + f = palloc(sizeof(RewriteMappingFile)); + f->lsn = f_lsn; + strcpy(f->fname, mapping_de->d_name); + files = lappend(files, f); + } + FreeDir(mapping_dir); + + /* sort files so we apply them in LSN order */ + list_sort(files, file_sort_by_lsn); + + foreach(file, files) + { + RewriteMappingFile *f = (RewriteMappingFile *) lfirst(file); + + elog(DEBUG1, "applying mapping: \"%s\" in %u", f->fname, + snapshot->subxip[0]); + ApplyLogicalMappingFile(tuplecid_data, relid, f->fname); + pfree(f); + } +} + +/* + * Lookup cmin/cmax of a tuple, during logical decoding where we can't rely on + * combo CIDs. + */ +bool +ResolveCminCmaxDuringDecoding(HTAB *tuplecid_data, + Snapshot snapshot, + HeapTuple htup, Buffer buffer, + CommandId *cmin, CommandId *cmax) +{ + ReorderBufferTupleCidKey key; + ReorderBufferTupleCidEnt *ent; + ForkNumber forkno; + BlockNumber blockno; + bool updated_mapping = false; + + /* + * Return unresolved if tuplecid_data is not valid. That's because when + * streaming in-progress transactions we may run into tuples with the CID + * before actually decoding them. Think e.g. about INSERT followed by + * TRUNCATE, where the TRUNCATE may not be decoded yet when applying the + * INSERT. So in such cases, we assume the CID is from the future + * command. + */ + if (tuplecid_data == NULL) + return false; + + /* be careful about padding */ + memset(&key, 0, sizeof(key)); + + Assert(!BufferIsLocal(buffer)); + + /* + * get relfilelocator from the buffer, no convenient way to access it + * other than that. + */ + BufferGetTag(buffer, &key.rlocator, &forkno, &blockno); + + /* tuples can only be in the main fork */ + Assert(forkno == MAIN_FORKNUM); + Assert(blockno == ItemPointerGetBlockNumber(&htup->t_self)); + + ItemPointerCopy(&htup->t_self, + &key.tid); + +restart: + ent = (ReorderBufferTupleCidEnt *) + hash_search(tuplecid_data, &key, HASH_FIND, NULL); + + /* + * failed to find a mapping, check whether the table was rewritten and + * apply mapping if so, but only do that once - there can be no new + * mappings while we are in here since we have to hold a lock on the + * relation. + */ + if (ent == NULL && !updated_mapping) + { + UpdateLogicalMappings(tuplecid_data, htup->t_tableOid, snapshot); + /* now check but don't update for a mapping again */ + updated_mapping = true; + goto restart; + } + else if (ent == NULL) + return false; + + if (cmin) + *cmin = ent->cmin; + if (cmax) + *cmax = ent->cmax; + return true; +} diff --git a/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/snapbuild.c b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/snapbuild.c new file mode 100644 index 00000000000..bc1d5acde26 --- /dev/null +++ b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/snapbuild.c @@ -0,0 +1,2132 @@ +/*------------------------------------------------------------------------- + * + * snapbuild.c + * + * Infrastructure for building historic catalog snapshots based on contents + * of the WAL, for the purpose of decoding heapam.c style values in the + * WAL. + * + * NOTES: + * + * We build snapshots which can *only* be used to read catalog contents and we + * do so by reading and interpreting the WAL stream. The aim is to build a + * snapshot that behaves the same as a freshly taken MVCC snapshot would have + * at the time the XLogRecord was generated. + * + * To build the snapshots we reuse the infrastructure built for Hot + * Standby. The in-memory snapshots we build look different than HS' because + * we have different needs. To successfully decode data from the WAL we only + * need to access catalog tables and (sys|rel|cat)cache, not the actual user + * tables since the data we decode is wholly contained in the WAL + * records. Also, our snapshots need to be different in comparison to normal + * MVCC ones because in contrast to those we cannot fully rely on the clog and + * pg_subtrans for information about committed transactions because they might + * commit in the future from the POV of the WAL entry we're currently + * decoding. This definition has the advantage that we only need to prevent + * removal of catalog rows, while normal table's rows can still be + * removed. This is achieved by using the replication slot mechanism. + * + * As the percentage of transactions modifying the catalog normally is fairly + * small in comparisons to ones only manipulating user data, we keep track of + * the committed catalog modifying ones inside [xmin, xmax) instead of keeping + * track of all running transactions like it's done in a normal snapshot. Note + * that we're generally only looking at transactions that have acquired an + * xid. That is we keep a list of transactions between snapshot->(xmin, xmax) + * that we consider committed, everything else is considered aborted/in + * progress. That also allows us not to care about subtransactions before they + * have committed which means this module, in contrast to HS, doesn't have to + * care about suboverflowed subtransactions and similar. + * + * One complexity of doing this is that to e.g. handle mixed DDL/DML + * transactions we need Snapshots that see intermediate versions of the + * catalog in a transaction. During normal operation this is achieved by using + * CommandIds/cmin/cmax. The problem with that however is that for space + * efficiency reasons only one value of that is stored + * (cf. combocid.c). Since combo CIDs are only available in memory we log + * additional information which allows us to get the original (cmin, cmax) + * pair during visibility checks. Check the reorderbuffer.c's comment above + * ResolveCminCmaxDuringDecoding() for details. + * + * To facilitate all this we need our own visibility routine, as the normal + * ones are optimized for different usecases. + * + * To replace the normal catalog snapshots with decoding ones use the + * SetupHistoricSnapshot() and TeardownHistoricSnapshot() functions. + * + * + * + * The snapbuild machinery is starting up in several stages, as illustrated + * by the following graph describing the SnapBuild->state transitions: + * + * +-------------------------+ + * +----| START |-------------+ + * | +-------------------------+ | + * | | | + * | | | + * | running_xacts #1 | + * | | | + * | | | + * | v | + * | +-------------------------+ v + * | | BUILDING_SNAPSHOT |------------>| + * | +-------------------------+ | + * | | | + * | | | + * | running_xacts #2, xacts from #1 finished | + * | | | + * | | | + * | v | + * | +-------------------------+ v + * | | FULL_SNAPSHOT |------------>| + * | +-------------------------+ | + * | | | + * running_xacts | saved snapshot + * with zero xacts | at running_xacts's lsn + * | | | + * | running_xacts with xacts from #2 finished | + * | | | + * | v | + * | +-------------------------+ | + * +--->|SNAPBUILD_CONSISTENT |<------------+ + * +-------------------------+ + * + * Initially the machinery is in the START stage. When an xl_running_xacts + * record is read that is sufficiently new (above the safe xmin horizon), + * there's a state transition. If there were no running xacts when the + * xl_running_xacts record was generated, we'll directly go into CONSISTENT + * state, otherwise we'll switch to the BUILDING_SNAPSHOT state. Having a full + * snapshot means that all transactions that start henceforth can be decoded + * in their entirety, but transactions that started previously can't. In + * FULL_SNAPSHOT we'll switch into CONSISTENT once all those previously + * running transactions have committed or aborted. + * + * Only transactions that commit after CONSISTENT state has been reached will + * be replayed, even though they might have started while still in + * FULL_SNAPSHOT. That ensures that we'll reach a point where no previous + * changes has been exported, but all the following ones will be. That point + * is a convenient point to initialize replication from, which is why we + * export a snapshot at that point, which *can* be used to read normal data. + * + * Copyright (c) 2012-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/snapbuild.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <sys/stat.h> +#include <unistd.h> + +#include "access/heapam_xlog.h" +#include "access/transam.h" +#include "access/xact.h" +#include "common/file_utils.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "replication/logical.h" +#include "replication/reorderbuffer.h" +#include "replication/snapbuild.h" +#include "storage/block.h" /* debugging output */ +#include "storage/fd.h" +#include "storage/lmgr.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/standby.h" +#include "utils/builtins.h" +#include "utils/memutils.h" +#include "utils/snapmgr.h" +#include "utils/snapshot.h" + +/* + * This struct contains the current state of the snapshot building + * machinery. Besides a forward declaration in the header, it is not exposed + * to the public, so we can easily change its contents. + */ +struct SnapBuild +{ + /* how far are we along building our first full snapshot */ + SnapBuildState state; + + /* private memory context used to allocate memory for this module. */ + MemoryContext context; + + /* all transactions < than this have committed/aborted */ + TransactionId xmin; + + /* all transactions >= than this are uncommitted */ + TransactionId xmax; + + /* + * Don't replay commits from an LSN < this LSN. This can be set externally + * but it will also be advanced (never retreat) from within snapbuild.c. + */ + XLogRecPtr start_decoding_at; + + /* + * LSN at which two-phase decoding was enabled or LSN at which we found a + * consistent point at the time of slot creation. + * + * The prepared transactions, that were skipped because previously + * two-phase was not enabled or are not covered by initial snapshot, need + * to be sent later along with commit prepared and they must be before + * this point. + */ + XLogRecPtr two_phase_at; + + /* + * Don't start decoding WAL until the "xl_running_xacts" information + * indicates there are no running xids with an xid smaller than this. + */ + TransactionId initial_xmin_horizon; + + /* Indicates if we are building full snapshot or just catalog one. */ + bool building_full_snapshot; + + /* + * Snapshot that's valid to see the catalog state seen at this moment. + */ + Snapshot snapshot; + + /* + * LSN of the last location we are sure a snapshot has been serialized to. + */ + XLogRecPtr last_serialized_snapshot; + + /* + * The reorderbuffer we need to update with usable snapshots et al. + */ + ReorderBuffer *reorder; + + /* + * TransactionId at which the next phase of initial snapshot building will + * happen. InvalidTransactionId if not known (i.e. SNAPBUILD_START), or + * when no next phase necessary (SNAPBUILD_CONSISTENT). + */ + TransactionId next_phase_at; + + /* + * Array of transactions which could have catalog changes that committed + * between xmin and xmax. + */ + struct + { + /* number of committed transactions */ + size_t xcnt; + + /* available space for committed transactions */ + size_t xcnt_space; + + /* + * Until we reach a CONSISTENT state, we record commits of all + * transactions, not just the catalog changing ones. Record when that + * changes so we know we cannot export a snapshot safely anymore. + */ + bool includes_all_transactions; + + /* + * Array of committed transactions that have modified the catalog. + * + * As this array is frequently modified we do *not* keep it in + * xidComparator order. Instead we sort the array when building & + * distributing a snapshot. + * + * TODO: It's unclear whether that reasoning has much merit. Every + * time we add something here after becoming consistent will also + * require distributing a snapshot. Storing them sorted would + * potentially also make it easier to purge (but more complicated wrt + * wraparound?). Should be improved if sorting while building the + * snapshot shows up in profiles. + */ + TransactionId *xip; + } committed; + + /* + * Array of transactions and subtransactions that had modified catalogs + * and were running when the snapshot was serialized. + * + * We normally rely on some WAL record types such as HEAP2_NEW_CID to know + * if the transaction has changed the catalog. But it could happen that + * the logical decoding decodes only the commit record of the transaction + * after restoring the previously serialized snapshot in which case we + * will miss adding the xid to the snapshot and end up looking at the + * catalogs with the wrong snapshot. + * + * Now to avoid the above problem, we serialize the transactions that had + * modified the catalogs and are still running at the time of snapshot + * serialization. We fill this array while restoring the snapshot and then + * refer it while decoding commit to ensure if the xact has modified the + * catalog. We discard this array when all the xids in the list become old + * enough to matter. See SnapBuildPurgeOlderTxn for details. + */ + struct + { + /* number of transactions */ + size_t xcnt; + + /* This array must be sorted in xidComparator order */ + TransactionId *xip; + } catchange; +}; + +/* + * Starting a transaction -- which we need to do while exporting a snapshot -- + * removes knowledge about the previously used resowner, so we save it here. + */ +static __thread ResourceOwner SavedResourceOwnerDuringExport = NULL; +static __thread bool ExportInProgress = false; + +/* ->committed and ->catchange manipulation */ +static void SnapBuildPurgeOlderTxn(SnapBuild *builder); + +/* snapshot building/manipulation/distribution functions */ +static Snapshot SnapBuildBuildSnapshot(SnapBuild *builder); + +static void SnapBuildFreeSnapshot(Snapshot snap); + +static void SnapBuildSnapIncRefcount(Snapshot snap); + +static void SnapBuildDistributeNewCatalogSnapshot(SnapBuild *builder, XLogRecPtr lsn); + +static inline bool SnapBuildXidHasCatalogChanges(SnapBuild *builder, TransactionId xid, + uint32 xinfo); + +/* xlog reading helper functions for SnapBuildProcessRunningXacts */ +static bool SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running); +static void SnapBuildWaitSnapshot(xl_running_xacts *running, TransactionId cutoff); + +/* serialization functions */ +static void SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn); +static bool SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn); +static void SnapBuildRestoreContents(int fd, char *dest, Size size, const char *path); + +/* + * Allocate a new snapshot builder. + * + * xmin_horizon is the xid >= which we can be sure no catalog rows have been + * removed, start_lsn is the LSN >= we want to replay commits. + */ +SnapBuild * +AllocateSnapshotBuilder(ReorderBuffer *reorder, + TransactionId xmin_horizon, + XLogRecPtr start_lsn, + bool need_full_snapshot, + XLogRecPtr two_phase_at) +{ + MemoryContext context; + MemoryContext oldcontext; + SnapBuild *builder; + + /* allocate memory in own context, to have better accountability */ + context = AllocSetContextCreate(CurrentMemoryContext, + "snapshot builder context", + ALLOCSET_DEFAULT_SIZES); + oldcontext = MemoryContextSwitchTo(context); + + builder = palloc0(sizeof(SnapBuild)); + + builder->state = SNAPBUILD_START; + builder->context = context; + builder->reorder = reorder; + /* Other struct members initialized by zeroing via palloc0 above */ + + builder->committed.xcnt = 0; + builder->committed.xcnt_space = 128; /* arbitrary number */ + builder->committed.xip = + palloc0(builder->committed.xcnt_space * sizeof(TransactionId)); + builder->committed.includes_all_transactions = true; + + builder->catchange.xcnt = 0; + builder->catchange.xip = NULL; + + builder->initial_xmin_horizon = xmin_horizon; + builder->start_decoding_at = start_lsn; + builder->building_full_snapshot = need_full_snapshot; + builder->two_phase_at = two_phase_at; + + MemoryContextSwitchTo(oldcontext); + + return builder; +} + +/* + * Free a snapshot builder. + */ +void +FreeSnapshotBuilder(SnapBuild *builder) +{ + MemoryContext context = builder->context; + + /* free snapshot explicitly, that contains some error checking */ + if (builder->snapshot != NULL) + { + SnapBuildSnapDecRefcount(builder->snapshot); + builder->snapshot = NULL; + } + + /* other resources are deallocated via memory context reset */ + MemoryContextDelete(context); +} + +/* + * Free an unreferenced snapshot that has previously been built by us. + */ +static void +SnapBuildFreeSnapshot(Snapshot snap) +{ + /* make sure we don't get passed an external snapshot */ + Assert(snap->snapshot_type == SNAPSHOT_HISTORIC_MVCC); + + /* make sure nobody modified our snapshot */ + Assert(snap->curcid == FirstCommandId); + Assert(!snap->suboverflowed); + Assert(!snap->takenDuringRecovery); + Assert(snap->regd_count == 0); + + /* slightly more likely, so it's checked even without c-asserts */ + if (snap->copied) + elog(ERROR, "cannot free a copied snapshot"); + + if (snap->active_count) + elog(ERROR, "cannot free an active snapshot"); + + pfree(snap); +} + +/* + * In which state of snapshot building are we? + */ +SnapBuildState +SnapBuildCurrentState(SnapBuild *builder) +{ + return builder->state; +} + +/* + * Return the LSN at which the two-phase decoding was first enabled. + */ +XLogRecPtr +SnapBuildGetTwoPhaseAt(SnapBuild *builder) +{ + return builder->two_phase_at; +} + +/* + * Set the LSN at which two-phase decoding is enabled. + */ +void +SnapBuildSetTwoPhaseAt(SnapBuild *builder, XLogRecPtr ptr) +{ + builder->two_phase_at = ptr; +} + +/* + * Should the contents of transaction ending at 'ptr' be decoded? + */ +bool +SnapBuildXactNeedsSkip(SnapBuild *builder, XLogRecPtr ptr) +{ + return ptr < builder->start_decoding_at; +} + +/* + * Increase refcount of a snapshot. + * + * This is used when handing out a snapshot to some external resource or when + * adding a Snapshot as builder->snapshot. + */ +static void +SnapBuildSnapIncRefcount(Snapshot snap) +{ + snap->active_count++; +} + +/* + * Decrease refcount of a snapshot and free if the refcount reaches zero. + * + * Externally visible, so that external resources that have been handed an + * IncRef'ed Snapshot can adjust its refcount easily. + */ +void +SnapBuildSnapDecRefcount(Snapshot snap) +{ + /* make sure we don't get passed an external snapshot */ + Assert(snap->snapshot_type == SNAPSHOT_HISTORIC_MVCC); + + /* make sure nobody modified our snapshot */ + Assert(snap->curcid == FirstCommandId); + Assert(!snap->suboverflowed); + Assert(!snap->takenDuringRecovery); + + Assert(snap->regd_count == 0); + + Assert(snap->active_count > 0); + + /* slightly more likely, so it's checked even without casserts */ + if (snap->copied) + elog(ERROR, "cannot free a copied snapshot"); + + snap->active_count--; + if (snap->active_count == 0) + SnapBuildFreeSnapshot(snap); +} + +/* + * Build a new snapshot, based on currently committed catalog-modifying + * transactions. + * + * In-progress transactions with catalog access are *not* allowed to modify + * these snapshots; they have to copy them and fill in appropriate ->curcid + * and ->subxip/subxcnt values. + */ +static Snapshot +SnapBuildBuildSnapshot(SnapBuild *builder) +{ + Snapshot snapshot; + Size ssize; + + Assert(builder->state >= SNAPBUILD_FULL_SNAPSHOT); + + ssize = sizeof(SnapshotData) + + sizeof(TransactionId) * builder->committed.xcnt + + sizeof(TransactionId) * 1 /* toplevel xid */ ; + + snapshot = MemoryContextAllocZero(builder->context, ssize); + + snapshot->snapshot_type = SNAPSHOT_HISTORIC_MVCC; + + /* + * We misuse the original meaning of SnapshotData's xip and subxip fields + * to make the more fitting for our needs. + * + * In the 'xip' array we store transactions that have to be treated as + * committed. Since we will only ever look at tuples from transactions + * that have modified the catalog it's more efficient to store those few + * that exist between xmin and xmax (frequently there are none). + * + * Snapshots that are used in transactions that have modified the catalog + * also use the 'subxip' array to store their toplevel xid and all the + * subtransaction xids so we can recognize when we need to treat rows as + * visible that are not in xip but still need to be visible. Subxip only + * gets filled when the transaction is copied into the context of a + * catalog modifying transaction since we otherwise share a snapshot + * between transactions. As long as a txn hasn't modified the catalog it + * doesn't need to treat any uncommitted rows as visible, so there is no + * need for those xids. + * + * Both arrays are qsort'ed so that we can use bsearch() on them. + */ + Assert(TransactionIdIsNormal(builder->xmin)); + Assert(TransactionIdIsNormal(builder->xmax)); + + snapshot->xmin = builder->xmin; + snapshot->xmax = builder->xmax; + + /* store all transactions to be treated as committed by this snapshot */ + snapshot->xip = + (TransactionId *) ((char *) snapshot + sizeof(SnapshotData)); + snapshot->xcnt = builder->committed.xcnt; + memcpy(snapshot->xip, + builder->committed.xip, + builder->committed.xcnt * sizeof(TransactionId)); + + /* sort so we can bsearch() */ + qsort(snapshot->xip, snapshot->xcnt, sizeof(TransactionId), xidComparator); + + /* + * Initially, subxip is empty, i.e. it's a snapshot to be used by + * transactions that don't modify the catalog. Will be filled by + * ReorderBufferCopySnap() if necessary. + */ + snapshot->subxcnt = 0; + snapshot->subxip = NULL; + + snapshot->suboverflowed = false; + snapshot->takenDuringRecovery = false; + snapshot->copied = false; + snapshot->curcid = FirstCommandId; + snapshot->active_count = 0; + snapshot->regd_count = 0; + snapshot->snapXactCompletionCount = 0; + + return snapshot; +} + +/* + * Build the initial slot snapshot and convert it to a normal snapshot that + * is understood by HeapTupleSatisfiesMVCC. + * + * The snapshot will be usable directly in current transaction or exported + * for loading in different transaction. + */ +Snapshot +SnapBuildInitialSnapshot(SnapBuild *builder) +{ + Snapshot snap; + TransactionId xid; + TransactionId safeXid; + TransactionId *newxip; + int newxcnt = 0; + + Assert(XactIsoLevel == XACT_REPEATABLE_READ); + Assert(builder->building_full_snapshot); + + /* don't allow older snapshots */ + InvalidateCatalogSnapshot(); /* about to overwrite MyProc->xmin */ + if (HaveRegisteredOrActiveSnapshot()) + elog(ERROR, "cannot build an initial slot snapshot when snapshots exist"); + Assert(!HistoricSnapshotActive()); + + if (builder->state != SNAPBUILD_CONSISTENT) + elog(ERROR, "cannot build an initial slot snapshot before reaching a consistent state"); + + if (!builder->committed.includes_all_transactions) + elog(ERROR, "cannot build an initial slot snapshot, not all transactions are monitored anymore"); + + /* so we don't overwrite the existing value */ + if (TransactionIdIsValid(MyProc->xmin)) + elog(ERROR, "cannot build an initial slot snapshot when MyProc->xmin already is valid"); + + snap = SnapBuildBuildSnapshot(builder); + + /* + * We know that snap->xmin is alive, enforced by the logical xmin + * mechanism. Due to that we can do this without locks, we're only + * changing our own value. + * + * Building an initial snapshot is expensive and an unenforced xmin + * horizon would have bad consequences, therefore always double-check that + * the horizon is enforced. + */ + LWLockAcquire(ProcArrayLock, LW_SHARED); + safeXid = GetOldestSafeDecodingTransactionId(false); + LWLockRelease(ProcArrayLock); + + if (TransactionIdFollows(safeXid, snap->xmin)) + elog(ERROR, "cannot build an initial slot snapshot as oldest safe xid %u follows snapshot's xmin %u", + safeXid, snap->xmin); + + MyProc->xmin = snap->xmin; + + /* allocate in transaction context */ + newxip = (TransactionId *) + palloc(sizeof(TransactionId) * GetMaxSnapshotXidCount()); + + /* + * snapbuild.c builds transactions in an "inverted" manner, which means it + * stores committed transactions in ->xip, not ones in progress. Build a + * classical snapshot by marking all non-committed transactions as + * in-progress. This can be expensive. + */ + for (xid = snap->xmin; NormalTransactionIdPrecedes(xid, snap->xmax);) + { + void *test; + + /* + * Check whether transaction committed using the decoding snapshot + * meaning of ->xip. + */ + test = bsearch(&xid, snap->xip, snap->xcnt, + sizeof(TransactionId), xidComparator); + + if (test == NULL) + { + if (newxcnt >= GetMaxSnapshotXidCount()) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("initial slot snapshot too large"))); + + newxip[newxcnt++] = xid; + } + + TransactionIdAdvance(xid); + } + + /* adjust remaining snapshot fields as needed */ + snap->snapshot_type = SNAPSHOT_MVCC; + snap->xcnt = newxcnt; + snap->xip = newxip; + + return snap; +} + +/* + * Export a snapshot so it can be set in another session with SET TRANSACTION + * SNAPSHOT. + * + * For that we need to start a transaction in the current backend as the + * importing side checks whether the source transaction is still open to make + * sure the xmin horizon hasn't advanced since then. + */ +const char * +SnapBuildExportSnapshot(SnapBuild *builder) +{ + Snapshot snap; + char *snapname; + + if (IsTransactionOrTransactionBlock()) + elog(ERROR, "cannot export a snapshot from within a transaction"); + + if (SavedResourceOwnerDuringExport) + elog(ERROR, "can only export one snapshot at a time"); + + SavedResourceOwnerDuringExport = CurrentResourceOwner; + ExportInProgress = true; + + StartTransactionCommand(); + + /* There doesn't seem to a nice API to set these */ + XactIsoLevel = XACT_REPEATABLE_READ; + XactReadOnly = true; + + snap = SnapBuildInitialSnapshot(builder); + + /* + * now that we've built a plain snapshot, make it active and use the + * normal mechanisms for exporting it + */ + snapname = ExportSnapshot(snap); + + ereport(LOG, + (errmsg_plural("exported logical decoding snapshot: \"%s\" with %u transaction ID", + "exported logical decoding snapshot: \"%s\" with %u transaction IDs", + snap->xcnt, + snapname, snap->xcnt))); + return snapname; +} + +/* + * Ensure there is a snapshot and if not build one for current transaction. + */ +Snapshot +SnapBuildGetOrBuildSnapshot(SnapBuild *builder) +{ + Assert(builder->state == SNAPBUILD_CONSISTENT); + + /* only build a new snapshot if we don't have a prebuilt one */ + if (builder->snapshot == NULL) + { + builder->snapshot = SnapBuildBuildSnapshot(builder); + /* increase refcount for the snapshot builder */ + SnapBuildSnapIncRefcount(builder->snapshot); + } + + return builder->snapshot; +} + +/* + * Reset a previously SnapBuildExportSnapshot()'ed snapshot if there is + * any. Aborts the previously started transaction and resets the resource + * owner back to its original value. + */ +void +SnapBuildClearExportedSnapshot(void) +{ + ResourceOwner tmpResOwner; + + /* nothing exported, that is the usual case */ + if (!ExportInProgress) + return; + + if (!IsTransactionState()) + elog(ERROR, "clearing exported snapshot in wrong transaction state"); + + /* + * AbortCurrentTransaction() takes care of resetting the snapshot state, + * so remember SavedResourceOwnerDuringExport. + */ + tmpResOwner = SavedResourceOwnerDuringExport; + + /* make sure nothing could have ever happened */ + AbortCurrentTransaction(); + + CurrentResourceOwner = tmpResOwner; +} + +/* + * Clear snapshot export state during transaction abort. + */ +void +SnapBuildResetExportedSnapshotState(void) +{ + SavedResourceOwnerDuringExport = NULL; + ExportInProgress = false; +} + +/* + * Handle the effects of a single heap change, appropriate to the current state + * of the snapshot builder and returns whether changes made at (xid, lsn) can + * be decoded. + */ +bool +SnapBuildProcessChange(SnapBuild *builder, TransactionId xid, XLogRecPtr lsn) +{ + /* + * We can't handle data in transactions if we haven't built a snapshot + * yet, so don't store them. + */ + if (builder->state < SNAPBUILD_FULL_SNAPSHOT) + return false; + + /* + * No point in keeping track of changes in transactions that we don't have + * enough information about to decode. This means that they started before + * we got into the SNAPBUILD_FULL_SNAPSHOT state. + */ + if (builder->state < SNAPBUILD_CONSISTENT && + TransactionIdPrecedes(xid, builder->next_phase_at)) + return false; + + /* + * If the reorderbuffer doesn't yet have a snapshot, add one now, it will + * be needed to decode the change we're currently processing. + */ + if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, xid)) + { + /* only build a new snapshot if we don't have a prebuilt one */ + if (builder->snapshot == NULL) + { + builder->snapshot = SnapBuildBuildSnapshot(builder); + /* increase refcount for the snapshot builder */ + SnapBuildSnapIncRefcount(builder->snapshot); + } + + /* + * Increase refcount for the transaction we're handing the snapshot + * out to. + */ + SnapBuildSnapIncRefcount(builder->snapshot); + ReorderBufferSetBaseSnapshot(builder->reorder, xid, lsn, + builder->snapshot); + } + + return true; +} + +/* + * Do CommandId/combo CID handling after reading an xl_heap_new_cid record. + * This implies that a transaction has done some form of write to system + * catalogs. + */ +void +SnapBuildProcessNewCid(SnapBuild *builder, TransactionId xid, + XLogRecPtr lsn, xl_heap_new_cid *xlrec) +{ + CommandId cid; + + /* + * we only log new_cid's if a catalog tuple was modified, so mark the + * transaction as containing catalog modifications + */ + ReorderBufferXidSetCatalogChanges(builder->reorder, xid, lsn); + + ReorderBufferAddNewTupleCids(builder->reorder, xlrec->top_xid, lsn, + xlrec->target_locator, xlrec->target_tid, + xlrec->cmin, xlrec->cmax, + xlrec->combocid); + + /* figure out new command id */ + if (xlrec->cmin != InvalidCommandId && + xlrec->cmax != InvalidCommandId) + cid = Max(xlrec->cmin, xlrec->cmax); + else if (xlrec->cmax != InvalidCommandId) + cid = xlrec->cmax; + else if (xlrec->cmin != InvalidCommandId) + cid = xlrec->cmin; + else + { + cid = InvalidCommandId; /* silence compiler */ + elog(ERROR, "xl_heap_new_cid record without a valid CommandId"); + } + + ReorderBufferAddNewCommandId(builder->reorder, xid, lsn, cid + 1); +} + +/* + * Add a new Snapshot to all transactions we're decoding that currently are + * in-progress so they can see new catalog contents made by the transaction + * that just committed. This is necessary because those in-progress + * transactions will use the new catalog's contents from here on (at the very + * least everything they do needs to be compatible with newer catalog + * contents). + */ +static void +SnapBuildDistributeNewCatalogSnapshot(SnapBuild *builder, XLogRecPtr lsn) +{ + dlist_iter txn_i; + ReorderBufferTXN *txn; + + /* + * Iterate through all toplevel transactions. This can include + * subtransactions which we just don't yet know to be that, but that's + * fine, they will just get an unnecessary snapshot queued. + */ + dlist_foreach(txn_i, &builder->reorder->toplevel_by_lsn) + { + txn = dlist_container(ReorderBufferTXN, node, txn_i.cur); + + Assert(TransactionIdIsValid(txn->xid)); + + /* + * If we don't have a base snapshot yet, there are no changes in this + * transaction which in turn implies we don't yet need a snapshot at + * all. We'll add a snapshot when the first change gets queued. + * + * NB: This works correctly even for subtransactions because + * ReorderBufferAssignChild() takes care to transfer the base snapshot + * to the top-level transaction, and while iterating the changequeue + * we'll get the change from the subtxn. + */ + if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, txn->xid)) + continue; + + /* + * We don't need to add snapshot to prepared transactions as they + * should not see the new catalog contents. + */ + if (rbtxn_prepared(txn) || rbtxn_skip_prepared(txn)) + continue; + + elog(DEBUG2, "adding a new snapshot to %u at %X/%X", + txn->xid, LSN_FORMAT_ARGS(lsn)); + + /* + * increase the snapshot's refcount for the transaction we are handing + * it out to + */ + SnapBuildSnapIncRefcount(builder->snapshot); + ReorderBufferAddSnapshot(builder->reorder, txn->xid, lsn, + builder->snapshot); + } +} + +/* + * Keep track of a new catalog changing transaction that has committed. + */ +static void +SnapBuildAddCommittedTxn(SnapBuild *builder, TransactionId xid) +{ + Assert(TransactionIdIsValid(xid)); + + if (builder->committed.xcnt == builder->committed.xcnt_space) + { + builder->committed.xcnt_space = builder->committed.xcnt_space * 2 + 1; + + elog(DEBUG1, "increasing space for committed transactions to %u", + (uint32) builder->committed.xcnt_space); + + builder->committed.xip = repalloc(builder->committed.xip, + builder->committed.xcnt_space * sizeof(TransactionId)); + } + + /* + * TODO: It might make sense to keep the array sorted here instead of + * doing it every time we build a new snapshot. On the other hand this + * gets called repeatedly when a transaction with subtransactions commits. + */ + builder->committed.xip[builder->committed.xcnt++] = xid; +} + +/* + * Remove knowledge about transactions we treat as committed or containing catalog + * changes that are smaller than ->xmin. Those won't ever get checked via + * the ->committed or ->catchange array, respectively. The committed xids will + * get checked via the clog machinery. + * + * We can ideally remove the transaction from catchange array once it is + * finished (committed/aborted) but that could be costly as we need to maintain + * the xids order in the array. + */ +static void +SnapBuildPurgeOlderTxn(SnapBuild *builder) +{ + int off; + TransactionId *workspace; + int surviving_xids = 0; + + /* not ready yet */ + if (!TransactionIdIsNormal(builder->xmin)) + return; + + /* TODO: Neater algorithm than just copying and iterating? */ + workspace = + MemoryContextAlloc(builder->context, + builder->committed.xcnt * sizeof(TransactionId)); + + /* copy xids that still are interesting to workspace */ + for (off = 0; off < builder->committed.xcnt; off++) + { + if (NormalTransactionIdPrecedes(builder->committed.xip[off], + builder->xmin)) + ; /* remove */ + else + workspace[surviving_xids++] = builder->committed.xip[off]; + } + + /* copy workspace back to persistent state */ + memcpy(builder->committed.xip, workspace, + surviving_xids * sizeof(TransactionId)); + + elog(DEBUG3, "purged committed transactions from %u to %u, xmin: %u, xmax: %u", + (uint32) builder->committed.xcnt, (uint32) surviving_xids, + builder->xmin, builder->xmax); + builder->committed.xcnt = surviving_xids; + + pfree(workspace); + + /* + * Purge xids in ->catchange as well. The purged array must also be sorted + * in xidComparator order. + */ + if (builder->catchange.xcnt > 0) + { + /* + * Since catchange.xip is sorted, we find the lower bound of xids that + * are still interesting. + */ + for (off = 0; off < builder->catchange.xcnt; off++) + { + if (TransactionIdFollowsOrEquals(builder->catchange.xip[off], + builder->xmin)) + break; + } + + surviving_xids = builder->catchange.xcnt - off; + + if (surviving_xids > 0) + { + memmove(builder->catchange.xip, &(builder->catchange.xip[off]), + surviving_xids * sizeof(TransactionId)); + } + else + { + pfree(builder->catchange.xip); + builder->catchange.xip = NULL; + } + + elog(DEBUG3, "purged catalog modifying transactions from %u to %u, xmin: %u, xmax: %u", + (uint32) builder->catchange.xcnt, (uint32) surviving_xids, + builder->xmin, builder->xmax); + builder->catchange.xcnt = surviving_xids; + } +} + +/* + * Handle everything that needs to be done when a transaction commits + */ +void +SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid, + int nsubxacts, TransactionId *subxacts, uint32 xinfo) +{ + int nxact; + + bool needs_snapshot = false; + bool needs_timetravel = false; + bool sub_needs_timetravel = false; + + TransactionId xmax = xid; + + /* + * Transactions preceding BUILDING_SNAPSHOT will neither be decoded, nor + * will they be part of a snapshot. So we don't need to record anything. + */ + if (builder->state == SNAPBUILD_START || + (builder->state == SNAPBUILD_BUILDING_SNAPSHOT && + TransactionIdPrecedes(xid, builder->next_phase_at))) + { + /* ensure that only commits after this are getting replayed */ + if (builder->start_decoding_at <= lsn) + builder->start_decoding_at = lsn + 1; + return; + } + + if (builder->state < SNAPBUILD_CONSISTENT) + { + /* ensure that only commits after this are getting replayed */ + if (builder->start_decoding_at <= lsn) + builder->start_decoding_at = lsn + 1; + + /* + * If building an exportable snapshot, force xid to be tracked, even + * if the transaction didn't modify the catalog. + */ + if (builder->building_full_snapshot) + { + needs_timetravel = true; + } + } + + for (nxact = 0; nxact < nsubxacts; nxact++) + { + TransactionId subxid = subxacts[nxact]; + + /* + * Add subtransaction to base snapshot if catalog modifying, we don't + * distinguish to toplevel transactions there. + */ + if (SnapBuildXidHasCatalogChanges(builder, subxid, xinfo)) + { + sub_needs_timetravel = true; + needs_snapshot = true; + + elog(DEBUG1, "found subtransaction %u:%u with catalog changes", + xid, subxid); + + SnapBuildAddCommittedTxn(builder, subxid); + + if (NormalTransactionIdFollows(subxid, xmax)) + xmax = subxid; + } + + /* + * If we're forcing timetravel we also need visibility information + * about subtransaction, so keep track of subtransaction's state, even + * if not catalog modifying. Don't need to distribute a snapshot in + * that case. + */ + else if (needs_timetravel) + { + SnapBuildAddCommittedTxn(builder, subxid); + if (NormalTransactionIdFollows(subxid, xmax)) + xmax = subxid; + } + } + + /* if top-level modified catalog, it'll need a snapshot */ + if (SnapBuildXidHasCatalogChanges(builder, xid, xinfo)) + { + elog(DEBUG2, "found top level transaction %u, with catalog changes", + xid); + needs_snapshot = true; + needs_timetravel = true; + SnapBuildAddCommittedTxn(builder, xid); + } + else if (sub_needs_timetravel) + { + /* track toplevel txn as well, subxact alone isn't meaningful */ + elog(DEBUG2, "forced transaction %u to do timetravel due to one of its subtransactions", + xid); + needs_timetravel = true; + SnapBuildAddCommittedTxn(builder, xid); + } + else if (needs_timetravel) + { + elog(DEBUG2, "forced transaction %u to do timetravel", xid); + + SnapBuildAddCommittedTxn(builder, xid); + } + + if (!needs_timetravel) + { + /* record that we cannot export a general snapshot anymore */ + builder->committed.includes_all_transactions = false; + } + + Assert(!needs_snapshot || needs_timetravel); + + /* + * Adjust xmax of the snapshot builder, we only do that for committed, + * catalog modifying, transactions, everything else isn't interesting for + * us since we'll never look at the respective rows. + */ + if (needs_timetravel && + (!TransactionIdIsValid(builder->xmax) || + TransactionIdFollowsOrEquals(xmax, builder->xmax))) + { + builder->xmax = xmax; + TransactionIdAdvance(builder->xmax); + } + + /* if there's any reason to build a historic snapshot, do so now */ + if (needs_snapshot) + { + /* + * If we haven't built a complete snapshot yet there's no need to hand + * it out, it wouldn't (and couldn't) be used anyway. + */ + if (builder->state < SNAPBUILD_FULL_SNAPSHOT) + return; + + /* + * Decrease the snapshot builder's refcount of the old snapshot, note + * that it still will be used if it has been handed out to the + * reorderbuffer earlier. + */ + if (builder->snapshot) + SnapBuildSnapDecRefcount(builder->snapshot); + + builder->snapshot = SnapBuildBuildSnapshot(builder); + + /* we might need to execute invalidations, add snapshot */ + if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, xid)) + { + SnapBuildSnapIncRefcount(builder->snapshot); + ReorderBufferSetBaseSnapshot(builder->reorder, xid, lsn, + builder->snapshot); + } + + /* refcount of the snapshot builder for the new snapshot */ + SnapBuildSnapIncRefcount(builder->snapshot); + + /* add a new catalog snapshot to all currently running transactions */ + SnapBuildDistributeNewCatalogSnapshot(builder, lsn); + } +} + +/* + * Check the reorder buffer and the snapshot to see if the given transaction has + * modified catalogs. + */ +static inline bool +SnapBuildXidHasCatalogChanges(SnapBuild *builder, TransactionId xid, + uint32 xinfo) +{ + if (ReorderBufferXidHasCatalogChanges(builder->reorder, xid)) + return true; + + /* + * The transactions that have changed catalogs must have invalidation + * info. + */ + if (!(xinfo & XACT_XINFO_HAS_INVALS)) + return false; + + /* Check the catchange XID array */ + return ((builder->catchange.xcnt > 0) && + (bsearch(&xid, builder->catchange.xip, builder->catchange.xcnt, + sizeof(TransactionId), xidComparator) != NULL)); +} + +/* ----------------------------------- + * Snapshot building functions dealing with xlog records + * ----------------------------------- + */ + +/* + * Process a running xacts record, and use its information to first build a + * historic snapshot and later to release resources that aren't needed + * anymore. + */ +void +SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running) +{ + ReorderBufferTXN *txn; + TransactionId xmin; + + /* + * If we're not consistent yet, inspect the record to see whether it + * allows to get closer to being consistent. If we are consistent, dump + * our snapshot so others or we, after a restart, can use it. + */ + if (builder->state < SNAPBUILD_CONSISTENT) + { + /* returns false if there's no point in performing cleanup just yet */ + if (!SnapBuildFindSnapshot(builder, lsn, running)) + return; + } + else + SnapBuildSerialize(builder, lsn); + + /* + * Update range of interesting xids based on the running xacts + * information. We don't increase ->xmax using it, because once we are in + * a consistent state we can do that ourselves and much more efficiently + * so, because we only need to do it for catalog transactions since we + * only ever look at those. + * + * NB: We only increase xmax when a catalog modifying transaction commits + * (see SnapBuildCommitTxn). Because of this, xmax can be lower than + * xmin, which looks odd but is correct and actually more efficient, since + * we hit fast paths in heapam_visibility.c. + */ + builder->xmin = running->oldestRunningXid; + + /* Remove transactions we don't need to keep track off anymore */ + SnapBuildPurgeOlderTxn(builder); + + /* + * Advance the xmin limit for the current replication slot, to allow + * vacuum to clean up the tuples this slot has been protecting. + * + * The reorderbuffer might have an xmin among the currently running + * snapshots; use it if so. If not, we need only consider the snapshots + * we'll produce later, which can't be less than the oldest running xid in + * the record we're reading now. + */ + xmin = ReorderBufferGetOldestXmin(builder->reorder); + if (xmin == InvalidTransactionId) + xmin = running->oldestRunningXid; + elog(DEBUG3, "xmin: %u, xmax: %u, oldest running: %u, oldest xmin: %u", + builder->xmin, builder->xmax, running->oldestRunningXid, xmin); + LogicalIncreaseXminForSlot(lsn, xmin); + + /* + * Also tell the slot where we can restart decoding from. We don't want to + * do that after every commit because changing that implies an fsync of + * the logical slot's state file, so we only do it every time we see a + * running xacts record. + * + * Do so by looking for the oldest in progress transaction (determined by + * the first LSN of any of its relevant records). Every transaction + * remembers the last location we stored the snapshot to disk before its + * beginning. That point is where we can restart from. + */ + + /* + * Can't know about a serialized snapshot's location if we're not + * consistent. + */ + if (builder->state < SNAPBUILD_CONSISTENT) + return; + + txn = ReorderBufferGetOldestTXN(builder->reorder); + + /* + * oldest ongoing txn might have started when we didn't yet serialize + * anything because we hadn't reached a consistent state yet. + */ + if (txn != NULL && txn->restart_decoding_lsn != InvalidXLogRecPtr) + LogicalIncreaseRestartDecodingForSlot(lsn, txn->restart_decoding_lsn); + + /* + * No in-progress transaction, can reuse the last serialized snapshot if + * we have one. + */ + else if (txn == NULL && + builder->reorder->current_restart_decoding_lsn != InvalidXLogRecPtr && + builder->last_serialized_snapshot != InvalidXLogRecPtr) + LogicalIncreaseRestartDecodingForSlot(lsn, + builder->last_serialized_snapshot); +} + + +/* + * Build the start of a snapshot that's capable of decoding the catalog. + * + * Helper function for SnapBuildProcessRunningXacts() while we're not yet + * consistent. + * + * Returns true if there is a point in performing internal maintenance/cleanup + * using the xl_running_xacts record. + */ +static bool +SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running) +{ + /* --- + * Build catalog decoding snapshot incrementally using information about + * the currently running transactions. There are several ways to do that: + * + * a) There were no running transactions when the xl_running_xacts record + * was inserted, jump to CONSISTENT immediately. We might find such a + * state while waiting on c)'s sub-states. + * + * b) This (in a previous run) or another decoding slot serialized a + * snapshot to disk that we can use. Can't use this method for the + * initial snapshot when slot is being created and needs full snapshot + * for export or direct use, as that snapshot will only contain catalog + * modifying transactions. + * + * c) First incrementally build a snapshot for catalog tuples + * (BUILDING_SNAPSHOT), that requires all, already in-progress, + * transactions to finish. Every transaction starting after that + * (FULL_SNAPSHOT state), has enough information to be decoded. But + * for older running transactions no viable snapshot exists yet, so + * CONSISTENT will only be reached once all of those have finished. + * --- + */ + + /* + * xl_running_xacts record is older than what we can use, we might not + * have all necessary catalog rows anymore. + */ + if (TransactionIdIsNormal(builder->initial_xmin_horizon) && + NormalTransactionIdPrecedes(running->oldestRunningXid, + builder->initial_xmin_horizon)) + { + ereport(DEBUG1, + (errmsg_internal("skipping snapshot at %X/%X while building logical decoding snapshot, xmin horizon too low", + LSN_FORMAT_ARGS(lsn)), + errdetail_internal("initial xmin horizon of %u vs the snapshot's %u", + builder->initial_xmin_horizon, running->oldestRunningXid))); + + + SnapBuildWaitSnapshot(running, builder->initial_xmin_horizon); + + return true; + } + + /* + * a) No transaction were running, we can jump to consistent. + * + * This is not affected by races around xl_running_xacts, because we can + * miss transaction commits, but currently not transactions starting. + * + * NB: We might have already started to incrementally assemble a snapshot, + * so we need to be careful to deal with that. + */ + if (running->oldestRunningXid == running->nextXid) + { + if (builder->start_decoding_at == InvalidXLogRecPtr || + builder->start_decoding_at <= lsn) + /* can decode everything after this */ + builder->start_decoding_at = lsn + 1; + + /* As no transactions were running xmin/xmax can be trivially set. */ + builder->xmin = running->nextXid; /* < are finished */ + builder->xmax = running->nextXid; /* >= are running */ + + /* so we can safely use the faster comparisons */ + Assert(TransactionIdIsNormal(builder->xmin)); + Assert(TransactionIdIsNormal(builder->xmax)); + + builder->state = SNAPBUILD_CONSISTENT; + builder->next_phase_at = InvalidTransactionId; + + ereport(LOG, + (errmsg("logical decoding found consistent point at %X/%X", + LSN_FORMAT_ARGS(lsn)), + errdetail("There are no running transactions."))); + + return false; + } + /* b) valid on disk state and not building full snapshot */ + else if (!builder->building_full_snapshot && + SnapBuildRestore(builder, lsn)) + { + /* there won't be any state to cleanup */ + return false; + } + + /* + * c) transition from START to BUILDING_SNAPSHOT. + * + * In START state, and a xl_running_xacts record with running xacts is + * encountered. In that case, switch to BUILDING_SNAPSHOT state, and + * record xl_running_xacts->nextXid. Once all running xacts have finished + * (i.e. they're all >= nextXid), we have a complete catalog snapshot. It + * might look that we could use xl_running_xacts's ->xids information to + * get there quicker, but that is problematic because transactions marked + * as running, might already have inserted their commit record - it's + * infeasible to change that with locking. + */ + else if (builder->state == SNAPBUILD_START) + { + builder->state = SNAPBUILD_BUILDING_SNAPSHOT; + builder->next_phase_at = running->nextXid; + + /* + * Start with an xmin/xmax that's correct for future, when all the + * currently running transactions have finished. We'll update both + * while waiting for the pending transactions to finish. + */ + builder->xmin = running->nextXid; /* < are finished */ + builder->xmax = running->nextXid; /* >= are running */ + + /* so we can safely use the faster comparisons */ + Assert(TransactionIdIsNormal(builder->xmin)); + Assert(TransactionIdIsNormal(builder->xmax)); + + ereport(LOG, + (errmsg("logical decoding found initial starting point at %X/%X", + LSN_FORMAT_ARGS(lsn)), + errdetail("Waiting for transactions (approximately %d) older than %u to end.", + running->xcnt, running->nextXid))); + + SnapBuildWaitSnapshot(running, running->nextXid); + } + + /* + * c) transition from BUILDING_SNAPSHOT to FULL_SNAPSHOT. + * + * In BUILDING_SNAPSHOT state, and this xl_running_xacts' oldestRunningXid + * is >= than nextXid from when we switched to BUILDING_SNAPSHOT. This + * means all transactions starting afterwards have enough information to + * be decoded. Switch to FULL_SNAPSHOT. + */ + else if (builder->state == SNAPBUILD_BUILDING_SNAPSHOT && + TransactionIdPrecedesOrEquals(builder->next_phase_at, + running->oldestRunningXid)) + { + builder->state = SNAPBUILD_FULL_SNAPSHOT; + builder->next_phase_at = running->nextXid; + + ereport(LOG, + (errmsg("logical decoding found initial consistent point at %X/%X", + LSN_FORMAT_ARGS(lsn)), + errdetail("Waiting for transactions (approximately %d) older than %u to end.", + running->xcnt, running->nextXid))); + + SnapBuildWaitSnapshot(running, running->nextXid); + } + + /* + * c) transition from FULL_SNAPSHOT to CONSISTENT. + * + * In FULL_SNAPSHOT state, and this xl_running_xacts' oldestRunningXid is + * >= than nextXid from when we switched to FULL_SNAPSHOT. This means all + * transactions that are currently in progress have a catalog snapshot, + * and all their changes have been collected. Switch to CONSISTENT. + */ + else if (builder->state == SNAPBUILD_FULL_SNAPSHOT && + TransactionIdPrecedesOrEquals(builder->next_phase_at, + running->oldestRunningXid)) + { + builder->state = SNAPBUILD_CONSISTENT; + builder->next_phase_at = InvalidTransactionId; + + ereport(LOG, + (errmsg("logical decoding found consistent point at %X/%X", + LSN_FORMAT_ARGS(lsn)), + errdetail("There are no old transactions anymore."))); + } + + /* + * We already started to track running xacts and need to wait for all + * in-progress ones to finish. We fall through to the normal processing of + * records so incremental cleanup can be performed. + */ + return true; +} + +/* --- + * Iterate through xids in record, wait for all older than the cutoff to + * finish. Then, if possible, log a new xl_running_xacts record. + * + * This isn't required for the correctness of decoding, but to: + * a) allow isolationtester to notice that we're currently waiting for + * something. + * b) log a new xl_running_xacts record where it'd be helpful, without having + * to wait for bgwriter or checkpointer. + * --- + */ +static void +SnapBuildWaitSnapshot(xl_running_xacts *running, TransactionId cutoff) +{ + int off; + + for (off = 0; off < running->xcnt; off++) + { + TransactionId xid = running->xids[off]; + + /* + * Upper layers should prevent that we ever need to wait on ourselves. + * Check anyway, since failing to do so would either result in an + * endless wait or an Assert() failure. + */ + if (TransactionIdIsCurrentTransactionId(xid)) + elog(ERROR, "waiting for ourselves"); + + if (TransactionIdFollows(xid, cutoff)) + continue; + + XactLockTableWait(xid, NULL, NULL, XLTW_None); + } + + /* + * All transactions we needed to finish finished - try to ensure there is + * another xl_running_xacts record in a timely manner, without having to + * wait for bgwriter or checkpointer to log one. During recovery we can't + * enforce that, so we'll have to wait. + */ + if (!RecoveryInProgress()) + { + LogStandbySnapshot(); + } +} + +/* ----------------------------------- + * Snapshot serialization support + * ----------------------------------- + */ + +/* + * We store current state of struct SnapBuild on disk in the following manner: + * + * struct SnapBuildOnDisk; + * TransactionId * committed.xcnt; (*not xcnt_space*) + * TransactionId * catchange.xcnt; + * + */ +typedef struct SnapBuildOnDisk +{ + /* first part of this struct needs to be version independent */ + + /* data not covered by checksum */ + uint32 magic; + pg_crc32c checksum; + + /* data covered by checksum */ + + /* version, in case we want to support pg_upgrade */ + uint32 version; + /* how large is the on disk data, excluding the constant sized part */ + uint32 length; + + /* version dependent part */ + SnapBuild builder; + + /* variable amount of TransactionIds follows */ +} SnapBuildOnDisk; + +#define SnapBuildOnDiskConstantSize \ + offsetof(SnapBuildOnDisk, builder) +#define SnapBuildOnDiskNotChecksummedSize \ + offsetof(SnapBuildOnDisk, version) + +#define SNAPBUILD_MAGIC 0x51A1E001 +#define SNAPBUILD_VERSION 5 + +/* + * Store/Load a snapshot from disk, depending on the snapshot builder's state. + * + * Supposed to be used by external (i.e. not snapbuild.c) code that just read + * a record that's a potential location for a serialized snapshot. + */ +void +SnapBuildSerializationPoint(SnapBuild *builder, XLogRecPtr lsn) +{ + if (builder->state < SNAPBUILD_CONSISTENT) + SnapBuildRestore(builder, lsn); + else + SnapBuildSerialize(builder, lsn); +} + +/* + * Serialize the snapshot 'builder' at the location 'lsn' if it hasn't already + * been done by another decoding process. + */ +static void +SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn) +{ + Size needed_length; + SnapBuildOnDisk *ondisk = NULL; + TransactionId *catchange_xip = NULL; + MemoryContext old_ctx; + size_t catchange_xcnt; + char *ondisk_c; + int fd; + char tmppath[MAXPGPATH]; + char path[MAXPGPATH]; + int ret; + struct stat stat_buf; + Size sz; + + Assert(lsn != InvalidXLogRecPtr); + Assert(builder->last_serialized_snapshot == InvalidXLogRecPtr || + builder->last_serialized_snapshot <= lsn); + + /* + * no point in serializing if we cannot continue to work immediately after + * restoring the snapshot + */ + if (builder->state < SNAPBUILD_CONSISTENT) + return; + + /* consistent snapshots have no next phase */ + Assert(builder->next_phase_at == InvalidTransactionId); + + /* + * We identify snapshots by the LSN they are valid for. We don't need to + * include timelines in the name as each LSN maps to exactly one timeline + * unless the user used pg_resetwal or similar. If a user did so, there's + * no hope continuing to decode anyway. + */ + sprintf(path, "pg_logical/snapshots/%X-%X.snap", + LSN_FORMAT_ARGS(lsn)); + + /* + * first check whether some other backend already has written the snapshot + * for this LSN. It's perfectly fine if there's none, so we accept ENOENT + * as a valid state. Everything else is an unexpected error. + */ + ret = stat(path, &stat_buf); + + if (ret != 0 && errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", path))); + + else if (ret == 0) + { + /* + * somebody else has already serialized to this point, don't overwrite + * but remember location, so we don't need to read old data again. + * + * To be sure it has been synced to disk after the rename() from the + * tempfile filename to the real filename, we just repeat the fsync. + * That ought to be cheap because in most scenarios it should already + * be safely on disk. + */ + fsync_fname(path, false); + fsync_fname("pg_logical/snapshots", true); + + builder->last_serialized_snapshot = lsn; + goto out; + } + + /* + * there is an obvious race condition here between the time we stat(2) the + * file and us writing the file. But we rename the file into place + * atomically and all files created need to contain the same data anyway, + * so this is perfectly fine, although a bit of a resource waste. Locking + * seems like pointless complication. + */ + elog(DEBUG1, "serializing snapshot to %s", path); + + /* to make sure only we will write to this tempfile, include pid */ + sprintf(tmppath, "pg_logical/snapshots/%X-%X.snap.%d.tmp", + LSN_FORMAT_ARGS(lsn), MyProcPid); + + /* + * Unlink temporary file if it already exists, needs to have been before a + * crash/error since we won't enter this function twice from within a + * single decoding slot/backend and the temporary file contains the pid of + * the current process. + */ + if (unlink(tmppath) != 0 && errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", tmppath))); + + old_ctx = MemoryContextSwitchTo(builder->context); + + /* Get the catalog modifying transactions that are yet not committed */ + catchange_xip = ReorderBufferGetCatalogChangesXacts(builder->reorder); + catchange_xcnt = dclist_count(&builder->reorder->catchange_txns); + + needed_length = sizeof(SnapBuildOnDisk) + + sizeof(TransactionId) * (builder->committed.xcnt + catchange_xcnt); + + ondisk_c = palloc0(needed_length); + ondisk = (SnapBuildOnDisk *) ondisk_c; + ondisk->magic = SNAPBUILD_MAGIC; + ondisk->version = SNAPBUILD_VERSION; + ondisk->length = needed_length; + INIT_CRC32C(ondisk->checksum); + COMP_CRC32C(ondisk->checksum, + ((char *) ondisk) + SnapBuildOnDiskNotChecksummedSize, + SnapBuildOnDiskConstantSize - SnapBuildOnDiskNotChecksummedSize); + ondisk_c += sizeof(SnapBuildOnDisk); + + memcpy(&ondisk->builder, builder, sizeof(SnapBuild)); + /* NULL-ify memory-only data */ + ondisk->builder.context = NULL; + ondisk->builder.snapshot = NULL; + ondisk->builder.reorder = NULL; + ondisk->builder.committed.xip = NULL; + ondisk->builder.catchange.xip = NULL; + /* update catchange only on disk data */ + ondisk->builder.catchange.xcnt = catchange_xcnt; + + COMP_CRC32C(ondisk->checksum, + &ondisk->builder, + sizeof(SnapBuild)); + + /* copy committed xacts */ + if (builder->committed.xcnt > 0) + { + sz = sizeof(TransactionId) * builder->committed.xcnt; + memcpy(ondisk_c, builder->committed.xip, sz); + COMP_CRC32C(ondisk->checksum, ondisk_c, sz); + ondisk_c += sz; + } + + /* copy catalog modifying xacts */ + if (catchange_xcnt > 0) + { + sz = sizeof(TransactionId) * catchange_xcnt; + memcpy(ondisk_c, catchange_xip, sz); + COMP_CRC32C(ondisk->checksum, ondisk_c, sz); + ondisk_c += sz; + } + + FIN_CRC32C(ondisk->checksum); + + /* we have valid data now, open tempfile and write it there */ + fd = OpenTransientFile(tmppath, + O_CREAT | O_EXCL | O_WRONLY | PG_BINARY); + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", tmppath))); + + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_WRITE); + if ((write(fd, ondisk, needed_length)) != needed_length) + { + int save_errno = errno; + + CloseTransientFile(fd); + + /* if write didn't set errno, assume problem is no disk space */ + errno = save_errno ? save_errno : ENOSPC; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", tmppath))); + } + pgstat_report_wait_end(); + + /* + * fsync the file before renaming so that even if we crash after this we + * have either a fully valid file or nothing. + * + * It's safe to just ERROR on fsync() here because we'll retry the whole + * operation including the writes. + * + * TODO: Do the fsync() via checkpoints/restartpoints, doing it here has + * some noticeable overhead since it's performed synchronously during + * decoding? + */ + pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_SYNC); + if (pg_fsync(fd) != 0) + { + int save_errno = errno; + + CloseTransientFile(fd); + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", tmppath))); + } + pgstat_report_wait_end(); + + if (CloseTransientFile(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", tmppath))); + + fsync_fname("pg_logical/snapshots", true); + + /* + * We may overwrite the work from some other backend, but that's ok, our + * snapshot is valid as well, we'll just have done some superfluous work. + */ + if (rename(tmppath, path) != 0) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not rename file \"%s\" to \"%s\": %m", + tmppath, path))); + } + + /* make sure we persist */ + fsync_fname(path, false); + fsync_fname("pg_logical/snapshots", true); + + /* + * Now there's no way we can lose the dumped state anymore, remember this + * as a serialization point. + */ + builder->last_serialized_snapshot = lsn; + + MemoryContextSwitchTo(old_ctx); + +out: + ReorderBufferSetRestartPoint(builder->reorder, + builder->last_serialized_snapshot); + /* be tidy */ + if (ondisk) + pfree(ondisk); + if (catchange_xip) + pfree(catchange_xip); +} + +/* + * Restore a snapshot into 'builder' if previously one has been stored at the + * location indicated by 'lsn'. Returns true if successful, false otherwise. + */ +static bool +SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn) +{ + SnapBuildOnDisk ondisk; + int fd; + char path[MAXPGPATH]; + Size sz; + pg_crc32c checksum; + + /* no point in loading a snapshot if we're already there */ + if (builder->state == SNAPBUILD_CONSISTENT) + return false; + + sprintf(path, "pg_logical/snapshots/%X-%X.snap", + LSN_FORMAT_ARGS(lsn)); + + fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); + + if (fd < 0 && errno == ENOENT) + return false; + else if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + + /* ---- + * Make sure the snapshot had been stored safely to disk, that's normally + * cheap. + * Note that we do not need PANIC here, nobody will be able to use the + * slot without fsyncing, and saving it won't succeed without an fsync() + * either... + * ---- + */ + fsync_fname(path, false); + fsync_fname("pg_logical/snapshots", true); + + + /* read statically sized portion of snapshot */ + SnapBuildRestoreContents(fd, (char *) &ondisk, SnapBuildOnDiskConstantSize, path); + + if (ondisk.magic != SNAPBUILD_MAGIC) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("snapbuild state file \"%s\" has wrong magic number: %u instead of %u", + path, ondisk.magic, SNAPBUILD_MAGIC))); + + if (ondisk.version != SNAPBUILD_VERSION) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("snapbuild state file \"%s\" has unsupported version: %u instead of %u", + path, ondisk.version, SNAPBUILD_VERSION))); + + INIT_CRC32C(checksum); + COMP_CRC32C(checksum, + ((char *) &ondisk) + SnapBuildOnDiskNotChecksummedSize, + SnapBuildOnDiskConstantSize - SnapBuildOnDiskNotChecksummedSize); + + /* read SnapBuild */ + SnapBuildRestoreContents(fd, (char *) &ondisk.builder, sizeof(SnapBuild), path); + COMP_CRC32C(checksum, &ondisk.builder, sizeof(SnapBuild)); + + /* restore committed xacts information */ + if (ondisk.builder.committed.xcnt > 0) + { + sz = sizeof(TransactionId) * ondisk.builder.committed.xcnt; + ondisk.builder.committed.xip = MemoryContextAllocZero(builder->context, sz); + SnapBuildRestoreContents(fd, (char *) ondisk.builder.committed.xip, sz, path); + COMP_CRC32C(checksum, ondisk.builder.committed.xip, sz); + } + + /* restore catalog modifying xacts information */ + if (ondisk.builder.catchange.xcnt > 0) + { + sz = sizeof(TransactionId) * ondisk.builder.catchange.xcnt; + ondisk.builder.catchange.xip = MemoryContextAllocZero(builder->context, sz); + SnapBuildRestoreContents(fd, (char *) ondisk.builder.catchange.xip, sz, path); + COMP_CRC32C(checksum, ondisk.builder.catchange.xip, sz); + } + + if (CloseTransientFile(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", path))); + + FIN_CRC32C(checksum); + + /* verify checksum of what we've read */ + if (!EQ_CRC32C(checksum, ondisk.checksum)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("checksum mismatch for snapbuild state file \"%s\": is %u, should be %u", + path, checksum, ondisk.checksum))); + + /* + * ok, we now have a sensible snapshot here, figure out if it has more + * information than we have. + */ + + /* + * We are only interested in consistent snapshots for now, comparing + * whether one incomplete snapshot is more "advanced" seems to be + * unnecessarily complex. + */ + if (ondisk.builder.state < SNAPBUILD_CONSISTENT) + goto snapshot_not_interesting; + + /* + * Don't use a snapshot that requires an xmin that we cannot guarantee to + * be available. + */ + if (TransactionIdPrecedes(ondisk.builder.xmin, builder->initial_xmin_horizon)) + goto snapshot_not_interesting; + + /* + * Consistent snapshots have no next phase. Reset next_phase_at as it is + * possible that an old value may remain. + */ + Assert(ondisk.builder.next_phase_at == InvalidTransactionId); + builder->next_phase_at = InvalidTransactionId; + + /* ok, we think the snapshot is sensible, copy over everything important */ + builder->xmin = ondisk.builder.xmin; + builder->xmax = ondisk.builder.xmax; + builder->state = ondisk.builder.state; + + builder->committed.xcnt = ondisk.builder.committed.xcnt; + /* We only allocated/stored xcnt, not xcnt_space xids ! */ + /* don't overwrite preallocated xip, if we don't have anything here */ + if (builder->committed.xcnt > 0) + { + pfree(builder->committed.xip); + builder->committed.xcnt_space = ondisk.builder.committed.xcnt; + builder->committed.xip = ondisk.builder.committed.xip; + } + ondisk.builder.committed.xip = NULL; + + /* set catalog modifying transactions */ + if (builder->catchange.xip) + pfree(builder->catchange.xip); + builder->catchange.xcnt = ondisk.builder.catchange.xcnt; + builder->catchange.xip = ondisk.builder.catchange.xip; + ondisk.builder.catchange.xip = NULL; + + /* our snapshot is not interesting anymore, build a new one */ + if (builder->snapshot != NULL) + { + SnapBuildSnapDecRefcount(builder->snapshot); + } + builder->snapshot = SnapBuildBuildSnapshot(builder); + SnapBuildSnapIncRefcount(builder->snapshot); + + ReorderBufferSetRestartPoint(builder->reorder, lsn); + + Assert(builder->state == SNAPBUILD_CONSISTENT); + + ereport(LOG, + (errmsg("logical decoding found consistent point at %X/%X", + LSN_FORMAT_ARGS(lsn)), + errdetail("Logical decoding will begin using saved snapshot."))); + return true; + +snapshot_not_interesting: + if (ondisk.builder.committed.xip != NULL) + pfree(ondisk.builder.committed.xip); + if (ondisk.builder.catchange.xip != NULL) + pfree(ondisk.builder.catchange.xip); + return false; +} + +/* + * Read the contents of the serialized snapshot to 'dest'. + */ +static void +SnapBuildRestoreContents(int fd, char *dest, Size size, const char *path) +{ + int readBytes; + + pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_READ); + readBytes = read(fd, dest, size); + pgstat_report_wait_end(); + if (readBytes != size) + { + int save_errno = errno; + + CloseTransientFile(fd); + + if (readBytes < 0) + { + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", path))); + } + else + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("could not read file \"%s\": read %d of %zu", + path, readBytes, size))); + } +} + +/* + * Remove all serialized snapshots that are not required anymore because no + * slot can need them. This doesn't actually have to run during a checkpoint, + * but it's a convenient point to schedule this. + * + * NB: We run this during checkpoints even if logical decoding is disabled so + * we cleanup old slots at some point after it got disabled. + */ +void +CheckPointSnapBuild(void) +{ + XLogRecPtr cutoff; + XLogRecPtr redo; + DIR *snap_dir; + struct dirent *snap_de; + char path[MAXPGPATH + 21]; + + /* + * We start off with a minimum of the last redo pointer. No new + * replication slot will start before that, so that's a safe upper bound + * for removal. + */ + redo = GetRedoRecPtr(); + + /* now check for the restart ptrs from existing slots */ + cutoff = ReplicationSlotsComputeLogicalRestartLSN(); + + /* don't start earlier than the restart lsn */ + if (redo < cutoff) + cutoff = redo; + + snap_dir = AllocateDir("pg_logical/snapshots"); + while ((snap_de = ReadDir(snap_dir, "pg_logical/snapshots")) != NULL) + { + uint32 hi; + uint32 lo; + XLogRecPtr lsn; + PGFileType de_type; + + if (strcmp(snap_de->d_name, ".") == 0 || + strcmp(snap_de->d_name, "..") == 0) + continue; + + snprintf(path, sizeof(path), "pg_logical/snapshots/%s", snap_de->d_name); + de_type = get_dirent_type(path, snap_de, false, DEBUG1); + + if (de_type != PGFILETYPE_ERROR && de_type != PGFILETYPE_REG) + { + elog(DEBUG1, "only regular files expected: %s", path); + continue; + } + + /* + * temporary filenames from SnapBuildSerialize() include the LSN and + * everything but are postfixed by .$pid.tmp. We can just remove them + * the same as other files because there can be none that are + * currently being written that are older than cutoff. + * + * We just log a message if a file doesn't fit the pattern, it's + * probably some editors lock/state file or similar... + */ + if (sscanf(snap_de->d_name, "%X-%X.snap", &hi, &lo) != 2) + { + ereport(LOG, + (errmsg("could not parse file name \"%s\"", path))); + continue; + } + + lsn = ((uint64) hi) << 32 | lo; + + /* check whether we still need it */ + if (lsn < cutoff || cutoff == InvalidXLogRecPtr) + { + elog(DEBUG1, "removing snapbuild snapshot %s", path); + + /* + * It's not particularly harmful, though strange, if we can't + * remove the file here. Don't prevent the checkpoint from + * completing, that'd be a cure worse than the disease. + */ + if (unlink(path) < 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", + path))); + continue; + } + } + } + FreeDir(snap_dir); +} diff --git a/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/tablesync.c b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/tablesync.c new file mode 100644 index 00000000000..6281537058c --- /dev/null +++ b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/tablesync.c @@ -0,0 +1,1690 @@ +/*------------------------------------------------------------------------- + * tablesync.c + * PostgreSQL logical replication: initial table data synchronization + * + * Copyright (c) 2012-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/tablesync.c + * + * NOTES + * This file contains code for initial table data synchronization for + * logical replication. + * + * The initial data synchronization is done separately for each table, + * in a separate apply worker that only fetches the initial snapshot data + * from the publisher and then synchronizes the position in the stream with + * the leader apply worker. + * + * There are several reasons for doing the synchronization this way: + * - It allows us to parallelize the initial data synchronization + * which lowers the time needed for it to happen. + * - The initial synchronization does not have to hold the xid and LSN + * for the time it takes to copy data of all tables, causing less + * bloat and lower disk consumption compared to doing the + * synchronization in a single process for the whole database. + * - It allows us to synchronize any tables added after the initial + * synchronization has finished. + * + * The stream position synchronization works in multiple steps: + * - Apply worker requests a tablesync worker to start, setting the new + * table state to INIT. + * - Tablesync worker starts; changes table state from INIT to DATASYNC while + * copying. + * - Tablesync worker does initial table copy; there is a FINISHEDCOPY (sync + * worker specific) state to indicate when the copy phase has completed, so + * if the worker crashes with this (non-memory) state then the copy will not + * be re-attempted. + * - Tablesync worker then sets table state to SYNCWAIT; waits for state change. + * - Apply worker periodically checks for tables in SYNCWAIT state. When + * any appear, it sets the table state to CATCHUP and starts loop-waiting + * until either the table state is set to SYNCDONE or the sync worker + * exits. + * - After the sync worker has seen the state change to CATCHUP, it will + * read the stream and apply changes (acting like an apply worker) until + * it catches up to the specified stream position. Then it sets the + * state to SYNCDONE. There might be zero changes applied between + * CATCHUP and SYNCDONE, because the sync worker might be ahead of the + * apply worker. + * - Once the state is set to SYNCDONE, the apply will continue tracking + * the table until it reaches the SYNCDONE stream position, at which + * point it sets state to READY and stops tracking. Again, there might + * be zero changes in between. + * + * So the state progression is always: INIT -> DATASYNC -> FINISHEDCOPY + * -> SYNCWAIT -> CATCHUP -> SYNCDONE -> READY. + * + * The catalog pg_subscription_rel is used to keep information about + * subscribed tables and their state. The catalog holds all states + * except SYNCWAIT and CATCHUP which are only in shared memory. + * + * Example flows look like this: + * - Apply is in front: + * sync:8 + * -> set in catalog FINISHEDCOPY + * -> set in memory SYNCWAIT + * apply:10 + * -> set in memory CATCHUP + * -> enter wait-loop + * sync:10 + * -> set in catalog SYNCDONE + * -> exit + * apply:10 + * -> exit wait-loop + * -> continue rep + * apply:11 + * -> set in catalog READY + * + * - Sync is in front: + * sync:10 + * -> set in catalog FINISHEDCOPY + * -> set in memory SYNCWAIT + * apply:8 + * -> set in memory CATCHUP + * -> continue per-table filtering + * sync:10 + * -> set in catalog SYNCDONE + * -> exit + * apply:10 + * -> set in catalog READY + * -> stop per-table filtering + * -> continue rep + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/table.h" +#include "access/xact.h" +#include "catalog/indexing.h" +#include "catalog/pg_subscription_rel.h" +#include "catalog/pg_type.h" +#include "commands/copy.h" +#include "miscadmin.h" +#include "nodes/makefuncs.h" +#include "parser/parse_relation.h" +#include "pgstat.h" +#include "replication/logicallauncher.h" +#include "replication/logicalrelation.h" +#include "replication/walreceiver.h" +#include "replication/worker_internal.h" +#include "replication/slot.h" +#include "replication/origin.h" +#include "storage/ipc.h" +#include "storage/lmgr.h" +#include "utils/acl.h" +#include "utils/array.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/rls.h" +#include "utils/snapmgr.h" +#include "utils/syscache.h" +#include "utils/usercontext.h" + +typedef enum +{ + SYNC_TABLE_STATE_NEEDS_REBUILD, + SYNC_TABLE_STATE_REBUILD_STARTED, + SYNC_TABLE_STATE_VALID, +} SyncingTablesState; + +static __thread SyncingTablesState table_states_validity = SYNC_TABLE_STATE_NEEDS_REBUILD; +static __thread List *table_states_not_ready = NIL; +static bool FetchTableStates(bool *started_tx); + +static __thread StringInfo copybuf = NULL; + +/* + * Exit routine for synchronization worker. + */ +static void +pg_attribute_noreturn() +finish_sync_worker(void) +{ + /* + * Commit any outstanding transaction. This is the usual case, unless + * there was nothing to do for the table. + */ + if (IsTransactionState()) + { + CommitTransactionCommand(); + pgstat_report_stat(true); + } + + /* And flush all writes. */ + XLogFlush(GetXLogWriteRecPtr()); + + StartTransactionCommand(); + ereport(LOG, + (errmsg("logical replication table synchronization worker for subscription \"%s\", table \"%s\" has finished", + MySubscription->name, + get_rel_name(MyLogicalRepWorker->relid)))); + CommitTransactionCommand(); + + /* Find the leader apply worker and signal it. */ + logicalrep_worker_wakeup(MyLogicalRepWorker->subid, InvalidOid); + + /* Stop gracefully */ + proc_exit(0); +} + +/* + * Wait until the relation sync state is set in the catalog to the expected + * one; return true when it happens. + * + * Returns false if the table sync worker or the table itself have + * disappeared, or the table state has been reset. + * + * Currently, this is used in the apply worker when transitioning from + * CATCHUP state to SYNCDONE. + */ +static bool +wait_for_relation_state_change(Oid relid, char expected_state) +{ + char state; + + for (;;) + { + LogicalRepWorker *worker; + XLogRecPtr statelsn; + + CHECK_FOR_INTERRUPTS(); + + InvalidateCatalogSnapshot(); + state = GetSubscriptionRelState(MyLogicalRepWorker->subid, + relid, &statelsn); + + if (state == SUBREL_STATE_UNKNOWN) + break; + + if (state == expected_state) + return true; + + /* Check if the sync worker is still running and bail if not. */ + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + worker = logicalrep_worker_find(MyLogicalRepWorker->subid, relid, + false); + LWLockRelease(LogicalRepWorkerLock); + if (!worker) + break; + + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 1000L, WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE); + + ResetLatch(MyLatch); + } + + return false; +} + +/* + * Wait until the apply worker changes the state of our synchronization + * worker to the expected one. + * + * Used when transitioning from SYNCWAIT state to CATCHUP. + * + * Returns false if the apply worker has disappeared. + */ +static bool +wait_for_worker_state_change(char expected_state) +{ + int rc; + + for (;;) + { + LogicalRepWorker *worker; + + CHECK_FOR_INTERRUPTS(); + + /* + * Done if already in correct state. (We assume this fetch is atomic + * enough to not give a misleading answer if we do it with no lock.) + */ + if (MyLogicalRepWorker->relstate == expected_state) + return true; + + /* + * Bail out if the apply worker has died, else signal it we're + * waiting. + */ + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + worker = logicalrep_worker_find(MyLogicalRepWorker->subid, + InvalidOid, false); + if (worker && worker->proc) + logicalrep_worker_wakeup_ptr(worker); + LWLockRelease(LogicalRepWorkerLock); + if (!worker) + break; + + /* + * Wait. We expect to get a latch signal back from the apply worker, + * but use a timeout in case it dies without sending one. + */ + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 1000L, WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE); + + if (rc & WL_LATCH_SET) + ResetLatch(MyLatch); + } + + return false; +} + +/* + * Callback from syscache invalidation. + */ +void +invalidate_syncing_table_states(Datum arg, int cacheid, uint32 hashvalue) +{ + table_states_validity = SYNC_TABLE_STATE_NEEDS_REBUILD; +} + +/* + * Handle table synchronization cooperation from the synchronization + * worker. + * + * If the sync worker is in CATCHUP state and reached (or passed) the + * predetermined synchronization point in the WAL stream, mark the table as + * SYNCDONE and finish. + */ +static void +process_syncing_tables_for_sync(XLogRecPtr current_lsn) +{ + SpinLockAcquire(&MyLogicalRepWorker->relmutex); + + if (MyLogicalRepWorker->relstate == SUBREL_STATE_CATCHUP && + current_lsn >= MyLogicalRepWorker->relstate_lsn) + { + TimeLineID tli; + char syncslotname[NAMEDATALEN] = {0}; + char originname[NAMEDATALEN] = {0}; + + MyLogicalRepWorker->relstate = SUBREL_STATE_SYNCDONE; + MyLogicalRepWorker->relstate_lsn = current_lsn; + + SpinLockRelease(&MyLogicalRepWorker->relmutex); + + /* + * UpdateSubscriptionRelState must be called within a transaction. + */ + if (!IsTransactionState()) + StartTransactionCommand(); + + UpdateSubscriptionRelState(MyLogicalRepWorker->subid, + MyLogicalRepWorker->relid, + MyLogicalRepWorker->relstate, + MyLogicalRepWorker->relstate_lsn); + + /* + * End streaming so that LogRepWorkerWalRcvConn can be used to drop + * the slot. + */ + walrcv_endstreaming(LogRepWorkerWalRcvConn, &tli); + + /* + * Cleanup the tablesync slot. + * + * This has to be done after updating the state because otherwise if + * there is an error while doing the database operations we won't be + * able to rollback dropped slot. + */ + ReplicationSlotNameForTablesync(MyLogicalRepWorker->subid, + MyLogicalRepWorker->relid, + syncslotname, + sizeof(syncslotname)); + + /* + * It is important to give an error if we are unable to drop the slot, + * otherwise, it won't be dropped till the corresponding subscription + * is dropped. So passing missing_ok = false. + */ + ReplicationSlotDropAtPubNode(LogRepWorkerWalRcvConn, syncslotname, false); + + CommitTransactionCommand(); + pgstat_report_stat(false); + + /* + * Start a new transaction to clean up the tablesync origin tracking. + * This transaction will be ended within the finish_sync_worker(). + * Now, even, if we fail to remove this here, the apply worker will + * ensure to clean it up afterward. + * + * We need to do this after the table state is set to SYNCDONE. + * Otherwise, if an error occurs while performing the database + * operation, the worker will be restarted and the in-memory state of + * replication progress (remote_lsn) won't be rolled-back which would + * have been cleared before restart. So, the restarted worker will use + * invalid replication progress state resulting in replay of + * transactions that have already been applied. + */ + StartTransactionCommand(); + + ReplicationOriginNameForLogicalRep(MyLogicalRepWorker->subid, + MyLogicalRepWorker->relid, + originname, + sizeof(originname)); + + /* + * Resetting the origin session removes the ownership of the slot. + * This is needed to allow the origin to be dropped. + */ + replorigin_session_reset(); + replorigin_session_origin = InvalidRepOriginId; + replorigin_session_origin_lsn = InvalidXLogRecPtr; + replorigin_session_origin_timestamp = 0; + + /* + * Drop the tablesync's origin tracking if exists. + * + * There is a chance that the user is concurrently performing refresh + * for the subscription where we remove the table state and its origin + * or the apply worker would have removed this origin. So passing + * missing_ok = true. + */ + replorigin_drop_by_name(originname, true, false); + + finish_sync_worker(); + } + else + SpinLockRelease(&MyLogicalRepWorker->relmutex); +} + +/* + * Handle table synchronization cooperation from the apply worker. + * + * Walk over all subscription tables that are individually tracked by the + * apply process (currently, all that have state other than + * SUBREL_STATE_READY) and manage synchronization for them. + * + * If there are tables that need synchronizing and are not being synchronized + * yet, start sync workers for them (if there are free slots for sync + * workers). To prevent starting the sync worker for the same relation at a + * high frequency after a failure, we store its last start time with each sync + * state info. We start the sync worker for the same relation after waiting + * at least wal_retrieve_retry_interval. + * + * For tables that are being synchronized already, check if sync workers + * either need action from the apply worker or have finished. This is the + * SYNCWAIT to CATCHUP transition. + * + * If the synchronization position is reached (SYNCDONE), then the table can + * be marked as READY and is no longer tracked. + */ +static void +process_syncing_tables_for_apply(XLogRecPtr current_lsn) +{ + struct tablesync_start_time_mapping + { + Oid relid; + TimestampTz last_start_time; + }; + static __thread HTAB *last_start_times = NULL; + ListCell *lc; + bool started_tx = false; + bool should_exit = false; + + Assert(!IsTransactionState()); + + /* We need up-to-date sync state info for subscription tables here. */ + FetchTableStates(&started_tx); + + /* + * Prepare a hash table for tracking last start times of workers, to avoid + * immediate restarts. We don't need it if there are no tables that need + * syncing. + */ + if (table_states_not_ready != NIL && !last_start_times) + { + HASHCTL ctl; + + ctl.keysize = sizeof(Oid); + ctl.entrysize = sizeof(struct tablesync_start_time_mapping); + last_start_times = hash_create("Logical replication table sync worker start times", + 256, &ctl, HASH_ELEM | HASH_BLOBS); + } + + /* + * Clean up the hash table when we're done with all tables (just to + * release the bit of memory). + */ + else if (table_states_not_ready == NIL && last_start_times) + { + hash_destroy(last_start_times); + last_start_times = NULL; + } + + /* + * Process all tables that are being synchronized. + */ + foreach(lc, table_states_not_ready) + { + SubscriptionRelState *rstate = (SubscriptionRelState *) lfirst(lc); + + if (rstate->state == SUBREL_STATE_SYNCDONE) + { + /* + * Apply has caught up to the position where the table sync has + * finished. Mark the table as ready so that the apply will just + * continue to replicate it normally. + */ + if (current_lsn >= rstate->lsn) + { + char originname[NAMEDATALEN]; + + rstate->state = SUBREL_STATE_READY; + rstate->lsn = current_lsn; + if (!started_tx) + { + StartTransactionCommand(); + started_tx = true; + } + + /* + * Remove the tablesync origin tracking if exists. + * + * There is a chance that the user is concurrently performing + * refresh for the subscription where we remove the table + * state and its origin or the tablesync worker would have + * already removed this origin. We can't rely on tablesync + * worker to remove the origin tracking as if there is any + * error while dropping we won't restart it to drop the + * origin. So passing missing_ok = true. + */ + ReplicationOriginNameForLogicalRep(MyLogicalRepWorker->subid, + rstate->relid, + originname, + sizeof(originname)); + replorigin_drop_by_name(originname, true, false); + + /* + * Update the state to READY only after the origin cleanup. + */ + UpdateSubscriptionRelState(MyLogicalRepWorker->subid, + rstate->relid, rstate->state, + rstate->lsn); + } + } + else + { + LogicalRepWorker *syncworker; + + /* + * Look for a sync worker for this relation. + */ + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + + syncworker = logicalrep_worker_find(MyLogicalRepWorker->subid, + rstate->relid, false); + + if (syncworker) + { + /* Found one, update our copy of its state */ + SpinLockAcquire(&syncworker->relmutex); + rstate->state = syncworker->relstate; + rstate->lsn = syncworker->relstate_lsn; + if (rstate->state == SUBREL_STATE_SYNCWAIT) + { + /* + * Sync worker is waiting for apply. Tell sync worker it + * can catchup now. + */ + syncworker->relstate = SUBREL_STATE_CATCHUP; + syncworker->relstate_lsn = + Max(syncworker->relstate_lsn, current_lsn); + } + SpinLockRelease(&syncworker->relmutex); + + /* If we told worker to catch up, wait for it. */ + if (rstate->state == SUBREL_STATE_SYNCWAIT) + { + /* Signal the sync worker, as it may be waiting for us. */ + if (syncworker->proc) + logicalrep_worker_wakeup_ptr(syncworker); + + /* Now safe to release the LWLock */ + LWLockRelease(LogicalRepWorkerLock); + + if (started_tx) + { + /* + * We must commit the existing transaction to release + * the existing locks before entering a busy loop. + * This is required to avoid any undetected deadlocks + * due to any existing lock as deadlock detector won't + * be able to detect the waits on the latch. + */ + CommitTransactionCommand(); + pgstat_report_stat(false); + } + + /* + * Enter busy loop and wait for synchronization worker to + * reach expected state (or die trying). + */ + StartTransactionCommand(); + started_tx = true; + + wait_for_relation_state_change(rstate->relid, + SUBREL_STATE_SYNCDONE); + } + else + LWLockRelease(LogicalRepWorkerLock); + } + else + { + /* + * If there is no sync worker for this table yet, count + * running sync workers for this subscription, while we have + * the lock. + */ + int nsyncworkers = + logicalrep_sync_worker_count(MyLogicalRepWorker->subid); + + /* Now safe to release the LWLock */ + LWLockRelease(LogicalRepWorkerLock); + + /* + * If there are free sync worker slot(s), start a new sync + * worker for the table. + */ + if (nsyncworkers < max_sync_workers_per_subscription) + { + TimestampTz now = GetCurrentTimestamp(); + struct tablesync_start_time_mapping *hentry; + bool found; + + hentry = hash_search(last_start_times, &rstate->relid, + HASH_ENTER, &found); + + if (!found || + TimestampDifferenceExceeds(hentry->last_start_time, now, + wal_retrieve_retry_interval)) + { + logicalrep_worker_launch(MyLogicalRepWorker->dbid, + MySubscription->oid, + MySubscription->name, + MyLogicalRepWorker->userid, + rstate->relid, + DSM_HANDLE_INVALID); + hentry->last_start_time = now; + } + } + } + } + } + + if (started_tx) + { + /* + * Even when the two_phase mode is requested by the user, it remains + * as 'pending' until all tablesyncs have reached READY state. + * + * When this happens, we restart the apply worker and (if the + * conditions are still ok) then the two_phase tri-state will become + * 'enabled' at that time. + * + * Note: If the subscription has no tables then leave the state as + * PENDING, which allows ALTER SUBSCRIPTION ... REFRESH PUBLICATION to + * work. + */ + if (MySubscription->twophasestate == LOGICALREP_TWOPHASE_STATE_PENDING) + { + CommandCounterIncrement(); /* make updates visible */ + if (AllTablesyncsReady()) + { + ereport(LOG, + (errmsg("logical replication apply worker for subscription \"%s\" will restart so that two_phase can be enabled", + MySubscription->name))); + should_exit = true; + } + } + + CommitTransactionCommand(); + pgstat_report_stat(true); + } + + if (should_exit) + { + /* + * Reset the last-start time for this worker so that the launcher will + * restart it without waiting for wal_retrieve_retry_interval. + */ + ApplyLauncherForgetWorkerStartTime(MySubscription->oid); + + proc_exit(0); + } +} + +/* + * Process possible state change(s) of tables that are being synchronized. + */ +void +process_syncing_tables(XLogRecPtr current_lsn) +{ + /* + * Skip for parallel apply workers because they only operate on tables + * that are in a READY state. See pa_can_start() and + * should_apply_changes_for_rel(). + */ + if (am_parallel_apply_worker()) + return; + + if (am_tablesync_worker()) + process_syncing_tables_for_sync(current_lsn); + else + process_syncing_tables_for_apply(current_lsn); +} + +/* + * Create list of columns for COPY based on logical relation mapping. + */ +static List * +make_copy_attnamelist(LogicalRepRelMapEntry *rel) +{ + List *attnamelist = NIL; + int i; + + for (i = 0; i < rel->remoterel.natts; i++) + { + attnamelist = lappend(attnamelist, + makeString(rel->remoterel.attnames[i])); + } + + + return attnamelist; +} + +/* + * Data source callback for the COPY FROM, which reads from the remote + * connection and passes the data back to our local COPY. + */ +static int +copy_read_data(void *outbuf, int minread, int maxread) +{ + int bytesread = 0; + int avail; + + /* If there are some leftover data from previous read, use it. */ + avail = copybuf->len - copybuf->cursor; + if (avail) + { + if (avail > maxread) + avail = maxread; + memcpy(outbuf, ©buf->data[copybuf->cursor], avail); + copybuf->cursor += avail; + maxread -= avail; + bytesread += avail; + } + + while (maxread > 0 && bytesread < minread) + { + pgsocket fd = PGINVALID_SOCKET; + int len; + char *buf = NULL; + + for (;;) + { + /* Try read the data. */ + len = walrcv_receive(LogRepWorkerWalRcvConn, &buf, &fd); + + CHECK_FOR_INTERRUPTS(); + + if (len == 0) + break; + else if (len < 0) + return bytesread; + else + { + /* Process the data */ + copybuf->data = buf; + copybuf->len = len; + copybuf->cursor = 0; + + avail = copybuf->len - copybuf->cursor; + if (avail > maxread) + avail = maxread; + memcpy(outbuf, ©buf->data[copybuf->cursor], avail); + outbuf = (void *) ((char *) outbuf + avail); + copybuf->cursor += avail; + maxread -= avail; + bytesread += avail; + } + + if (maxread <= 0 || bytesread >= minread) + return bytesread; + } + + /* + * Wait for more data or latch. + */ + (void) WaitLatchOrSocket(MyLatch, + WL_SOCKET_READABLE | WL_LATCH_SET | + WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + fd, 1000L, WAIT_EVENT_LOGICAL_SYNC_DATA); + + ResetLatch(MyLatch); + } + + return bytesread; +} + + +/* + * Get information about remote relation in similar fashion the RELATION + * message provides during replication. This function also returns the relation + * qualifications to be used in the COPY command. + */ +static void +fetch_remote_table_info(char *nspname, char *relname, + LogicalRepRelation *lrel, List **qual) +{ + WalRcvExecResult *res; + StringInfoData cmd; + TupleTableSlot *slot; + Oid tableRow[] = {OIDOID, CHAROID, CHAROID}; + Oid attrRow[] = {INT2OID, TEXTOID, OIDOID, BOOLOID}; + Oid qualRow[] = {TEXTOID}; + bool isnull; + int natt; + ListCell *lc; + Bitmapset *included_cols = NULL; + + lrel->nspname = nspname; + lrel->relname = relname; + + /* First fetch Oid and replica identity. */ + initStringInfo(&cmd); + appendStringInfo(&cmd, "SELECT c.oid, c.relreplident, c.relkind" + " FROM pg_catalog.pg_class c" + " INNER JOIN pg_catalog.pg_namespace n" + " ON (c.relnamespace = n.oid)" + " WHERE n.nspname = %s" + " AND c.relname = %s", + quote_literal_cstr(nspname), + quote_literal_cstr(relname)); + res = walrcv_exec(LogRepWorkerWalRcvConn, cmd.data, + lengthof(tableRow), tableRow); + + if (res->status != WALRCV_OK_TUPLES) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not fetch table info for table \"%s.%s\" from publisher: %s", + nspname, relname, res->err))); + + slot = MakeSingleTupleTableSlot(res->tupledesc, &TTSOpsMinimalTuple); + if (!tuplestore_gettupleslot(res->tuplestore, true, false, slot)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("table \"%s.%s\" not found on publisher", + nspname, relname))); + + lrel->remoteid = DatumGetObjectId(slot_getattr(slot, 1, &isnull)); + Assert(!isnull); + lrel->replident = DatumGetChar(slot_getattr(slot, 2, &isnull)); + Assert(!isnull); + lrel->relkind = DatumGetChar(slot_getattr(slot, 3, &isnull)); + Assert(!isnull); + + ExecDropSingleTupleTableSlot(slot); + walrcv_clear_result(res); + + + /* + * Get column lists for each relation. + * + * We need to do this before fetching info about column names and types, + * so that we can skip columns that should not be replicated. + */ + if (walrcv_server_version(LogRepWorkerWalRcvConn) >= 150000) + { + WalRcvExecResult *pubres; + TupleTableSlot *tslot; + Oid attrsRow[] = {INT2VECTOROID}; + StringInfoData pub_names; + + initStringInfo(&pub_names); + foreach(lc, MySubscription->publications) + { + if (foreach_current_index(lc) > 0) + appendStringInfoString(&pub_names, ", "); + appendStringInfoString(&pub_names, quote_literal_cstr(strVal(lfirst(lc)))); + } + + /* + * Fetch info about column lists for the relation (from all the + * publications). + */ + resetStringInfo(&cmd); + appendStringInfo(&cmd, + "SELECT DISTINCT" + " (CASE WHEN (array_length(gpt.attrs, 1) = c.relnatts)" + " THEN NULL ELSE gpt.attrs END)" + " FROM pg_publication p," + " LATERAL pg_get_publication_tables(p.pubname) gpt," + " pg_class c" + " WHERE gpt.relid = %u AND c.oid = gpt.relid" + " AND p.pubname IN ( %s )", + lrel->remoteid, + pub_names.data); + + pubres = walrcv_exec(LogRepWorkerWalRcvConn, cmd.data, + lengthof(attrsRow), attrsRow); + + if (pubres->status != WALRCV_OK_TUPLES) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not fetch column list info for table \"%s.%s\" from publisher: %s", + nspname, relname, pubres->err))); + + /* + * We don't support the case where the column list is different for + * the same table when combining publications. See comments atop + * fetch_table_list. So there should be only one row returned. + * Although we already checked this when creating the subscription, we + * still need to check here in case the column list was changed after + * creating the subscription and before the sync worker is started. + */ + if (tuplestore_tuple_count(pubres->tuplestore) > 1) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot use different column lists for table \"%s.%s\" in different publications", + nspname, relname)); + + /* + * Get the column list and build a single bitmap with the attnums. + * + * If we find a NULL value, it means all the columns should be + * replicated. + */ + tslot = MakeSingleTupleTableSlot(pubres->tupledesc, &TTSOpsMinimalTuple); + if (tuplestore_gettupleslot(pubres->tuplestore, true, false, tslot)) + { + Datum cfval = slot_getattr(tslot, 1, &isnull); + + if (!isnull) + { + ArrayType *arr; + int nelems; + int16 *elems; + + arr = DatumGetArrayTypeP(cfval); + nelems = ARR_DIMS(arr)[0]; + elems = (int16 *) ARR_DATA_PTR(arr); + + for (natt = 0; natt < nelems; natt++) + included_cols = bms_add_member(included_cols, elems[natt]); + } + + ExecClearTuple(tslot); + } + ExecDropSingleTupleTableSlot(tslot); + + walrcv_clear_result(pubres); + + pfree(pub_names.data); + } + + /* + * Now fetch column names and types. + */ + resetStringInfo(&cmd); + appendStringInfo(&cmd, + "SELECT a.attnum," + " a.attname," + " a.atttypid," + " a.attnum = ANY(i.indkey)" + " FROM pg_catalog.pg_attribute a" + " LEFT JOIN pg_catalog.pg_index i" + " ON (i.indexrelid = pg_get_replica_identity_index(%u))" + " WHERE a.attnum > 0::pg_catalog.int2" + " AND NOT a.attisdropped %s" + " AND a.attrelid = %u" + " ORDER BY a.attnum", + lrel->remoteid, + (walrcv_server_version(LogRepWorkerWalRcvConn) >= 120000 ? + "AND a.attgenerated = ''" : ""), + lrel->remoteid); + res = walrcv_exec(LogRepWorkerWalRcvConn, cmd.data, + lengthof(attrRow), attrRow); + + if (res->status != WALRCV_OK_TUPLES) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not fetch table info for table \"%s.%s\" from publisher: %s", + nspname, relname, res->err))); + + /* We don't know the number of rows coming, so allocate enough space. */ + lrel->attnames = palloc0(MaxTupleAttributeNumber * sizeof(char *)); + lrel->atttyps = palloc0(MaxTupleAttributeNumber * sizeof(Oid)); + lrel->attkeys = NULL; + + /* + * Store the columns as a list of names. Ignore those that are not + * present in the column list, if there is one. + */ + natt = 0; + slot = MakeSingleTupleTableSlot(res->tupledesc, &TTSOpsMinimalTuple); + while (tuplestore_gettupleslot(res->tuplestore, true, false, slot)) + { + char *rel_colname; + AttrNumber attnum; + + attnum = DatumGetInt16(slot_getattr(slot, 1, &isnull)); + Assert(!isnull); + + /* If the column is not in the column list, skip it. */ + if (included_cols != NULL && !bms_is_member(attnum, included_cols)) + { + ExecClearTuple(slot); + continue; + } + + rel_colname = TextDatumGetCString(slot_getattr(slot, 2, &isnull)); + Assert(!isnull); + + lrel->attnames[natt] = rel_colname; + lrel->atttyps[natt] = DatumGetObjectId(slot_getattr(slot, 3, &isnull)); + Assert(!isnull); + + if (DatumGetBool(slot_getattr(slot, 4, &isnull))) + lrel->attkeys = bms_add_member(lrel->attkeys, natt); + + /* Should never happen. */ + if (++natt >= MaxTupleAttributeNumber) + elog(ERROR, "too many columns in remote table \"%s.%s\"", + nspname, relname); + + ExecClearTuple(slot); + } + ExecDropSingleTupleTableSlot(slot); + + lrel->natts = natt; + + walrcv_clear_result(res); + + /* + * Get relation's row filter expressions. DISTINCT avoids the same + * expression of a table in multiple publications from being included + * multiple times in the final expression. + * + * We need to copy the row even if it matches just one of the + * publications, so we later combine all the quals with OR. + * + * For initial synchronization, row filtering can be ignored in following + * cases: + * + * 1) one of the subscribed publications for the table hasn't specified + * any row filter + * + * 2) one of the subscribed publications has puballtables set to true + * + * 3) one of the subscribed publications is declared as TABLES IN SCHEMA + * that includes this relation + */ + if (walrcv_server_version(LogRepWorkerWalRcvConn) >= 150000) + { + StringInfoData pub_names; + + /* Build the pubname list. */ + initStringInfo(&pub_names); + foreach(lc, MySubscription->publications) + { + char *pubname = strVal(lfirst(lc)); + + if (foreach_current_index(lc) > 0) + appendStringInfoString(&pub_names, ", "); + + appendStringInfoString(&pub_names, quote_literal_cstr(pubname)); + } + + /* Check for row filters. */ + resetStringInfo(&cmd); + appendStringInfo(&cmd, + "SELECT DISTINCT pg_get_expr(gpt.qual, gpt.relid)" + " FROM pg_publication p," + " LATERAL pg_get_publication_tables(p.pubname) gpt" + " WHERE gpt.relid = %u" + " AND p.pubname IN ( %s )", + lrel->remoteid, + pub_names.data); + + res = walrcv_exec(LogRepWorkerWalRcvConn, cmd.data, 1, qualRow); + + if (res->status != WALRCV_OK_TUPLES) + ereport(ERROR, + (errmsg("could not fetch table WHERE clause info for table \"%s.%s\" from publisher: %s", + nspname, relname, res->err))); + + /* + * Multiple row filter expressions for the same table will be combined + * by COPY using OR. If any of the filter expressions for this table + * are null, it means the whole table will be copied. In this case it + * is not necessary to construct a unified row filter expression at + * all. + */ + slot = MakeSingleTupleTableSlot(res->tupledesc, &TTSOpsMinimalTuple); + while (tuplestore_gettupleslot(res->tuplestore, true, false, slot)) + { + Datum rf = slot_getattr(slot, 1, &isnull); + + if (!isnull) + *qual = lappend(*qual, makeString(TextDatumGetCString(rf))); + else + { + /* Ignore filters and cleanup as necessary. */ + if (*qual) + { + list_free_deep(*qual); + *qual = NIL; + } + break; + } + + ExecClearTuple(slot); + } + ExecDropSingleTupleTableSlot(slot); + + walrcv_clear_result(res); + } + + pfree(cmd.data); +} + +/* + * Copy existing data of a table from publisher. + * + * Caller is responsible for locking the local relation. + */ +static void +copy_table(Relation rel) +{ + LogicalRepRelMapEntry *relmapentry; + LogicalRepRelation lrel; + List *qual = NIL; + WalRcvExecResult *res; + StringInfoData cmd; + CopyFromState cstate; + List *attnamelist; + ParseState *pstate; + List *options = NIL; + + /* Get the publisher relation info. */ + fetch_remote_table_info(get_namespace_name(RelationGetNamespace(rel)), + RelationGetRelationName(rel), &lrel, &qual); + + /* Put the relation into relmap. */ + logicalrep_relmap_update(&lrel); + + /* Map the publisher relation to local one. */ + relmapentry = logicalrep_rel_open(lrel.remoteid, NoLock); + Assert(rel == relmapentry->localrel); + + /* Start copy on the publisher. */ + initStringInfo(&cmd); + + /* Regular table with no row filter */ + if (lrel.relkind == RELKIND_RELATION && qual == NIL) + { + appendStringInfo(&cmd, "COPY %s", + quote_qualified_identifier(lrel.nspname, lrel.relname)); + + /* If the table has columns, then specify the columns */ + if (lrel.natts) + { + appendStringInfoString(&cmd, " ("); + + /* + * XXX Do we need to list the columns in all cases? Maybe we're + * replicating all columns? + */ + for (int i = 0; i < lrel.natts; i++) + { + if (i > 0) + appendStringInfoString(&cmd, ", "); + + appendStringInfoString(&cmd, quote_identifier(lrel.attnames[i])); + } + + appendStringInfoString(&cmd, ")"); + } + + appendStringInfoString(&cmd, " TO STDOUT"); + } + else + { + /* + * For non-tables and tables with row filters, we need to do COPY + * (SELECT ...), but we can't just do SELECT * because we need to not + * copy generated columns. For tables with any row filters, build a + * SELECT query with OR'ed row filters for COPY. + */ + appendStringInfoString(&cmd, "COPY (SELECT "); + for (int i = 0; i < lrel.natts; i++) + { + appendStringInfoString(&cmd, quote_identifier(lrel.attnames[i])); + if (i < lrel.natts - 1) + appendStringInfoString(&cmd, ", "); + } + + appendStringInfoString(&cmd, " FROM "); + + /* + * For regular tables, make sure we don't copy data from a child that + * inherits the named table as those will be copied separately. + */ + if (lrel.relkind == RELKIND_RELATION) + appendStringInfoString(&cmd, "ONLY "); + + appendStringInfoString(&cmd, quote_qualified_identifier(lrel.nspname, lrel.relname)); + /* list of OR'ed filters */ + if (qual != NIL) + { + ListCell *lc; + char *q = strVal(linitial(qual)); + + appendStringInfo(&cmd, " WHERE %s", q); + for_each_from(lc, qual, 1) + { + q = strVal(lfirst(lc)); + appendStringInfo(&cmd, " OR %s", q); + } + list_free_deep(qual); + } + + appendStringInfoString(&cmd, ") TO STDOUT"); + } + + /* + * Prior to v16, initial table synchronization will use text format even + * if the binary option is enabled for a subscription. + */ + if (walrcv_server_version(LogRepWorkerWalRcvConn) >= 160000 && + MySubscription->binary) + { + appendStringInfoString(&cmd, " WITH (FORMAT binary)"); + options = list_make1(makeDefElem("format", + (Node *) makeString("binary"), -1)); + } + + res = walrcv_exec(LogRepWorkerWalRcvConn, cmd.data, 0, NULL); + pfree(cmd.data); + if (res->status != WALRCV_OK_COPY_OUT) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not start initial contents copy for table \"%s.%s\": %s", + lrel.nspname, lrel.relname, res->err))); + walrcv_clear_result(res); + + copybuf = makeStringInfo(); + + pstate = make_parsestate(NULL); + (void) addRangeTableEntryForRelation(pstate, rel, AccessShareLock, + NULL, false, false); + + attnamelist = make_copy_attnamelist(relmapentry); + cstate = BeginCopyFrom(pstate, rel, NULL, NULL, false, copy_read_data, attnamelist, options); + + /* Do the copy */ + (void) CopyFrom(cstate); + + logicalrep_rel_close(relmapentry, NoLock); +} + +/* + * Determine the tablesync slot name. + * + * The name must not exceed NAMEDATALEN - 1 because of remote node constraints + * on slot name length. We append system_identifier to avoid slot_name + * collision with subscriptions in other clusters. With the current scheme + * pg_%u_sync_%u_UINT64_FORMAT (3 + 10 + 6 + 10 + 20 + '\0'), the maximum + * length of slot_name will be 50. + * + * The returned slot name is stored in the supplied buffer (syncslotname) with + * the given size. + * + * Note: We don't use the subscription slot name as part of tablesync slot name + * because we are responsible for cleaning up these slots and it could become + * impossible to recalculate what name to cleanup if the subscription slot name + * had changed. + */ +void +ReplicationSlotNameForTablesync(Oid suboid, Oid relid, + char *syncslotname, Size szslot) +{ + snprintf(syncslotname, szslot, "pg_%u_sync_%u_" UINT64_FORMAT, suboid, + relid, GetSystemIdentifier()); +} + +/* + * Start syncing the table in the sync worker. + * + * If nothing needs to be done to sync the table, we exit the worker without + * any further action. + * + * The returned slot name is palloc'ed in current memory context. + */ +char * +LogicalRepSyncTableStart(XLogRecPtr *origin_startpos) +{ + char *slotname; + char *err; + char relstate; + XLogRecPtr relstate_lsn; + Relation rel; + AclResult aclresult; + WalRcvExecResult *res; + char originname[NAMEDATALEN]; + RepOriginId originid; + UserContext ucxt; + bool must_use_password; + bool run_as_owner; + + /* Check the state of the table synchronization. */ + StartTransactionCommand(); + relstate = GetSubscriptionRelState(MyLogicalRepWorker->subid, + MyLogicalRepWorker->relid, + &relstate_lsn); + + /* Is the use of a password mandatory? */ + must_use_password = MySubscription->passwordrequired && + !superuser_arg(MySubscription->owner); + + /* Note that the superuser_arg call can access the DB */ + CommitTransactionCommand(); + + SpinLockAcquire(&MyLogicalRepWorker->relmutex); + MyLogicalRepWorker->relstate = relstate; + MyLogicalRepWorker->relstate_lsn = relstate_lsn; + SpinLockRelease(&MyLogicalRepWorker->relmutex); + + /* + * If synchronization is already done or no longer necessary, exit now + * that we've updated shared memory state. + */ + switch (relstate) + { + case SUBREL_STATE_SYNCDONE: + case SUBREL_STATE_READY: + case SUBREL_STATE_UNKNOWN: + finish_sync_worker(); /* doesn't return */ + } + + /* Calculate the name of the tablesync slot. */ + slotname = (char *) palloc(NAMEDATALEN); + ReplicationSlotNameForTablesync(MySubscription->oid, + MyLogicalRepWorker->relid, + slotname, + NAMEDATALEN); + + /* + * Here we use the slot name instead of the subscription name as the + * application_name, so that it is different from the leader apply worker, + * so that synchronous replication can distinguish them. + */ + LogRepWorkerWalRcvConn = + walrcv_connect(MySubscription->conninfo, true, + must_use_password, + slotname, &err); + if (LogRepWorkerWalRcvConn == NULL) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not connect to the publisher: %s", err))); + + Assert(MyLogicalRepWorker->relstate == SUBREL_STATE_INIT || + MyLogicalRepWorker->relstate == SUBREL_STATE_DATASYNC || + MyLogicalRepWorker->relstate == SUBREL_STATE_FINISHEDCOPY); + + /* Assign the origin tracking record name. */ + ReplicationOriginNameForLogicalRep(MySubscription->oid, + MyLogicalRepWorker->relid, + originname, + sizeof(originname)); + + if (MyLogicalRepWorker->relstate == SUBREL_STATE_DATASYNC) + { + /* + * We have previously errored out before finishing the copy so the + * replication slot might exist. We want to remove the slot if it + * already exists and proceed. + * + * XXX We could also instead try to drop the slot, last time we failed + * but for that, we might need to clean up the copy state as it might + * be in the middle of fetching the rows. Also, if there is a network + * breakdown then it wouldn't have succeeded so trying it next time + * seems like a better bet. + */ + ReplicationSlotDropAtPubNode(LogRepWorkerWalRcvConn, slotname, true); + } + else if (MyLogicalRepWorker->relstate == SUBREL_STATE_FINISHEDCOPY) + { + /* + * The COPY phase was previously done, but tablesync then crashed + * before it was able to finish normally. + */ + StartTransactionCommand(); + + /* + * The origin tracking name must already exist. It was created first + * time this tablesync was launched. + */ + originid = replorigin_by_name(originname, false); + replorigin_session_setup(originid, 0); + replorigin_session_origin = originid; + *origin_startpos = replorigin_session_get_progress(false); + + CommitTransactionCommand(); + + goto copy_table_done; + } + + SpinLockAcquire(&MyLogicalRepWorker->relmutex); + MyLogicalRepWorker->relstate = SUBREL_STATE_DATASYNC; + MyLogicalRepWorker->relstate_lsn = InvalidXLogRecPtr; + SpinLockRelease(&MyLogicalRepWorker->relmutex); + + /* Update the state and make it visible to others. */ + StartTransactionCommand(); + UpdateSubscriptionRelState(MyLogicalRepWorker->subid, + MyLogicalRepWorker->relid, + MyLogicalRepWorker->relstate, + MyLogicalRepWorker->relstate_lsn); + CommitTransactionCommand(); + pgstat_report_stat(true); + + StartTransactionCommand(); + + /* + * Use a standard write lock here. It might be better to disallow access + * to the table while it's being synchronized. But we don't want to block + * the main apply process from working and it has to open the relation in + * RowExclusiveLock when remapping remote relation id to local one. + */ + rel = table_open(MyLogicalRepWorker->relid, RowExclusiveLock); + + /* + * Start a transaction in the remote node in REPEATABLE READ mode. This + * ensures that both the replication slot we create (see below) and the + * COPY are consistent with each other. + */ + res = walrcv_exec(LogRepWorkerWalRcvConn, + "BEGIN READ ONLY ISOLATION LEVEL REPEATABLE READ", + 0, NULL); + if (res->status != WALRCV_OK_COMMAND) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("table copy could not start transaction on publisher: %s", + res->err))); + walrcv_clear_result(res); + + /* + * Create a new permanent logical decoding slot. This slot will be used + * for the catchup phase after COPY is done, so tell it to use the + * snapshot to make the final data consistent. + */ + walrcv_create_slot(LogRepWorkerWalRcvConn, + slotname, false /* permanent */ , false /* two_phase */ , + CRS_USE_SNAPSHOT, origin_startpos); + + /* + * Setup replication origin tracking. The purpose of doing this before the + * copy is to avoid doing the copy again due to any error in setting up + * origin tracking. + */ + originid = replorigin_by_name(originname, true); + if (!OidIsValid(originid)) + { + /* + * Origin tracking does not exist, so create it now. + * + * Then advance to the LSN got from walrcv_create_slot. This is WAL + * logged for the purpose of recovery. Locks are to prevent the + * replication origin from vanishing while advancing. + */ + originid = replorigin_create(originname); + + LockRelationOid(ReplicationOriginRelationId, RowExclusiveLock); + replorigin_advance(originid, *origin_startpos, InvalidXLogRecPtr, + true /* go backward */ , true /* WAL log */ ); + UnlockRelationOid(ReplicationOriginRelationId, RowExclusiveLock); + + replorigin_session_setup(originid, 0); + replorigin_session_origin = originid; + } + else + { + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_OBJECT), + errmsg("replication origin \"%s\" already exists", + originname))); + } + + /* + * Make sure that the copy command runs as the table owner, unless the + * user has opted out of that behaviour. + */ + run_as_owner = MySubscription->runasowner; + if (!run_as_owner) + SwitchToUntrustedUser(rel->rd_rel->relowner, &ucxt); + + /* + * Check that our table sync worker has permission to insert into the + * target table. + */ + aclresult = pg_class_aclcheck(RelationGetRelid(rel), GetUserId(), + ACL_INSERT); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, + get_relkind_objtype(rel->rd_rel->relkind), + RelationGetRelationName(rel)); + + /* + * COPY FROM does not honor RLS policies. That is not a problem for + * subscriptions owned by roles with BYPASSRLS privilege (or superuser, + * who has it implicitly), but other roles should not be able to + * circumvent RLS. Disallow logical replication into RLS enabled + * relations for such roles. + */ + if (check_enable_rls(RelationGetRelid(rel), InvalidOid, false) == RLS_ENABLED) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("user \"%s\" cannot replicate into relation with row-level security enabled: \"%s\"", + GetUserNameFromId(GetUserId(), true), + RelationGetRelationName(rel)))); + + /* Now do the initial data copy */ + PushActiveSnapshot(GetTransactionSnapshot()); + copy_table(rel); + PopActiveSnapshot(); + + res = walrcv_exec(LogRepWorkerWalRcvConn, "COMMIT", 0, NULL); + if (res->status != WALRCV_OK_COMMAND) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("table copy could not finish transaction on publisher: %s", + res->err))); + walrcv_clear_result(res); + + if (!run_as_owner) + RestoreUserContext(&ucxt); + + table_close(rel, NoLock); + + /* Make the copy visible. */ + CommandCounterIncrement(); + + /* + * Update the persisted state to indicate the COPY phase is done; make it + * visible to others. + */ + UpdateSubscriptionRelState(MyLogicalRepWorker->subid, + MyLogicalRepWorker->relid, + SUBREL_STATE_FINISHEDCOPY, + MyLogicalRepWorker->relstate_lsn); + + CommitTransactionCommand(); + +copy_table_done: + + elog(DEBUG1, + "LogicalRepSyncTableStart: '%s' origin_startpos lsn %X/%X", + originname, LSN_FORMAT_ARGS(*origin_startpos)); + + /* + * We are done with the initial data synchronization, update the state. + */ + SpinLockAcquire(&MyLogicalRepWorker->relmutex); + MyLogicalRepWorker->relstate = SUBREL_STATE_SYNCWAIT; + MyLogicalRepWorker->relstate_lsn = *origin_startpos; + SpinLockRelease(&MyLogicalRepWorker->relmutex); + + /* + * Finally, wait until the leader apply worker tells us to catch up and + * then return to let LogicalRepApplyLoop do it. + */ + wait_for_worker_state_change(SUBREL_STATE_CATCHUP); + return slotname; +} + +/* + * Common code to fetch the up-to-date sync state info into the static lists. + * + * Returns true if subscription has 1 or more tables, else false. + * + * Note: If this function started the transaction (indicated by the parameter) + * then it is the caller's responsibility to commit it. + */ +static bool +FetchTableStates(bool *started_tx) +{ + static __thread bool has_subrels = false; + + *started_tx = false; + + if (table_states_validity != SYNC_TABLE_STATE_VALID) + { + MemoryContext oldctx; + List *rstates; + ListCell *lc; + SubscriptionRelState *rstate; + + table_states_validity = SYNC_TABLE_STATE_REBUILD_STARTED; + + /* Clean the old lists. */ + list_free_deep(table_states_not_ready); + table_states_not_ready = NIL; + + if (!IsTransactionState()) + { + StartTransactionCommand(); + *started_tx = true; + } + + /* Fetch all non-ready tables. */ + rstates = GetSubscriptionRelations(MySubscription->oid, true); + + /* Allocate the tracking info in a permanent memory context. */ + oldctx = MemoryContextSwitchTo(CacheMemoryContext); + foreach(lc, rstates) + { + rstate = palloc(sizeof(SubscriptionRelState)); + memcpy(rstate, lfirst(lc), sizeof(SubscriptionRelState)); + table_states_not_ready = lappend(table_states_not_ready, rstate); + } + MemoryContextSwitchTo(oldctx); + + /* + * Does the subscription have tables? + * + * If there were not-READY relations found then we know it does. But + * if table_states_not_ready was empty we still need to check again to + * see if there are 0 tables. + */ + has_subrels = (table_states_not_ready != NIL) || + HasSubscriptionRelations(MySubscription->oid); + + /* + * If the subscription relation cache has been invalidated since we + * entered this routine, we still use and return the relations we just + * finished constructing, to avoid infinite loops, but we leave the + * table states marked as stale so that we'll rebuild it again on next + * access. Otherwise, we mark the table states as valid. + */ + if (table_states_validity == SYNC_TABLE_STATE_REBUILD_STARTED) + table_states_validity = SYNC_TABLE_STATE_VALID; + } + + return has_subrels; +} + +/* + * If the subscription has no tables then return false. + * + * Otherwise, are all tablesyncs READY? + * + * Note: This function is not suitable to be called from outside of apply or + * tablesync workers because MySubscription needs to be already initialized. + */ +bool +AllTablesyncsReady(void) +{ + bool started_tx = false; + bool has_subrels = false; + + /* We need up-to-date sync state info for subscription tables here. */ + has_subrels = FetchTableStates(&started_tx); + + if (started_tx) + { + CommitTransactionCommand(); + pgstat_report_stat(true); + } + + /* + * Return false when there are no tables in subscription or not all tables + * are in ready state; true otherwise. + */ + return has_subrels && (table_states_not_ready == NIL); +} + +/* + * Update the two_phase state of the specified subscription in pg_subscription. + */ +void +UpdateTwoPhaseState(Oid suboid, char new_state) +{ + Relation rel; + HeapTuple tup; + bool nulls[Natts_pg_subscription]; + bool replaces[Natts_pg_subscription]; + Datum values[Natts_pg_subscription]; + + Assert(new_state == LOGICALREP_TWOPHASE_STATE_DISABLED || + new_state == LOGICALREP_TWOPHASE_STATE_PENDING || + new_state == LOGICALREP_TWOPHASE_STATE_ENABLED); + + rel = table_open(SubscriptionRelationId, RowExclusiveLock); + tup = SearchSysCacheCopy1(SUBSCRIPTIONOID, ObjectIdGetDatum(suboid)); + if (!HeapTupleIsValid(tup)) + elog(ERROR, + "cache lookup failed for subscription oid %u", + suboid); + + /* Form a new tuple. */ + memset(values, 0, sizeof(values)); + memset(nulls, false, sizeof(nulls)); + memset(replaces, false, sizeof(replaces)); + + /* And update/set two_phase state */ + values[Anum_pg_subscription_subtwophasestate - 1] = CharGetDatum(new_state); + replaces[Anum_pg_subscription_subtwophasestate - 1] = true; + + tup = heap_modify_tuple(tup, RelationGetDescr(rel), + values, nulls, replaces); + CatalogTupleUpdate(rel, &tup->t_self, tup); + + heap_freetuple(tup); + table_close(rel, RowExclusiveLock); +} diff --git a/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/worker.c b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/worker.c new file mode 100644 index 00000000000..0b6ad19733d --- /dev/null +++ b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/replication/logical/worker.c @@ -0,0 +1,5113 @@ +/*------------------------------------------------------------------------- + * worker.c + * PostgreSQL logical replication worker (apply) + * + * Copyright (c) 2016-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/worker.c + * + * NOTES + * This file contains the worker which applies logical changes as they come + * from remote logical replication stream. + * + * The main worker (apply) is started by logical replication worker + * launcher for every enabled subscription in a database. It uses + * walsender protocol to communicate with publisher. + * + * This module includes server facing code and shares libpqwalreceiver + * module with walreceiver for providing the libpq specific functionality. + * + * + * STREAMED TRANSACTIONS + * --------------------- + * Streamed transactions (large transactions exceeding a memory limit on the + * upstream) are applied using one of two approaches: + * + * 1) Write to temporary files and apply when the final commit arrives + * + * This approach is used when the user has set the subscription's streaming + * option as on. + * + * Unlike the regular (non-streamed) case, handling streamed transactions has + * to handle aborts of both the toplevel transaction and subtransactions. This + * is achieved by tracking offsets for subtransactions, which is then used + * to truncate the file with serialized changes. + * + * The files are placed in tmp file directory by default, and the filenames + * include both the XID of the toplevel transaction and OID of the + * subscription. This is necessary so that different workers processing a + * remote transaction with the same XID doesn't interfere. + * + * We use BufFiles instead of using normal temporary files because (a) the + * BufFile infrastructure supports temporary files that exceed the OS file size + * limit, (b) provides a way for automatic clean up on the error and (c) provides + * a way to survive these files across local transactions and allow to open and + * close at stream start and close. We decided to use FileSet + * infrastructure as without that it deletes the files on the closure of the + * file and if we decide to keep stream files open across the start/stop stream + * then it will consume a lot of memory (more than 8K for each BufFile and + * there could be multiple such BufFiles as the subscriber could receive + * multiple start/stop streams for different transactions before getting the + * commit). Moreover, if we don't use FileSet then we also need to invent + * a new way to pass filenames to BufFile APIs so that we are allowed to open + * the file we desired across multiple stream-open calls for the same + * transaction. + * + * 2) Parallel apply workers. + * + * This approach is used when the user has set the subscription's streaming + * option as parallel. See logical/applyparallelworker.c for information about + * this approach. + * + * TWO_PHASE TRANSACTIONS + * ---------------------- + * Two phase transactions are replayed at prepare and then committed or + * rolled back at commit prepared and rollback prepared respectively. It is + * possible to have a prepared transaction that arrives at the apply worker + * when the tablesync is busy doing the initial copy. In this case, the apply + * worker skips all the prepared operations [e.g. inserts] while the tablesync + * is still busy (see the condition of should_apply_changes_for_rel). The + * tablesync worker might not get such a prepared transaction because say it + * was prior to the initial consistent point but might have got some later + * commits. Now, the tablesync worker will exit without doing anything for the + * prepared transaction skipped by the apply worker as the sync location for it + * will be already ahead of the apply worker's current location. This would lead + * to an "empty prepare", because later when the apply worker does the commit + * prepare, there is nothing in it (the inserts were skipped earlier). + * + * To avoid this, and similar prepare confusions the subscription's two_phase + * commit is enabled only after the initial sync is over. The two_phase option + * has been implemented as a tri-state with values DISABLED, PENDING, and + * ENABLED. + * + * Even if the user specifies they want a subscription with two_phase = on, + * internally it will start with a tri-state of PENDING which only becomes + * ENABLED after all tablesync initializations are completed - i.e. when all + * tablesync workers have reached their READY state. In other words, the value + * PENDING is only a temporary state for subscription start-up. + * + * Until the two_phase is properly available (ENABLED) the subscription will + * behave as if two_phase = off. When the apply worker detects that all + * tablesyncs have become READY (while the tri-state was PENDING) it will + * restart the apply worker process. This happens in + * process_syncing_tables_for_apply. + * + * When the (re-started) apply worker finds that all tablesyncs are READY for a + * two_phase tri-state of PENDING it start streaming messages with the + * two_phase option which in turn enables the decoding of two-phase commits at + * the publisher. Then, it updates the tri-state value from PENDING to ENABLED. + * Now, it is possible that during the time we have not enabled two_phase, the + * publisher (replication server) would have skipped some prepares but we + * ensure that such prepares are sent along with commit prepare, see + * ReorderBufferFinishPrepared. + * + * If the subscription has no tables then a two_phase tri-state PENDING is + * left unchanged. This lets the user still do an ALTER SUBSCRIPTION REFRESH + * PUBLICATION which might otherwise be disallowed (see below). + * + * If ever a user needs to be aware of the tri-state value, they can fetch it + * from the pg_subscription catalog (see column subtwophasestate). + * + * We don't allow to toggle two_phase option of a subscription because it can + * lead to an inconsistent replica. Consider, initially, it was on and we have + * received some prepare then we turn it off, now at commit time the server + * will send the entire transaction data along with the commit. With some more + * analysis, we can allow changing this option from off to on but not sure if + * that alone would be useful. + * + * Finally, to avoid problems mentioned in previous paragraphs from any + * subsequent (not READY) tablesyncs (need to toggle two_phase option from 'on' + * to 'off' and then again back to 'on') there is a restriction for + * ALTER SUBSCRIPTION REFRESH PUBLICATION. This command is not permitted when + * the two_phase tri-state is ENABLED, except when copy_data = false. + * + * We can get prepare of the same GID more than once for the genuine cases + * where we have defined multiple subscriptions for publications on the same + * server and prepared transaction has operations on tables subscribed to those + * subscriptions. For such cases, if we use the GID sent by publisher one of + * the prepares will be successful and others will fail, in which case the + * server will send them again. Now, this can lead to a deadlock if user has + * set synchronous_standby_names for all the subscriptions on subscriber. To + * avoid such deadlocks, we generate a unique GID (consisting of the + * subscription oid and the xid of the prepared transaction) for each prepare + * transaction on the subscriber. + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <sys/stat.h> +#include <unistd.h> + +#include "access/genam.h" +#include "access/table.h" +#include "access/tableam.h" +#include "access/twophase.h" +#include "access/xact.h" +#include "access/xlog_internal.h" +#include "catalog/catalog.h" +#include "catalog/indexing.h" +#include "catalog/namespace.h" +#include "catalog/partition.h" +#include "catalog/pg_inherits.h" +#include "catalog/pg_subscription.h" +#include "catalog/pg_subscription_rel.h" +#include "catalog/pg_tablespace.h" +#include "commands/tablecmds.h" +#include "commands/tablespace.h" +#include "commands/trigger.h" +#include "executor/executor.h" +#include "executor/execPartition.h" +#include "executor/nodeModifyTable.h" +#include "funcapi.h" +#include "libpq/pqformat.h" +#include "libpq/pqsignal.h" +#include "mb/pg_wchar.h" +#include "miscadmin.h" +#include "nodes/makefuncs.h" +#include "optimizer/optimizer.h" +#include "parser/parse_relation.h" +#include "pgstat.h" +#include "postmaster/bgworker.h" +#include "postmaster/interrupt.h" +#include "postmaster/postmaster.h" +#include "postmaster/walwriter.h" +#include "replication/decode.h" +#include "replication/logical.h" +#include "replication/logicallauncher.h" +#include "replication/logicalproto.h" +#include "replication/logicalrelation.h" +#include "replication/logicalworker.h" +#include "replication/origin.h" +#include "replication/reorderbuffer.h" +#include "replication/snapbuild.h" +#include "replication/walreceiver.h" +#include "replication/worker_internal.h" +#include "rewrite/rewriteHandler.h" +#include "storage/buffile.h" +#include "storage/bufmgr.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/lmgr.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "tcop/tcopprot.h" +#include "utils/acl.h" +#include "utils/builtins.h" +#include "utils/catcache.h" +#include "utils/dynahash.h" +#include "utils/datum.h" +#include "utils/fmgroids.h" +#include "utils/guc.h" +#include "utils/inval.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/pg_lsn.h" +#include "utils/rel.h" +#include "utils/rls.h" +#include "utils/syscache.h" +#include "utils/timeout.h" +#include "utils/usercontext.h" + +#define NAPTIME_PER_CYCLE 1000 /* max sleep time between cycles (1s) */ + +typedef struct FlushPosition +{ + dlist_node node; + XLogRecPtr local_end; + XLogRecPtr remote_end; +} FlushPosition; + +static __thread dlist_head lsn_mapping ;void lsn_mapping_init(void) { dlist_init(&lsn_mapping); } + +typedef struct ApplyExecutionData +{ + EState *estate; /* executor state, used to track resources */ + + LogicalRepRelMapEntry *targetRel; /* replication target rel */ + ResultRelInfo *targetRelInfo; /* ResultRelInfo for same */ + + /* These fields are used when the target relation is partitioned: */ + ModifyTableState *mtstate; /* dummy ModifyTable state */ + PartitionTupleRouting *proute; /* partition routing info */ +} ApplyExecutionData; + +/* Struct for saving and restoring apply errcontext information */ +typedef struct ApplyErrorCallbackArg +{ + LogicalRepMsgType command; /* 0 if invalid */ + LogicalRepRelMapEntry *rel; + + /* Remote node information */ + int remote_attnum; /* -1 if invalid */ + TransactionId remote_xid; + XLogRecPtr finish_lsn; + char *origin_name; +} ApplyErrorCallbackArg; + +/* + * The action to be taken for the changes in the transaction. + * + * TRANS_LEADER_APPLY: + * This action means that we are in the leader apply worker or table sync + * worker. The changes of the transaction are either directly applied or + * are read from temporary files (for streaming transactions) and then + * applied by the worker. + * + * TRANS_LEADER_SERIALIZE: + * This action means that we are in the leader apply worker or table sync + * worker. Changes are written to temporary files and then applied when the + * final commit arrives. + * + * TRANS_LEADER_SEND_TO_PARALLEL: + * This action means that we are in the leader apply worker and need to send + * the changes to the parallel apply worker. + * + * TRANS_LEADER_PARTIAL_SERIALIZE: + * This action means that we are in the leader apply worker and have sent some + * changes directly to the parallel apply worker and the remaining changes are + * serialized to a file, due to timeout while sending data. The parallel apply + * worker will apply these serialized changes when the final commit arrives. + * + * We can't use TRANS_LEADER_SERIALIZE for this case because, in addition to + * serializing changes, the leader worker also needs to serialize the + * STREAM_XXX message to a file, and wait for the parallel apply worker to + * finish the transaction when processing the transaction finish command. So + * this new action was introduced to keep the code and logic clear. + * + * TRANS_PARALLEL_APPLY: + * This action means that we are in the parallel apply worker and changes of + * the transaction are applied directly by the worker. + */ +typedef enum +{ + /* The action for non-streaming transactions. */ + TRANS_LEADER_APPLY, + + /* Actions for streaming transactions. */ + TRANS_LEADER_SERIALIZE, + TRANS_LEADER_SEND_TO_PARALLEL, + TRANS_LEADER_PARTIAL_SERIALIZE, + TRANS_PARALLEL_APPLY +} TransApplyAction; + +/* errcontext tracker */ +__thread ApplyErrorCallbackArg apply_error_callback_arg = +{ + .command = 0, + .rel = NULL, + .remote_attnum = -1, + .remote_xid = InvalidTransactionId, + .finish_lsn = InvalidXLogRecPtr, + .origin_name = NULL, +}; + +__thread ErrorContextCallback *apply_error_context_stack = NULL; + +__thread MemoryContext ApplyMessageContext = NULL; +__thread MemoryContext ApplyContext = NULL; + +/* per stream context for streaming transactions */ +static __thread MemoryContext LogicalStreamingContext = NULL; + +__thread WalReceiverConn *LogRepWorkerWalRcvConn = NULL; + +__thread Subscription *MySubscription = NULL; +static __thread bool MySubscriptionValid = false; + +static __thread List *on_commit_wakeup_workers_subids = NIL; + +__thread bool in_remote_transaction = false; +static __thread XLogRecPtr remote_final_lsn = InvalidXLogRecPtr; + +/* fields valid only when processing streamed transaction */ +static __thread bool in_streamed_transaction = false; + +static __thread TransactionId stream_xid = InvalidTransactionId; + +/* + * The number of changes applied by parallel apply worker during one streaming + * block. + */ +static __thread uint32 parallel_stream_nchanges = 0; + +/* Are we initializing a apply worker? */ +__thread bool InitializingApplyWorker = false; + +/* + * We enable skipping all data modification changes (INSERT, UPDATE, etc.) for + * the subscription if the remote transaction's finish LSN matches the subskiplsn. + * Once we start skipping changes, we don't stop it until we skip all changes of + * the transaction even if pg_subscription is updated and MySubscription->skiplsn + * gets changed or reset during that. Also, in streaming transaction cases (streaming = on), + * we don't skip receiving and spooling the changes since we decide whether or not + * to skip applying the changes when starting to apply changes. The subskiplsn is + * cleared after successfully skipping the transaction or applying non-empty + * transaction. The latter prevents the mistakenly specified subskiplsn from + * being left. Note that we cannot skip the streaming transactions when using + * parallel apply workers because we cannot get the finish LSN before applying + * the changes. So, we don't start parallel apply worker when finish LSN is set + * by the user. + */ +static __thread XLogRecPtr skip_xact_finish_lsn = InvalidXLogRecPtr; +#define is_skipping_changes() (unlikely(!XLogRecPtrIsInvalid(skip_xact_finish_lsn))) + +/* BufFile handle of the current streaming file */ +static __thread BufFile *stream_fd = NULL; + +typedef struct SubXactInfo +{ + TransactionId xid; /* XID of the subxact */ + int fileno; /* file number in the buffile */ + off_t offset; /* offset in the file */ +} SubXactInfo; + +/* Sub-transaction data for the current streaming transaction */ +typedef struct ApplySubXactData +{ + uint32 nsubxacts; /* number of sub-transactions */ + uint32 nsubxacts_max; /* current capacity of subxacts */ + TransactionId subxact_last; /* xid of the last sub-transaction */ + SubXactInfo *subxacts; /* sub-xact offset in changes file */ +} ApplySubXactData; + +static __thread ApplySubXactData subxact_data = {0, 0, InvalidTransactionId, NULL}; + +static inline void subxact_filename(char *path, Oid subid, TransactionId xid); +static inline void changes_filename(char *path, Oid subid, TransactionId xid); + +/* + * Information about subtransactions of a given toplevel transaction. + */ +static void subxact_info_write(Oid subid, TransactionId xid); +static void subxact_info_read(Oid subid, TransactionId xid); +static void subxact_info_add(TransactionId xid); +static inline void cleanup_subxact_info(void); + +/* + * Serialize and deserialize changes for a toplevel transaction. + */ +static void stream_open_file(Oid subid, TransactionId xid, + bool first_segment); +static void stream_write_change(char action, StringInfo s); +static void stream_open_and_write_change(TransactionId xid, char action, StringInfo s); +static void stream_close_file(void); + +static void send_feedback(XLogRecPtr recvpos, bool force, bool requestReply); + +static void DisableSubscriptionAndExit(void); + +static void apply_handle_commit_internal(LogicalRepCommitData *commit_data); +static void apply_handle_insert_internal(ApplyExecutionData *edata, + ResultRelInfo *relinfo, + TupleTableSlot *remoteslot); +static void apply_handle_update_internal(ApplyExecutionData *edata, + ResultRelInfo *relinfo, + TupleTableSlot *remoteslot, + LogicalRepTupleData *newtup, + Oid localindexoid); +static void apply_handle_delete_internal(ApplyExecutionData *edata, + ResultRelInfo *relinfo, + TupleTableSlot *remoteslot, + Oid localindexoid); +static bool FindReplTupleInLocalRel(ApplyExecutionData *edata, Relation localrel, + LogicalRepRelation *remoterel, + Oid localidxoid, + TupleTableSlot *remoteslot, + TupleTableSlot **localslot); +static void apply_handle_tuple_routing(ApplyExecutionData *edata, + TupleTableSlot *remoteslot, + LogicalRepTupleData *newtup, + CmdType operation); + +/* Compute GID for two_phase transactions */ +static void TwoPhaseTransactionGid(Oid subid, TransactionId xid, char *gid, int szgid); + +/* Functions for skipping changes */ +static void maybe_start_skipping_changes(XLogRecPtr finish_lsn); +static void stop_skipping_changes(void); +static void clear_subscription_skip_lsn(XLogRecPtr finish_lsn); + +/* Functions for apply error callback */ +static inline void set_apply_error_context_xact(TransactionId xid, XLogRecPtr lsn); +static inline void reset_apply_error_context_info(void); + +static TransApplyAction get_transaction_apply_action(TransactionId xid, + ParallelApplyWorkerInfo **winfo); + +/* + * Form the origin name for the subscription. + * + * This is a common function for tablesync and other workers. Tablesync workers + * must pass a valid relid. Other callers must pass relid = InvalidOid. + * + * Return the name in the supplied buffer. + */ +void +ReplicationOriginNameForLogicalRep(Oid suboid, Oid relid, + char *originname, Size szoriginname) +{ + if (OidIsValid(relid)) + { + /* Replication origin name for tablesync workers. */ + snprintf(originname, szoriginname, "pg_%u_%u", suboid, relid); + } + else + { + /* Replication origin name for non-tablesync workers. */ + snprintf(originname, szoriginname, "pg_%u", suboid); + } +} + +/* + * Should this worker apply changes for given relation. + * + * This is mainly needed for initial relation data sync as that runs in + * separate worker process running in parallel and we need some way to skip + * changes coming to the leader apply worker during the sync of a table. + * + * Note we need to do smaller or equals comparison for SYNCDONE state because + * it might hold position of end of initial slot consistent point WAL + * record + 1 (ie start of next record) and next record can be COMMIT of + * transaction we are now processing (which is what we set remote_final_lsn + * to in apply_handle_begin). + * + * Note that for streaming transactions that are being applied in the parallel + * apply worker, we disallow applying changes if the target table in the + * subscription is not in the READY state, because we cannot decide whether to + * apply the change as we won't know remote_final_lsn by that time. + * + * We already checked this in pa_can_start() before assigning the + * streaming transaction to the parallel worker, but it also needs to be + * checked here because if the user executes ALTER SUBSCRIPTION ... REFRESH + * PUBLICATION in parallel, the new table can be added to pg_subscription_rel + * while applying this transaction. + */ +static bool +should_apply_changes_for_rel(LogicalRepRelMapEntry *rel) +{ + if (am_tablesync_worker()) + return MyLogicalRepWorker->relid == rel->localreloid; + else if (am_parallel_apply_worker()) + { + /* We don't synchronize rel's that are in unknown state. */ + if (rel->state != SUBREL_STATE_READY && + rel->state != SUBREL_STATE_UNKNOWN) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical replication parallel apply worker for subscription \"%s\" will stop", + MySubscription->name), + errdetail("Cannot handle streamed replication transactions using parallel apply workers until all tables have been synchronized."))); + + return rel->state == SUBREL_STATE_READY; + } + else + return (rel->state == SUBREL_STATE_READY || + (rel->state == SUBREL_STATE_SYNCDONE && + rel->statelsn <= remote_final_lsn)); +} + +/* + * Begin one step (one INSERT, UPDATE, etc) of a replication transaction. + * + * Start a transaction, if this is the first step (else we keep using the + * existing transaction). + * Also provide a global snapshot and ensure we run in ApplyMessageContext. + */ +static void +begin_replication_step(void) +{ + SetCurrentStatementStartTimestamp(); + + if (!IsTransactionState()) + { + StartTransactionCommand(); + maybe_reread_subscription(); + } + + PushActiveSnapshot(GetTransactionSnapshot()); + + MemoryContextSwitchTo(ApplyMessageContext); +} + +/* + * Finish up one step of a replication transaction. + * Callers of begin_replication_step() must also call this. + * + * We don't close out the transaction here, but we should increment + * the command counter to make the effects of this step visible. + */ +static void +end_replication_step(void) +{ + PopActiveSnapshot(); + + CommandCounterIncrement(); +} + +/* + * Handle streamed transactions for both the leader apply worker and the + * parallel apply workers. + * + * In the streaming case (receiving a block of the streamed transaction), for + * serialize mode, simply redirect it to a file for the proper toplevel + * transaction, and for parallel mode, the leader apply worker will send the + * changes to parallel apply workers and the parallel apply worker will define + * savepoints if needed. (LOGICAL_REP_MSG_RELATION or LOGICAL_REP_MSG_TYPE + * messages will be applied by both leader apply worker and parallel apply + * workers). + * + * Returns true for streamed transactions (when the change is either serialized + * to file or sent to parallel apply worker), false otherwise (regular mode or + * needs to be processed by parallel apply worker). + * + * Exception: If the message being processed is LOGICAL_REP_MSG_RELATION + * or LOGICAL_REP_MSG_TYPE, return false even if the message needs to be sent + * to a parallel apply worker. + */ +static bool +handle_streamed_transaction(LogicalRepMsgType action, StringInfo s) +{ + TransactionId current_xid; + ParallelApplyWorkerInfo *winfo; + TransApplyAction apply_action; + StringInfoData original_msg; + + apply_action = get_transaction_apply_action(stream_xid, &winfo); + + /* not in streaming mode */ + if (apply_action == TRANS_LEADER_APPLY) + return false; + + Assert(TransactionIdIsValid(stream_xid)); + + /* + * The parallel apply worker needs the xid in this message to decide + * whether to define a savepoint, so save the original message that has + * not moved the cursor after the xid. We will serialize this message to a + * file in PARTIAL_SERIALIZE mode. + */ + original_msg = *s; + + /* + * We should have received XID of the subxact as the first part of the + * message, so extract it. + */ + current_xid = pq_getmsgint(s, 4); + + if (!TransactionIdIsValid(current_xid)) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("invalid transaction ID in streamed replication transaction"))); + + switch (apply_action) + { + case TRANS_LEADER_SERIALIZE: + Assert(stream_fd); + + /* Add the new subxact to the array (unless already there). */ + subxact_info_add(current_xid); + + /* Write the change to the current file */ + stream_write_change(action, s); + return true; + + case TRANS_LEADER_SEND_TO_PARALLEL: + Assert(winfo); + + /* + * XXX The publisher side doesn't always send relation/type update + * messages after the streaming transaction, so also update the + * relation/type in leader apply worker. See function + * cleanup_rel_sync_cache. + */ + if (pa_send_data(winfo, s->len, s->data)) + return (action != LOGICAL_REP_MSG_RELATION && + action != LOGICAL_REP_MSG_TYPE); + + /* + * Switch to serialize mode when we are not able to send the + * change to parallel apply worker. + */ + pa_switch_to_partial_serialize(winfo, false); + + /* fall through */ + case TRANS_LEADER_PARTIAL_SERIALIZE: + stream_write_change(action, &original_msg); + + /* Same reason as TRANS_LEADER_SEND_TO_PARALLEL case. */ + return (action != LOGICAL_REP_MSG_RELATION && + action != LOGICAL_REP_MSG_TYPE); + + case TRANS_PARALLEL_APPLY: + parallel_stream_nchanges += 1; + + /* Define a savepoint for a subxact if needed. */ + pa_start_subtrans(current_xid, stream_xid); + return false; + + default: + elog(ERROR, "unexpected apply action: %d", (int) apply_action); + return false; /* silence compiler warning */ + } +} + +/* + * Executor state preparation for evaluation of constraint expressions, + * indexes and triggers for the specified relation. + * + * Note that the caller must open and close any indexes to be updated. + */ +static ApplyExecutionData * +create_edata_for_relation(LogicalRepRelMapEntry *rel) +{ + ApplyExecutionData *edata; + EState *estate; + RangeTblEntry *rte; + List *perminfos = NIL; + ResultRelInfo *resultRelInfo; + + edata = (ApplyExecutionData *) palloc0(sizeof(ApplyExecutionData)); + edata->targetRel = rel; + + edata->estate = estate = CreateExecutorState(); + + rte = makeNode(RangeTblEntry); + rte->rtekind = RTE_RELATION; + rte->relid = RelationGetRelid(rel->localrel); + rte->relkind = rel->localrel->rd_rel->relkind; + rte->rellockmode = AccessShareLock; + + addRTEPermissionInfo(&perminfos, rte); + + ExecInitRangeTable(estate, list_make1(rte), perminfos); + + edata->targetRelInfo = resultRelInfo = makeNode(ResultRelInfo); + + /* + * Use Relation opened by logicalrep_rel_open() instead of opening it + * again. + */ + InitResultRelInfo(resultRelInfo, rel->localrel, 1, NULL, 0); + + /* + * We put the ResultRelInfo in the es_opened_result_relations list, even + * though we don't populate the es_result_relations array. That's a bit + * bogus, but it's enough to make ExecGetTriggerResultRel() find them. + * + * ExecOpenIndices() is not called here either, each execution path doing + * an apply operation being responsible for that. + */ + estate->es_opened_result_relations = + lappend(estate->es_opened_result_relations, resultRelInfo); + + estate->es_output_cid = GetCurrentCommandId(true); + + /* Prepare to catch AFTER triggers. */ + AfterTriggerBeginQuery(); + + /* other fields of edata remain NULL for now */ + + return edata; +} + +/* + * Finish any operations related to the executor state created by + * create_edata_for_relation(). + */ +static void +finish_edata(ApplyExecutionData *edata) +{ + EState *estate = edata->estate; + + /* Handle any queued AFTER triggers. */ + AfterTriggerEndQuery(estate); + + /* Shut down tuple routing, if any was done. */ + if (edata->proute) + ExecCleanupTupleRouting(edata->mtstate, edata->proute); + + /* + * Cleanup. It might seem that we should call ExecCloseResultRelations() + * here, but we intentionally don't. It would close the rel we added to + * es_opened_result_relations above, which is wrong because we took no + * corresponding refcount. We rely on ExecCleanupTupleRouting() to close + * any other relations opened during execution. + */ + ExecResetTupleTable(estate->es_tupleTable, false); + FreeExecutorState(estate); + pfree(edata); +} + +/* + * Executes default values for columns for which we can't map to remote + * relation columns. + * + * This allows us to support tables which have more columns on the downstream + * than on the upstream. + */ +static void +slot_fill_defaults(LogicalRepRelMapEntry *rel, EState *estate, + TupleTableSlot *slot) +{ + TupleDesc desc = RelationGetDescr(rel->localrel); + int num_phys_attrs = desc->natts; + int i; + int attnum, + num_defaults = 0; + int *defmap; + ExprState **defexprs; + ExprContext *econtext; + + econtext = GetPerTupleExprContext(estate); + + /* We got all the data via replication, no need to evaluate anything. */ + if (num_phys_attrs == rel->remoterel.natts) + return; + + defmap = (int *) palloc(num_phys_attrs * sizeof(int)); + defexprs = (ExprState **) palloc(num_phys_attrs * sizeof(ExprState *)); + + Assert(rel->attrmap->maplen == num_phys_attrs); + for (attnum = 0; attnum < num_phys_attrs; attnum++) + { + Expr *defexpr; + + if (TupleDescAttr(desc, attnum)->attisdropped || TupleDescAttr(desc, attnum)->attgenerated) + continue; + + if (rel->attrmap->attnums[attnum] >= 0) + continue; + + defexpr = (Expr *) build_column_default(rel->localrel, attnum + 1); + + if (defexpr != NULL) + { + /* Run the expression through planner */ + defexpr = expression_planner(defexpr); + + /* Initialize executable expression in copycontext */ + defexprs[num_defaults] = ExecInitExpr(defexpr, NULL); + defmap[num_defaults] = attnum; + num_defaults++; + } + } + + for (i = 0; i < num_defaults; i++) + slot->tts_values[defmap[i]] = + ExecEvalExpr(defexprs[i], econtext, &slot->tts_isnull[defmap[i]]); +} + +/* + * Store tuple data into slot. + * + * Incoming data can be either text or binary format. + */ +static void +slot_store_data(TupleTableSlot *slot, LogicalRepRelMapEntry *rel, + LogicalRepTupleData *tupleData) +{ + int natts = slot->tts_tupleDescriptor->natts; + int i; + + ExecClearTuple(slot); + + /* Call the "in" function for each non-dropped, non-null attribute */ + Assert(natts == rel->attrmap->maplen); + for (i = 0; i < natts; i++) + { + Form_pg_attribute att = TupleDescAttr(slot->tts_tupleDescriptor, i); + int remoteattnum = rel->attrmap->attnums[i]; + + if (!att->attisdropped && remoteattnum >= 0) + { + StringInfo colvalue = &tupleData->colvalues[remoteattnum]; + + Assert(remoteattnum < tupleData->ncols); + + /* Set attnum for error callback */ + apply_error_callback_arg.remote_attnum = remoteattnum; + + if (tupleData->colstatus[remoteattnum] == LOGICALREP_COLUMN_TEXT) + { + Oid typinput; + Oid typioparam; + + getTypeInputInfo(att->atttypid, &typinput, &typioparam); + slot->tts_values[i] = + OidInputFunctionCall(typinput, colvalue->data, + typioparam, att->atttypmod); + slot->tts_isnull[i] = false; + } + else if (tupleData->colstatus[remoteattnum] == LOGICALREP_COLUMN_BINARY) + { + Oid typreceive; + Oid typioparam; + + /* + * In some code paths we may be asked to re-parse the same + * tuple data. Reset the StringInfo's cursor so that works. + */ + colvalue->cursor = 0; + + getTypeBinaryInputInfo(att->atttypid, &typreceive, &typioparam); + slot->tts_values[i] = + OidReceiveFunctionCall(typreceive, colvalue, + typioparam, att->atttypmod); + + /* Trouble if it didn't eat the whole buffer */ + if (colvalue->cursor != colvalue->len) + ereport(ERROR, + (errcode(ERRCODE_INVALID_BINARY_REPRESENTATION), + errmsg("incorrect binary data format in logical replication column %d", + remoteattnum + 1))); + slot->tts_isnull[i] = false; + } + else + { + /* + * NULL value from remote. (We don't expect to see + * LOGICALREP_COLUMN_UNCHANGED here, but if we do, treat it as + * NULL.) + */ + slot->tts_values[i] = (Datum) 0; + slot->tts_isnull[i] = true; + } + + /* Reset attnum for error callback */ + apply_error_callback_arg.remote_attnum = -1; + } + else + { + /* + * We assign NULL to dropped attributes and missing values + * (missing values should be later filled using + * slot_fill_defaults). + */ + slot->tts_values[i] = (Datum) 0; + slot->tts_isnull[i] = true; + } + } + + ExecStoreVirtualTuple(slot); +} + +/* + * Replace updated columns with data from the LogicalRepTupleData struct. + * This is somewhat similar to heap_modify_tuple but also calls the type + * input functions on the user data. + * + * "slot" is filled with a copy of the tuple in "srcslot", replacing + * columns provided in "tupleData" and leaving others as-is. + * + * Caution: unreplaced pass-by-ref columns in "slot" will point into the + * storage for "srcslot". This is OK for current usage, but someday we may + * need to materialize "slot" at the end to make it independent of "srcslot". + */ +static void +slot_modify_data(TupleTableSlot *slot, TupleTableSlot *srcslot, + LogicalRepRelMapEntry *rel, + LogicalRepTupleData *tupleData) +{ + int natts = slot->tts_tupleDescriptor->natts; + int i; + + /* We'll fill "slot" with a virtual tuple, so we must start with ... */ + ExecClearTuple(slot); + + /* + * Copy all the column data from srcslot, so that we'll have valid values + * for unreplaced columns. + */ + Assert(natts == srcslot->tts_tupleDescriptor->natts); + slot_getallattrs(srcslot); + memcpy(slot->tts_values, srcslot->tts_values, natts * sizeof(Datum)); + memcpy(slot->tts_isnull, srcslot->tts_isnull, natts * sizeof(bool)); + + /* Call the "in" function for each replaced attribute */ + Assert(natts == rel->attrmap->maplen); + for (i = 0; i < natts; i++) + { + Form_pg_attribute att = TupleDescAttr(slot->tts_tupleDescriptor, i); + int remoteattnum = rel->attrmap->attnums[i]; + + if (remoteattnum < 0) + continue; + + Assert(remoteattnum < tupleData->ncols); + + if (tupleData->colstatus[remoteattnum] != LOGICALREP_COLUMN_UNCHANGED) + { + StringInfo colvalue = &tupleData->colvalues[remoteattnum]; + + /* Set attnum for error callback */ + apply_error_callback_arg.remote_attnum = remoteattnum; + + if (tupleData->colstatus[remoteattnum] == LOGICALREP_COLUMN_TEXT) + { + Oid typinput; + Oid typioparam; + + getTypeInputInfo(att->atttypid, &typinput, &typioparam); + slot->tts_values[i] = + OidInputFunctionCall(typinput, colvalue->data, + typioparam, att->atttypmod); + slot->tts_isnull[i] = false; + } + else if (tupleData->colstatus[remoteattnum] == LOGICALREP_COLUMN_BINARY) + { + Oid typreceive; + Oid typioparam; + + /* + * In some code paths we may be asked to re-parse the same + * tuple data. Reset the StringInfo's cursor so that works. + */ + colvalue->cursor = 0; + + getTypeBinaryInputInfo(att->atttypid, &typreceive, &typioparam); + slot->tts_values[i] = + OidReceiveFunctionCall(typreceive, colvalue, + typioparam, att->atttypmod); + + /* Trouble if it didn't eat the whole buffer */ + if (colvalue->cursor != colvalue->len) + ereport(ERROR, + (errcode(ERRCODE_INVALID_BINARY_REPRESENTATION), + errmsg("incorrect binary data format in logical replication column %d", + remoteattnum + 1))); + slot->tts_isnull[i] = false; + } + else + { + /* must be LOGICALREP_COLUMN_NULL */ + slot->tts_values[i] = (Datum) 0; + slot->tts_isnull[i] = true; + } + + /* Reset attnum for error callback */ + apply_error_callback_arg.remote_attnum = -1; + } + } + + /* And finally, declare that "slot" contains a valid virtual tuple */ + ExecStoreVirtualTuple(slot); +} + +/* + * Handle BEGIN message. + */ +static void +apply_handle_begin(StringInfo s) +{ + LogicalRepBeginData begin_data; + + /* There must not be an active streaming transaction. */ + Assert(!TransactionIdIsValid(stream_xid)); + + logicalrep_read_begin(s, &begin_data); + set_apply_error_context_xact(begin_data.xid, begin_data.final_lsn); + + remote_final_lsn = begin_data.final_lsn; + + maybe_start_skipping_changes(begin_data.final_lsn); + + in_remote_transaction = true; + + pgstat_report_activity(STATE_RUNNING, NULL); +} + +/* + * Handle COMMIT message. + * + * TODO, support tracking of multiple origins + */ +static void +apply_handle_commit(StringInfo s) +{ + LogicalRepCommitData commit_data; + + logicalrep_read_commit(s, &commit_data); + + if (commit_data.commit_lsn != remote_final_lsn) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("incorrect commit LSN %X/%X in commit message (expected %X/%X)", + LSN_FORMAT_ARGS(commit_data.commit_lsn), + LSN_FORMAT_ARGS(remote_final_lsn)))); + + apply_handle_commit_internal(&commit_data); + + /* Process any tables that are being synchronized in parallel. */ + process_syncing_tables(commit_data.end_lsn); + + pgstat_report_activity(STATE_IDLE, NULL); + reset_apply_error_context_info(); +} + +/* + * Handle BEGIN PREPARE message. + */ +static void +apply_handle_begin_prepare(StringInfo s) +{ + LogicalRepPreparedTxnData begin_data; + + /* Tablesync should never receive prepare. */ + if (am_tablesync_worker()) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("tablesync worker received a BEGIN PREPARE message"))); + + /* There must not be an active streaming transaction. */ + Assert(!TransactionIdIsValid(stream_xid)); + + logicalrep_read_begin_prepare(s, &begin_data); + set_apply_error_context_xact(begin_data.xid, begin_data.prepare_lsn); + + remote_final_lsn = begin_data.prepare_lsn; + + maybe_start_skipping_changes(begin_data.prepare_lsn); + + in_remote_transaction = true; + + pgstat_report_activity(STATE_RUNNING, NULL); +} + +/* + * Common function to prepare the GID. + */ +static void +apply_handle_prepare_internal(LogicalRepPreparedTxnData *prepare_data) +{ + char gid[GIDSIZE]; + + /* + * Compute unique GID for two_phase transactions. We don't use GID of + * prepared transaction sent by server as that can lead to deadlock when + * we have multiple subscriptions from same node point to publications on + * the same node. See comments atop worker.c + */ + TwoPhaseTransactionGid(MySubscription->oid, prepare_data->xid, + gid, sizeof(gid)); + + /* + * BeginTransactionBlock is necessary to balance the EndTransactionBlock + * called within the PrepareTransactionBlock below. + */ + if (!IsTransactionBlock()) + { + BeginTransactionBlock(); + CommitTransactionCommand(); /* Completes the preceding Begin command. */ + } + + /* + * Update origin state so we can restart streaming from correct position + * in case of crash. + */ + replorigin_session_origin_lsn = prepare_data->end_lsn; + replorigin_session_origin_timestamp = prepare_data->prepare_time; + + PrepareTransactionBlock(gid); +} + +/* + * Handle PREPARE message. + */ +static void +apply_handle_prepare(StringInfo s) +{ + LogicalRepPreparedTxnData prepare_data; + + logicalrep_read_prepare(s, &prepare_data); + + if (prepare_data.prepare_lsn != remote_final_lsn) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("incorrect prepare LSN %X/%X in prepare message (expected %X/%X)", + LSN_FORMAT_ARGS(prepare_data.prepare_lsn), + LSN_FORMAT_ARGS(remote_final_lsn)))); + + /* + * Unlike commit, here, we always prepare the transaction even though no + * change has happened in this transaction or all changes are skipped. It + * is done this way because at commit prepared time, we won't know whether + * we have skipped preparing a transaction because of those reasons. + * + * XXX, We can optimize such that at commit prepared time, we first check + * whether we have prepared the transaction or not but that doesn't seem + * worthwhile because such cases shouldn't be common. + */ + begin_replication_step(); + + apply_handle_prepare_internal(&prepare_data); + + end_replication_step(); + CommitTransactionCommand(); + pgstat_report_stat(false); + + store_flush_position(prepare_data.end_lsn, XactLastCommitEnd); + + in_remote_transaction = false; + + /* Process any tables that are being synchronized in parallel. */ + process_syncing_tables(prepare_data.end_lsn); + + /* + * Since we have already prepared the transaction, in a case where the + * server crashes before clearing the subskiplsn, it will be left but the + * transaction won't be resent. But that's okay because it's a rare case + * and the subskiplsn will be cleared when finishing the next transaction. + */ + stop_skipping_changes(); + clear_subscription_skip_lsn(prepare_data.prepare_lsn); + + pgstat_report_activity(STATE_IDLE, NULL); + reset_apply_error_context_info(); +} + +/* + * Handle a COMMIT PREPARED of a previously PREPARED transaction. + * + * Note that we don't need to wait here if the transaction was prepared in a + * parallel apply worker. In that case, we have already waited for the prepare + * to finish in apply_handle_stream_prepare() which will ensure all the + * operations in that transaction have happened in the subscriber, so no + * concurrent transaction can cause deadlock or transaction dependency issues. + */ +static void +apply_handle_commit_prepared(StringInfo s) +{ + LogicalRepCommitPreparedTxnData prepare_data; + char gid[GIDSIZE]; + + logicalrep_read_commit_prepared(s, &prepare_data); + set_apply_error_context_xact(prepare_data.xid, prepare_data.commit_lsn); + + /* Compute GID for two_phase transactions. */ + TwoPhaseTransactionGid(MySubscription->oid, prepare_data.xid, + gid, sizeof(gid)); + + /* There is no transaction when COMMIT PREPARED is called */ + begin_replication_step(); + + /* + * Update origin state so we can restart streaming from correct position + * in case of crash. + */ + replorigin_session_origin_lsn = prepare_data.end_lsn; + replorigin_session_origin_timestamp = prepare_data.commit_time; + + FinishPreparedTransaction(gid, true); + end_replication_step(); + CommitTransactionCommand(); + pgstat_report_stat(false); + + store_flush_position(prepare_data.end_lsn, XactLastCommitEnd); + in_remote_transaction = false; + + /* Process any tables that are being synchronized in parallel. */ + process_syncing_tables(prepare_data.end_lsn); + + clear_subscription_skip_lsn(prepare_data.end_lsn); + + pgstat_report_activity(STATE_IDLE, NULL); + reset_apply_error_context_info(); +} + +/* + * Handle a ROLLBACK PREPARED of a previously PREPARED TRANSACTION. + * + * Note that we don't need to wait here if the transaction was prepared in a + * parallel apply worker. In that case, we have already waited for the prepare + * to finish in apply_handle_stream_prepare() which will ensure all the + * operations in that transaction have happened in the subscriber, so no + * concurrent transaction can cause deadlock or transaction dependency issues. + */ +static void +apply_handle_rollback_prepared(StringInfo s) +{ + LogicalRepRollbackPreparedTxnData rollback_data; + char gid[GIDSIZE]; + + logicalrep_read_rollback_prepared(s, &rollback_data); + set_apply_error_context_xact(rollback_data.xid, rollback_data.rollback_end_lsn); + + /* Compute GID for two_phase transactions. */ + TwoPhaseTransactionGid(MySubscription->oid, rollback_data.xid, + gid, sizeof(gid)); + + /* + * It is possible that we haven't received prepare because it occurred + * before walsender reached a consistent point or the two_phase was still + * not enabled by that time, so in such cases, we need to skip rollback + * prepared. + */ + if (LookupGXact(gid, rollback_data.prepare_end_lsn, + rollback_data.prepare_time)) + { + /* + * Update origin state so we can restart streaming from correct + * position in case of crash. + */ + replorigin_session_origin_lsn = rollback_data.rollback_end_lsn; + replorigin_session_origin_timestamp = rollback_data.rollback_time; + + /* There is no transaction when ABORT/ROLLBACK PREPARED is called */ + begin_replication_step(); + FinishPreparedTransaction(gid, false); + end_replication_step(); + CommitTransactionCommand(); + + clear_subscription_skip_lsn(rollback_data.rollback_end_lsn); + } + + pgstat_report_stat(false); + + store_flush_position(rollback_data.rollback_end_lsn, XactLastCommitEnd); + in_remote_transaction = false; + + /* Process any tables that are being synchronized in parallel. */ + process_syncing_tables(rollback_data.rollback_end_lsn); + + pgstat_report_activity(STATE_IDLE, NULL); + reset_apply_error_context_info(); +} + +/* + * Handle STREAM PREPARE. + */ +static void +apply_handle_stream_prepare(StringInfo s) +{ + LogicalRepPreparedTxnData prepare_data; + ParallelApplyWorkerInfo *winfo; + TransApplyAction apply_action; + + /* Save the message before it is consumed. */ + StringInfoData original_msg = *s; + + if (in_streamed_transaction) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("STREAM PREPARE message without STREAM STOP"))); + + /* Tablesync should never receive prepare. */ + if (am_tablesync_worker()) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("tablesync worker received a STREAM PREPARE message"))); + + logicalrep_read_stream_prepare(s, &prepare_data); + set_apply_error_context_xact(prepare_data.xid, prepare_data.prepare_lsn); + + apply_action = get_transaction_apply_action(prepare_data.xid, &winfo); + + switch (apply_action) + { + case TRANS_LEADER_APPLY: + + /* + * The transaction has been serialized to file, so replay all the + * spooled operations. + */ + apply_spooled_messages(MyLogicalRepWorker->stream_fileset, + prepare_data.xid, prepare_data.prepare_lsn); + + /* Mark the transaction as prepared. */ + apply_handle_prepare_internal(&prepare_data); + + CommitTransactionCommand(); + + store_flush_position(prepare_data.end_lsn, XactLastCommitEnd); + + in_remote_transaction = false; + + /* Unlink the files with serialized changes and subxact info. */ + stream_cleanup_files(MyLogicalRepWorker->subid, prepare_data.xid); + + elog(DEBUG1, "finished processing the STREAM PREPARE command"); + break; + + case TRANS_LEADER_SEND_TO_PARALLEL: + Assert(winfo); + + if (pa_send_data(winfo, s->len, s->data)) + { + /* Finish processing the streaming transaction. */ + pa_xact_finish(winfo, prepare_data.end_lsn); + break; + } + + /* + * Switch to serialize mode when we are not able to send the + * change to parallel apply worker. + */ + pa_switch_to_partial_serialize(winfo, true); + + /* fall through */ + case TRANS_LEADER_PARTIAL_SERIALIZE: + Assert(winfo); + + stream_open_and_write_change(prepare_data.xid, + LOGICAL_REP_MSG_STREAM_PREPARE, + &original_msg); + + pa_set_fileset_state(winfo->shared, FS_SERIALIZE_DONE); + + /* Finish processing the streaming transaction. */ + pa_xact_finish(winfo, prepare_data.end_lsn); + break; + + case TRANS_PARALLEL_APPLY: + + /* + * If the parallel apply worker is applying spooled messages then + * close the file before preparing. + */ + if (stream_fd) + stream_close_file(); + + begin_replication_step(); + + /* Mark the transaction as prepared. */ + apply_handle_prepare_internal(&prepare_data); + + end_replication_step(); + + CommitTransactionCommand(); + + MyParallelShared->last_commit_end = XactLastCommitEnd; + + pa_set_xact_state(MyParallelShared, PARALLEL_TRANS_FINISHED); + pa_unlock_transaction(MyParallelShared->xid, AccessExclusiveLock); + + pa_reset_subtrans(); + + elog(DEBUG1, "finished processing the STREAM PREPARE command"); + break; + + default: + elog(ERROR, "unexpected apply action: %d", (int) apply_action); + break; + } + + pgstat_report_stat(false); + + /* Process any tables that are being synchronized in parallel. */ + process_syncing_tables(prepare_data.end_lsn); + + /* + * Similar to prepare case, the subskiplsn could be left in a case of + * server crash but it's okay. See the comments in apply_handle_prepare(). + */ + stop_skipping_changes(); + clear_subscription_skip_lsn(prepare_data.prepare_lsn); + + pgstat_report_activity(STATE_IDLE, NULL); + + reset_apply_error_context_info(); +} + +/* + * Handle ORIGIN message. + * + * TODO, support tracking of multiple origins + */ +static void +apply_handle_origin(StringInfo s) +{ + /* + * ORIGIN message can only come inside streaming transaction or inside + * remote transaction and before any actual writes. + */ + if (!in_streamed_transaction && + (!in_remote_transaction || + (IsTransactionState() && !am_tablesync_worker()))) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("ORIGIN message sent out of order"))); +} + +/* + * Initialize fileset (if not already done). + * + * Create a new file when first_segment is true, otherwise open the existing + * file. + */ +void +stream_start_internal(TransactionId xid, bool first_segment) +{ + begin_replication_step(); + + /* + * Initialize the worker's stream_fileset if we haven't yet. This will be + * used for the entire duration of the worker so create it in a permanent + * context. We create this on the very first streaming message from any + * transaction and then use it for this and other streaming transactions. + * Now, we could create a fileset at the start of the worker as well but + * then we won't be sure that it will ever be used. + */ + if (!MyLogicalRepWorker->stream_fileset) + { + MemoryContext oldctx; + + oldctx = MemoryContextSwitchTo(ApplyContext); + + MyLogicalRepWorker->stream_fileset = palloc(sizeof(FileSet)); + FileSetInit(MyLogicalRepWorker->stream_fileset); + + MemoryContextSwitchTo(oldctx); + } + + /* Open the spool file for this transaction. */ + stream_open_file(MyLogicalRepWorker->subid, xid, first_segment); + + /* If this is not the first segment, open existing subxact file. */ + if (!first_segment) + subxact_info_read(MyLogicalRepWorker->subid, xid); + + end_replication_step(); +} + +/* + * Handle STREAM START message. + */ +static void +apply_handle_stream_start(StringInfo s) +{ + bool first_segment; + ParallelApplyWorkerInfo *winfo; + TransApplyAction apply_action; + + /* Save the message before it is consumed. */ + StringInfoData original_msg = *s; + + if (in_streamed_transaction) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("duplicate STREAM START message"))); + + /* There must not be an active streaming transaction. */ + Assert(!TransactionIdIsValid(stream_xid)); + + /* notify handle methods we're processing a remote transaction */ + in_streamed_transaction = true; + + /* extract XID of the top-level transaction */ + stream_xid = logicalrep_read_stream_start(s, &first_segment); + + if (!TransactionIdIsValid(stream_xid)) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("invalid transaction ID in streamed replication transaction"))); + + set_apply_error_context_xact(stream_xid, InvalidXLogRecPtr); + + /* Try to allocate a worker for the streaming transaction. */ + if (first_segment) + pa_allocate_worker(stream_xid); + + apply_action = get_transaction_apply_action(stream_xid, &winfo); + + switch (apply_action) + { + case TRANS_LEADER_SERIALIZE: + + /* + * Function stream_start_internal starts a transaction. This + * transaction will be committed on the stream stop unless it is a + * tablesync worker in which case it will be committed after + * processing all the messages. We need this transaction for + * handling the BufFile, used for serializing the streaming data + * and subxact info. + */ + stream_start_internal(stream_xid, first_segment); + break; + + case TRANS_LEADER_SEND_TO_PARALLEL: + Assert(winfo); + + /* + * Once we start serializing the changes, the parallel apply + * worker will wait for the leader to release the stream lock + * until the end of the transaction. So, we don't need to release + * the lock or increment the stream count in that case. + */ + if (pa_send_data(winfo, s->len, s->data)) + { + /* + * Unlock the shared object lock so that the parallel apply + * worker can continue to receive changes. + */ + if (!first_segment) + pa_unlock_stream(winfo->shared->xid, AccessExclusiveLock); + + /* + * Increment the number of streaming blocks waiting to be + * processed by parallel apply worker. + */ + pg_atomic_add_fetch_u32(&winfo->shared->pending_stream_count, 1); + + /* Cache the parallel apply worker for this transaction. */ + pa_set_stream_apply_worker(winfo); + break; + } + + /* + * Switch to serialize mode when we are not able to send the + * change to parallel apply worker. + */ + pa_switch_to_partial_serialize(winfo, !first_segment); + + /* fall through */ + case TRANS_LEADER_PARTIAL_SERIALIZE: + Assert(winfo); + + /* + * Open the spool file unless it was already opened when switching + * to serialize mode. The transaction started in + * stream_start_internal will be committed on the stream stop. + */ + if (apply_action != TRANS_LEADER_SEND_TO_PARALLEL) + stream_start_internal(stream_xid, first_segment); + + stream_write_change(LOGICAL_REP_MSG_STREAM_START, &original_msg); + + /* Cache the parallel apply worker for this transaction. */ + pa_set_stream_apply_worker(winfo); + break; + + case TRANS_PARALLEL_APPLY: + if (first_segment) + { + /* Hold the lock until the end of the transaction. */ + pa_lock_transaction(MyParallelShared->xid, AccessExclusiveLock); + pa_set_xact_state(MyParallelShared, PARALLEL_TRANS_STARTED); + + /* + * Signal the leader apply worker, as it may be waiting for + * us. + */ + logicalrep_worker_wakeup(MyLogicalRepWorker->subid, InvalidOid); + } + + parallel_stream_nchanges = 0; + break; + + default: + elog(ERROR, "unexpected apply action: %d", (int) apply_action); + break; + } + + pgstat_report_activity(STATE_RUNNING, NULL); +} + +/* + * Update the information about subxacts and close the file. + * + * This function should be called when the stream_start_internal function has + * been called. + */ +void +stream_stop_internal(TransactionId xid) +{ + /* + * Serialize information about subxacts for the toplevel transaction, then + * close the stream messages spool file. + */ + subxact_info_write(MyLogicalRepWorker->subid, xid); + stream_close_file(); + + /* We must be in a valid transaction state */ + Assert(IsTransactionState()); + + /* Commit the per-stream transaction */ + CommitTransactionCommand(); + + /* Reset per-stream context */ + MemoryContextReset(LogicalStreamingContext); +} + +/* + * Handle STREAM STOP message. + */ +static void +apply_handle_stream_stop(StringInfo s) +{ + ParallelApplyWorkerInfo *winfo; + TransApplyAction apply_action; + + if (!in_streamed_transaction) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("STREAM STOP message without STREAM START"))); + + apply_action = get_transaction_apply_action(stream_xid, &winfo); + + switch (apply_action) + { + case TRANS_LEADER_SERIALIZE: + stream_stop_internal(stream_xid); + break; + + case TRANS_LEADER_SEND_TO_PARALLEL: + Assert(winfo); + + /* + * Lock before sending the STREAM_STOP message so that the leader + * can hold the lock first and the parallel apply worker will wait + * for leader to release the lock. See Locking Considerations atop + * applyparallelworker.c. + */ + pa_lock_stream(winfo->shared->xid, AccessExclusiveLock); + + if (pa_send_data(winfo, s->len, s->data)) + { + pa_set_stream_apply_worker(NULL); + break; + } + + /* + * Switch to serialize mode when we are not able to send the + * change to parallel apply worker. + */ + pa_switch_to_partial_serialize(winfo, true); + + /* fall through */ + case TRANS_LEADER_PARTIAL_SERIALIZE: + stream_write_change(LOGICAL_REP_MSG_STREAM_STOP, s); + stream_stop_internal(stream_xid); + pa_set_stream_apply_worker(NULL); + break; + + case TRANS_PARALLEL_APPLY: + elog(DEBUG1, "applied %u changes in the streaming chunk", + parallel_stream_nchanges); + + /* + * By the time parallel apply worker is processing the changes in + * the current streaming block, the leader apply worker may have + * sent multiple streaming blocks. This can lead to parallel apply + * worker start waiting even when there are more chunk of streams + * in the queue. So, try to lock only if there is no message left + * in the queue. See Locking Considerations atop + * applyparallelworker.c. + * + * Note that here we have a race condition where we can start + * waiting even when there are pending streaming chunks. This can + * happen if the leader sends another streaming block and acquires + * the stream lock again after the parallel apply worker checks + * that there is no pending streaming block and before it actually + * starts waiting on a lock. We can handle this case by not + * allowing the leader to increment the stream block count during + * the time parallel apply worker acquires the lock but it is not + * clear whether that is worth the complexity. + * + * Now, if this missed chunk contains rollback to savepoint, then + * there is a risk of deadlock which probably shouldn't happen + * after restart. + */ + pa_decr_and_wait_stream_block(); + break; + + default: + elog(ERROR, "unexpected apply action: %d", (int) apply_action); + break; + } + + in_streamed_transaction = false; + stream_xid = InvalidTransactionId; + + /* + * The parallel apply worker could be in a transaction in which case we + * need to report the state as STATE_IDLEINTRANSACTION. + */ + if (IsTransactionOrTransactionBlock()) + pgstat_report_activity(STATE_IDLEINTRANSACTION, NULL); + else + pgstat_report_activity(STATE_IDLE, NULL); + + reset_apply_error_context_info(); +} + +/* + * Helper function to handle STREAM ABORT message when the transaction was + * serialized to file. + */ +static void +stream_abort_internal(TransactionId xid, TransactionId subxid) +{ + /* + * If the two XIDs are the same, it's in fact abort of toplevel xact, so + * just delete the files with serialized info. + */ + if (xid == subxid) + stream_cleanup_files(MyLogicalRepWorker->subid, xid); + else + { + /* + * OK, so it's a subxact. We need to read the subxact file for the + * toplevel transaction, determine the offset tracked for the subxact, + * and truncate the file with changes. We also remove the subxacts + * with higher offsets (or rather higher XIDs). + * + * We intentionally scan the array from the tail, because we're likely + * aborting a change for the most recent subtransactions. + * + * We can't use the binary search here as subxact XIDs won't + * necessarily arrive in sorted order, consider the case where we have + * released the savepoint for multiple subtransactions and then + * performed rollback to savepoint for one of the earlier + * sub-transaction. + */ + int64 i; + int64 subidx; + BufFile *fd; + bool found = false; + char path[MAXPGPATH]; + + subidx = -1; + begin_replication_step(); + subxact_info_read(MyLogicalRepWorker->subid, xid); + + for (i = subxact_data.nsubxacts; i > 0; i--) + { + if (subxact_data.subxacts[i - 1].xid == subxid) + { + subidx = (i - 1); + found = true; + break; + } + } + + /* + * If it's an empty sub-transaction then we will not find the subxid + * here so just cleanup the subxact info and return. + */ + if (!found) + { + /* Cleanup the subxact info */ + cleanup_subxact_info(); + end_replication_step(); + CommitTransactionCommand(); + return; + } + + /* open the changes file */ + changes_filename(path, MyLogicalRepWorker->subid, xid); + fd = BufFileOpenFileSet(MyLogicalRepWorker->stream_fileset, path, + O_RDWR, false); + + /* OK, truncate the file at the right offset */ + BufFileTruncateFileSet(fd, subxact_data.subxacts[subidx].fileno, + subxact_data.subxacts[subidx].offset); + BufFileClose(fd); + + /* discard the subxacts added later */ + subxact_data.nsubxacts = subidx; + + /* write the updated subxact list */ + subxact_info_write(MyLogicalRepWorker->subid, xid); + + end_replication_step(); + CommitTransactionCommand(); + } +} + +/* + * Handle STREAM ABORT message. + */ +static void +apply_handle_stream_abort(StringInfo s) +{ + TransactionId xid; + TransactionId subxid; + LogicalRepStreamAbortData abort_data; + ParallelApplyWorkerInfo *winfo; + TransApplyAction apply_action; + + /* Save the message before it is consumed. */ + StringInfoData original_msg = *s; + bool toplevel_xact; + + if (in_streamed_transaction) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("STREAM ABORT message without STREAM STOP"))); + + /* We receive abort information only when we can apply in parallel. */ + logicalrep_read_stream_abort(s, &abort_data, + MyLogicalRepWorker->parallel_apply); + + xid = abort_data.xid; + subxid = abort_data.subxid; + toplevel_xact = (xid == subxid); + + set_apply_error_context_xact(subxid, abort_data.abort_lsn); + + apply_action = get_transaction_apply_action(xid, &winfo); + + switch (apply_action) + { + case TRANS_LEADER_APPLY: + + /* + * We are in the leader apply worker and the transaction has been + * serialized to file. + */ + stream_abort_internal(xid, subxid); + + elog(DEBUG1, "finished processing the STREAM ABORT command"); + break; + + case TRANS_LEADER_SEND_TO_PARALLEL: + Assert(winfo); + + /* + * For the case of aborting the subtransaction, we increment the + * number of streaming blocks and take the lock again before + * sending the STREAM_ABORT to ensure that the parallel apply + * worker will wait on the lock for the next set of changes after + * processing the STREAM_ABORT message if it is not already + * waiting for STREAM_STOP message. + * + * It is important to perform this locking before sending the + * STREAM_ABORT message so that the leader can hold the lock first + * and the parallel apply worker will wait for the leader to + * release the lock. This is the same as what we do in + * apply_handle_stream_stop. See Locking Considerations atop + * applyparallelworker.c. + */ + if (!toplevel_xact) + { + pa_unlock_stream(xid, AccessExclusiveLock); + pg_atomic_add_fetch_u32(&winfo->shared->pending_stream_count, 1); + pa_lock_stream(xid, AccessExclusiveLock); + } + + if (pa_send_data(winfo, s->len, s->data)) + { + /* + * Unlike STREAM_COMMIT and STREAM_PREPARE, we don't need to + * wait here for the parallel apply worker to finish as that + * is not required to maintain the commit order and won't have + * the risk of failures due to transaction dependencies and + * deadlocks. However, it is possible that before the parallel + * worker finishes and we clear the worker info, the xid + * wraparound happens on the upstream and a new transaction + * with the same xid can appear and that can lead to duplicate + * entries in ParallelApplyTxnHash. Yet another problem could + * be that we may have serialized the changes in partial + * serialize mode and the file containing xact changes may + * already exist, and after xid wraparound trying to create + * the file for the same xid can lead to an error. To avoid + * these problems, we decide to wait for the aborts to finish. + * + * Note, it is okay to not update the flush location position + * for aborts as in worst case that means such a transaction + * won't be sent again after restart. + */ + if (toplevel_xact) + pa_xact_finish(winfo, InvalidXLogRecPtr); + + break; + } + + /* + * Switch to serialize mode when we are not able to send the + * change to parallel apply worker. + */ + pa_switch_to_partial_serialize(winfo, true); + + /* fall through */ + case TRANS_LEADER_PARTIAL_SERIALIZE: + Assert(winfo); + + /* + * Parallel apply worker might have applied some changes, so write + * the STREAM_ABORT message so that it can rollback the + * subtransaction if needed. + */ + stream_open_and_write_change(xid, LOGICAL_REP_MSG_STREAM_ABORT, + &original_msg); + + if (toplevel_xact) + { + pa_set_fileset_state(winfo->shared, FS_SERIALIZE_DONE); + pa_xact_finish(winfo, InvalidXLogRecPtr); + } + break; + + case TRANS_PARALLEL_APPLY: + + /* + * If the parallel apply worker is applying spooled messages then + * close the file before aborting. + */ + if (toplevel_xact && stream_fd) + stream_close_file(); + + pa_stream_abort(&abort_data); + + /* + * We need to wait after processing rollback to savepoint for the + * next set of changes. + * + * We have a race condition here due to which we can start waiting + * here when there are more chunk of streams in the queue. See + * apply_handle_stream_stop. + */ + if (!toplevel_xact) + pa_decr_and_wait_stream_block(); + + elog(DEBUG1, "finished processing the STREAM ABORT command"); + break; + + default: + elog(ERROR, "unexpected apply action: %d", (int) apply_action); + break; + } + + reset_apply_error_context_info(); +} + +/* + * Ensure that the passed location is fileset's end. + */ +static void +ensure_last_message(FileSet *stream_fileset, TransactionId xid, int fileno, + off_t offset) +{ + char path[MAXPGPATH]; + BufFile *fd; + int last_fileno; + off_t last_offset; + + Assert(!IsTransactionState()); + + begin_replication_step(); + + changes_filename(path, MyLogicalRepWorker->subid, xid); + + fd = BufFileOpenFileSet(stream_fileset, path, O_RDONLY, false); + + BufFileSeek(fd, 0, 0, SEEK_END); + BufFileTell(fd, &last_fileno, &last_offset); + + BufFileClose(fd); + + end_replication_step(); + + if (last_fileno != fileno || last_offset != offset) + elog(ERROR, "unexpected message left in streaming transaction's changes file \"%s\"", + path); +} + +/* + * Common spoolfile processing. + */ +void +apply_spooled_messages(FileSet *stream_fileset, TransactionId xid, + XLogRecPtr lsn) +{ + StringInfoData s2; + int nchanges; + char path[MAXPGPATH]; + char *buffer = NULL; + MemoryContext oldcxt; + ResourceOwner oldowner; + int fileno; + off_t offset; + + if (!am_parallel_apply_worker()) + maybe_start_skipping_changes(lsn); + + /* Make sure we have an open transaction */ + begin_replication_step(); + + /* + * Allocate file handle and memory required to process all the messages in + * TopTransactionContext to avoid them getting reset after each message is + * processed. + */ + oldcxt = MemoryContextSwitchTo(TopTransactionContext); + + /* Open the spool file for the committed/prepared transaction */ + changes_filename(path, MyLogicalRepWorker->subid, xid); + elog(DEBUG1, "replaying changes from file \"%s\"", path); + + /* + * Make sure the file is owned by the toplevel transaction so that the + * file will not be accidentally closed when aborting a subtransaction. + */ + oldowner = CurrentResourceOwner; + CurrentResourceOwner = TopTransactionResourceOwner; + + stream_fd = BufFileOpenFileSet(stream_fileset, path, O_RDONLY, false); + + CurrentResourceOwner = oldowner; + + buffer = palloc(BLCKSZ); + initStringInfo(&s2); + + MemoryContextSwitchTo(oldcxt); + + remote_final_lsn = lsn; + + /* + * Make sure the handle apply_dispatch methods are aware we're in a remote + * transaction. + */ + in_remote_transaction = true; + pgstat_report_activity(STATE_RUNNING, NULL); + + end_replication_step(); + + /* + * Read the entries one by one and pass them through the same logic as in + * apply_dispatch. + */ + nchanges = 0; + while (true) + { + size_t nbytes; + int len; + + CHECK_FOR_INTERRUPTS(); + + /* read length of the on-disk record */ + nbytes = BufFileReadMaybeEOF(stream_fd, &len, sizeof(len), true); + + /* have we reached end of the file? */ + if (nbytes == 0) + break; + + /* do we have a correct length? */ + if (len <= 0) + elog(ERROR, "incorrect length %d in streaming transaction's changes file \"%s\"", + len, path); + + /* make sure we have sufficiently large buffer */ + buffer = repalloc(buffer, len); + + /* and finally read the data into the buffer */ + BufFileReadExact(stream_fd, buffer, len); + + BufFileTell(stream_fd, &fileno, &offset); + + /* copy the buffer to the stringinfo and call apply_dispatch */ + resetStringInfo(&s2); + appendBinaryStringInfo(&s2, buffer, len); + + /* Ensure we are reading the data into our memory context. */ + oldcxt = MemoryContextSwitchTo(ApplyMessageContext); + + apply_dispatch(&s2); + + MemoryContextReset(ApplyMessageContext); + + MemoryContextSwitchTo(oldcxt); + + nchanges++; + + /* + * It is possible the file has been closed because we have processed + * the transaction end message like stream_commit in which case that + * must be the last message. + */ + if (!stream_fd) + { + ensure_last_message(stream_fileset, xid, fileno, offset); + break; + } + + if (nchanges % 1000 == 0) + elog(DEBUG1, "replayed %d changes from file \"%s\"", + nchanges, path); + } + + if (stream_fd) + stream_close_file(); + + elog(DEBUG1, "replayed %d (all) changes from file \"%s\"", + nchanges, path); + + return; +} + +/* + * Handle STREAM COMMIT message. + */ +static void +apply_handle_stream_commit(StringInfo s) +{ + TransactionId xid; + LogicalRepCommitData commit_data; + ParallelApplyWorkerInfo *winfo; + TransApplyAction apply_action; + + /* Save the message before it is consumed. */ + StringInfoData original_msg = *s; + + if (in_streamed_transaction) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("STREAM COMMIT message without STREAM STOP"))); + + xid = logicalrep_read_stream_commit(s, &commit_data); + set_apply_error_context_xact(xid, commit_data.commit_lsn); + + apply_action = get_transaction_apply_action(xid, &winfo); + + switch (apply_action) + { + case TRANS_LEADER_APPLY: + + /* + * The transaction has been serialized to file, so replay all the + * spooled operations. + */ + apply_spooled_messages(MyLogicalRepWorker->stream_fileset, xid, + commit_data.commit_lsn); + + apply_handle_commit_internal(&commit_data); + + /* Unlink the files with serialized changes and subxact info. */ + stream_cleanup_files(MyLogicalRepWorker->subid, xid); + + elog(DEBUG1, "finished processing the STREAM COMMIT command"); + break; + + case TRANS_LEADER_SEND_TO_PARALLEL: + Assert(winfo); + + if (pa_send_data(winfo, s->len, s->data)) + { + /* Finish processing the streaming transaction. */ + pa_xact_finish(winfo, commit_data.end_lsn); + break; + } + + /* + * Switch to serialize mode when we are not able to send the + * change to parallel apply worker. + */ + pa_switch_to_partial_serialize(winfo, true); + + /* fall through */ + case TRANS_LEADER_PARTIAL_SERIALIZE: + Assert(winfo); + + stream_open_and_write_change(xid, LOGICAL_REP_MSG_STREAM_COMMIT, + &original_msg); + + pa_set_fileset_state(winfo->shared, FS_SERIALIZE_DONE); + + /* Finish processing the streaming transaction. */ + pa_xact_finish(winfo, commit_data.end_lsn); + break; + + case TRANS_PARALLEL_APPLY: + + /* + * If the parallel apply worker is applying spooled messages then + * close the file before committing. + */ + if (stream_fd) + stream_close_file(); + + apply_handle_commit_internal(&commit_data); + + MyParallelShared->last_commit_end = XactLastCommitEnd; + + /* + * It is important to set the transaction state as finished before + * releasing the lock. See pa_wait_for_xact_finish. + */ + pa_set_xact_state(MyParallelShared, PARALLEL_TRANS_FINISHED); + pa_unlock_transaction(xid, AccessExclusiveLock); + + pa_reset_subtrans(); + + elog(DEBUG1, "finished processing the STREAM COMMIT command"); + break; + + default: + elog(ERROR, "unexpected apply action: %d", (int) apply_action); + break; + } + + /* Process any tables that are being synchronized in parallel. */ + process_syncing_tables(commit_data.end_lsn); + + pgstat_report_activity(STATE_IDLE, NULL); + + reset_apply_error_context_info(); +} + +/* + * Helper function for apply_handle_commit and apply_handle_stream_commit. + */ +static void +apply_handle_commit_internal(LogicalRepCommitData *commit_data) +{ + if (is_skipping_changes()) + { + stop_skipping_changes(); + + /* + * Start a new transaction to clear the subskiplsn, if not started + * yet. + */ + if (!IsTransactionState()) + StartTransactionCommand(); + } + + if (IsTransactionState()) + { + /* + * The transaction is either non-empty or skipped, so we clear the + * subskiplsn. + */ + clear_subscription_skip_lsn(commit_data->commit_lsn); + + /* + * Update origin state so we can restart streaming from correct + * position in case of crash. + */ + replorigin_session_origin_lsn = commit_data->end_lsn; + replorigin_session_origin_timestamp = commit_data->committime; + + CommitTransactionCommand(); + + if (IsTransactionBlock()) + { + EndTransactionBlock(false); + CommitTransactionCommand(); + } + + pgstat_report_stat(false); + + store_flush_position(commit_data->end_lsn, XactLastCommitEnd); + } + else + { + /* Process any invalidation messages that might have accumulated. */ + AcceptInvalidationMessages(); + maybe_reread_subscription(); + } + + in_remote_transaction = false; +} + +/* + * Handle RELATION message. + * + * Note we don't do validation against local schema here. The validation + * against local schema is postponed until first change for given relation + * comes as we only care about it when applying changes for it anyway and we + * do less locking this way. + */ +static void +apply_handle_relation(StringInfo s) +{ + LogicalRepRelation *rel; + + if (handle_streamed_transaction(LOGICAL_REP_MSG_RELATION, s)) + return; + + rel = logicalrep_read_rel(s); + logicalrep_relmap_update(rel); + + /* Also reset all entries in the partition map that refer to remoterel. */ + logicalrep_partmap_reset_relmap(rel); +} + +/* + * Handle TYPE message. + * + * This implementation pays no attention to TYPE messages; we expect the user + * to have set things up so that the incoming data is acceptable to the input + * functions for the locally subscribed tables. Hence, we just read and + * discard the message. + */ +static void +apply_handle_type(StringInfo s) +{ + LogicalRepTyp typ; + + if (handle_streamed_transaction(LOGICAL_REP_MSG_TYPE, s)) + return; + + logicalrep_read_typ(s, &typ); +} + +/* + * Check that we (the subscription owner) have sufficient privileges on the + * target relation to perform the given operation. + */ +static void +TargetPrivilegesCheck(Relation rel, AclMode mode) +{ + Oid relid; + AclResult aclresult; + + relid = RelationGetRelid(rel); + aclresult = pg_class_aclcheck(relid, GetUserId(), mode); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, + get_relkind_objtype(rel->rd_rel->relkind), + get_rel_name(relid)); + + /* + * We lack the infrastructure to honor RLS policies. It might be possible + * to add such infrastructure here, but tablesync workers lack it, too, so + * we don't bother. RLS does not ordinarily apply to TRUNCATE commands, + * but it seems dangerous to replicate a TRUNCATE and then refuse to + * replicate subsequent INSERTs, so we forbid all commands the same. + */ + if (check_enable_rls(relid, InvalidOid, false) == RLS_ENABLED) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("user \"%s\" cannot replicate into relation with row-level security enabled: \"%s\"", + GetUserNameFromId(GetUserId(), true), + RelationGetRelationName(rel)))); +} + +/* + * Handle INSERT message. + */ + +static void +apply_handle_insert(StringInfo s) +{ + LogicalRepRelMapEntry *rel; + LogicalRepTupleData newtup; + LogicalRepRelId relid; + UserContext ucxt; + ApplyExecutionData *edata; + EState *estate; + TupleTableSlot *remoteslot; + MemoryContext oldctx; + bool run_as_owner; + + /* + * Quick return if we are skipping data modification changes or handling + * streamed transactions. + */ + if (is_skipping_changes() || + handle_streamed_transaction(LOGICAL_REP_MSG_INSERT, s)) + return; + + begin_replication_step(); + + relid = logicalrep_read_insert(s, &newtup); + rel = logicalrep_rel_open(relid, RowExclusiveLock); + if (!should_apply_changes_for_rel(rel)) + { + /* + * The relation can't become interesting in the middle of the + * transaction so it's safe to unlock it. + */ + logicalrep_rel_close(rel, RowExclusiveLock); + end_replication_step(); + return; + } + + /* + * Make sure that any user-supplied code runs as the table owner, unless + * the user has opted out of that behavior. + */ + run_as_owner = MySubscription->runasowner; + if (!run_as_owner) + SwitchToUntrustedUser(rel->localrel->rd_rel->relowner, &ucxt); + + /* Set relation for error callback */ + apply_error_callback_arg.rel = rel; + + /* Initialize the executor state. */ + edata = create_edata_for_relation(rel); + estate = edata->estate; + remoteslot = ExecInitExtraTupleSlot(estate, + RelationGetDescr(rel->localrel), + &TTSOpsVirtual); + + /* Process and store remote tuple in the slot */ + oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + slot_store_data(remoteslot, rel, &newtup); + slot_fill_defaults(rel, estate, remoteslot); + MemoryContextSwitchTo(oldctx); + + /* For a partitioned table, insert the tuple into a partition. */ + if (rel->localrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + apply_handle_tuple_routing(edata, + remoteslot, NULL, CMD_INSERT); + else + apply_handle_insert_internal(edata, edata->targetRelInfo, + remoteslot); + + finish_edata(edata); + + /* Reset relation for error callback */ + apply_error_callback_arg.rel = NULL; + + if (!run_as_owner) + RestoreUserContext(&ucxt); + + logicalrep_rel_close(rel, NoLock); + + end_replication_step(); +} + +/* + * Workhorse for apply_handle_insert() + * relinfo is for the relation we're actually inserting into + * (could be a child partition of edata->targetRelInfo) + */ +static void +apply_handle_insert_internal(ApplyExecutionData *edata, + ResultRelInfo *relinfo, + TupleTableSlot *remoteslot) +{ + EState *estate = edata->estate; + + /* We must open indexes here. */ + ExecOpenIndices(relinfo, false); + + /* Do the insert. */ + TargetPrivilegesCheck(relinfo->ri_RelationDesc, ACL_INSERT); + ExecSimpleRelationInsert(relinfo, estate, remoteslot); + + /* Cleanup. */ + ExecCloseIndices(relinfo); +} + +/* + * Check if the logical replication relation is updatable and throw + * appropriate error if it isn't. + */ +static void +check_relation_updatable(LogicalRepRelMapEntry *rel) +{ + /* + * For partitioned tables, we only need to care if the target partition is + * updatable (aka has PK or RI defined for it). + */ + if (rel->localrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + return; + + /* Updatable, no error. */ + if (rel->updatable) + return; + + /* + * We are in error mode so it's fine this is somewhat slow. It's better to + * give user correct error. + */ + if (OidIsValid(GetRelationIdentityOrPK(rel->localrel))) + { + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("publisher did not send replica identity column " + "expected by the logical replication target relation \"%s.%s\"", + rel->remoterel.nspname, rel->remoterel.relname))); + } + + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical replication target relation \"%s.%s\" has " + "neither REPLICA IDENTITY index nor PRIMARY " + "KEY and published relation does not have " + "REPLICA IDENTITY FULL", + rel->remoterel.nspname, rel->remoterel.relname))); +} + +/* + * Handle UPDATE message. + * + * TODO: FDW support + */ +static void +apply_handle_update(StringInfo s) +{ + LogicalRepRelMapEntry *rel; + LogicalRepRelId relid; + UserContext ucxt; + ApplyExecutionData *edata; + EState *estate; + LogicalRepTupleData oldtup; + LogicalRepTupleData newtup; + bool has_oldtup; + TupleTableSlot *remoteslot; + RTEPermissionInfo *target_perminfo; + MemoryContext oldctx; + bool run_as_owner; + + /* + * Quick return if we are skipping data modification changes or handling + * streamed transactions. + */ + if (is_skipping_changes() || + handle_streamed_transaction(LOGICAL_REP_MSG_UPDATE, s)) + return; + + begin_replication_step(); + + relid = logicalrep_read_update(s, &has_oldtup, &oldtup, + &newtup); + rel = logicalrep_rel_open(relid, RowExclusiveLock); + if (!should_apply_changes_for_rel(rel)) + { + /* + * The relation can't become interesting in the middle of the + * transaction so it's safe to unlock it. + */ + logicalrep_rel_close(rel, RowExclusiveLock); + end_replication_step(); + return; + } + + /* Set relation for error callback */ + apply_error_callback_arg.rel = rel; + + /* Check if we can do the update. */ + check_relation_updatable(rel); + + /* + * Make sure that any user-supplied code runs as the table owner, unless + * the user has opted out of that behavior. + */ + run_as_owner = MySubscription->runasowner; + if (!run_as_owner) + SwitchToUntrustedUser(rel->localrel->rd_rel->relowner, &ucxt); + + /* Initialize the executor state. */ + edata = create_edata_for_relation(rel); + estate = edata->estate; + remoteslot = ExecInitExtraTupleSlot(estate, + RelationGetDescr(rel->localrel), + &TTSOpsVirtual); + + /* + * Populate updatedCols so that per-column triggers can fire, and so + * executor can correctly pass down indexUnchanged hint. This could + * include more columns than were actually changed on the publisher + * because the logical replication protocol doesn't contain that + * information. But it would for example exclude columns that only exist + * on the subscriber, since we are not touching those. + */ + target_perminfo = list_nth(estate->es_rteperminfos, 0); + for (int i = 0; i < remoteslot->tts_tupleDescriptor->natts; i++) + { + Form_pg_attribute att = TupleDescAttr(remoteslot->tts_tupleDescriptor, i); + int remoteattnum = rel->attrmap->attnums[i]; + + if (!att->attisdropped && remoteattnum >= 0) + { + Assert(remoteattnum < newtup.ncols); + if (newtup.colstatus[remoteattnum] != LOGICALREP_COLUMN_UNCHANGED) + target_perminfo->updatedCols = + bms_add_member(target_perminfo->updatedCols, + i + 1 - FirstLowInvalidHeapAttributeNumber); + } + } + + /* Build the search tuple. */ + oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + slot_store_data(remoteslot, rel, + has_oldtup ? &oldtup : &newtup); + MemoryContextSwitchTo(oldctx); + + /* For a partitioned table, apply update to correct partition. */ + if (rel->localrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + apply_handle_tuple_routing(edata, + remoteslot, &newtup, CMD_UPDATE); + else + apply_handle_update_internal(edata, edata->targetRelInfo, + remoteslot, &newtup, rel->localindexoid); + + finish_edata(edata); + + /* Reset relation for error callback */ + apply_error_callback_arg.rel = NULL; + + if (!run_as_owner) + RestoreUserContext(&ucxt); + + logicalrep_rel_close(rel, NoLock); + + end_replication_step(); +} + +/* + * Workhorse for apply_handle_update() + * relinfo is for the relation we're actually updating in + * (could be a child partition of edata->targetRelInfo) + */ +static void +apply_handle_update_internal(ApplyExecutionData *edata, + ResultRelInfo *relinfo, + TupleTableSlot *remoteslot, + LogicalRepTupleData *newtup, + Oid localindexoid) +{ + EState *estate = edata->estate; + LogicalRepRelMapEntry *relmapentry = edata->targetRel; + Relation localrel = relinfo->ri_RelationDesc; + EPQState epqstate; + TupleTableSlot *localslot; + bool found; + MemoryContext oldctx; + + EvalPlanQualInit(&epqstate, estate, NULL, NIL, -1, NIL); + ExecOpenIndices(relinfo, false); + + found = FindReplTupleInLocalRel(edata, localrel, + &relmapentry->remoterel, + localindexoid, + remoteslot, &localslot); + ExecClearTuple(remoteslot); + + /* + * Tuple found. + * + * Note this will fail if there are other conflicting unique indexes. + */ + if (found) + { + /* Process and store remote tuple in the slot */ + oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + slot_modify_data(remoteslot, localslot, relmapentry, newtup); + MemoryContextSwitchTo(oldctx); + + EvalPlanQualSetSlot(&epqstate, remoteslot); + + /* Do the actual update. */ + TargetPrivilegesCheck(relinfo->ri_RelationDesc, ACL_UPDATE); + ExecSimpleRelationUpdate(relinfo, estate, &epqstate, localslot, + remoteslot); + } + else + { + /* + * The tuple to be updated could not be found. Do nothing except for + * emitting a log message. + * + * XXX should this be promoted to ereport(LOG) perhaps? + */ + elog(DEBUG1, + "logical replication did not find row to be updated " + "in replication target relation \"%s\"", + RelationGetRelationName(localrel)); + } + + /* Cleanup. */ + ExecCloseIndices(relinfo); + EvalPlanQualEnd(&epqstate); +} + +/* + * Handle DELETE message. + * + * TODO: FDW support + */ +static void +apply_handle_delete(StringInfo s) +{ + LogicalRepRelMapEntry *rel; + LogicalRepTupleData oldtup; + LogicalRepRelId relid; + UserContext ucxt; + ApplyExecutionData *edata; + EState *estate; + TupleTableSlot *remoteslot; + MemoryContext oldctx; + bool run_as_owner; + + /* + * Quick return if we are skipping data modification changes or handling + * streamed transactions. + */ + if (is_skipping_changes() || + handle_streamed_transaction(LOGICAL_REP_MSG_DELETE, s)) + return; + + begin_replication_step(); + + relid = logicalrep_read_delete(s, &oldtup); + rel = logicalrep_rel_open(relid, RowExclusiveLock); + if (!should_apply_changes_for_rel(rel)) + { + /* + * The relation can't become interesting in the middle of the + * transaction so it's safe to unlock it. + */ + logicalrep_rel_close(rel, RowExclusiveLock); + end_replication_step(); + return; + } + + /* Set relation for error callback */ + apply_error_callback_arg.rel = rel; + + /* Check if we can do the delete. */ + check_relation_updatable(rel); + + /* + * Make sure that any user-supplied code runs as the table owner, unless + * the user has opted out of that behavior. + */ + run_as_owner = MySubscription->runasowner; + if (!run_as_owner) + SwitchToUntrustedUser(rel->localrel->rd_rel->relowner, &ucxt); + + /* Initialize the executor state. */ + edata = create_edata_for_relation(rel); + estate = edata->estate; + remoteslot = ExecInitExtraTupleSlot(estate, + RelationGetDescr(rel->localrel), + &TTSOpsVirtual); + + /* Build the search tuple. */ + oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + slot_store_data(remoteslot, rel, &oldtup); + MemoryContextSwitchTo(oldctx); + + /* For a partitioned table, apply delete to correct partition. */ + if (rel->localrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + apply_handle_tuple_routing(edata, + remoteslot, NULL, CMD_DELETE); + else + apply_handle_delete_internal(edata, edata->targetRelInfo, + remoteslot, rel->localindexoid); + + finish_edata(edata); + + /* Reset relation for error callback */ + apply_error_callback_arg.rel = NULL; + + if (!run_as_owner) + RestoreUserContext(&ucxt); + + logicalrep_rel_close(rel, NoLock); + + end_replication_step(); +} + +/* + * Workhorse for apply_handle_delete() + * relinfo is for the relation we're actually deleting from + * (could be a child partition of edata->targetRelInfo) + */ +static void +apply_handle_delete_internal(ApplyExecutionData *edata, + ResultRelInfo *relinfo, + TupleTableSlot *remoteslot, + Oid localindexoid) +{ + EState *estate = edata->estate; + Relation localrel = relinfo->ri_RelationDesc; + LogicalRepRelation *remoterel = &edata->targetRel->remoterel; + EPQState epqstate; + TupleTableSlot *localslot; + bool found; + + EvalPlanQualInit(&epqstate, estate, NULL, NIL, -1, NIL); + ExecOpenIndices(relinfo, false); + + found = FindReplTupleInLocalRel(edata, localrel, remoterel, localindexoid, + remoteslot, &localslot); + + /* If found delete it. */ + if (found) + { + EvalPlanQualSetSlot(&epqstate, localslot); + + /* Do the actual delete. */ + TargetPrivilegesCheck(relinfo->ri_RelationDesc, ACL_DELETE); + ExecSimpleRelationDelete(relinfo, estate, &epqstate, localslot); + } + else + { + /* + * The tuple to be deleted could not be found. Do nothing except for + * emitting a log message. + * + * XXX should this be promoted to ereport(LOG) perhaps? + */ + elog(DEBUG1, + "logical replication did not find row to be deleted " + "in replication target relation \"%s\"", + RelationGetRelationName(localrel)); + } + + /* Cleanup. */ + ExecCloseIndices(relinfo); + EvalPlanQualEnd(&epqstate); +} + +/* + * Try to find a tuple received from the publication side (in 'remoteslot') in + * the corresponding local relation using either replica identity index, + * primary key, index or if needed, sequential scan. + * + * Local tuple, if found, is returned in '*localslot'. + */ +static bool +FindReplTupleInLocalRel(ApplyExecutionData *edata, Relation localrel, + LogicalRepRelation *remoterel, + Oid localidxoid, + TupleTableSlot *remoteslot, + TupleTableSlot **localslot) +{ + EState *estate = edata->estate; + bool found; + + /* + * Regardless of the top-level operation, we're performing a read here, so + * check for SELECT privileges. + */ + TargetPrivilegesCheck(localrel, ACL_SELECT); + + *localslot = table_slot_create(localrel, &estate->es_tupleTable); + + Assert(OidIsValid(localidxoid) || + (remoterel->replident == REPLICA_IDENTITY_FULL)); + + if (OidIsValid(localidxoid)) + { +#ifdef USE_ASSERT_CHECKING + Relation idxrel = index_open(localidxoid, AccessShareLock); + + /* Index must be PK, RI, or usable for REPLICA IDENTITY FULL tables */ + Assert(GetRelationIdentityOrPK(idxrel) == localidxoid || + IsIndexUsableForReplicaIdentityFull(BuildIndexInfo(idxrel), + edata->targetRel->attrmap)); + index_close(idxrel, AccessShareLock); +#endif + + found = RelationFindReplTupleByIndex(localrel, localidxoid, + LockTupleExclusive, + remoteslot, *localslot); + } + else + found = RelationFindReplTupleSeq(localrel, LockTupleExclusive, + remoteslot, *localslot); + + return found; +} + +/* + * This handles insert, update, delete on a partitioned table. + */ +static void +apply_handle_tuple_routing(ApplyExecutionData *edata, + TupleTableSlot *remoteslot, + LogicalRepTupleData *newtup, + CmdType operation) +{ + EState *estate = edata->estate; + LogicalRepRelMapEntry *relmapentry = edata->targetRel; + ResultRelInfo *relinfo = edata->targetRelInfo; + Relation parentrel = relinfo->ri_RelationDesc; + ModifyTableState *mtstate; + PartitionTupleRouting *proute; + ResultRelInfo *partrelinfo; + Relation partrel; + TupleTableSlot *remoteslot_part; + TupleConversionMap *map; + MemoryContext oldctx; + LogicalRepRelMapEntry *part_entry = NULL; + AttrMap *attrmap = NULL; + + /* ModifyTableState is needed for ExecFindPartition(). */ + edata->mtstate = mtstate = makeNode(ModifyTableState); + mtstate->ps.plan = NULL; + mtstate->ps.state = estate; + mtstate->operation = operation; + mtstate->resultRelInfo = relinfo; + + /* ... as is PartitionTupleRouting. */ + edata->proute = proute = ExecSetupPartitionTupleRouting(estate, parentrel); + + /* + * Find the partition to which the "search tuple" belongs. + */ + Assert(remoteslot != NULL); + oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + partrelinfo = ExecFindPartition(mtstate, relinfo, proute, + remoteslot, estate); + Assert(partrelinfo != NULL); + partrel = partrelinfo->ri_RelationDesc; + + /* + * Check for supported relkind. We need this since partitions might be of + * unsupported relkinds; and the set of partitions can change, so checking + * at CREATE/ALTER SUBSCRIPTION would be insufficient. + */ + CheckSubscriptionRelkind(partrel->rd_rel->relkind, + get_namespace_name(RelationGetNamespace(partrel)), + RelationGetRelationName(partrel)); + + /* + * To perform any of the operations below, the tuple must match the + * partition's rowtype. Convert if needed or just copy, using a dedicated + * slot to store the tuple in any case. + */ + remoteslot_part = partrelinfo->ri_PartitionTupleSlot; + if (remoteslot_part == NULL) + remoteslot_part = table_slot_create(partrel, &estate->es_tupleTable); + map = ExecGetRootToChildMap(partrelinfo, estate); + if (map != NULL) + { + attrmap = map->attrMap; + remoteslot_part = execute_attr_map_slot(attrmap, remoteslot, + remoteslot_part); + } + else + { + remoteslot_part = ExecCopySlot(remoteslot_part, remoteslot); + slot_getallattrs(remoteslot_part); + } + MemoryContextSwitchTo(oldctx); + + /* Check if we can do the update or delete on the leaf partition. */ + if (operation == CMD_UPDATE || operation == CMD_DELETE) + { + part_entry = logicalrep_partition_open(relmapentry, partrel, + attrmap); + check_relation_updatable(part_entry); + } + + switch (operation) + { + case CMD_INSERT: + apply_handle_insert_internal(edata, partrelinfo, + remoteslot_part); + break; + + case CMD_DELETE: + apply_handle_delete_internal(edata, partrelinfo, + remoteslot_part, + part_entry->localindexoid); + break; + + case CMD_UPDATE: + + /* + * For UPDATE, depending on whether or not the updated tuple + * satisfies the partition's constraint, perform a simple UPDATE + * of the partition or move the updated tuple into a different + * suitable partition. + */ + { + TupleTableSlot *localslot; + ResultRelInfo *partrelinfo_new; + Relation partrel_new; + bool found; + + /* Get the matching local tuple from the partition. */ + found = FindReplTupleInLocalRel(edata, partrel, + &part_entry->remoterel, + part_entry->localindexoid, + remoteslot_part, &localslot); + if (!found) + { + /* + * The tuple to be updated could not be found. Do nothing + * except for emitting a log message. + * + * XXX should this be promoted to ereport(LOG) perhaps? + */ + elog(DEBUG1, + "logical replication did not find row to be updated " + "in replication target relation's partition \"%s\"", + RelationGetRelationName(partrel)); + return; + } + + /* + * Apply the update to the local tuple, putting the result in + * remoteslot_part. + */ + oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + slot_modify_data(remoteslot_part, localslot, part_entry, + newtup); + MemoryContextSwitchTo(oldctx); + + /* + * Does the updated tuple still satisfy the current + * partition's constraint? + */ + if (!partrel->rd_rel->relispartition || + ExecPartitionCheck(partrelinfo, remoteslot_part, estate, + false)) + { + /* + * Yes, so simply UPDATE the partition. We don't call + * apply_handle_update_internal() here, which would + * normally do the following work, to avoid repeating some + * work already done above to find the local tuple in the + * partition. + */ + EPQState epqstate; + + EvalPlanQualInit(&epqstate, estate, NULL, NIL, -1, NIL); + ExecOpenIndices(partrelinfo, false); + + EvalPlanQualSetSlot(&epqstate, remoteslot_part); + TargetPrivilegesCheck(partrelinfo->ri_RelationDesc, + ACL_UPDATE); + ExecSimpleRelationUpdate(partrelinfo, estate, &epqstate, + localslot, remoteslot_part); + ExecCloseIndices(partrelinfo); + EvalPlanQualEnd(&epqstate); + } + else + { + /* Move the tuple into the new partition. */ + + /* + * New partition will be found using tuple routing, which + * can only occur via the parent table. We might need to + * convert the tuple to the parent's rowtype. Note that + * this is the tuple found in the partition, not the + * original search tuple received by this function. + */ + if (map) + { + TupleConversionMap *PartitionToRootMap = + convert_tuples_by_name(RelationGetDescr(partrel), + RelationGetDescr(parentrel)); + + remoteslot = + execute_attr_map_slot(PartitionToRootMap->attrMap, + remoteslot_part, remoteslot); + } + else + { + remoteslot = ExecCopySlot(remoteslot, remoteslot_part); + slot_getallattrs(remoteslot); + } + + /* Find the new partition. */ + oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + partrelinfo_new = ExecFindPartition(mtstate, relinfo, + proute, remoteslot, + estate); + MemoryContextSwitchTo(oldctx); + Assert(partrelinfo_new != partrelinfo); + partrel_new = partrelinfo_new->ri_RelationDesc; + + /* Check that new partition also has supported relkind. */ + CheckSubscriptionRelkind(partrel_new->rd_rel->relkind, + get_namespace_name(RelationGetNamespace(partrel_new)), + RelationGetRelationName(partrel_new)); + + /* DELETE old tuple found in the old partition. */ + apply_handle_delete_internal(edata, partrelinfo, + localslot, + part_entry->localindexoid); + + /* INSERT new tuple into the new partition. */ + + /* + * Convert the replacement tuple to match the destination + * partition rowtype. + */ + oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + remoteslot_part = partrelinfo_new->ri_PartitionTupleSlot; + if (remoteslot_part == NULL) + remoteslot_part = table_slot_create(partrel_new, + &estate->es_tupleTable); + map = ExecGetRootToChildMap(partrelinfo_new, estate); + if (map != NULL) + { + remoteslot_part = execute_attr_map_slot(map->attrMap, + remoteslot, + remoteslot_part); + } + else + { + remoteslot_part = ExecCopySlot(remoteslot_part, + remoteslot); + slot_getallattrs(remoteslot); + } + MemoryContextSwitchTo(oldctx); + apply_handle_insert_internal(edata, partrelinfo_new, + remoteslot_part); + } + } + break; + + default: + elog(ERROR, "unrecognized CmdType: %d", (int) operation); + break; + } +} + +/* + * Handle TRUNCATE message. + * + * TODO: FDW support + */ +static void +apply_handle_truncate(StringInfo s) +{ + bool cascade = false; + bool restart_seqs = false; + List *remote_relids = NIL; + List *remote_rels = NIL; + List *rels = NIL; + List *part_rels = NIL; + List *relids = NIL; + List *relids_logged = NIL; + ListCell *lc; + LOCKMODE lockmode = AccessExclusiveLock; + + /* + * Quick return if we are skipping data modification changes or handling + * streamed transactions. + */ + if (is_skipping_changes() || + handle_streamed_transaction(LOGICAL_REP_MSG_TRUNCATE, s)) + return; + + begin_replication_step(); + + remote_relids = logicalrep_read_truncate(s, &cascade, &restart_seqs); + + foreach(lc, remote_relids) + { + LogicalRepRelId relid = lfirst_oid(lc); + LogicalRepRelMapEntry *rel; + + rel = logicalrep_rel_open(relid, lockmode); + if (!should_apply_changes_for_rel(rel)) + { + /* + * The relation can't become interesting in the middle of the + * transaction so it's safe to unlock it. + */ + logicalrep_rel_close(rel, lockmode); + continue; + } + + remote_rels = lappend(remote_rels, rel); + TargetPrivilegesCheck(rel->localrel, ACL_TRUNCATE); + rels = lappend(rels, rel->localrel); + relids = lappend_oid(relids, rel->localreloid); + if (RelationIsLogicallyLogged(rel->localrel)) + relids_logged = lappend_oid(relids_logged, rel->localreloid); + + /* + * Truncate partitions if we got a message to truncate a partitioned + * table. + */ + if (rel->localrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + { + ListCell *child; + List *children = find_all_inheritors(rel->localreloid, + lockmode, + NULL); + + foreach(child, children) + { + Oid childrelid = lfirst_oid(child); + Relation childrel; + + if (list_member_oid(relids, childrelid)) + continue; + + /* find_all_inheritors already got lock */ + childrel = table_open(childrelid, NoLock); + + /* + * Ignore temp tables of other backends. See similar code in + * ExecuteTruncate(). + */ + if (RELATION_IS_OTHER_TEMP(childrel)) + { + table_close(childrel, lockmode); + continue; + } + + TargetPrivilegesCheck(childrel, ACL_TRUNCATE); + rels = lappend(rels, childrel); + part_rels = lappend(part_rels, childrel); + relids = lappend_oid(relids, childrelid); + /* Log this relation only if needed for logical decoding */ + if (RelationIsLogicallyLogged(childrel)) + relids_logged = lappend_oid(relids_logged, childrelid); + } + } + } + + /* + * Even if we used CASCADE on the upstream primary we explicitly default + * to replaying changes without further cascading. This might be later + * changeable with a user specified option. + * + * MySubscription->runasowner tells us whether we want to execute + * replication actions as the subscription owner; the last argument to + * TruncateGuts tells it whether we want to switch to the table owner. + * Those are exactly opposite conditions. + */ + ExecuteTruncateGuts(rels, + relids, + relids_logged, + DROP_RESTRICT, + restart_seqs, + !MySubscription->runasowner); + foreach(lc, remote_rels) + { + LogicalRepRelMapEntry *rel = lfirst(lc); + + logicalrep_rel_close(rel, NoLock); + } + foreach(lc, part_rels) + { + Relation rel = lfirst(lc); + + table_close(rel, NoLock); + } + + end_replication_step(); +} + + +/* + * Logical replication protocol message dispatcher. + */ +void +apply_dispatch(StringInfo s) +{ + LogicalRepMsgType action = pq_getmsgbyte(s); + LogicalRepMsgType saved_command; + + /* + * Set the current command being applied. Since this function can be + * called recursively when applying spooled changes, save the current + * command. + */ + saved_command = apply_error_callback_arg.command; + apply_error_callback_arg.command = action; + + switch (action) + { + case LOGICAL_REP_MSG_BEGIN: + apply_handle_begin(s); + break; + + case LOGICAL_REP_MSG_COMMIT: + apply_handle_commit(s); + break; + + case LOGICAL_REP_MSG_INSERT: + apply_handle_insert(s); + break; + + case LOGICAL_REP_MSG_UPDATE: + apply_handle_update(s); + break; + + case LOGICAL_REP_MSG_DELETE: + apply_handle_delete(s); + break; + + case LOGICAL_REP_MSG_TRUNCATE: + apply_handle_truncate(s); + break; + + case LOGICAL_REP_MSG_RELATION: + apply_handle_relation(s); + break; + + case LOGICAL_REP_MSG_TYPE: + apply_handle_type(s); + break; + + case LOGICAL_REP_MSG_ORIGIN: + apply_handle_origin(s); + break; + + case LOGICAL_REP_MSG_MESSAGE: + + /* + * Logical replication does not use generic logical messages yet. + * Although, it could be used by other applications that use this + * output plugin. + */ + break; + + case LOGICAL_REP_MSG_STREAM_START: + apply_handle_stream_start(s); + break; + + case LOGICAL_REP_MSG_STREAM_STOP: + apply_handle_stream_stop(s); + break; + + case LOGICAL_REP_MSG_STREAM_ABORT: + apply_handle_stream_abort(s); + break; + + case LOGICAL_REP_MSG_STREAM_COMMIT: + apply_handle_stream_commit(s); + break; + + case LOGICAL_REP_MSG_BEGIN_PREPARE: + apply_handle_begin_prepare(s); + break; + + case LOGICAL_REP_MSG_PREPARE: + apply_handle_prepare(s); + break; + + case LOGICAL_REP_MSG_COMMIT_PREPARED: + apply_handle_commit_prepared(s); + break; + + case LOGICAL_REP_MSG_ROLLBACK_PREPARED: + apply_handle_rollback_prepared(s); + break; + + case LOGICAL_REP_MSG_STREAM_PREPARE: + apply_handle_stream_prepare(s); + break; + + default: + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("invalid logical replication message type \"??? (%d)\"", action))); + } + + /* Reset the current command */ + apply_error_callback_arg.command = saved_command; +} + +/* + * Figure out which write/flush positions to report to the walsender process. + * + * We can't simply report back the last LSN the walsender sent us because the + * local transaction might not yet be flushed to disk locally. Instead we + * build a list that associates local with remote LSNs for every commit. When + * reporting back the flush position to the sender we iterate that list and + * check which entries on it are already locally flushed. Those we can report + * as having been flushed. + * + * The have_pending_txes is true if there are outstanding transactions that + * need to be flushed. + */ +static void +get_flush_position(XLogRecPtr *write, XLogRecPtr *flush, + bool *have_pending_txes) +{ + dlist_mutable_iter iter; + XLogRecPtr local_flush = GetFlushRecPtr(NULL); + + *write = InvalidXLogRecPtr; + *flush = InvalidXLogRecPtr; + + dlist_foreach_modify(iter, &lsn_mapping) + { + FlushPosition *pos = + dlist_container(FlushPosition, node, iter.cur); + + *write = pos->remote_end; + + if (pos->local_end <= local_flush) + { + *flush = pos->remote_end; + dlist_delete(iter.cur); + pfree(pos); + } + else + { + /* + * Don't want to uselessly iterate over the rest of the list which + * could potentially be long. Instead get the last element and + * grab the write position from there. + */ + pos = dlist_tail_element(FlushPosition, node, + &lsn_mapping); + *write = pos->remote_end; + *have_pending_txes = true; + return; + } + } + + *have_pending_txes = !dlist_is_empty(&lsn_mapping); +} + +/* + * Store current remote/local lsn pair in the tracking list. + */ +void +store_flush_position(XLogRecPtr remote_lsn, XLogRecPtr local_lsn) +{ + FlushPosition *flushpos; + + /* + * Skip for parallel apply workers, because the lsn_mapping is maintained + * by the leader apply worker. + */ + if (am_parallel_apply_worker()) + return; + + /* Need to do this in permanent context */ + MemoryContextSwitchTo(ApplyContext); + + /* Track commit lsn */ + flushpos = (FlushPosition *) palloc(sizeof(FlushPosition)); + flushpos->local_end = local_lsn; + flushpos->remote_end = remote_lsn; + + dlist_push_tail(&lsn_mapping, &flushpos->node); + MemoryContextSwitchTo(ApplyMessageContext); +} + + +/* Update statistics of the worker. */ +static void +UpdateWorkerStats(XLogRecPtr last_lsn, TimestampTz send_time, bool reply) +{ + MyLogicalRepWorker->last_lsn = last_lsn; + MyLogicalRepWorker->last_send_time = send_time; + MyLogicalRepWorker->last_recv_time = GetCurrentTimestamp(); + if (reply) + { + MyLogicalRepWorker->reply_lsn = last_lsn; + MyLogicalRepWorker->reply_time = send_time; + } +} + +/* + * Apply main loop. + */ +static void +LogicalRepApplyLoop(XLogRecPtr last_received) +{ + TimestampTz last_recv_timestamp = GetCurrentTimestamp(); + bool ping_sent = false; + TimeLineID tli; + ErrorContextCallback errcallback; + + /* + * Init the ApplyMessageContext which we clean up after each replication + * protocol message. + */ + ApplyMessageContext = AllocSetContextCreate(ApplyContext, + "ApplyMessageContext", + ALLOCSET_DEFAULT_SIZES); + + /* + * This memory context is used for per-stream data when the streaming mode + * is enabled. This context is reset on each stream stop. + */ + LogicalStreamingContext = AllocSetContextCreate(ApplyContext, + "LogicalStreamingContext", + ALLOCSET_DEFAULT_SIZES); + + /* mark as idle, before starting to loop */ + pgstat_report_activity(STATE_IDLE, NULL); + + /* + * Push apply error context callback. Fields will be filled while applying + * a change. + */ + errcallback.callback = apply_error_callback; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + apply_error_context_stack = error_context_stack; + + /* This outer loop iterates once per wait. */ + for (;;) + { + pgsocket fd = PGINVALID_SOCKET; + int rc; + int len; + char *buf = NULL; + bool endofstream = false; + long wait_time; + + CHECK_FOR_INTERRUPTS(); + + MemoryContextSwitchTo(ApplyMessageContext); + + len = walrcv_receive(LogRepWorkerWalRcvConn, &buf, &fd); + + if (len != 0) + { + /* Loop to process all available data (without blocking). */ + for (;;) + { + CHECK_FOR_INTERRUPTS(); + + if (len == 0) + { + break; + } + else if (len < 0) + { + ereport(LOG, + (errmsg("data stream from publisher has ended"))); + endofstream = true; + break; + } + else + { + int c; + StringInfoData s; + + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } + + /* Reset timeout. */ + last_recv_timestamp = GetCurrentTimestamp(); + ping_sent = false; + + /* Ensure we are reading the data into our memory context. */ + MemoryContextSwitchTo(ApplyMessageContext); + + s.data = buf; + s.len = len; + s.cursor = 0; + s.maxlen = -1; + + c = pq_getmsgbyte(&s); + + if (c == 'w') + { + XLogRecPtr start_lsn; + XLogRecPtr end_lsn; + TimestampTz send_time; + + start_lsn = pq_getmsgint64(&s); + end_lsn = pq_getmsgint64(&s); + send_time = pq_getmsgint64(&s); + + if (last_received < start_lsn) + last_received = start_lsn; + + if (last_received < end_lsn) + last_received = end_lsn; + + UpdateWorkerStats(last_received, send_time, false); + + apply_dispatch(&s); + } + else if (c == 'k') + { + XLogRecPtr end_lsn; + TimestampTz timestamp; + bool reply_requested; + + end_lsn = pq_getmsgint64(&s); + timestamp = pq_getmsgint64(&s); + reply_requested = pq_getmsgbyte(&s); + + if (last_received < end_lsn) + last_received = end_lsn; + + send_feedback(last_received, reply_requested, false); + UpdateWorkerStats(last_received, timestamp, true); + } + /* other message types are purposefully ignored */ + + MemoryContextReset(ApplyMessageContext); + } + + len = walrcv_receive(LogRepWorkerWalRcvConn, &buf, &fd); + } + } + + /* confirm all writes so far */ + send_feedback(last_received, false, false); + + if (!in_remote_transaction && !in_streamed_transaction) + { + /* + * If we didn't get any transactions for a while there might be + * unconsumed invalidation messages in the queue, consume them + * now. + */ + AcceptInvalidationMessages(); + maybe_reread_subscription(); + + /* Process any table synchronization changes. */ + process_syncing_tables(last_received); + } + + /* Cleanup the memory. */ + MemoryContextResetAndDeleteChildren(ApplyMessageContext); + MemoryContextSwitchTo(TopMemoryContext); + + /* Check if we need to exit the streaming loop. */ + if (endofstream) + break; + + /* + * Wait for more data or latch. If we have unflushed transactions, + * wake up after WalWriterDelay to see if they've been flushed yet (in + * which case we should send a feedback message). Otherwise, there's + * no particular urgency about waking up unless we get data or a + * signal. + */ + if (!dlist_is_empty(&lsn_mapping)) + wait_time = WalWriterDelay; + else + wait_time = NAPTIME_PER_CYCLE; + + rc = WaitLatchOrSocket(MyLatch, + WL_SOCKET_READABLE | WL_LATCH_SET | + WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + fd, wait_time, + WAIT_EVENT_LOGICAL_APPLY_MAIN); + + if (rc & WL_LATCH_SET) + { + ResetLatch(MyLatch); + CHECK_FOR_INTERRUPTS(); + } + + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } + + if (rc & WL_TIMEOUT) + { + /* + * We didn't receive anything new. If we haven't heard anything + * from the server for more than wal_receiver_timeout / 2, ping + * the server. Also, if it's been longer than + * wal_receiver_status_interval since the last update we sent, + * send a status update to the primary anyway, to report any + * progress in applying WAL. + */ + bool requestReply = false; + + /* + * Check if time since last receive from primary has reached the + * configured limit. + */ + if (wal_receiver_timeout > 0) + { + TimestampTz now = GetCurrentTimestamp(); + TimestampTz timeout; + + timeout = + TimestampTzPlusMilliseconds(last_recv_timestamp, + wal_receiver_timeout); + + if (now >= timeout) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("terminating logical replication worker due to timeout"))); + + /* Check to see if it's time for a ping. */ + if (!ping_sent) + { + timeout = TimestampTzPlusMilliseconds(last_recv_timestamp, + (wal_receiver_timeout / 2)); + if (now >= timeout) + { + requestReply = true; + ping_sent = true; + } + } + } + + send_feedback(last_received, requestReply, requestReply); + + /* + * Force reporting to ensure long idle periods don't lead to + * arbitrarily delayed stats. Stats can only be reported outside + * of (implicit or explicit) transactions. That shouldn't lead to + * stats being delayed for long, because transactions are either + * sent as a whole on commit or streamed. Streamed transactions + * are spilled to disk and applied on commit. + */ + if (!IsTransactionState()) + pgstat_report_stat(true); + } + } + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; + apply_error_context_stack = error_context_stack; + + /* All done */ + walrcv_endstreaming(LogRepWorkerWalRcvConn, &tli); +} + +/* + * Send a Standby Status Update message to server. + * + * 'recvpos' is the latest LSN we've received data to, force is set if we need + * to send a response to avoid timeouts. + */ +static void +send_feedback(XLogRecPtr recvpos, bool force, bool requestReply) +{ + static __thread StringInfo reply_message = NULL; + static __thread TimestampTz send_time = 0; + + static __thread XLogRecPtr last_recvpos = InvalidXLogRecPtr; + static __thread XLogRecPtr last_writepos = InvalidXLogRecPtr; + static __thread XLogRecPtr last_flushpos = InvalidXLogRecPtr; + + XLogRecPtr writepos; + XLogRecPtr flushpos; + TimestampTz now; + bool have_pending_txes; + + /* + * If the user doesn't want status to be reported to the publisher, be + * sure to exit before doing anything at all. + */ + if (!force && wal_receiver_status_interval <= 0) + return; + + /* It's legal to not pass a recvpos */ + if (recvpos < last_recvpos) + recvpos = last_recvpos; + + get_flush_position(&writepos, &flushpos, &have_pending_txes); + + /* + * No outstanding transactions to flush, we can report the latest received + * position. This is important for synchronous replication. + */ + if (!have_pending_txes) + flushpos = writepos = recvpos; + + if (writepos < last_writepos) + writepos = last_writepos; + + if (flushpos < last_flushpos) + flushpos = last_flushpos; + + now = GetCurrentTimestamp(); + + /* if we've already reported everything we're good */ + if (!force && + writepos == last_writepos && + flushpos == last_flushpos && + !TimestampDifferenceExceeds(send_time, now, + wal_receiver_status_interval * 1000)) + return; + send_time = now; + + if (!reply_message) + { + MemoryContext oldctx = MemoryContextSwitchTo(ApplyContext); + + reply_message = makeStringInfo(); + MemoryContextSwitchTo(oldctx); + } + else + resetStringInfo(reply_message); + + pq_sendbyte(reply_message, 'r'); + pq_sendint64(reply_message, recvpos); /* write */ + pq_sendint64(reply_message, flushpos); /* flush */ + pq_sendint64(reply_message, writepos); /* apply */ + pq_sendint64(reply_message, now); /* sendTime */ + pq_sendbyte(reply_message, requestReply); /* replyRequested */ + + elog(DEBUG2, "sending feedback (force %d) to recv %X/%X, write %X/%X, flush %X/%X", + force, + LSN_FORMAT_ARGS(recvpos), + LSN_FORMAT_ARGS(writepos), + LSN_FORMAT_ARGS(flushpos)); + + walrcv_send(LogRepWorkerWalRcvConn, + reply_message->data, reply_message->len); + + if (recvpos > last_recvpos) + last_recvpos = recvpos; + if (writepos > last_writepos) + last_writepos = writepos; + if (flushpos > last_flushpos) + last_flushpos = flushpos; +} + +/* + * Exit routine for apply workers due to subscription parameter changes. + */ +static void +apply_worker_exit(void) +{ + if (am_parallel_apply_worker()) + { + /* + * Don't stop the parallel apply worker as the leader will detect the + * subscription parameter change and restart logical replication later + * anyway. This also prevents the leader from reporting errors when + * trying to communicate with a stopped parallel apply worker, which + * would accidentally disable subscriptions if disable_on_error was + * set. + */ + return; + } + + /* + * Reset the last-start time for this apply worker so that the launcher + * will restart it without waiting for wal_retrieve_retry_interval if the + * subscription is still active, and so that we won't leak that hash table + * entry if it isn't. + */ + if (!am_tablesync_worker()) + ApplyLauncherForgetWorkerStartTime(MyLogicalRepWorker->subid); + + proc_exit(0); +} + +/* + * Reread subscription info if needed. Most changes will be exit. + */ +void +maybe_reread_subscription(void) +{ + MemoryContext oldctx; + Subscription *newsub; + bool started_tx = false; + + /* When cache state is valid there is nothing to do here. */ + if (MySubscriptionValid) + return; + + /* This function might be called inside or outside of transaction. */ + if (!IsTransactionState()) + { + StartTransactionCommand(); + started_tx = true; + } + + /* Ensure allocations in permanent context. */ + oldctx = MemoryContextSwitchTo(ApplyContext); + + newsub = GetSubscription(MyLogicalRepWorker->subid, true); + + /* + * Exit if the subscription was removed. This normally should not happen + * as the worker gets killed during DROP SUBSCRIPTION. + */ + if (!newsub) + { + ereport(LOG, + (errmsg("logical replication worker for subscription \"%s\" will stop because the subscription was removed", + MySubscription->name))); + + /* Ensure we remove no-longer-useful entry for worker's start time */ + if (!am_tablesync_worker() && !am_parallel_apply_worker()) + ApplyLauncherForgetWorkerStartTime(MyLogicalRepWorker->subid); + proc_exit(0); + } + + /* Exit if the subscription was disabled. */ + if (!newsub->enabled) + { + ereport(LOG, + (errmsg("logical replication worker for subscription \"%s\" will stop because the subscription was disabled", + MySubscription->name))); + + apply_worker_exit(); + } + + /* !slotname should never happen when enabled is true. */ + Assert(newsub->slotname); + + /* two-phase should not be altered */ + Assert(newsub->twophasestate == MySubscription->twophasestate); + + /* + * Exit if any parameter that affects the remote connection was changed. + * The launcher will start a new worker but note that the parallel apply + * worker won't restart if the streaming option's value is changed from + * 'parallel' to any other value or the server decides not to stream the + * in-progress transaction. + */ + if (strcmp(newsub->conninfo, MySubscription->conninfo) != 0 || + strcmp(newsub->name, MySubscription->name) != 0 || + strcmp(newsub->slotname, MySubscription->slotname) != 0 || + newsub->binary != MySubscription->binary || + newsub->stream != MySubscription->stream || + newsub->passwordrequired != MySubscription->passwordrequired || + strcmp(newsub->origin, MySubscription->origin) != 0 || + newsub->owner != MySubscription->owner || + !equal(newsub->publications, MySubscription->publications)) + { + if (am_parallel_apply_worker()) + ereport(LOG, + (errmsg("logical replication parallel apply worker for subscription \"%s\" will stop because of a parameter change", + MySubscription->name))); + else + ereport(LOG, + (errmsg("logical replication worker for subscription \"%s\" will restart because of a parameter change", + MySubscription->name))); + + apply_worker_exit(); + } + + /* Check for other changes that should never happen too. */ + if (newsub->dbid != MySubscription->dbid) + { + elog(ERROR, "subscription %u changed unexpectedly", + MyLogicalRepWorker->subid); + } + + /* Clean old subscription info and switch to new one. */ + FreeSubscription(MySubscription); + MySubscription = newsub; + + MemoryContextSwitchTo(oldctx); + + /* Change synchronous commit according to the user's wishes */ + SetConfigOption("synchronous_commit", MySubscription->synccommit, + PGC_BACKEND, PGC_S_OVERRIDE); + + if (started_tx) + CommitTransactionCommand(); + + MySubscriptionValid = true; +} + +/* + * Callback from subscription syscache invalidation. + */ +static void +subscription_change_cb(Datum arg, int cacheid, uint32 hashvalue) +{ + MySubscriptionValid = false; +} + +/* + * subxact_info_write + * Store information about subxacts for a toplevel transaction. + * + * For each subxact we store offset of it's first change in the main file. + * The file is always over-written as a whole. + * + * XXX We should only store subxacts that were not aborted yet. + */ +static void +subxact_info_write(Oid subid, TransactionId xid) +{ + char path[MAXPGPATH]; + Size len; + BufFile *fd; + + Assert(TransactionIdIsValid(xid)); + + /* construct the subxact filename */ + subxact_filename(path, subid, xid); + + /* Delete the subxacts file, if exists. */ + if (subxact_data.nsubxacts == 0) + { + cleanup_subxact_info(); + BufFileDeleteFileSet(MyLogicalRepWorker->stream_fileset, path, true); + + return; + } + + /* + * Create the subxact file if it not already created, otherwise open the + * existing file. + */ + fd = BufFileOpenFileSet(MyLogicalRepWorker->stream_fileset, path, O_RDWR, + true); + if (fd == NULL) + fd = BufFileCreateFileSet(MyLogicalRepWorker->stream_fileset, path); + + len = sizeof(SubXactInfo) * subxact_data.nsubxacts; + + /* Write the subxact count and subxact info */ + BufFileWrite(fd, &subxact_data.nsubxacts, sizeof(subxact_data.nsubxacts)); + BufFileWrite(fd, subxact_data.subxacts, len); + + BufFileClose(fd); + + /* free the memory allocated for subxact info */ + cleanup_subxact_info(); +} + +/* + * subxact_info_read + * Restore information about subxacts of a streamed transaction. + * + * Read information about subxacts into the structure subxact_data that can be + * used later. + */ +static void +subxact_info_read(Oid subid, TransactionId xid) +{ + char path[MAXPGPATH]; + Size len; + BufFile *fd; + MemoryContext oldctx; + + Assert(!subxact_data.subxacts); + Assert(subxact_data.nsubxacts == 0); + Assert(subxact_data.nsubxacts_max == 0); + + /* + * If the subxact file doesn't exist that means we don't have any subxact + * info. + */ + subxact_filename(path, subid, xid); + fd = BufFileOpenFileSet(MyLogicalRepWorker->stream_fileset, path, O_RDONLY, + true); + if (fd == NULL) + return; + + /* read number of subxact items */ + BufFileReadExact(fd, &subxact_data.nsubxacts, sizeof(subxact_data.nsubxacts)); + + len = sizeof(SubXactInfo) * subxact_data.nsubxacts; + + /* we keep the maximum as a power of 2 */ + subxact_data.nsubxacts_max = 1 << my_log2(subxact_data.nsubxacts); + + /* + * Allocate subxact information in the logical streaming context. We need + * this information during the complete stream so that we can add the sub + * transaction info to this. On stream stop we will flush this information + * to the subxact file and reset the logical streaming context. + */ + oldctx = MemoryContextSwitchTo(LogicalStreamingContext); + subxact_data.subxacts = palloc(subxact_data.nsubxacts_max * + sizeof(SubXactInfo)); + MemoryContextSwitchTo(oldctx); + + if (len > 0) + BufFileReadExact(fd, subxact_data.subxacts, len); + + BufFileClose(fd); +} + +/* + * subxact_info_add + * Add information about a subxact (offset in the main file). + */ +static void +subxact_info_add(TransactionId xid) +{ + SubXactInfo *subxacts = subxact_data.subxacts; + int64 i; + + /* We must have a valid top level stream xid and a stream fd. */ + Assert(TransactionIdIsValid(stream_xid)); + Assert(stream_fd != NULL); + + /* + * If the XID matches the toplevel transaction, we don't want to add it. + */ + if (stream_xid == xid) + return; + + /* + * In most cases we're checking the same subxact as we've already seen in + * the last call, so make sure to ignore it (this change comes later). + */ + if (subxact_data.subxact_last == xid) + return; + + /* OK, remember we're processing this XID. */ + subxact_data.subxact_last = xid; + + /* + * Check if the transaction is already present in the array of subxact. We + * intentionally scan the array from the tail, because we're likely adding + * a change for the most recent subtransactions. + * + * XXX Can we rely on the subxact XIDs arriving in sorted order? That + * would allow us to use binary search here. + */ + for (i = subxact_data.nsubxacts; i > 0; i--) + { + /* found, so we're done */ + if (subxacts[i - 1].xid == xid) + return; + } + + /* This is a new subxact, so we need to add it to the array. */ + if (subxact_data.nsubxacts == 0) + { + MemoryContext oldctx; + + subxact_data.nsubxacts_max = 128; + + /* + * Allocate this memory for subxacts in per-stream context, see + * subxact_info_read. + */ + oldctx = MemoryContextSwitchTo(LogicalStreamingContext); + subxacts = palloc(subxact_data.nsubxacts_max * sizeof(SubXactInfo)); + MemoryContextSwitchTo(oldctx); + } + else if (subxact_data.nsubxacts == subxact_data.nsubxacts_max) + { + subxact_data.nsubxacts_max *= 2; + subxacts = repalloc(subxacts, + subxact_data.nsubxacts_max * sizeof(SubXactInfo)); + } + + subxacts[subxact_data.nsubxacts].xid = xid; + + /* + * Get the current offset of the stream file and store it as offset of + * this subxact. + */ + BufFileTell(stream_fd, + &subxacts[subxact_data.nsubxacts].fileno, + &subxacts[subxact_data.nsubxacts].offset); + + subxact_data.nsubxacts++; + subxact_data.subxacts = subxacts; +} + +/* format filename for file containing the info about subxacts */ +static inline void +subxact_filename(char *path, Oid subid, TransactionId xid) +{ + snprintf(path, MAXPGPATH, "%u-%u.subxacts", subid, xid); +} + +/* format filename for file containing serialized changes */ +static inline void +changes_filename(char *path, Oid subid, TransactionId xid) +{ + snprintf(path, MAXPGPATH, "%u-%u.changes", subid, xid); +} + +/* + * stream_cleanup_files + * Cleanup files for a subscription / toplevel transaction. + * + * Remove files with serialized changes and subxact info for a particular + * toplevel transaction. Each subscription has a separate set of files + * for any toplevel transaction. + */ +void +stream_cleanup_files(Oid subid, TransactionId xid) +{ + char path[MAXPGPATH]; + + /* Delete the changes file. */ + changes_filename(path, subid, xid); + BufFileDeleteFileSet(MyLogicalRepWorker->stream_fileset, path, false); + + /* Delete the subxact file, if it exists. */ + subxact_filename(path, subid, xid); + BufFileDeleteFileSet(MyLogicalRepWorker->stream_fileset, path, true); +} + +/* + * stream_open_file + * Open a file that we'll use to serialize changes for a toplevel + * transaction. + * + * Open a file for streamed changes from a toplevel transaction identified + * by stream_xid (global variable). If it's the first chunk of streamed + * changes for this transaction, create the buffile, otherwise open the + * previously created file. + */ +static void +stream_open_file(Oid subid, TransactionId xid, bool first_segment) +{ + char path[MAXPGPATH]; + MemoryContext oldcxt; + + Assert(OidIsValid(subid)); + Assert(TransactionIdIsValid(xid)); + Assert(stream_fd == NULL); + + + changes_filename(path, subid, xid); + elog(DEBUG1, "opening file \"%s\" for streamed changes", path); + + /* + * Create/open the buffiles under the logical streaming context so that we + * have those files until stream stop. + */ + oldcxt = MemoryContextSwitchTo(LogicalStreamingContext); + + /* + * If this is the first streamed segment, create the changes file. + * Otherwise, just open the file for writing, in append mode. + */ + if (first_segment) + stream_fd = BufFileCreateFileSet(MyLogicalRepWorker->stream_fileset, + path); + else + { + /* + * Open the file and seek to the end of the file because we always + * append the changes file. + */ + stream_fd = BufFileOpenFileSet(MyLogicalRepWorker->stream_fileset, + path, O_RDWR, false); + BufFileSeek(stream_fd, 0, 0, SEEK_END); + } + + MemoryContextSwitchTo(oldcxt); +} + +/* + * stream_close_file + * Close the currently open file with streamed changes. + */ +static void +stream_close_file(void) +{ + Assert(stream_fd != NULL); + + BufFileClose(stream_fd); + + stream_fd = NULL; +} + +/* + * stream_write_change + * Serialize a change to a file for the current toplevel transaction. + * + * The change is serialized in a simple format, with length (not including + * the length), action code (identifying the message type) and message + * contents (without the subxact TransactionId value). + */ +static void +stream_write_change(char action, StringInfo s) +{ + int len; + + Assert(stream_fd != NULL); + + /* total on-disk size, including the action type character */ + len = (s->len - s->cursor) + sizeof(char); + + /* first write the size */ + BufFileWrite(stream_fd, &len, sizeof(len)); + + /* then the action */ + BufFileWrite(stream_fd, &action, sizeof(action)); + + /* and finally the remaining part of the buffer (after the XID) */ + len = (s->len - s->cursor); + + BufFileWrite(stream_fd, &s->data[s->cursor], len); +} + +/* + * stream_open_and_write_change + * Serialize a message to a file for the given transaction. + * + * This function is similar to stream_write_change except that it will open the + * target file if not already before writing the message and close the file at + * the end. + */ +static void +stream_open_and_write_change(TransactionId xid, char action, StringInfo s) +{ + Assert(!in_streamed_transaction); + + if (!stream_fd) + stream_start_internal(xid, false); + + stream_write_change(action, s); + stream_stop_internal(xid); +} + +/* + * Cleanup the memory for subxacts and reset the related variables. + */ +static inline void +cleanup_subxact_info() +{ + if (subxact_data.subxacts) + pfree(subxact_data.subxacts); + + subxact_data.subxacts = NULL; + subxact_data.subxact_last = InvalidTransactionId; + subxact_data.nsubxacts = 0; + subxact_data.nsubxacts_max = 0; +} + +/* + * Form the prepared transaction GID for two_phase transactions. + * + * Return the GID in the supplied buffer. + */ +static void +TwoPhaseTransactionGid(Oid subid, TransactionId xid, char *gid, int szgid) +{ + Assert(subid != InvalidRepOriginId); + + if (!TransactionIdIsValid(xid)) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("invalid two-phase transaction ID"))); + + snprintf(gid, szgid, "pg_gid_%u_%u", subid, xid); +} + +/* + * Execute the initial sync with error handling. Disable the subscription, + * if it's required. + * + * Allocate the slot name in long-lived context on return. Note that we don't + * handle FATAL errors which are probably because of system resource error and + * are not repeatable. + */ +static void +start_table_sync(XLogRecPtr *origin_startpos, char **myslotname) +{ + char *syncslotname = NULL; + + Assert(am_tablesync_worker()); + + PG_TRY(); + { + /* Call initial sync. */ + syncslotname = LogicalRepSyncTableStart(origin_startpos); + } + PG_CATCH(); + { + if (MySubscription->disableonerr) + DisableSubscriptionAndExit(); + else + { + /* + * Report the worker failed during table synchronization. Abort + * the current transaction so that the stats message is sent in an + * idle state. + */ + AbortOutOfAnyTransaction(); + pgstat_report_subscription_error(MySubscription->oid, false); + + PG_RE_THROW(); + } + } + PG_END_TRY(); + + /* allocate slot name in long-lived context */ + *myslotname = MemoryContextStrdup(ApplyContext, syncslotname); + pfree(syncslotname); +} + +/* + * Run the apply loop with error handling. Disable the subscription, + * if necessary. + * + * Note that we don't handle FATAL errors which are probably because + * of system resource error and are not repeatable. + */ +static void +start_apply(XLogRecPtr origin_startpos) +{ + PG_TRY(); + { + LogicalRepApplyLoop(origin_startpos); + } + PG_CATCH(); + { + if (MySubscription->disableonerr) + DisableSubscriptionAndExit(); + else + { + /* + * Report the worker failed while applying changes. Abort the + * current transaction so that the stats message is sent in an + * idle state. + */ + AbortOutOfAnyTransaction(); + pgstat_report_subscription_error(MySubscription->oid, !am_tablesync_worker()); + + PG_RE_THROW(); + } + } + PG_END_TRY(); +} + +/* + * Common initialization for leader apply worker and parallel apply worker. + * + * Initialize the database connection, in-memory subscription and necessary + * config options. + */ +void +InitializeApplyWorker(void) +{ + MemoryContext oldctx; + + /* Run as replica session replication role. */ + SetConfigOption("session_replication_role", "replica", + PGC_SUSET, PGC_S_OVERRIDE); + + /* Connect to our database. */ + BackgroundWorkerInitializeConnectionByOid(MyLogicalRepWorker->dbid, + MyLogicalRepWorker->userid, + 0); + + /* + * Set always-secure search path, so malicious users can't redirect user + * code (e.g. pg_index.indexprs). + */ + SetConfigOption("search_path", "", PGC_SUSET, PGC_S_OVERRIDE); + + /* Load the subscription into persistent memory context. */ + ApplyContext = AllocSetContextCreate(TopMemoryContext, + "ApplyContext", + ALLOCSET_DEFAULT_SIZES); + StartTransactionCommand(); + oldctx = MemoryContextSwitchTo(ApplyContext); + + MySubscription = GetSubscription(MyLogicalRepWorker->subid, true); + if (!MySubscription) + { + ereport(LOG, + (errmsg("logical replication worker for subscription %u will not start because the subscription was removed during startup", + MyLogicalRepWorker->subid))); + + /* Ensure we remove no-longer-useful entry for worker's start time */ + if (!am_tablesync_worker() && !am_parallel_apply_worker()) + ApplyLauncherForgetWorkerStartTime(MyLogicalRepWorker->subid); + proc_exit(0); + } + + MySubscriptionValid = true; + MemoryContextSwitchTo(oldctx); + + if (!MySubscription->enabled) + { + ereport(LOG, + (errmsg("logical replication worker for subscription \"%s\" will not start because the subscription was disabled during startup", + MySubscription->name))); + + apply_worker_exit(); + } + + /* Setup synchronous commit according to the user's wishes */ + SetConfigOption("synchronous_commit", MySubscription->synccommit, + PGC_BACKEND, PGC_S_OVERRIDE); + + /* Keep us informed about subscription changes. */ + CacheRegisterSyscacheCallback(SUBSCRIPTIONOID, + subscription_change_cb, + (Datum) 0); + + if (am_tablesync_worker()) + ereport(LOG, + (errmsg("logical replication table synchronization worker for subscription \"%s\", table \"%s\" has started", + MySubscription->name, + get_rel_name(MyLogicalRepWorker->relid)))); + else + ereport(LOG, + (errmsg("logical replication apply worker for subscription \"%s\" has started", + MySubscription->name))); + + CommitTransactionCommand(); +} + +/* Logical Replication Apply worker entry point */ +void +ApplyWorkerMain(Datum main_arg) +{ + int worker_slot = DatumGetInt32(main_arg); + char originname[NAMEDATALEN]; + XLogRecPtr origin_startpos = InvalidXLogRecPtr; + char *myslotname = NULL; + WalRcvStreamOptions options; + int server_version; + + InitializingApplyWorker = true; + + /* Attach to slot */ + logicalrep_worker_attach(worker_slot); + + /* Setup signal handling */ + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGTERM, die); + BackgroundWorkerUnblockSignals(); + + /* + * We don't currently need any ResourceOwner in a walreceiver process, but + * if we did, we could call CreateAuxProcessResourceOwner here. + */ + + /* Initialise stats to a sanish value */ + MyLogicalRepWorker->last_send_time = MyLogicalRepWorker->last_recv_time = + MyLogicalRepWorker->reply_time = GetCurrentTimestamp(); + + /* Load the libpq-specific functions */ + load_file("libpqwalreceiver", false); + + InitializeApplyWorker(); + + InitializingApplyWorker = false; + + /* Connect to the origin and start the replication. */ + elog(DEBUG1, "connecting to publisher using connection string \"%s\"", + MySubscription->conninfo); + + if (am_tablesync_worker()) + { + start_table_sync(&origin_startpos, &myslotname); + + ReplicationOriginNameForLogicalRep(MySubscription->oid, + MyLogicalRepWorker->relid, + originname, + sizeof(originname)); + set_apply_error_context_origin(originname); + } + else + { + /* This is the leader apply worker */ + RepOriginId originid; + TimeLineID startpointTLI; + char *err; + bool must_use_password; + + myslotname = MySubscription->slotname; + + /* + * This shouldn't happen if the subscription is enabled, but guard + * against DDL bugs or manual catalog changes. (libpqwalreceiver will + * crash if slot is NULL.) + */ + if (!myslotname) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("subscription has no replication slot set"))); + + /* Setup replication origin tracking. */ + StartTransactionCommand(); + ReplicationOriginNameForLogicalRep(MySubscription->oid, InvalidOid, + originname, sizeof(originname)); + originid = replorigin_by_name(originname, true); + if (!OidIsValid(originid)) + originid = replorigin_create(originname); + replorigin_session_setup(originid, 0); + replorigin_session_origin = originid; + origin_startpos = replorigin_session_get_progress(false); + + /* Is the use of a password mandatory? */ + must_use_password = MySubscription->passwordrequired && + !superuser_arg(MySubscription->owner); + + /* Note that the superuser_arg call can access the DB */ + CommitTransactionCommand(); + + LogRepWorkerWalRcvConn = walrcv_connect(MySubscription->conninfo, true, + must_use_password, + MySubscription->name, &err); + if (LogRepWorkerWalRcvConn == NULL) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not connect to the publisher: %s", err))); + + /* + * We don't really use the output identify_system for anything but it + * does some initializations on the upstream so let's still call it. + */ + (void) walrcv_identify_system(LogRepWorkerWalRcvConn, &startpointTLI); + + set_apply_error_context_origin(originname); + } + + /* + * Setup callback for syscache so that we know when something changes in + * the subscription relation state. + */ + CacheRegisterSyscacheCallback(SUBSCRIPTIONRELMAP, + invalidate_syncing_table_states, + (Datum) 0); + + /* Build logical replication streaming options. */ + options.logical = true; + options.startpoint = origin_startpos; + options.slotname = myslotname; + + server_version = walrcv_server_version(LogRepWorkerWalRcvConn); + options.proto.logical.proto_version = + server_version >= 160000 ? LOGICALREP_PROTO_STREAM_PARALLEL_VERSION_NUM : + server_version >= 150000 ? LOGICALREP_PROTO_TWOPHASE_VERSION_NUM : + server_version >= 140000 ? LOGICALREP_PROTO_STREAM_VERSION_NUM : + LOGICALREP_PROTO_VERSION_NUM; + + options.proto.logical.publication_names = MySubscription->publications; + options.proto.logical.binary = MySubscription->binary; + + /* + * Assign the appropriate option value for streaming option according to + * the 'streaming' mode and the publisher's ability to support that mode. + */ + if (server_version >= 160000 && + MySubscription->stream == LOGICALREP_STREAM_PARALLEL) + { + options.proto.logical.streaming_str = "parallel"; + MyLogicalRepWorker->parallel_apply = true; + } + else if (server_version >= 140000 && + MySubscription->stream != LOGICALREP_STREAM_OFF) + { + options.proto.logical.streaming_str = "on"; + MyLogicalRepWorker->parallel_apply = false; + } + else + { + options.proto.logical.streaming_str = NULL; + MyLogicalRepWorker->parallel_apply = false; + } + + options.proto.logical.twophase = false; + options.proto.logical.origin = pstrdup(MySubscription->origin); + + if (!am_tablesync_worker()) + { + /* + * Even when the two_phase mode is requested by the user, it remains + * as the tri-state PENDING until all tablesyncs have reached READY + * state. Only then, can it become ENABLED. + * + * Note: If the subscription has no tables then leave the state as + * PENDING, which allows ALTER SUBSCRIPTION ... REFRESH PUBLICATION to + * work. + */ + if (MySubscription->twophasestate == LOGICALREP_TWOPHASE_STATE_PENDING && + AllTablesyncsReady()) + { + /* Start streaming with two_phase enabled */ + options.proto.logical.twophase = true; + walrcv_startstreaming(LogRepWorkerWalRcvConn, &options); + + StartTransactionCommand(); + UpdateTwoPhaseState(MySubscription->oid, LOGICALREP_TWOPHASE_STATE_ENABLED); + MySubscription->twophasestate = LOGICALREP_TWOPHASE_STATE_ENABLED; + CommitTransactionCommand(); + } + else + { + walrcv_startstreaming(LogRepWorkerWalRcvConn, &options); + } + + ereport(DEBUG1, + (errmsg_internal("logical replication apply worker for subscription \"%s\" two_phase is %s", + MySubscription->name, + MySubscription->twophasestate == LOGICALREP_TWOPHASE_STATE_DISABLED ? "DISABLED" : + MySubscription->twophasestate == LOGICALREP_TWOPHASE_STATE_PENDING ? "PENDING" : + MySubscription->twophasestate == LOGICALREP_TWOPHASE_STATE_ENABLED ? "ENABLED" : + "?"))); + } + else + { + /* Start normal logical streaming replication. */ + walrcv_startstreaming(LogRepWorkerWalRcvConn, &options); + } + + /* Run the main loop. */ + start_apply(origin_startpos); + + proc_exit(0); +} + +/* + * After error recovery, disable the subscription in a new transaction + * and exit cleanly. + */ +static void +DisableSubscriptionAndExit(void) +{ + /* + * Emit the error message, and recover from the error state to an idle + * state + */ + HOLD_INTERRUPTS(); + + EmitErrorReport(); + AbortOutOfAnyTransaction(); + FlushErrorState(); + + RESUME_INTERRUPTS(); + + /* Report the worker failed during either table synchronization or apply */ + pgstat_report_subscription_error(MyLogicalRepWorker->subid, + !am_tablesync_worker()); + + /* Disable the subscription */ + StartTransactionCommand(); + DisableSubscription(MySubscription->oid); + CommitTransactionCommand(); + + /* Ensure we remove no-longer-useful entry for worker's start time */ + if (!am_tablesync_worker() && !am_parallel_apply_worker()) + ApplyLauncherForgetWorkerStartTime(MyLogicalRepWorker->subid); + + /* Notify the subscription has been disabled and exit */ + ereport(LOG, + errmsg("subscription \"%s\" has been disabled because of an error", + MySubscription->name)); + + proc_exit(0); +} + +/* + * Is current process a logical replication worker? + */ +bool +IsLogicalWorker(void) +{ + return MyLogicalRepWorker != NULL; +} + +/* + * Is current process a logical replication parallel apply worker? + */ +bool +IsLogicalParallelApplyWorker(void) +{ + return IsLogicalWorker() && am_parallel_apply_worker(); +} + +/* + * Start skipping changes of the transaction if the given LSN matches the + * LSN specified by subscription's skiplsn. + */ +static void +maybe_start_skipping_changes(XLogRecPtr finish_lsn) +{ + Assert(!is_skipping_changes()); + Assert(!in_remote_transaction); + Assert(!in_streamed_transaction); + + /* + * Quick return if it's not requested to skip this transaction. This + * function is called for every remote transaction and we assume that + * skipping the transaction is not used often. + */ + if (likely(XLogRecPtrIsInvalid(MySubscription->skiplsn) || + MySubscription->skiplsn != finish_lsn)) + return; + + /* Start skipping all changes of this transaction */ + skip_xact_finish_lsn = finish_lsn; + + ereport(LOG, + errmsg("logical replication starts skipping transaction at LSN %X/%X", + LSN_FORMAT_ARGS(skip_xact_finish_lsn))); +} + +/* + * Stop skipping changes by resetting skip_xact_finish_lsn if enabled. + */ +static void +stop_skipping_changes(void) +{ + if (!is_skipping_changes()) + return; + + ereport(LOG, + (errmsg("logical replication completed skipping transaction at LSN %X/%X", + LSN_FORMAT_ARGS(skip_xact_finish_lsn)))); + + /* Stop skipping changes */ + skip_xact_finish_lsn = InvalidXLogRecPtr; +} + +/* + * Clear subskiplsn of pg_subscription catalog. + * + * finish_lsn is the transaction's finish LSN that is used to check if the + * subskiplsn matches it. If not matched, we raise a warning when clearing the + * subskiplsn in order to inform users for cases e.g., where the user mistakenly + * specified the wrong subskiplsn. + */ +static void +clear_subscription_skip_lsn(XLogRecPtr finish_lsn) +{ + Relation rel; + Form_pg_subscription subform; + HeapTuple tup; + XLogRecPtr myskiplsn = MySubscription->skiplsn; + bool started_tx = false; + + if (likely(XLogRecPtrIsInvalid(myskiplsn)) || am_parallel_apply_worker()) + return; + + if (!IsTransactionState()) + { + StartTransactionCommand(); + started_tx = true; + } + + /* + * Protect subskiplsn of pg_subscription from being concurrently updated + * while clearing it. + */ + LockSharedObject(SubscriptionRelationId, MySubscription->oid, 0, + AccessShareLock); + + rel = table_open(SubscriptionRelationId, RowExclusiveLock); + + /* Fetch the existing tuple. */ + tup = SearchSysCacheCopy1(SUBSCRIPTIONOID, + ObjectIdGetDatum(MySubscription->oid)); + + if (!HeapTupleIsValid(tup)) + elog(ERROR, "subscription \"%s\" does not exist", MySubscription->name); + + subform = (Form_pg_subscription) GETSTRUCT(tup); + + /* + * Clear the subskiplsn. If the user has already changed subskiplsn before + * clearing it we don't update the catalog and the replication origin + * state won't get advanced. So in the worst case, if the server crashes + * before sending an acknowledgment of the flush position the transaction + * will be sent again and the user needs to set subskiplsn again. We can + * reduce the possibility by logging a replication origin WAL record to + * advance the origin LSN instead but there is no way to advance the + * origin timestamp and it doesn't seem to be worth doing anything about + * it since it's a very rare case. + */ + if (subform->subskiplsn == myskiplsn) + { + bool nulls[Natts_pg_subscription]; + bool replaces[Natts_pg_subscription]; + Datum values[Natts_pg_subscription]; + + memset(values, 0, sizeof(values)); + memset(nulls, false, sizeof(nulls)); + memset(replaces, false, sizeof(replaces)); + + /* reset subskiplsn */ + values[Anum_pg_subscription_subskiplsn - 1] = LSNGetDatum(InvalidXLogRecPtr); + replaces[Anum_pg_subscription_subskiplsn - 1] = true; + + tup = heap_modify_tuple(tup, RelationGetDescr(rel), values, nulls, + replaces); + CatalogTupleUpdate(rel, &tup->t_self, tup); + + if (myskiplsn != finish_lsn) + ereport(WARNING, + errmsg("skip-LSN of subscription \"%s\" cleared", MySubscription->name), + errdetail("Remote transaction's finish WAL location (LSN) %X/%X did not match skip-LSN %X/%X.", + LSN_FORMAT_ARGS(finish_lsn), + LSN_FORMAT_ARGS(myskiplsn))); + } + + heap_freetuple(tup); + table_close(rel, NoLock); + + if (started_tx) + CommitTransactionCommand(); +} + +/* Error callback to give more context info about the change being applied */ +void +apply_error_callback(void *arg) +{ + ApplyErrorCallbackArg *errarg = &apply_error_callback_arg; + + if (apply_error_callback_arg.command == 0) + return; + + Assert(errarg->origin_name); + + if (errarg->rel == NULL) + { + if (!TransactionIdIsValid(errarg->remote_xid)) + errcontext("processing remote data for replication origin \"%s\" during message type \"%s\"", + errarg->origin_name, + logicalrep_message_type(errarg->command)); + else if (XLogRecPtrIsInvalid(errarg->finish_lsn)) + errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" in transaction %u", + errarg->origin_name, + logicalrep_message_type(errarg->command), + errarg->remote_xid); + else + errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" in transaction %u, finished at %X/%X", + errarg->origin_name, + logicalrep_message_type(errarg->command), + errarg->remote_xid, + LSN_FORMAT_ARGS(errarg->finish_lsn)); + } + else + { + if (errarg->remote_attnum < 0) + { + if (XLogRecPtrIsInvalid(errarg->finish_lsn)) + errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" for replication target relation \"%s.%s\" in transaction %u", + errarg->origin_name, + logicalrep_message_type(errarg->command), + errarg->rel->remoterel.nspname, + errarg->rel->remoterel.relname, + errarg->remote_xid); + else + errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" for replication target relation \"%s.%s\" in transaction %u, finished at %X/%X", + errarg->origin_name, + logicalrep_message_type(errarg->command), + errarg->rel->remoterel.nspname, + errarg->rel->remoterel.relname, + errarg->remote_xid, + LSN_FORMAT_ARGS(errarg->finish_lsn)); + } + else + { + if (XLogRecPtrIsInvalid(errarg->finish_lsn)) + errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" for replication target relation \"%s.%s\" column \"%s\" in transaction %u", + errarg->origin_name, + logicalrep_message_type(errarg->command), + errarg->rel->remoterel.nspname, + errarg->rel->remoterel.relname, + errarg->rel->remoterel.attnames[errarg->remote_attnum], + errarg->remote_xid); + else + errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" for replication target relation \"%s.%s\" column \"%s\" in transaction %u, finished at %X/%X", + errarg->origin_name, + logicalrep_message_type(errarg->command), + errarg->rel->remoterel.nspname, + errarg->rel->remoterel.relname, + errarg->rel->remoterel.attnames[errarg->remote_attnum], + errarg->remote_xid, + LSN_FORMAT_ARGS(errarg->finish_lsn)); + } + } +} + +/* Set transaction information of apply error callback */ +static inline void +set_apply_error_context_xact(TransactionId xid, XLogRecPtr lsn) +{ + apply_error_callback_arg.remote_xid = xid; + apply_error_callback_arg.finish_lsn = lsn; +} + +/* Reset all information of apply error callback */ +static inline void +reset_apply_error_context_info(void) +{ + apply_error_callback_arg.command = 0; + apply_error_callback_arg.rel = NULL; + apply_error_callback_arg.remote_attnum = -1; + set_apply_error_context_xact(InvalidTransactionId, InvalidXLogRecPtr); +} + +/* + * Request wakeup of the workers for the given subscription OID + * at commit of the current transaction. + * + * This is used to ensure that the workers process assorted changes + * as soon as possible. + */ +void +LogicalRepWorkersWakeupAtCommit(Oid subid) +{ + MemoryContext oldcxt; + + oldcxt = MemoryContextSwitchTo(TopTransactionContext); + on_commit_wakeup_workers_subids = + list_append_unique_oid(on_commit_wakeup_workers_subids, subid); + MemoryContextSwitchTo(oldcxt); +} + +/* + * Wake up the workers of any subscriptions that were changed in this xact. + */ +void +AtEOXact_LogicalRepWorkers(bool isCommit) +{ + if (isCommit && on_commit_wakeup_workers_subids != NIL) + { + ListCell *lc; + + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + foreach(lc, on_commit_wakeup_workers_subids) + { + Oid subid = lfirst_oid(lc); + List *workers; + ListCell *lc2; + + workers = logicalrep_workers_find(subid, true); + foreach(lc2, workers) + { + LogicalRepWorker *worker = (LogicalRepWorker *) lfirst(lc2); + + logicalrep_worker_wakeup_ptr(worker); + } + } + LWLockRelease(LogicalRepWorkerLock); + } + + /* The List storage will be reclaimed automatically in xact cleanup. */ + on_commit_wakeup_workers_subids = NIL; +} + +/* + * Allocate the origin name in long-lived context for error context message. + */ +void +set_apply_error_context_origin(char *originname) +{ + apply_error_callback_arg.origin_name = MemoryContextStrdup(ApplyContext, + originname); +} + +/* + * Return the action to be taken for the given transaction. See + * TransApplyAction for information on each of the actions. + * + * *winfo is assigned to the destination parallel worker info when the leader + * apply worker has to pass all the transaction's changes to the parallel + * apply worker. + */ +static TransApplyAction +get_transaction_apply_action(TransactionId xid, ParallelApplyWorkerInfo **winfo) +{ + *winfo = NULL; + + if (am_parallel_apply_worker()) + { + return TRANS_PARALLEL_APPLY; + } + + /* + * If we are processing this transaction using a parallel apply worker + * then either we send the changes to the parallel worker or if the worker + * is busy then serialize the changes to the file which will later be + * processed by the parallel worker. + */ + *winfo = pa_find_worker(xid); + + if (*winfo && (*winfo)->serialize_changes) + { + return TRANS_LEADER_PARTIAL_SERIALIZE; + } + else if (*winfo) + { + return TRANS_LEADER_SEND_TO_PARALLEL; + } + + /* + * If there is no parallel worker involved to process this transaction + * then we either directly apply the change or serialize it to a file + * which will later be applied when the transaction finish message is + * processed. + */ + else if (in_streamed_transaction) + { + return TRANS_LEADER_SERIALIZE; + } + else + { + return TRANS_LEADER_APPLY; + } +} |
