Skip to content

Commit cf14d6a

Browse files
author
Commitfest Bot
committed
[CF 5734] v3 - Improve logicalrep_worker_launch() logic
This branch was automatically generated by a robot using patches from an email thread registered at: https://commitfest.postgresql.org/patch/5734 The branch will be overwritten each time a new patch version is posted to the thread, and also periodically to check for bitrot caused by changes on the master branch. Patch(es): https://www.postgresql.org/message-id/[email protected] Author(s): Fujii Masao
2 parents 2782f3b + 815bdde commit cf14d6a

File tree

1 file changed

+42
-36
lines changed

1 file changed

+42
-36
lines changed

src/backend/replication/logical/launcher.c

+42-36
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ static void logicalrep_launcher_onexit(int code, Datum arg);
9696
static void logicalrep_worker_onexit(int code, Datum arg);
9797
static void logicalrep_worker_detach(void);
9898
static void logicalrep_worker_cleanup(LogicalRepWorker *worker);
99-
static int logicalrep_pa_worker_count(Oid subid);
99+
static void logicalrep_worker_count(Oid subid, int *nsync, int *nparallelapply);
100100
static void logicalrep_launcher_attach_dshmem(void);
101101
static void ApplyLauncherSetWorkerStartTime(Oid subid, TimestampTz start_time);
102102
static TimestampTz ApplyLauncherGetWorkerStartTime(Oid subid);
@@ -336,7 +336,6 @@ logicalrep_worker_launch(LogicalRepWorkerType wtype,
336336
*/
337337
LWLockAcquire(LogicalRepWorkerLock, LW_EXCLUSIVE);
338338

339-
retry:
340339
/* Find unused worker slot. */
341340
for (i = 0; i < max_logical_replication_workers; i++)
342341
{
@@ -350,16 +349,21 @@ logicalrep_worker_launch(LogicalRepWorkerType wtype,
350349
}
351350
}
352351

353-
nsyncworkers = logicalrep_sync_worker_count(subid);
352+
logicalrep_worker_count(subid, &nsyncworkers, &nparallelapplyworkers);
354353

355354
now = GetCurrentTimestamp();
356355

357356
/*
358-
* If we didn't find a free slot, try to do garbage collection. The
359-
* reason we do this is because if some worker failed to start up and its
360-
* parent has crashed while waiting, the in_use state was never cleared.
357+
* If we can't start a new logical replication background worker because
358+
* no free slot is available, or because the number of sync workers or
359+
* parallel apply workers has reached the limit per subscriptoin, try
360+
* running garbage collection. The reason we do this is because if some
361+
* workers failed to start up and their parent has crashed while waiting,
362+
* the in_use state was never cleared. By freeing up these stale worker
363+
* slots, we may be able to start a new worker.
361364
*/
362-
if (worker == NULL || nsyncworkers >= max_sync_workers_per_subscription)
365+
if (worker == NULL || nsyncworkers >= max_sync_workers_per_subscription ||
366+
nparallelapplyworkers >= max_parallel_apply_workers_per_subscription)
363367
{
364368
bool did_cleanup = false;
365369

@@ -381,11 +385,21 @@ logicalrep_worker_launch(LogicalRepWorkerType wtype,
381385

382386
logicalrep_worker_cleanup(w);
383387
did_cleanup = true;
388+
389+
if (worker == NULL)
390+
{
391+
worker = w;
392+
slot = i;
393+
}
384394
}
385395
}
386396

397+
/*
398+
* Count the current number of sync and parallel apply workers again,
399+
* since garbage collection may have changed it.
400+
*/
387401
if (did_cleanup)
388-
goto retry;
402+
logicalrep_worker_count(subid, &nsyncworkers, &nparallelapplyworkers);
389403
}
390404

391405
/*
@@ -399,8 +413,6 @@ logicalrep_worker_launch(LogicalRepWorkerType wtype,
399413
return false;
400414
}
401415

402-
nparallelapplyworkers = logicalrep_pa_worker_count(subid);
403-
404416
/*
405417
* Return false if the number of parallel apply workers reached the limit
406418
* per subscription.
@@ -844,48 +856,42 @@ logicalrep_worker_onexit(int code, Datum arg)
844856
int
845857
logicalrep_sync_worker_count(Oid subid)
846858
{
847-
int i;
848859
int res = 0;
849860

850-
Assert(LWLockHeldByMe(LogicalRepWorkerLock));
851-
852-
/* Search for attached worker for a given subscription id. */
853-
for (i = 0; i < max_logical_replication_workers; i++)
854-
{
855-
LogicalRepWorker *w = &LogicalRepCtx->workers[i];
856-
857-
if (isTablesyncWorker(w) && w->subid == subid)
858-
res++;
859-
}
860-
861+
logicalrep_worker_count(subid, &res, NULL);
861862
return res;
862863
}
863864

864865
/*
865-
* Count the number of registered (but not necessarily running) parallel apply
866-
* workers for a subscription.
866+
* Count the number of registered (but not necessarily running) sync workers
867+
* and parallel apply workers for a subscription.
867868
*/
868-
static int
869-
logicalrep_pa_worker_count(Oid subid)
869+
static void
870+
logicalrep_worker_count(Oid subid, int *nsync, int *nparallelapply)
870871
{
871-
int i;
872-
int res = 0;
873-
874872
Assert(LWLockHeldByMe(LogicalRepWorkerLock));
875873

874+
if (nsync != NULL)
875+
*nsync = 0;
876+
if (nparallelapply != NULL)
877+
*nparallelapply = 0;
878+
876879
/*
877-
* Scan all attached parallel apply workers, only counting those which
878-
* have the given subscription id.
880+
* Scan all attached sync and parallel apply workers, only counting those
881+
* which have the given subscription id.
879882
*/
880-
for (i = 0; i < max_logical_replication_workers; i++)
883+
for (int i = 0; i < max_logical_replication_workers; i++)
881884
{
882885
LogicalRepWorker *w = &LogicalRepCtx->workers[i];
883886

884-
if (isParallelApplyWorker(w) && w->subid == subid)
885-
res++;
887+
if (w->subid == subid)
888+
{
889+
if (nsync != NULL && isTablesyncWorker(w))
890+
(*nsync)++;
891+
if (nparallelapply != NULL && isParallelApplyWorker(w))
892+
(*nparallelapply)++;
893+
}
886894
}
887-
888-
return res;
889895
}
890896

891897
/*

0 commit comments

Comments
 (0)