postgresql之page分配管理(一)

本文详细介绍了PostgreSQL的缓冲区管理,包括页大小的配置(默认8kB,最大32kB)、初始化过程,以及缓冲区的分配和组织管理。在初始化时,通过`InitBufferPool`创建并分配缓冲区,并使用哈希表和自由列表进行快速查找和管理。当分配完所有缓冲区后,系统会进行扩容或淘汰策略,而自由列表为空时则需要进一步的操作。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

一、简介

postgresql是多进程架构,同时也是一个庞大的共享内存系统,大部分的数据都是在共享内存中,供多进程进行协作处理,今天主要是buffer(page)。

二、page 大小

page默认大小是8kB,可以在编译时进行调整,最大32kB

2.1 如何配置

configure.ac

#
# Block size
#
AC_MSG_CHECKING([for block size])
PGAC_ARG_REQ(with, blocksize, [BLOCKSIZE], [set table block size in kB [8]],
             [blocksize=$withval],
             [blocksize=8])
case ${blocksize} in
  1) BLCKSZ=1024;;
  2) BLCKSZ=2048;;
  4) BLCKSZ=4096;;
  8) BLCKSZ=8192;;
 16) BLCKSZ=16384;;
 32) BLCKSZ=32768;;
  *) AC_MSG_ERROR([Invalid block size. Allowed values are 1,2,4,8,16,32.])
esac
AC_MSG_RESULT([${blocksize}kB])

AC_DEFINE_UNQUOTED([BLCKSZ], ${BLCKSZ}, [
 Size of a disk block --- this also limits the size of a tuple.  You
 can set it bigger if you need bigger tuples (although TOAST should
 reduce the need to have large tuples, since fields can be spread
 across multiple tuples).

 BLCKSZ must be a power of 2.  The maximum possible value of BLCKSZ
 is currently 2^15 (32768).  This is determined by the 15-bit widths
 of the lp_off and lp_len fields in ItemIdData (see
 include/storage/itemid.h).

 Changing BLCKSZ requires an initdb.
])

默认值为8kB,可以通过 --with-blocksize=x进行设置,x取值范围[1,2,4,8,16,32], 最大值32768,2的15次方

2.2 为啥最大只能32kB

src/include/storage/itemid.h

typedef struct ItemIdData
{
	unsigned	lp_off:15,		/* offset to tuple (from start of page) */
				lp_flags:2,		/* state of line pointer, see below */
				lp_len:15;		/* byte length of tuple */
} ItemIdData;

指定每个记录在page中的偏移量和长度字段只有15bit,这就限制了page的大小。

三、初始化

postgresql在启动时,会进行相应的初始化。

初始化调用栈

src/backend/main/main.c
main()
	src/backend/postmaster/postmaster.c
	PostmasterMain()
		reset_shared();
			src/backend/storage/ipc/ipci.c
			CreateSharedMemoryAndSemaphores()
				src/backend/storage/buffer/buf_init.c
				InitBufferPool();
					src/backend/storage/buffer/freelist.c
					StrategyInitialize(!foundDescs);
						src/backend/storage/buffer/buf_table.c
						InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS);
void
CreateSharedMemoryAndSemaphores(void)
{
	...
//第一步 计算总共需要的共享内存大小
		size = 100000;
		...
		size = add_size(size, BufferShmemSize());
		size = add_size(size, LockShmemSize());
		...
		
//第二步 将计算的size进行8k对齐		
		/* might as well round it off to a multiple of a typical page size */
		size = add_size(size, 8192 - (size % 8192));

	
// 第三步 创建共享内存
		seghdr = PGSharedMemoryCreate(size, &shim);

// 第四步 初始化共享内存各个部分
		InitShmemAccess(seghdr);

		
	/*
	 * Set up shared memory allocation mechanism
	 */
	if (!IsUnderPostmaster)
		InitShmemAllocation();

	/*
	 * Now initialize LWLocks, which do shared memory allocation and are
	 * needed for InitShmemIndex.
	 */
	CreateLWLocks();

	/*
	 * Set up shmem.c index hashtable
	 */
	InitShmemIndex();

	dsm_shmem_init();

	...
	InitBufferPool();

	/*
	 * Set up lock manager
	 */
	InitLocks();

	/*
	 * Set up predicate lock manager
	 */
	InitPredicateLocks();

	/*
	 * Set up process table
	 */
	if (!IsUnderPostmaster)
		InitProcGlobal();
	CreateSharedProcArray();
	CreateSharedBackendStatus();
	TwoPhaseShmemInit();
	BackgroundWorkerShmemInit();

	/*
	 * Set up shared-inval messaging
	 */
	CreateSharedInvalidationState();

	/*
	 * Set up interprocess signaling mechanisms
	 */
	PMSignalShmemInit();
	ProcSignalShmemInit();
	CheckpointerShmemInit();
	AutoVacuumShmemInit();
	ReplicationSlotsShmemInit();
	ReplicationOriginShmemInit();
	WalSndShmemInit();
	WalRcvShmemInit();
	PgArchShmemInit();
	ApplyLauncherShmemInit();

	/*
	 * Set up other modules that need some shared memory space
	 */
	SnapMgrInit();
	BTreeShmemInit();
	SyncScanShmemInit();
	AsyncShmemInit();
	...
}

这里主要关注InitBufferPool

3.1 创建buffer

src/backend/utils/init/globals.c

int			NBuffers = 1000;

src/backend/storage/buffer/buf_init.c

void
InitBufferPool(void)
{
	...

	/* Align descriptors to a cacheline boundary. */
	BufferDescriptors = (BufferDescPadded *)
		ShmemInitStruct("Buffer Descriptors",
						NBuffers * sizeof(BufferDescPadded),
						&foundDescs);

	BufferBlocks = (char *)
		ShmemInitStruct("Buffer Blocks",
						NBuffers * (Size) BLCKSZ, &foundBufs);

	/* Align condition variables to cacheline boundary. */
	BufferIOCVArray = (ConditionVariableMinimallyPadded *)
		ShmemInitStruct("Buffer IO Condition Variables",
						NBuffers * sizeof(ConditionVariableMinimallyPadded),
						&foundIOCV);
}

3.2 初始化buffer header

		/*
		 * Initialize all the buffer headers.
		 */
		for (i = 0; i < NBuffers; i++)
		{
			BufferDesc *buf = GetBufferDescriptor(i);

			CLEAR_BUFFERTAG(buf->tag);

			pg_atomic_init_u32(&buf->state, 0);
			buf->wait_backend_pid = 0;

			buf->buf_id = i;

			/*
			 * Initially link all the buffers together as unused. Subsequent
			 * management of this list is done by freelist.c.
			 */
			buf->freeNext = i + 1;

			...
		}

		/* Correct last entry of linked list */
		GetBufferDescriptor(NBuffers - 1)->freeNext = FREENEXT_END_OF_LIST;
	}

四、buffer组织管理

buffer已经有了,但是要如何快速的找到一个buffer?哪些buffer没有使用?哪些已经使用? 这里使用了一个hash表以及一个freelist。

4.1 创建hashtable

void
InitBufTable(int size)
{
	HASHCTL		info;

	/* assume no locking is needed yet */

	/* BufferTag maps to Buffer */
	info.keysize = sizeof(BufferTag);
	info.entrysize = sizeof(BufferLookupEnt);
	info.num_partitions = NUM_BUFFER_PARTITIONS;

	SharedBufHash = ShmemInitHash("Shared Buffer Lookup Table",
								  size, size,
								  &info,
								  HASH_ELEM | HASH_BLOBS | HASH_PARTITION);
}
HTAB *
ShmemInitHash(const char *name,		/* table string name for shmem index */
			  long init_size,	/* initial table size */
			  long max_size,	/* max size of the table */
			  HASHCTL *infoP,	/* info about key and bucket size */
			  int hash_flags)	/* info about infoP */
{
	...
	/* look it up in the shmem index */
	location = ShmemInitStruct(name,
							   hash_get_shared_size(infoP, hash_flags),
							   &found);
  ...
Size
hash_get_shared_size(HASHCTL *info, int flags)
{
	Assert(flags & HASH_DIRSIZE);
	Assert(info->dsize == info->max_dsize);
	return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT);
}

4.2 初始化hashtable

src/backend/utils/hash/dynahash.c

static bool
init_htab(HTAB *hashp, long nelem)
{
	HASHHDR    *hctl = hashp->hctl;
	HASHSEGMENT *segp;
	int			nbuckets;
	int			nsegs;
	int			i;

	/*
	 * initialize mutexes if it's a partitioned table
	 */
	if (IS_PARTITIONED(hctl))
		for (i = 0; i < NUM_FREELISTS; i++)
			SpinLockInit(&(hctl->freeList[i].mutex));

	/*
	 * Allocate space for the next greater power of two number of buckets,
	 * assuming a desired maximum load factor of 1.
	 */
	nbuckets = next_pow2_int(nelem);

	/*
	 * In a partitioned table, nbuckets must be at least equal to
	 * num_partitions; were it less, keys with apparently different partition
	 * numbers would map to the same bucket, breaking partition independence.
	 * (Normally nbuckets will be much bigger; this is just a safety check.)
	 */
	while (nbuckets < hctl->num_partitions)
		nbuckets <<= 1;

	hctl->max_bucket = hctl->low_mask = nbuckets - 1;
	hctl->high_mask = (nbuckets << 1) - 1;

	/*
	 * Figure number of directory segments needed, round up to a power of 2
	 */
	nsegs = (nbuckets - 1) / hctl->ssize + 1;
	nsegs = next_pow2_int(nsegs);
....

	/* Allocate initial segments */
	for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++)
	{
		*segp = seg_alloc(hashp);
		if (*segp == NULL)
			return false;
	}

	/* Choose number of entries to allocate at a time */
	hctl->nelem_alloc = choose_nelem_alloc(hctl->entrysize);

	return true;
}
static HASHSEGMENT
seg_alloc(HTAB *hashp)
{
	HASHSEGMENT segp;

	CurrentDynaHashCxt = hashp->hcxt;
	segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->ssize);

	if (!segp)
		return NULL;

	MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->ssize);

	return segp;
}

4.3 创建freelist

src/backend/utils/hash/dynahash.c

...
for (i= 0; i < freelist_partitions; i++)
{
	int			temp = (i == 0) ? nelem_alloc_first : nelem_alloc;

	if (!element_alloc(hashp, temp, i))
		ereport(ERROR,
				(errcode(ERRCODE_OUT_OF_MEMORY),
				 errmsg("out of memory")));
}
...
static bool
element_alloc(HTAB *hashp, int nelem, int freelist_idx)
{
	HASHHDR    *hctl = hashp->hctl;
	Size		elementSize;
	HASHELEMENT *firstElement;
	HASHELEMENT *tmpElement;
	HASHELEMENT *prevElement;
	int			i;

	if (hashp->isfixed)
		return false;

	/* Each element has a HASHELEMENT header plus user data. */
	elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize);

	CurrentDynaHashCxt = hashp->hcxt;
	firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize);

	if (!firstElement)
		return false;

	/* prepare to link all the new entries into the freelist */
	prevElement = NULL;
	tmpElement = firstElement;
	for (i = 0; i < nelem; i++)
	{
		tmpElement->link = prevElement;
		prevElement = tmpElement;
		tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize);
	}

	/* if partitioned, must lock to touch freeList */
	if (IS_PARTITIONED(hctl))
		SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);

	/* freelist could be nonempty if two backends did this concurrently */
	firstElement->link = hctl->freeList[freelist_idx].freeList;
	hctl->freeList[freelist_idx].freeList = prevElement;

	if (IS_PARTITIONED(hctl))
		SpinLockRelease(&hctl->freeList[freelist_idx].mutex);

	return true;
}

五、结构图

在这里插入图片描述

六、分配

src/backend/storage/buffer/bufmgr.c

static BufferDesc *
BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
			BlockNumber blockNum,
			BufferAccessStrategy strategy,
			bool *foundPtr){
...
}

6.1 根据BufferTag计算hash值

	/* create a tag so we can lookup the buffer */
	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
	
	newHash = BufTableHashCode(&newTag);

6.2 hash表中查找

	buf_id = BufTableLookup(&newTag, newHash);

6.3 没有找到,分配新的一个

for (;;)
	{
	...
		buf = StrategyGetBuffer(strategy, &buf_state);
	...
}
BufferDesc *
StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
{
	BufferDesc *buf;
	......
	pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1);
	...
	if (StrategyControl->firstFreeBuffer >= 0)
	{
		while (true)
		{
			...
			if (StrategyControl->firstFreeBuffer < 0)
			{
				...
				break;
			}

			buf = GetBufferDescriptor(StrategyControl->firstFreeBuffer);

			/* Unconditionally remove buffer from freelist */
			StrategyControl->firstFreeBuffer = buf->freeNext;
			buf->freeNext = FREENEXT_NOT_IN_LIST;

			...
				return buf;
			...
		}
	}
	....
}

以上的代码把各种判断逻辑都删除了,只剩整个结构,可以看出通过StrategyControl->firstFreeBuffer获取到第一个空闲的buffer。

然后从freelist中获取一个节点将这个id和buffer进行关联,将节点从freelist中移除(指针改变),然后将此节点加入hash中。

在这里插入图片描述

七、后续

  1. 随着page的分配,当分配完后,是扩容还是淘汰? 如何淘汰?
  2. freelist中的节点分配完了,如何操作?
### PostgreSQL 常用运维管理命令汇总 以下是 PostgreSQL 数据库常用的运维管理和维护命令: #### 1. 启动和停止服务 用于启动、重启或停止 PostgreSQL 服务。 ```bash sudo systemctl start postgresql # 启动 PostgreSQL 服务 sudo systemctl restart postgresql # 重启 PostgreSQL 服务 sudo systemctl stop postgresql # 停止 PostgreSQL 服务 ``` #### 2. 查看当前连接会话 显示当前数据库中的活动连接及其状态。 ```sql SELECT pid, usename, application_name, client_addr, backend_start, state FROM pg_stat_activity; ``` 此查询可以用来监控哪些客户端正在访问数据库以及它们的状态[^3]。 #### 3. 创建新用户和分配权限 创建新的数据库用户并授予其特定权限。 ```sql CREATE USER new_user WITH PASSWORD 'password'; -- 创建用户 GRANT ALL PRIVILEGES ON DATABASE database_name TO new_user; -- 授予数据库权限 ALTER ROLE new_user CREATEDB; -- 赋予创建数据库的权限 ``` #### 4. 备份与恢复数据 备份整个数据库或将备份还原到目标环境。 ```bash pg_dump -U username dbname > backup.sql # 导出数据库结构和数据 psql -U username dbname < backup.sql # 还原数据库结构和数据 ``` #### 5. 表空间管理 表空间允许管理员将对象存储在不同的物理位置。 ```sql CREATE TABLESPACE tablespace_name LOCATION '/path/to/directory'; ALTER DATABASE dbname SET TABLESPACE tablespace_name; ``` #### 6. 查询性能优化工具 分析慢查询日志以提高数据库性能。 ```sql EXPLAIN ANALYZE SELECT * FROM large_table WHERE condition = value; -- 分析 SQL 执行计划 SHOW log_min_duration_statement; -- 显示记录语句执行时间阈值 SET log_min_duration_statement = 100; -- 设置超过 100ms 的查询会被记录 ``` #### 7. 锁定情况检测 查看是否有锁等待的情况发生,防止死锁问题。 ```sql SELECT blocked_locks.pid AS blocked_pid, blocking_locks.pid AS blocking_pid, blocked_activity.usename AS blocked_user, blocking_activity.usename AS blocking_user, blocked_activity.query AS blocked_query, blocking_activity.query AS current_query_in_blocking_process FROM pg_catalog.pg_locks blocked_locks JOIN pg_catalog.pg_stat_activity blocked_activity ON blocked_locks.pid = blocked_activity.pid JOIN pg_catalog.pg_locks blocking_locks ON blocking_locks.locktype = blocked_locks.locktype AND blocking_locks.database IS NOT DISTINCT FROM blocked_locks.database AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid AND blocking_locks.pid != blocked_locks.pid JOIN pg_catalog.pg_stat_activity blocking_activity ON blocking_locks.pid = blocking_activity.pid WHERE NOT blocked_locks.granted; ``` #### 8. 统计数据库大小 获取单个数据库或者所有数据库的空间占用信息。 ```sql SELECT datname, pg_size_pretty(pg_database_size(datname)) AS size FROM pg_database ORDER BY pg_database_size(datname) DESC; ``` #### 9. 清理无用索引 删除那些不再被使用的索引从而节省磁盘空间。 ```sql WITH index_usage_stats AS ( SELECT s.relname as table_name, i.relname as index_name, idx_scan as total_scans FROM pg_stat_all_indexes s JOIN pg_index USING(indexrelid) JOIN pg_class c ON c.oid=s.indexrelid LEFT JOIN pg_class i ON i.oid=indexrelid WHERE schemaname='public' ) DELETE FROM pg_index WHERE indrelid IN (SELECT oid FROM pg_class WHERE relkind='r') AND indexrelid NOT IN (SELECT indexrelid FROM index_usage_stats WHERE total_scans>0); ``` #### 10. 自定义函数编写 简化复杂逻辑处理过程。 ```plpgsql CREATE OR REPLACE FUNCTION add_two_numbers(a integer, b integer) RETURNS integer AS $$ BEGIN RETURN a + b; END; $$ LANGUAGE plpgsql; -- 使用自定义函数 SELECT add_two_numbers(5, 3); -- 返回 8 ``` 以上列举了些常见的 PostgreSQL 运维管理命令,具体应用还需根据实际需求调整参数设置。如果需要更深入的功能支持,则可能涉及配置文件修改或其他高级特性操作[^1]。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值