/*-------------------------------------------------------------------------
*
* bufmgr.c
* buffer manager interface routines
*
* Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/backend/storage/buffer/bufmgr.c
*
*-------------------------------------------------------------------------
*/
/*
* Principal entry points:
*
* ReadBuffer() -- find or create a buffer holding the requested page,
* and pin it so that no one can destroy it while this process
* is using it.
*
* ReleaseBuffer() -- unpin a buffer
*
* MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
* The disk write is delayed until buffer replacement or checkpoint.
*
* See also these files:
* freelist.c -- chooses victim for buffer replacement
* buf_table.c -- manages the buffer lookup table
*/
#include "postgres.h"
#include <sys/file.h>
#include <unistd.h>
#include "access/xlog.h"
#include "catalog/catalog.h"
#include "catalog/storage.h"
#include "executor/instrument.h"
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
#include "storage/ipc.h"
#include "storage/proc.h"
#include "storage/smgr.h"
#include "storage/standby.h"
#include "utils/rel.h"
#include "utils/resowner_private.h"
#include "utils/timestamp.h"
/* Note: these two macros only work on shared buffers, not local ones! */
#define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
#define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
/* Note: this macro only works on local buffers, not shared ones! */
#define LocalBufHdrGetBlock(bufHdr) \
LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
/* Bits in SyncOneBuffer's return value */
#define BUF_WRITTEN 0x01
#define BUF_REUSABLE 0x02
#define DROP_RELS_BSEARCH_THRESHOLD 20
typedef struct PrivateRefCountEntry
{
Buffer buffer;
int32 refcount;
} PrivateRefCountEntry;
/* 64 bytes, about the size of a cache line on common systems */
#define REFCOUNT_ARRAY_ENTRIES 8
/* GUC variables */
bool zero_damaged_pages = false;
int bgwriter_lru_maxpages = 100;
double bgwriter_lru_multiplier = 2.0;
bool track_io_timing = false;
int effective_io_concurrency = 0;
/*
* How many buffers PrefetchBuffer callers should try to stay ahead of their
* ReadBuffer calls by. This is maintained by the assign hook for
* effective_io_concurrency. Zero means "never prefetch". This value is
* only used for buffers not belonging to tablespaces that have their
* effective_io_concurrency parameter set.
*/
int target_prefetch_pages = 0;
/* local state for StartBufferIO and related functions */
static BufferDesc *InProgressBuf = NULL;
static bool IsForInput;
/* local state for LockBufferForCleanup */
static BufferDesc *PinCountWaitBuf = NULL;
/*
* Backend-Private refcount management:
*
* Each buffer also has a private refcount that keeps track of the number of
* times the buffer is pinned in the current process. This is so that the
* shared refcount needs to be modified only once if a buffer is pinned more
* than once by an individual backend. It's also used to check that no buffers
* are still pinned at the end of transactions and when exiting.
*
*
* To avoid - as we used to - requiring an array with NBuffers entries to keep
* track of local buffers, we use a small sequentially searched array
* (PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
* keep track of backend local pins.
*
* Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
* refcounts are kept track of in the array; after that, new array entries
* displace old ones into the hash table. That way a frequently used entry
* can't get "stuck" in the hashtable while infrequent ones clog the array.
*
* Note that in most scenarios the number of pinned buffers will not exceed
* REFCOUNT_ARRAY_ENTRIES.
*
*
* To enter a buffer into the refcount tracking mechanism first reserve a free
* entry using ReservePrivateRefCountEntry() and then later, if necessary,
* fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
* memory allocations in NewPrivateRefCountEntry() which can be important
* because in some scenarios it's called with a spinlock held...
*/
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
static HTAB *PrivateRefCountHash = NULL;
static int32 PrivateRefCountOverflowed = 0;
static uint32 PrivateRefCountClock = 0;
static PrivateRefCountEntry *ReservedRefCountEntry = NULL;
static void ReservePrivateRefCountEntry(void);
static PrivateRefCountEntry *NewPrivateRefCountEntry(Buffer buffer);
static PrivateRefCountEntry *GetPrivateRefCountEntry(Buffer buffer, bool do_move);
static inline int32 GetPrivateRefCount(Buffer buffer);
static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref);
/*
* Ensure that the PrivateRefCountArray has sufficient space to store one more
* entry. This has to be called before using NewPrivateRefCountEntry() to fill
* a new entry - but it's perfectly fine to not use a reserved entry.
*/
static void
ReservePrivateRefCountEntry(void)
{
/* Already reserved (or freed), nothing to do */
if (ReservedRefCountEntry != NULL)
return;
/*
* First search for a free entry the array, that'll be sufficient in the
* majority of cases.
*/
{
int i;
for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
{
PrivateRefCountEntry *res;
res = &PrivateRefCountArray[i];
if (res->buffer == InvalidBuffer)
{
ReservedRefCountEntry = res;
return;
}
}
}
/*
* No luck. All array entries are full. Move one array entry into the hash
* table.
*/
{
/*
* Move entry from the current clock position in the array into the
* hashtable. Use that slot.
*/
PrivateRefCountEntry *hashent;
bool found;
/* select victim slot */
ReservedRefCountEntry =
&PrivateRefCountArray[PrivateRefCountClock++ % REFCOUNT_ARRAY_ENTRIES];
/* Better be used, otherwise we shouldn't get here. */
Assert(ReservedRefCountEntry->buffer != InvalidBuffer);
/* enter victim array entry into hashtable */
hashent = hash_search(PrivateRefCountHash,
(void *) &(ReservedRefCountEntry->buffer),
HASH_ENTER,
&found);
Assert(!found);
hashent->refcount = ReservedRefCountEntry->refcount;
/* clear the now free array slot */
ReservedRefCountEntry->buffer = InvalidBuffer;
ReservedRefCountEntry->refcount = 0;
PrivateRefCountOverflowed++;
}
}
/*
* Fill a previously reserved refcount entry.
*/
static PrivateRefCountEntry *
NewPrivateRefCountEntry(Buffer buffer)
{
PrivateRefCountEntry *res;
/* only allowed to be called when a reservation has been made */
Assert(ReservedRefCountEntry != NULL);
/* use up the reserved entry */
res = ReservedRefCountEntry;
ReservedRefCountEntry = NULL;
/* and fill it */
res->buffer = buffer;
res->refcount = 0;
return res;
}
/*
* Return the PrivateRefCount entry for the passed buffer.
*
* Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
* do_move is true, and the entry resides in the hashtable the entry is
* optimized for frequent access by moving it to the array.
*/
static PrivateRefCountEntry *
GetPrivateRefCountEntry(Buffer buffer, bool do_move)
{
PrivateRefCountEntry *res;
int i;
Assert(BufferIsValid(buffer));
Assert(!BufferIsLocal(buffer));
/*
* First search for references in the array, that'll be sufficient in the
* majority of cases.
*/
for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
{
res = &PrivateRefCountArray[i];
if (res->buffer == buffer)
return res;
}
/*
* By here we know that the buffer, if already pinned, isn't residing in
* the array.
*
* Only look up the buffer in the hashtable if we've previously overflowed
* into it.
*/
if (PrivateRefCountOverflowed == 0)
return NULL;
res = hash_search(PrivateRefCountHash,
(void *) &buffer,
HASH_FIND,
NULL);
if (res == NULL)
return NULL;
else if (!do_move)
{
/* caller doesn't want us to move the hash entry into the array */
return res;
}
else
{
/* move buffer from hashtable into the free array slot */
bool found;
PrivateRefCountEntry *free;
/* Ensure there's a free array slot */
ReservePrivateRefCountEntry();
/* Use up the reserved slot */
Assert(ReservedRefCountEntry != NULL);
free = ReservedRefCountEntry;
ReservedRefCountEntry = NULL;
Assert(free->buffer == InvalidBuffer);
/* and fill it */
free->buffer = buffer;
free->refcount = res->refcount;
/* delete from hashtable */
hash_search(PrivateRefCountHash,
(void *) &buffer,
HASH_REMOVE,
&found);
Assert(found);
Assert(PrivateRefCountOverflowed > 0);
PrivateRefCountOverflowed--;
return free;
}
}
/*
* Returns how many times the passed buffer is pinned by this backend.
*
* Only works for shared memory buffers!
*/
static inline int32
GetPrivateRefCount(Buffer buffer)
{
PrivateRefCountEntry *ref;
Assert(BufferIsValid(buffer));
Assert(!BufferIsLocal(buffer));
/*
* Not moving the entry - that's ok for the current users, but we might
* want to change this one day.
*/
ref = GetPrivateRefCountEntry(buffer, false);
if (ref == NULL)
return 0;
return ref->refcount;
}
/*
* Release resources used to track the reference count of a buffer which we no
* longer have pinned and don't want to pin again immediately.
*/
static void
ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
{
Assert(ref->refcount == 0);
if (ref >= &PrivateRefCountArray[0] &&
ref < &PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES])
{
ref->buffer = InvalidBuffer;
/*
* Mark the just used entry as reserved - in many scenarios that
* allows us to avoid ever having to search the array/hash for free
* entries.
*/
ReservedRefCountEntry = ref;
}
else
{
bool found;
Buffer buffer = ref->buffer;
hash_search(PrivateRefCountHash,
(void *) &buffer,
HASH_REMOVE,
&found);
Assert(found);
Assert(PrivateRefCountOverflowed > 0);
PrivateRefCountOverflowed--;
}
}
/*
* BufferIsPinned
* True iff the buffer is pinned (also checks for valid buffer number).
*
* NOTE: what we check here is that *this* backend holds a pin on
* the buffer. We do not care whether some other backend does.
*/
#define BufferIsPinned(bufnum) \
( \
!BufferIsValid(bufnum) ? \
false \
: \
BufferIsLocal(bufnum) ? \
(LocalRefCount[-(bufnum) - 1] > 0) \
: \
(GetPrivateRefCount(bufnum) > 0) \
)
static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence,
ForkNumber forkNum, BlockNumber blockNum,
ReadBufferMode mode, BufferAccessStrategy strategy,
bool *hit);
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
static void PinBuffer_Locked(BufferDesc *buf);
static void UnpinBuffer(BufferDesc *buf, bool fixOwner);
static void BufferSync(int flags);
static int SyncOneBuffer(int buf_id, bool skip_recently_used);
static void WaitIO(BufferDesc *buf);
static bool StartBufferIO(BufferDesc *buf, bool forInput);
static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
int set_flag_bits);
static void shared_buffer_write_error_callback(void *arg);
static void local_buffer_write_error_callback(void *arg);
static BufferDesc *BufferAlloc(SMgrRelation smgr,
char relpersistence,
ForkNumber forkNum,
BlockNumber blockNum,
BufferAccessStrategy strategy,
bool *foundPtr);
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln);
static void AtProcExit_Buffers(int code, Datum arg);
static void CheckForBufferLeaks(void);
static int rnode_comparator(const void *p1, const void *p2);
/*
* ComputeIoConcurrency -- get the number of pages to prefetch for a given
* number of spindles.
*/
bool
ComputeIoConcurrency(int io_concurrency, double *target)
{
double new_prefetch_pages = 0.0;
int i;
/*
* Make sure the io_concurrency value is within valid range; it may have
* been forced with a manual pg_tablespace update.
*/
io_concurrency = Min(Max(io_concurrency, 0), MAX_IO_CONCURRENCY);
/*----------
* The user-visible GUC parameter is the number of drives (spindles),
* which we need to translate to a number-of-pages-to-prefetch target.
* The target value is stashed in *extra and then assigned to the actual
* variable by assign_effective_io_concurrency.
*
* The expected number of prefetch pages needed to keep N drives busy is:
*
* drives | I/O requests
* -------+----------------
* 1 | 1
* 2 | 2/1 + 2/2 = 3
* 3 | 3/1 + 3/2 + 3/3 = 5 1/2
* 4 | 4/1 + 4/2 + 4/3 + 4/4 = 8 1/3
* n | n * H(n)
*
* This is called the "coupon collector problem" and H(n) is called the
* harmonic series. This could be approximated by n * ln(n), but for
* reasonable numbers of drives we might as well just compute the series.
*
* Alternatively we could set the target to the number of pages necessary
* so that the expected number of active spindles is some arbitrary
* percentage of the total. This sounds the same but is actually slightly
* different. The result ends up being ln(1-P)/ln((n-1)/n) where P is
* that desired fraction.
*
* Experimental results show that both of these formulas aren't aggressive
* enough, but we don't really have any better proposals.
*
* Note that if io_concurrency = 0 (disabled), we must set target = 0.
*----------
*/
for (i = 1; i <= io_concurrency; i++)
new_prefetch_pages += (double) io_concurrency / (double) i;
*target = new_prefetch_pages;
/* This range check shouldn't fail, but let's be paranoid */
return (new_prefetch_pages > 0.0 && new_prefetch_pages < (double) INT_MAX);
}
/*
* PrefetchBuffer -- initiate asynchronous read of a block of a relation
*
* This is named by analogy to ReadBuffer but doesn't actually allocate a
* buffer. Instead it tries to ensure that a future ReadBuffer for the given
* block will not be delayed by the I/O. Prefetching is optional.
* No-op if prefetching isn't compiled in.
*/
void
PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
{
#ifdef USE_PREFETCH
Assert(RelationIsValid(reln));
Assert(BlockNumberIsValid(blockNum));
/* Open it at the smgr level if not already done */
RelationOpenSmgr(reln);
if (RelationUsesLocalBuffers(reln))
{
/* see comments in ReadBufferExtended */
if (RELATION_IS_OTHER_TEMP(reln))
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot access temporary tables of other sessions")));
/* pass it off to localbuf.c */
LocalPrefetchBuffer(reln->rd_smgr, forkNum, blockNum);
}
else
{
BufferTag newTag; /* identity of requested block */
uint32 newHash; /* hash value for newTag */
LWLock *newPartitionLock; /* buffer partition lock for it */
int buf_id;
/* create a tag so we can lookup the buffer */
INIT_BUFFERTAG(newTag, reln->rd_smgr->smgr_rnode.node,
forkNum, blockNum);
/* determine its hash code and partition lock ID */
newHash = BufTableHashCode(&newTag);
newPartitionLock = BufMappingPartitionLock(newHash);
/* see if the block is in the buffer pool already */
LWLockAcquire(newPartitionLock, LW_SHARED);
buf_id = BufTableLookup(&newTag, newHash);
LWLockRelease(newPartitionLock);
/* If not in buffers, initiate prefetch */
if (buf_id < 0)
smgrprefetch(reln->rd_smgr, forkNum, blockNum);
/*
* If the block *is* in buffers, we do nothing. This is not really
* ideal: the block might be just about to be evicted, which would be
* stupid since we know we are going to need it soon. But the only
* easy answer is to bump the usage_count, which does not seem like a
* great solution: when the caller does ultimately touch the block,
* usage_count would get bumped again, resulting in too much
* favoritism for blocks that are involved in a prefetch sequence. A
* real fix would involve some additional per-buffer state, and it's
* not clear that there's enough of a problem to justify that.
*/
}
#endif /* USE_PREFETCH */
}
/*
* ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
* fork with RBM_NORMAL mode and default strategy.
*/
Buffer
ReadBuffer(Relation reln, BlockNumber blockNum)
{
return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
}
/*
* ReadBufferExtended -- returns a buffer containing the requested
* block of the requested relation. If the blknum
* requested is P_NEW, extend the relation file and
* allocate a new block. (Caller is responsible for
* ensuring that only one backend tries to extend a
* relation at the same time!)
*
* Returns: the buffer number for the buffer containing
* the block read. The returned buffer has been pinned.
* Does not return on error --- elog's instead.
*
* Assume when this function is called, that reln has been opened already.
*
* In RBM_NORMAL mode, the page is read from disk, and the page header is
* validated. An error is thrown if the page header is not valid. (But
* note that an all-zero page is considered "valid"; see PageIsVerified().)
*
* RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
* valid, the page is zeroed instead of throwing an error. This is intended
* for non-critical data, where the caller is prepared to repair errors.
*
* In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
* filled with zeros instead of reading it from disk. Useful when the caller
* is going to fill the page from scratch, since this saves I/O and avoids
* unnecessary failure if the page-on-disk has corrupt page headers.
* The page is returned locked to ensure that the caller has a chance to
* initialize the page before it's made visible to others.
* Caution: do not use this mode to read a page that is beyond the relation's
* current physical EOF; that is likely to cause problems in md.c when
* the page is modified and written out. P_NEW is OK, though.
*
* RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
* a cleanup-strength lock on the page.
*
* RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
*
* If strategy is not NULL, a nondefault buffer access strategy is used.
* See buffer/README for details.
*/
Buffer
ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
ReadBufferMode mode, BufferAccessStrategy strategy)
{
bool hit;
Buffer buf;
/* Open it at the smgr level if not already done */
RelationOpenSmgr(reln);
/*
* Reject attempts to read non-local temporary relations; we would be
* likely to get wrong data since we have no visibility into the owning
* session's local buffers.
*/
if (RELATION_IS_OTHER_TEMP(reln))
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot access temporary tables of other sessions")));
/*
* Read the buffer, and update pgstat counters to reflect a cache hit or
* miss.
*/
pgstat_count_buffer_read(reln);
buf = ReadBuffer_common(reln->rd_smgr, reln->rd_rel->relpersistence,
forkNum, blockNum, mode, strategy, &hit);
if (hit)
pgstat_count_buffer_hit(reln);
return buf;
}
/*
* ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
* a relcache entry for the relation.
*
* NB: At present, this function may only be used on permanent relations, which
* is OK, because we only use it during XLOG replay. If in the future we
* want to use it on temporary or unlogged relations, we could pass additional
* parameters.
*/
Buffer
ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
BlockNumber blockNum, ReadBufferMode mode,
BufferAccessStrategy strategy)
{
bool hit;
SMgrRelation smgr = smgropen(rnode, InvalidBackendId);
Assert(InRecovery);
return ReadBuffer_common(smgr, RELPERSISTENCE_PERMANENT, forkNum, blockNum,
mode, strategy, &hit);
}
/*
* ReadBuffer_common -- common logic for all ReadBuffer variants
*
* *hit is set to true if the request was satisfied from shared buffer cache.
*/
static Buffer
ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
BlockNumber blockNum, ReadBufferMode mode,
BufferAccessStrategy strategy, bool *hit)
{
BufferDesc *bufHdr;
Block bufBlock;
bool found;
bool isExtend;
bool isLocalBuf = SmgrIsTemp(smgr);
*hit = false;
/* Make sure we will have room to remember the buffer pin */
ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
isExtend = (blockNum == P_NEW);
TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
smgr->smgr_rnode.node.spcNode,
smgr->smgr_rnode.node.dbNode,
smgr->smgr_rnode.node.relNode,
smgr->smgr_rnode.backend,
isExtend);
/* Substitute proper block number if caller asked for P_NEW */
if (isExtend)
blockNum = smgrnblocks(smgr, forkNum);
if (isLocalBuf)
{
bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
if (found)
pgBufferUsage.local_blks_hit++;
else
pgBufferUsage.local_blks_read++;
}
else
{
/*
* lookup the buffer. IO_IN_PROGRESS is set if the requested block is
* not currently in memory.
*/
bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
strategy, &found);
if (found)
pgBufferUsage.shared_blks_hit++;
else
pgBufferUsage.shared_blks_read++;
}
/* At this point we do NOT hold any locks. */
/* if it was already in the buffer pool, we're done */
if (found)
{
if (!isExtend)
{
/* Just need to update stats before we exit */
*hit = true;
VacuumPageHit++;
if (VacuumCostActive)
VacuumCostBalance += VacuumCostPageHit;
TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
smgr->smgr_rnode.node.spcNode,
smgr->smgr_rnode.node.dbNode,
smgr->smgr_rnode.node.relNode,
smgr->smgr_rnode.backend,
isExtend,
found);
/*
* In RBM_ZERO_AND_LOCK mode the caller expects the page to be
* locked on return.
*/
if (!isLocalBuf)
{
if (mode == RBM_ZERO_AND_LOCK)
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr),
LW_EXCLUSIVE);
else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
}
return BufferDescriptorGetBuffer(bufHdr);
}
/*
* We get here only in the corner case where we are trying to extend
* the relation but we found a pre-existing buffer marked BM_VALID.
* This can happen because mdread doesn't complain about reads beyond
* EOF (when zero_damaged_pages is ON) and so a previous attempt to
* read a block beyond EOF could have left a "valid" zero-filled
* buffer. Unfortunately, we have also seen this case occurring
* because of buggy Linux kernels that sometimes return an
* lseek(SEEK_END) result that doesn't account for a recent write. In
* that situation, the pre-existing buffer would contain valid data
* that we don't want to overwrite. Since the legitimate case should
* always have left a zero-filled buffer, complain if not PageIsNew.
*/
bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
if (!PageIsNew((Page) bufBlock))
ereport(ERROR,
(errmsg("unexpected data beyond EOF in block %u of relation %s",
b
|