|
17 | 17 | * pending read. When that isn't possible, the existing pending read is sent
|
18 | 18 | * to StartReadBuffers() so that a new one can begin to form.
|
19 | 19 | *
|
20 |
| - * The algorithm for controlling the look-ahead distance tries to classify the |
21 |
| - * stream into three ideal behaviors: |
| 20 | + * The algorithm for controlling the look-ahead distance is based on recent |
| 21 | + * cache hits and misses: |
22 | 22 | *
|
23 |
| - * A) No I/O is necessary, because the requested blocks are fully cached |
24 |
| - * already. There is no benefit to looking ahead more than one block, so |
25 |
| - * distance is 1. This is the default initial assumption. |
26 |
| - * |
27 |
| - * B) I/O is necessary, but read-ahead advice is undesirable because the |
28 |
| - * access is sequential and we can rely on the kernel's read-ahead heuristics, |
29 |
| - * or impossible because direct I/O is enabled, or the system doesn't support |
30 |
| - * read-ahead advice. There is no benefit in looking ahead more than |
31 |
| - * io_combine_limit, because in this case the only goal is larger read system |
32 |
| - * calls. Looking further ahead would pin many buffers and perform |
33 |
| - * speculative work for no benefit. |
34 |
| - * |
35 |
| - * C) I/O is necessary, it appears to be random, and this system supports |
36 |
| - * read-ahead advice. We'll look further ahead in order to reach the |
37 |
| - * configured level of I/O concurrency. |
38 |
| - * |
39 |
| - * The distance increases rapidly and decays slowly, so that it moves towards |
40 |
| - * those levels as different I/O patterns are discovered. For example, a |
41 |
| - * sequential scan of fully cached data doesn't bother looking ahead, but a |
42 |
| - * sequential scan that hits a region of uncached blocks will start issuing |
43 |
| - * increasingly wide read calls until it plateaus at io_combine_limit. |
| 23 | + * When no I/O is necessary, there is no point in looking ahead more than one |
| 24 | + * block. This is the default initial assumption. Otherwise rapidly increase |
| 25 | + * the distance to try to benefit from I/O combining and I/O concurrency. |
44 | 26 | *
|
45 | 27 | * The main data structure is a circular queue of buffers of size
|
46 | 28 | * max_pinned_buffers plus some extra space for technical reasons, ready to be
|
@@ -336,7 +318,7 @@ read_stream_start_pending_read(ReadStream *stream)
|
336 | 318 | /* Remember whether we need to wait before returning this buffer. */
|
337 | 319 | if (!need_wait)
|
338 | 320 | {
|
339 |
| - /* Look-ahead distance decays, no I/O necessary (behavior A). */ |
| 321 | + /* Look-ahead distance decays, no I/O necessary. */ |
340 | 322 | if (stream->distance > 1)
|
341 | 323 | stream->distance--;
|
342 | 324 | }
|
@@ -517,6 +499,15 @@ read_stream_begin_impl(int flags,
|
517 | 499 | else
|
518 | 500 | max_ios = get_tablespace_io_concurrency(tablespace_id);
|
519 | 501 |
|
| 502 | + /* |
| 503 | + * XXX Since we don't have asynchronous I/O yet, if direct I/O is enabled |
| 504 | + * then just behave as though I/O concurrency is set to 0. Otherwise we |
| 505 | + * would look ahead pinning many buffers for no benefit, for lack of |
| 506 | + * advice and AIO. |
| 507 | + */ |
| 508 | + if (io_direct_flags & IO_DIRECT_DATA) |
| 509 | + max_ios = 0; |
| 510 | + |
520 | 511 | /* Cap to INT16_MAX to avoid overflowing below */
|
521 | 512 | max_ios = Min(max_ios, PG_INT16_MAX);
|
522 | 513 |
|
@@ -637,7 +628,7 @@ read_stream_begin_impl(int flags,
|
637 | 628 | /*
|
638 | 629 | * Skip the initial ramp-up phase if the caller says we're going to be
|
639 | 630 | * reading the whole relation. This way we start out assuming we'll be
|
640 |
| - * doing full io_combine_limit sized reads (behavior B). |
| 631 | + * doing full io_combine_limit sized reads. |
641 | 632 | */
|
642 | 633 | if (flags & READ_STREAM_FULL)
|
643 | 634 | stream->distance = Min(max_pinned_buffers, stream->io_combine_limit);
|
@@ -728,10 +719,10 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
|
728 | 719 | #ifndef READ_STREAM_DISABLE_FAST_PATH
|
729 | 720 |
|
730 | 721 | /*
|
731 |
| - * A fast path for all-cached scans (behavior A). This is the same as the |
732 |
| - * usual algorithm, but it is specialized for no I/O and no per-buffer |
733 |
| - * data, so we can skip the queue management code, stay in the same buffer |
734 |
| - * slot and use singular StartReadBuffer(). |
| 722 | + * A fast path for all-cached scans. This is the same as the usual |
| 723 | + * algorithm, but it is specialized for no I/O and no per-buffer data, so |
| 724 | + * we can skip the queue management code, stay in the same buffer slot and |
| 725 | + * use singular StartReadBuffer(). |
735 | 726 | */
|
736 | 727 | if (likely(stream->fast_path))
|
737 | 728 | {
|
@@ -851,37 +842,20 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
|
851 | 842 | if (++stream->oldest_io_index == stream->max_ios)
|
852 | 843 | stream->oldest_io_index = 0;
|
853 | 844 |
|
854 |
| - if (stream->ios[io_index].op.flags & READ_BUFFERS_ISSUE_ADVICE) |
855 |
| - { |
856 |
| - /* Distance ramps up fast (behavior C). */ |
857 |
| - distance = stream->distance * 2; |
858 |
| - distance = Min(distance, stream->max_pinned_buffers); |
859 |
| - stream->distance = distance; |
| 845 | + /* Look-ahead distance ramps up quickly after we do I/O. */ |
| 846 | + distance = stream->distance * 2; |
| 847 | + distance = Min(distance, stream->max_pinned_buffers); |
| 848 | + stream->distance = distance; |
860 | 849 |
|
861 |
| - /* |
862 |
| - * If we've caught up with the first advice issued for the current |
863 |
| - * sequential region, cancel further advice until the next random |
864 |
| - * jump. The kernel should be able to see the pattern now that |
865 |
| - * we're actually making sequential preadv() calls. |
866 |
| - */ |
867 |
| - if (stream->ios[io_index].op.blocknum == stream->seq_until_processed) |
868 |
| - stream->seq_until_processed = InvalidBlockNumber; |
869 |
| - } |
870 |
| - else |
871 |
| - { |
872 |
| - /* No advice; move towards io_combine_limit (behavior B). */ |
873 |
| - if (stream->distance > stream->io_combine_limit) |
874 |
| - { |
875 |
| - stream->distance--; |
876 |
| - } |
877 |
| - else |
878 |
| - { |
879 |
| - distance = stream->distance * 2; |
880 |
| - distance = Min(distance, stream->io_combine_limit); |
881 |
| - distance = Min(distance, stream->max_pinned_buffers); |
882 |
| - stream->distance = distance; |
883 |
| - } |
884 |
| - } |
| 850 | + /* |
| 851 | + * If we've caught up with the first advice issued for the current |
| 852 | + * sequential region, cancel further advice until the next random |
| 853 | + * jump. The kernel should be able to see the pattern now that we're |
| 854 | + * actually making sequential preadv() calls. |
| 855 | + */ |
| 856 | + if (stream->advice_enabled && |
| 857 | + stream->ios[io_index].op.blocknum == stream->seq_until_processed) |
| 858 | + stream->seq_until_processed = InvalidBlockNumber; |
885 | 859 | }
|
886 | 860 |
|
887 | 861 | #ifdef CLOBBER_FREED_MEMORY
|
|
0 commit comments