Linux read系统调用之 page_cache_sync_readahead()

233 阅读 0 评论 154 点赞

我是靠谱客的博主单纯小蝴蝶，这篇文章主要介绍Linux read系统调用之 page_cache_sync_readahead()，现在分享给大家，希望可以做个参考。

1 page_cache_sync_readahead()

上篇文章说道在 do_generic_file_read() 函数中 label find_page 中调用 page_cache_sync_readahead() 启动同步预读。

void page_cache_sync_readahead(struct address_space *mapping,
struct file_ra_state *ra, struct file *filp,
pgoff_t offset, unsigned long req_size)
{
/* no read-ahead */
if (!ra->ra_pages)
return;
/* be dumb */
if (filp && (filp->f_mode & FMODE_RANDOM)) {
force_page_cache_readahead(mapping, filp, offset, req_size);
return;
}
/* do read-ahead */
ondemand_readahead(mapping, ra, filp, false, offset, req_size);
}

若预读的最大页数为 0 ，则没有预读。
若文件打开的方式为随机方式，调用 force_page_cache_readahead() 。
调用 ondemand_readahead() 启动预读。

2 ondemand_readahead()

static unsigned long
ondemand_readahead(struct address_space *mapping,
struct file_ra_state *ra, struct file *filp,
bool hit_readahead_marker, pgoff_t offset,
unsigned long req_size)
{
unsigned long max = max_sane_readahead(ra->ra_pages);
pgoff_t prev_offset;
/*
* start of file
*/
if (!offset)
goto initial_readahead;
/*
* It's the expected callback offset, assume sequential access.
* Ramp up sizes, and push forward the readahead window.
*/
if ((offset == (ra->start + ra->size - ra->async_size) ||
offset == (ra->start + ra->size))) {
ra->start += ra->size;
ra->size = get_next_ra_size(ra, max);
ra->async_size = ra->size;
goto readit;
}
/*
* Hit a marked page without valid readahead state.
* E.g. interleaved reads.
* Query the pagecache for async_size, which normally equals to
* readahead size. Ramp it up and use it as the new readahead size.
*/
if (hit_readahead_marker) {
pgoff_t start;
rcu_read_lock();
start = page_cache_next_hole(mapping, offset + 1, max);
rcu_read_unlock();
if (!start || start - offset > max)
return 0;
ra->start = start;
ra->size = start - offset;	/* old async_size */
ra->size += req_size;
ra->size = get_next_ra_size(ra, max);
ra->async_size = ra->size;
goto readit;
}
/*
* oversize read
*/
if (req_size > max)
goto initial_readahead;
/*
* sequential cache miss
* trivial case: (offset - prev_offset) == 1
* unaligned reads: (offset - prev_offset) == 0
*/
prev_offset = (unsigned long long)ra->prev_pos >> PAGE_CACHE_SHIFT;
if (offset - prev_offset <= 1UL)
goto initial_readahead;
/*
* Query the page cache and look for the traces(cached history pages)
* that a sequential stream would leave behind.
*/
if (try_context_readahead(mapping, ra, offset, req_size, max))
goto readit;
/*
* standalone, small random read
* Read as is, and do not pollute the readahead state.
*/
return __do_page_cache_readahead(mapping, filp, offset, req_size, 0);
initial_readahead:
ra->start = offset;
ra->size = get_init_ra_size(req_size, max);
ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
readit:
/*
* Will this read hit the readahead marker made by itself?
* If so, trigger the readahead marker hit now, and merge
* the resulted next readahead window into the current one.
*/
if (offset == ra->start && ra->size == ra->async_size) {
ra->async_size = get_next_ra_size(ra, max);
ra->size += ra->async_size;
}
return ra_submit(ra, mapping, filp);
}

具体代码就不分析了，说一下主要逻辑：

首先判断如果是从文件头开始读取的，则认为是顺序读，初始化预读信息。默认设置预读为4个 page。
如果不是从文件头开始读，则判断是否是连续的读取请求，如果是，则扩大预读数量，一般等于上次预读数量 ×2。
否则就是随机的读取，不适用预读，只读取 sys_read 请求的数量。
调用 ra_submit() 提交读请求。

3 ra_submit()

static inline unsigned long ra_submit(struct file_ra_state *ra,
struct address_space *mapping, struct file *filp)
{
return __do_page_cache_readahead(mapping, filp,
ra->start, ra->size, ra->async_size);
}

ra_submit() 函数只是对 __do_page_cache_readahead() 做一层封装。

4 __do_page_cache_readahead()

int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
pgoff_t offset, unsigned long nr_to_read,
unsigned long lookahead_size)
{
struct inode *inode = mapping->host;
struct page *page;
unsigned long end_index;	/* The last page we want to read */
LIST_HEAD(page_pool);
int page_idx;
int ret = 0;
loff_t isize = i_size_read(inode);
if (isize == 0)
goto out;
end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
/*
* Preallocate as many pages as we will need.
*/
for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
pgoff_t page_offset = offset + page_idx;
if (page_offset > end_index)
break;
rcu_read_lock();
page = radix_tree_lookup(&mapping->page_tree, page_offset);
rcu_read_unlock();
if (page && !radix_tree_exceptional_entry(page))
continue;
page = page_cache_alloc_readahead(mapping);
if (!page)
break;
page->index = page_offset;
list_add(&page->lru, &page_pool);
if (page_idx == nr_to_read - lookahead_size)
SetPageReadahead(page);
ret++;
}
/*
* Now start the IO.
We ignore I/O errors - if the page is not
* uptodate then the caller will launch readpage again, and
* will then handle the error.
*/
if (ret)
read_pages(mapping, filp, &page_pool, ret);
BUG_ON(!list_empty(&page_pool));
out:
return ret;
}

在分配内存前，首先检查页面是否已经在 cache 中，因为别的进程可能已经将某些页面读进内存了。
若页面 cache 中没有，则分配内存页，并将页面加入页面池（page_pool）。
当分配到第 nr_to_read - lookahead_size 个页面时，就设置该页面标志 PG_readahead ，以让下一次进行异步预读。
页面准备好，调用 read_pages 读取文件数据。

5 read_pages()

static int read_pages(struct address_space *mapping, struct file *filp,
struct list_head *pages, unsigned nr_pages)
{
struct blk_plug plug;
unsigned page_idx;
int ret;
blk_start_plug(&plug);
//block/blk-core.c/line3033
if (mapping->a_ops->readpages) {
ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);	//fs/ext3/inode.c/line1934
/* Clean up the remaining pages */
put_pages_list(pages);
goto out;
}
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
struct page *page = list_to_page(pages);
list_del(&page->lru);
if (!add_to_page_cache_lru(page, mapping,
page->index, GFP_KERNEL)) {
mapping->a_ops->readpage(filp, page);
}
page_cache_release(page);
}
ret = 0;
out:
blk_finish_plug(&plug);
//block/blk-core.c/line3196
return ret;
}

blk_start_plug() 初始化 plug->list，将 plug->list 插入到 task_struct。
调用各自文件系统实现的 readpages 方法。
调用 blk_finish_plug() 将 plug->list flush 到下一层，同时清空 task_struct 中的 plug->list 。

6 ext3_readpages()

static int
ext3_readpages(struct file *file, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages)
{
return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);	//fs/mpage.c/line356
}

ext3_readpages() 函数对 mpage_readpages() 做了一层封装，注意最后一个参数为 ext3_get_block 为函数指针。
ext3_get_block 函数计算要读的页在磁盘上的 block 号。

7 mpage_readpages()

int
mpage_readpages(struct address_space *mapping, struct list_head *pages,
unsigned nr_pages, get_block_t get_block)
{
struct bio *bio = NULL;
unsigned page_idx;
sector_t last_block_in_bio = 0;
struct buffer_head map_bh;
unsigned long first_logical_block = 0;
map_bh.b_state = 0;
map_bh.b_size = 0;
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
struct page *page = list_entry(pages->prev, struct page, lru);
prefetchw(&page->flags);
list_del(&page->lru);
if (!add_to_page_cache_lru(page, mapping,
page->index, GFP_KERNEL)) {
bio = do_mpage_readpage(bio, page,
nr_pages - page_idx,
&last_block_in_bio, &map_bh,
&first_logical_block,
get_block);
}
page_cache_release(page);
}
BUG_ON(!list_empty(pages));
if (bio)
mpage_bio_submit(READ, bio);
return 0;
}

循环调用 do_mpage_readpage 将要的的页转化成 bio 数据结构，若页内容在磁盘上是连续的，则只用一个 bio。若不连续，使用多个 bio 。bio 是一个单项链表。
调用 mpage_bio_submit() 将 bio submit 。

8 mpage_bio_submit()

static struct bio *mpage_bio_submit(int rw, struct bio *bio)
{
bio->bi_end_io = mpage_end_io;
guard_bio_eod(rw, bio);
submit_bio(rw, bio);
//block/blk-core.c/line1965
return NULL;
}