fs/mpage.c | 103 +++++++++++++++++++++++----------------------- include/linux/writeback.h | 6 ++ mm/vmscan.c | 74 ++++++++++++++++++++++++++++++++- 3 files changed, 131 insertions(+), 52 deletions(-) diff -puN mm/vmscan.c~cluster-pageout mm/vmscan.c --- bk-linux/mm/vmscan.c~cluster-pageout 2004-11-08 15:08:29.706526504 +0300 +++ bk-linux-nikita/mm/vmscan.c 2004-11-08 15:08:29.748520120 +0300 @@ -287,6 +287,78 @@ static void handle_write_error(struct ad unlock_page(page); } +enum { + PAGE_CLUSTER_WING = 16, + PAGE_CLUSTER_SIZE = 2 * PAGE_CLUSTER_WING, +}; + +enum { + PIVOT_RET_MAGIC = 42 +}; + +static int pageout_cluster(struct page *page, struct address_space *mapping, + struct writeback_control *wbc) +{ + pgoff_t punct; + pgoff_t start; + pgoff_t end; + struct page *opage = page; + + if (PageSwapCache(page) || + (!current_is_kswapd() && + bdi_write_congested(mapping->backing_dev_info))) + return mapping->a_ops->writepage(page, wbc); + + wbc->pivot = page; + punct = page->index; + spin_lock_irq(&mapping->tree_lock); + for (start = punct - 1; + start < punct && punct - start <= PAGE_CLUSTER_WING; -- start) { + page = radix_tree_lookup(&mapping->page_tree, start); + if (page == NULL || !PageDirty(page)) + /* + * no suitable page, stop cluster at this point + */ + break; + if ((start % PAGE_CLUSTER_SIZE) == 0) + /* + * we reached aligned page. + */ + -- start; + break; + } + ++ start; + for (end = punct + 1; + end > punct && end - start < PAGE_CLUSTER_SIZE; ++ end) { + /* + * XXX nikita: consider find_get_pages_tag() + */ + page = radix_tree_lookup(&mapping->page_tree, end); + if (page == NULL || !PageDirty(page)) + /* + * no suitable page, stop cluster at this point + */ + break; + } + spin_unlock_irq(&mapping->tree_lock); + -- end; + wbc->pivot_ret = PIVOT_RET_MAGIC; /* magic */ + if (end > start) { + wbc->start = ((loff_t)start) << PAGE_CACHE_SHIFT; + wbc->end = ((loff_t)end) << PAGE_CACHE_SHIFT; + wbc->end += PAGE_CACHE_SIZE - 1; + wbc->nr_to_write = end - start + 1; + do_writepages(mapping, wbc); + } + if (wbc->pivot_ret == PIVOT_RET_MAGIC) + /* + * single page, or ->writepages() skipped pivot for any + * reason: just call ->writepage() + */ + wbc->pivot_ret = mapping->a_ops->writepage(opage, wbc); + return wbc->pivot_ret; +} + /* * Called by shrink_list() for each dirty page. Calls ->writepage(). */ @@ -357,7 +429,7 @@ static pageout_t pageout(struct page *pa ClearPageSkipped(page); SetPageReclaim(page); - res = mapping->a_ops->writepage(page, &wbc); + res = pageout_cluster(page, mapping, &wbc); if (res < 0) handle_write_error(mapping, page, res); diff -puN include/linux/writeback.h~cluster-pageout include/linux/writeback.h --- bk-linux/include/linux/writeback.h~cluster-pageout 2004-11-08 15:08:29.709526048 +0300 +++ bk-linux-nikita/include/linux/writeback.h 2004-11-08 15:08:29.748520120 +0300 @@ -55,6 +55,12 @@ struct writeback_control { unsigned encountered_congestion:1; /* An output: a queue is full */ unsigned for_kupdate:1; /* A kupdate writeback */ unsigned for_reclaim:1; /* Invoked from the page allocator */ + /* if non-NULL, page already locked by ->writepages() + * caller. ->writepages() should use trylock on all other pages it + * submits for IO */ + struct page *pivot; + /* if ->pivot is not NULL, result for pivot page is stored here */ + int pivot_ret; }; /* diff -puN fs/mpage.c~cluster-pageout fs/mpage.c --- bk-linux/fs/mpage.c~cluster-pageout 2004-11-08 15:08:29.744520728 +0300 +++ bk-linux-nikita/fs/mpage.c 2004-11-08 15:08:29.749519968 +0300 @@ -126,8 +126,8 @@ mpage_alloc(struct block_device *bdev, * them. So when the buffer is up to date and the page size == block size, * this marks the page up to date instead of adding new buffers. */ -static void -map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block) +static void +map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block) { struct inode *inode = page->mapping->host; struct buffer_head *page_bh, *head; @@ -138,9 +138,9 @@ map_buffer_to_page(struct page *page, st * don't make any buffers if there is only one buffer on * the page and the page just needs to be set up to date */ - if (inode->i_blkbits == PAGE_CACHE_SHIFT && + if (inode->i_blkbits == PAGE_CACHE_SHIFT && buffer_uptodate(bh)) { - SetPageUptodate(page); + SetPageUptodate(page); return; } create_empty_buffers(page, 1 << inode->i_blkbits, 0); @@ -197,7 +197,7 @@ map_buffer_to_page(struct page *page, st * 12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16 * because the indirect block has to be read to get the mappings of blocks * 13,14,15,16. Obviously, this impacts performance. - * + * * So what we do it to allow the filesystem's get_block() function to set * BH_Boundary when it maps block 11. BH_Boundary says: mapping of the block * after this one will require I/O against a block which is probably close to @@ -380,7 +380,7 @@ EXPORT_SYMBOL(mpage_readpage); * * If all blocks are found to be contiguous then the page can go into the * BIO. Otherwise fall back to the mapping's writepage(). - * + * * FIXME: This code wants an estimate of how many pages are still to be * written, so it can intelligently allocate a suitably-sized BIO. For now, * just allocate full-size (16-page) BIOs. @@ -407,6 +407,7 @@ mpage_writepage(struct bio *bio, struct struct buffer_head map_bh; loff_t i_size = i_size_read(inode); + *ret = 0; if (page_has_buffers(page)) { struct buffer_head *head = page_buffers(page); struct buffer_head *bh = head; @@ -581,15 +582,6 @@ confused: if (bio) bio = mpage_bio_submit(WRITE, bio); *ret = page->mapping->a_ops->writepage(page, wbc); - /* - * The caller has a ref on the inode, so *mapping is stable - */ - if (*ret) { - if (*ret == -ENOSPC) - set_bit(AS_ENOSPC, &mapping->flags); - else - set_bit(AS_EIO, &mapping->flags); - } out: return bio; } @@ -597,7 +589,7 @@ out: /** * mpage_writepages - walk the list of dirty pages of the given * address space and writepage() all of them. - * + * * @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write * @get_block: the filesystem's block mapper function. @@ -665,50 +657,59 @@ retry: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - /* - * At this point we hold neither mapping->tree_lock nor - * lock on the page itself: the page may be truncated or - * invalidated (changing page->mapping to NULL), or even - * swizzled back from swapper_space to tmpfs file - * mapping - */ - - lock_page(page); + if (page != wbc->pivot) { + /* + * At this point we hold neither + * mapping->tree_lock nor lock on the page + * itself: the page may be truncated or + * invalidated (changing page->mapping to + * NULL), or even swizzled back from + * swapper_space to tmpfs file mapping + */ - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - continue; - } + if (wbc->pivot != NULL) { + if (unlikely(TestSetPageLocked(page))) + continue; + } else + lock_page(page); + + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + continue; + } - if (unlikely(is_range) && page->index > end) { - done = 1; - unlock_page(page); - continue; - } + if (unlikely(is_range) && page->index > end) { + done = 1; + unlock_page(page); + continue; + } - if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); - if (PageWriteback(page) || - !clear_page_dirty_for_io(page)) { - unlock_page(page); - continue; + if (PageWriteback(page) || + !clear_page_dirty_for_io(page)) { + unlock_page(page); + continue; + } } - - if (writepage) { + if (writepage) ret = (*writepage)(page, wbc); - if (ret) { - if (ret == -ENOSPC) - set_bit(AS_ENOSPC, - &mapping->flags); - else - set_bit(AS_EIO, - &mapping->flags); - } - } else { + else bio = mpage_writepage(bio, page, get_block, &last_block_in_bio, &ret, wbc); + if (ret) { + /* + * The caller has a ref on the inode, so + * *mapping is stable + */ + if (ret == -ENOSPC) + set_bit(AS_ENOSPC, &mapping->flags); + else + set_bit(AS_EIO, &mapping->flags); } + if (page == wbc->pivot) + wbc->pivot_ret = ret; if (ret || (--(wbc->nr_to_write) <= 0)) done = 1; if (wbc->nonblocking && bdi_write_congested(bdi)) { _