fs/mpage.c                |  103 +++++++++++++++++++++++-----------------------
 include/linux/writeback.h |    6 ++
 mm/vmscan.c               |   74 ++++++++++++++++++++++++++++++++-
 3 files changed, 131 insertions(+), 52 deletions(-)

diff -puN mm/vmscan.c~cluster-pageout mm/vmscan.c
--- bk-linux/mm/vmscan.c~cluster-pageout	2004-11-08 15:08:29.706526504 +0300
+++ bk-linux-nikita/mm/vmscan.c	2004-11-08 15:08:29.748520120 +0300
@@ -287,6 +287,78 @@ static void handle_write_error(struct ad
 	unlock_page(page);
 }
 
+enum {
+	PAGE_CLUSTER_WING = 16,
+	PAGE_CLUSTER_SIZE = 2 * PAGE_CLUSTER_WING,
+};
+
+enum {
+	PIVOT_RET_MAGIC = 42
+};
+
+static int pageout_cluster(struct page *page, struct address_space *mapping,
+			   struct writeback_control *wbc)
+{
+	pgoff_t punct;
+	pgoff_t start;
+	pgoff_t end;
+	struct page *opage = page;
+
+	if (PageSwapCache(page) ||
+	    (!current_is_kswapd() &&
+	     bdi_write_congested(mapping->backing_dev_info)))
+		return mapping->a_ops->writepage(page, wbc);
+
+	wbc->pivot = page;
+	punct = page->index;
+	spin_lock_irq(&mapping->tree_lock);
+	for (start = punct - 1;
+	     start < punct && punct - start <= PAGE_CLUSTER_WING; -- start) {
+		page = radix_tree_lookup(&mapping->page_tree, start);
+		if (page == NULL || !PageDirty(page))
+			/*
+			 * no suitable page, stop cluster at this point
+			 */
+			break;
+		if ((start % PAGE_CLUSTER_SIZE) == 0)
+			/*
+			 * we reached aligned page.
+			 */
+			-- start;
+			break;
+	}
+	++ start;
+	for (end = punct + 1;
+	     end > punct && end - start < PAGE_CLUSTER_SIZE; ++ end) {
+		/*
+		 * XXX nikita: consider find_get_pages_tag()
+		 */
+		page = radix_tree_lookup(&mapping->page_tree, end);
+		if (page == NULL || !PageDirty(page))
+			/*
+			 * no suitable page, stop cluster at this point
+			 */
+			break;
+	}
+	spin_unlock_irq(&mapping->tree_lock);
+	-- end;
+	wbc->pivot_ret = PIVOT_RET_MAGIC; /* magic */
+	if (end > start) {
+		wbc->start = ((loff_t)start) << PAGE_CACHE_SHIFT;
+		wbc->end   = ((loff_t)end) << PAGE_CACHE_SHIFT;
+		wbc->end  += PAGE_CACHE_SIZE - 1;
+		wbc->nr_to_write = end - start + 1;
+		do_writepages(mapping, wbc);
+	}
+	if (wbc->pivot_ret == PIVOT_RET_MAGIC)
+		/*
+		 * single page, or ->writepages() skipped pivot for any
+		 * reason: just call ->writepage()
+		 */
+		wbc->pivot_ret = mapping->a_ops->writepage(opage, wbc);
+	return wbc->pivot_ret;
+}
+
 /*
  * Called by shrink_list() for each dirty page. Calls ->writepage().
  */
@@ -357,7 +429,7 @@ static pageout_t pageout(struct page *pa
 
 		ClearPageSkipped(page);
 		SetPageReclaim(page);
-		res = mapping->a_ops->writepage(page, &wbc);
+		res = pageout_cluster(page, mapping, &wbc);
 
 		if (res < 0)
 			handle_write_error(mapping, page, res);
diff -puN include/linux/writeback.h~cluster-pageout include/linux/writeback.h
--- bk-linux/include/linux/writeback.h~cluster-pageout	2004-11-08 15:08:29.709526048 +0300
+++ bk-linux-nikita/include/linux/writeback.h	2004-11-08 15:08:29.748520120 +0300
@@ -55,6 +55,12 @@ struct writeback_control {
 	unsigned encountered_congestion:1;	/* An output: a queue is full */
 	unsigned for_kupdate:1;			/* A kupdate writeback */
 	unsigned for_reclaim:1;			/* Invoked from the page allocator */
+	/* if non-NULL, page already locked by ->writepages()
+	 * caller. ->writepages() should use trylock on all other pages it
+	 * submits for IO */
+	struct page *pivot;
+	/* if ->pivot is not NULL, result for pivot page is stored here */
+	int pivot_ret;
 };
 
 /*
diff -puN fs/mpage.c~cluster-pageout fs/mpage.c
--- bk-linux/fs/mpage.c~cluster-pageout	2004-11-08 15:08:29.744520728 +0300
+++ bk-linux-nikita/fs/mpage.c	2004-11-08 15:08:29.749519968 +0300
@@ -126,8 +126,8 @@ mpage_alloc(struct block_device *bdev,
  * them.  So when the buffer is up to date and the page size == block size,
  * this marks the page up to date instead of adding new buffers.
  */
-static void 
-map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block) 
+static void
+map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block)
 {
 	struct inode *inode = page->mapping->host;
 	struct buffer_head *page_bh, *head;
@@ -138,9 +138,9 @@ map_buffer_to_page(struct page *page, st
 		 * don't make any buffers if there is only one buffer on
 		 * the page and the page just needs to be set up to date
 		 */
-		if (inode->i_blkbits == PAGE_CACHE_SHIFT && 
+		if (inode->i_blkbits == PAGE_CACHE_SHIFT &&
 		    buffer_uptodate(bh)) {
-			SetPageUptodate(page);    
+			SetPageUptodate(page);
 			return;
 		}
 		create_empty_buffers(page, 1 << inode->i_blkbits, 0);
@@ -197,7 +197,7 @@ map_buffer_to_page(struct page *page, st
  * 	12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16
  * because the indirect block has to be read to get the mappings of blocks
  * 13,14,15,16.  Obviously, this impacts performance.
- * 
+ *
  * So what we do it to allow the filesystem's get_block() function to set
  * BH_Boundary when it maps block 11.  BH_Boundary says: mapping of the block
  * after this one will require I/O against a block which is probably close to
@@ -380,7 +380,7 @@ EXPORT_SYMBOL(mpage_readpage);
  *
  * If all blocks are found to be contiguous then the page can go into the
  * BIO.  Otherwise fall back to the mapping's writepage().
- * 
+ *
  * FIXME: This code wants an estimate of how many pages are still to be
  * written, so it can intelligently allocate a suitably-sized BIO.  For now,
  * just allocate full-size (16-page) BIOs.
@@ -407,6 +407,7 @@ mpage_writepage(struct bio *bio, struct 
 	struct buffer_head map_bh;
 	loff_t i_size = i_size_read(inode);
 
+	*ret = 0;
 	if (page_has_buffers(page)) {
 		struct buffer_head *head = page_buffers(page);
 		struct buffer_head *bh = head;
@@ -581,15 +582,6 @@ confused:
 	if (bio)
 		bio = mpage_bio_submit(WRITE, bio);
 	*ret = page->mapping->a_ops->writepage(page, wbc);
-	/*
-	 * The caller has a ref on the inode, so *mapping is stable
-	 */
-	if (*ret) {
-		if (*ret == -ENOSPC)
-			set_bit(AS_ENOSPC, &mapping->flags);
-		else
-			set_bit(AS_EIO, &mapping->flags);
-	}
 out:
 	return bio;
 }
@@ -597,7 +589,7 @@ out:
 /**
  * mpage_writepages - walk the list of dirty pages of the given
  * address space and writepage() all of them.
- * 
+ *
  * @mapping: address space structure to write
  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
  * @get_block: the filesystem's block mapper function.
@@ -665,50 +657,59 @@ retry:
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
 
-			/*
-			 * At this point we hold neither mapping->tree_lock nor
-			 * lock on the page itself: the page may be truncated or
-			 * invalidated (changing page->mapping to NULL), or even
-			 * swizzled back from swapper_space to tmpfs file
-			 * mapping
-			 */
-
-			lock_page(page);
+			if (page != wbc->pivot) {
+				/*
+				 * At this point we hold neither
+				 * mapping->tree_lock nor lock on the page
+				 * itself: the page may be truncated or
+				 * invalidated (changing page->mapping to
+				 * NULL), or even swizzled back from
+				 * swapper_space to tmpfs file mapping
+				 */
 
-			if (unlikely(page->mapping != mapping)) {
-				unlock_page(page);
-				continue;
-			}
+				if (wbc->pivot != NULL) {
+					if (unlikely(TestSetPageLocked(page)))
+						continue;
+				} else
+					lock_page(page);
+
+				if (unlikely(page->mapping != mapping)) {
+					unlock_page(page);
+					continue;
+				}
 
-			if (unlikely(is_range) && page->index > end) {
-				done = 1;
-				unlock_page(page);
-				continue;
-			}
+				if (unlikely(is_range) && page->index > end) {
+					done = 1;
+					unlock_page(page);
+					continue;
+				}
 
-			if (wbc->sync_mode != WB_SYNC_NONE)
-				wait_on_page_writeback(page);
+				if (wbc->sync_mode != WB_SYNC_NONE)
+					wait_on_page_writeback(page);
 
-			if (PageWriteback(page) ||
-					!clear_page_dirty_for_io(page)) {
-				unlock_page(page);
-				continue;
+				if (PageWriteback(page) ||
+				    !clear_page_dirty_for_io(page)) {
+					unlock_page(page);
+					continue;
+				}
 			}
-
-			if (writepage) {
+			if (writepage)
 				ret = (*writepage)(page, wbc);
-				if (ret) {
-					if (ret == -ENOSPC)
-						set_bit(AS_ENOSPC,
-							&mapping->flags);
-					else
-						set_bit(AS_EIO,
-							&mapping->flags);
-				}
-			} else {
+			else
 				bio = mpage_writepage(bio, page, get_block,
 						&last_block_in_bio, &ret, wbc);
+			if (ret) {
+				/*
+				 * The caller has a ref on the inode, so
+				 * *mapping is stable
+				 */
+				if (ret == -ENOSPC)
+					set_bit(AS_ENOSPC, &mapping->flags);
+				else
+					set_bit(AS_EIO, &mapping->flags);
 			}
+			if (page == wbc->pivot)
+				wbc->pivot_ret = ret;
 			if (ret || (--(wbc->nr_to_write) <= 0))
 				done = 1;
 			if (wbc->nonblocking && bdi_write_congested(bdi)) {

_