Perform calls to the ->writepage() asynchronously. VM scanner starts pageout for dirty pages found at tail of the inactive list during scan. It is supposed (or at least desired) that under normal conditions amount of such write back is small. Even if few pages are paged out by scanner, they still stall "direct reclaim" path (__alloc_pages()->try_to_free_pages()->...->shrink_list()->writepage()), and to decrease allocation latency it makes sense to perform pageout asynchronously. Current design is very simple: at the boot-up fixed number of pageout threads is started. If shrink_list() decides that page is eligible for the asynchronous pageout, it is placed into shared queue and later processed by one of pageout threads. Most interesting part of this patch is async_writepage() that decides when page should be paged out asynchronously. mm/vmscan.c | 162 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- page_alloc.c | 0 2 files changed, 159 insertions(+), 3 deletions(-) diff -puN mm/vmscan.c~async-writepage mm/vmscan.c --- bk-linux/mm/vmscan.c~async-writepage 2004-11-01 00:28:24.046731128 +0300 +++ bk-linux-nikita/mm/vmscan.c 2004-11-01 00:28:24.056729608 +0300 @@ -76,6 +76,21 @@ struct scan_control { }; /* + * Asynchronous writepage tunables. + */ +enum { + KAIO_THROTTLE = 128, + KAIO_CLUSTER_SIZE = 4, + KAIO_THREADS_NR = 4 +}; + +static spinlock_t kaio_queue_lock = SPIN_LOCK_UNLOCKED; +static unsigned int kaio_nr_requests = 0; +static unsigned int kaio_threads_active = KAIO_THREADS_NR; +static LIST_HEAD(kaio_queue); +static DECLARE_WAIT_QUEUE_HEAD(kaio_wait); + +/* * The list of shrinker callbacks used by to apply pressure to * ageable caches. */ @@ -371,13 +386,51 @@ static pageout_t pageout(struct page *pa } /* + * check whether writepage should be done asynchronously by kaiod. + */ +static int +async_writepage(struct page *page, int nr_dirty) +{ + /* goal of doing writepage asynchronously is to decrease latency of + * memory allocations involving direct reclaim, which is inapplicable + * to the kswapd */ + if (current_is_kswapd()) + return 0; + /* limit number of pending async-writepage requests */ + else if (kaio_nr_requests > KAIO_THROTTLE) + return 0; + /* if we are under memory pressure---do pageout synchronously to + * throttle scanner. */ + else if (page_zone(page)->prev_priority != DEF_PRIORITY) + return 0; + /* if expected number of writepage requests submitted by this + * invocation of shrink_list() is small enough---do them + * asynchronously */ + else if (nr_dirty <= KAIO_CLUSTER_SIZE) + return 1; + else + return 0; +} + +static void +send_page_to_kaiod(struct page *page) +{ + spin_lock(&kaio_queue_lock); + list_add_tail(&page->lru, &kaio_queue); + kaio_nr_requests ++; + spin_unlock(&kaio_queue_lock); +} + +/* * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed */ -static int shrink_list(struct list_head *page_list, struct scan_control *sc) +static int shrink_list(struct list_head *page_list, struct scan_control *sc, + int nr_dirty) { LIST_HEAD(ret_pages); struct pagevec freed_pvec; int pgactivate = 0; + int pgaio = 0; int reclaimed = 0; cond_resched(); @@ -447,6 +500,12 @@ static int shrink_list(struct list_head goto keep_locked; if (laptop_mode && !sc->may_writepage) goto keep_locked; + if (async_writepage(page, nr_dirty)) { + pgaio ++; + unlock_page(page); + send_page_to_kaiod(page); + continue; + } /* Page is dirty, try to write it out here */ switch(pageout(page, mapping)) { @@ -545,6 +604,8 @@ keep: list_add(&page->lru, &ret_pages); BUG_ON(PageLRU(page)); } + if (pgaio > 0) + wake_up_interruptible(&kaio_wait); list_splice(&ret_pages, page_list); if (pagevec_count(&freed_pvec)) __pagevec_release_nonlru(&freed_pvec); @@ -578,6 +639,7 @@ static void shrink_cache(struct zone *zo int nr_taken = 0; int nr_scan = 0; int nr_freed; + int nr_dirty = 0; while (nr_scan++ < SWAP_CLUSTER_MAX && !list_empty(&zone->inactive_list)) { @@ -600,6 +662,8 @@ static void shrink_cache(struct zone *zo } list_add(&page->lru, &page_list); nr_taken++; + if (PageDirty(page)) + nr_dirty++; } zone->nr_inactive -= nr_taken; zone->pages_scanned += nr_taken; @@ -613,7 +677,7 @@ static void shrink_cache(struct zone *zo mod_page_state_zone(zone, pgscan_kswapd, nr_scan); else mod_page_state_zone(zone, pgscan_direct, nr_scan); - nr_freed = shrink_list(&page_list, sc); + nr_freed = shrink_list(&page_list, sc, nr_dirty); if (current_is_kswapd()) mod_page_state(kswapd_steal, nr_freed); mod_page_state_zone(zone, pgsteal, nr_freed); @@ -647,7 +711,6 @@ done: pagevec_release(&pvec); } - /* move pages from @page_list to the @spot, that should be somewhere on the * @zone->active_list */ static int @@ -1185,6 +1248,96 @@ out: return total_reclaimed; } +static int kaiod(void *p) +{ + daemonize("kaiod%i", (int)p); + + current->flags |= PF_MEMALLOC|PF_KSWAPD; + + while (1) { + DEFINE_WAIT(wait); + LIST_HEAD(todo); + LIST_HEAD(done); + struct pagevec pvec; + struct zone *zone; + struct page *page; + + if (current->flags & PF_FREEZE) + refrigerator(PF_FREEZE); + + pagevec_init(&pvec, 1); + + spin_lock(&kaio_queue_lock); + while (kaio_nr_requests == 0) { + prepare_to_wait_exclusive(&kaio_wait, + &wait, TASK_INTERRUPTIBLE); + -- kaio_threads_active; + spin_unlock(&kaio_queue_lock); + schedule(); + finish_wait(&kaio_wait, &wait); + spin_lock(&kaio_queue_lock); + ++ kaio_threads_active; + } + list_splice_init(&kaio_queue, &todo); + kaio_nr_requests = 0; + spin_unlock(&kaio_queue_lock); + while (!list_empty(&todo)) { + pageout_t outcome; + + page = lru_to_page(&todo); + list_del(&page->lru); + + if (TestSetPageLocked(page)) + outcome = PAGE_SUCCESS; + else if (PageWriteback(page)) + outcome = PAGE_KEEP; + else if (!page_referenced(page, 1) && PageDirty(page)) + outcome = pageout(page, page_mapping(page)); + else + outcome = PAGE_KEEP; + + switch (outcome) { + case PAGE_ACTIVATE: + SetPageActive(page); + case PAGE_KEEP: + case PAGE_CLEAN: + unlock_page(page); + case PAGE_SUCCESS: + list_add(&page->lru, &done); + BUG_ON(PageLRU(page)); + } + } + zone = NULL; + while (!list_empty(&done)) { + page = lru_to_page(&done); + if (page_zone(page) != zone) { + if (zone != NULL) + spin_unlock_irq(&zone->lru_lock); + zone = page_zone(page); + spin_lock_irq(&zone->lru_lock); + } + if (TestSetPageLRU(page)) + BUG(); + list_del(&page->lru); + if (PageActive(page)) { + if (PageSkipped(page)) + ClearPageSkipped(page); + add_page_to_active_list(zone, page); + } else { + add_page_to_inactive_list(zone, page); + } + if (!pagevec_add(&pvec, page)) { + spin_unlock_irq(&zone->lru_lock); + __pagevec_release(&pvec); + spin_lock_irq(&zone->lru_lock); + } + } + if (zone != NULL) + spin_unlock_irq(&zone->lru_lock); + pagevec_release(&pvec); + } +} + /* * The background pageout daemon, started as a kernel thread * from the init process. @@ -1308,11 +1461,14 @@ static int __devinit cpu_callback(struct static int __init kswapd_init(void) { + int i; pg_data_t *pgdat; swap_setup(); for_each_pgdat(pgdat) pgdat->kswapd = find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL)); + for (i = 0; i < KAIO_THREADS_NR; ++i) + kernel_thread(kaiod, (void *)i, CLONE_KERNEL); total_memory = nr_free_pagecache_pages(); hotcpu_notifier(cpu_callback, 0); return 0; diff -puN include/linux/sched.h~async-writepage include/linux/sched.h diff -puN include/linux/page-flags.h~async-writepage include/linux/page-flags.h diff -puN mm/page_alloc.c~async-writepage mm/page_alloc.c _