--- linux/fs/inode.c.orig Tue Mar 13 12:48:43 2001 +++ linux/fs/inode.c Tue Mar 13 12:48:47 2001 @@ -103,6 +103,7 @@ INIT_LIST_HEAD(&inode->i_dirty_buffers); sema_init(&inode->i_sem, 1); sema_init(&inode->i_zombie, 1); + spin_lock_init(&inode->i_data.page_lock); spin_lock_init(&inode->i_data.i_shared_lock); } } --- linux/mm/vmscan.c.orig Tue Mar 13 12:48:44 2001 +++ linux/mm/vmscan.c Tue Mar 13 12:48:48 2001 @@ -320,27 +320,38 @@ */ struct page * reclaim_page(zone_t * zone) { + struct address_space *mapping; struct page * page = NULL; struct list_head * page_lru; + spinlock_t *pg_lock; int maxscan; - /* - * We only need the pagemap_lru_lock if we don't reclaim the page, - * but we have to grab the pagecache_lock before the pagemap_lru_lock - * to avoid deadlocks and most of the time we'll succeed anyway. + /* The repeating is necessary because we are taking these + * locks out of the normal order. */ - spin_lock(&pagecache_lock); +repeat: spin_lock(&pagemap_lru_lock); maxscan = zone->inactive_clean_pages; while ((page_lru = zone->inactive_clean_list.prev) != &zone->inactive_clean_list && maxscan--) { page = list_entry(page_lru, struct page, lru); + if (!page->mapping) + goto bogus_page; + + pg_lock = PAGECACHE_LOCK(page); + if (!spin_trylock(pg_lock)) { + maxscan++; + spin_unlock(&pagemap_lru_lock); + goto repeat; + } + /* Wrong page on list?! (list corruption, should not happen) */ if (!PageInactiveClean(page)) { printk("VM: reclaim_page, wrong page on list.\n"); list_del(page_lru); page->zone->inactive_clean_pages--; + spin_unlock(pg_lock); continue; } @@ -349,6 +360,7 @@ (!page->buffers && page_count(page) > 1)) { del_page_from_inactive_clean_list(page); add_page_to_active_list(page); + spin_unlock(pg_lock); continue; } @@ -356,6 +368,7 @@ if (page->buffers || PageDirty(page) || TryLockPage(page)) { del_page_from_inactive_clean_list(page); add_page_to_inactive_dirty_list(page); + spin_unlock(pg_lock); continue; } @@ -365,12 +378,18 @@ goto found_page; } - if (page->mapping) { + mapping = page->mapping; + if (mapping) { + spin_lock(&mapping->page_lock); __remove_inode_page(page); + spin_unlock(&mapping->page_lock); goto found_page; } /* We should never ever get here. */ + spin_unlock(pg_lock); + + bogus_page: printk(KERN_ERR "VM: reclaim_page, found unknown page\n"); list_del(page_lru); zone->inactive_clean_pages--; @@ -381,15 +400,16 @@ goto out; found_page: + spin_unlock(pg_lock); del_page_from_inactive_clean_list(page); UnlockPage(page); page->age = PAGE_AGE_START; if (page_count(page) != 1) printk("VM: reclaim_page, found page with count %d!\n", page_count(page)); + out: spin_unlock(&pagemap_lru_lock); - spin_unlock(&pagecache_lock); memory_pressure++; return page; } @@ -700,12 +720,8 @@ page = list_entry(page_lru, struct page, lru); /* Wrong page on list?! (list corruption, should not happen) */ - if (!PageActive(page)) { - printk("VM: refill_inactive, wrong page on list.\n"); - list_del(page_lru); - nr_active_pages--; - continue; - } + if (!PageActive(page)) + BUG(); /* Do aging on the pages. */ if (PageTestandClearReferenced(page)) { @@ -1011,7 +1027,10 @@ * we'll be woken up earlier... */ if (!free_shortage() || !inactive_shortage()) { +repeat_sleep: interruptible_sleep_on_timeout(&kswapd_wait, HZ); + if (!free_shortage() && !inactive_shortage()) + goto repeat_sleep; /* * If we couldn't free enough memory, we see if it was * due to the system just not having enough memory. --- linux/mm/filemap.c.orig Tue Mar 13 12:48:44 2001 +++ linux/mm/filemap.c Tue Mar 13 12:48:48 2001 @@ -42,13 +42,31 @@ atomic_t page_cache_size = ATOMIC_INIT(0); unsigned int page_hash_bits; -struct page **page_hash_table; +struct page_cache_bucket *page_hash_table; -spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED; -/* - * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with - * the pagemap_lru_lock held. +/* Page-cache SMP locking rules: + * + * 1) The identity of a page (mapping, index) is only changed + * under PAGECACHE_LOCK. + * + * 2) The deadlock-free ordering of lock acquisition is + * PAGECACHE_LOCK ==> pagemap_lru_lock ==> mapping->page_lock + * There are cases where two of these locks need to be held + * simultaneously but cannot be obtained in the correct order. + * The way to handle this situation is as follows: + * + * repeat: + * spin_lock(&pagemap_lru_lock); + * some_loop_over_lru_pages() { + * ... + * if (!spin_trylock(PAGECACHE_LOCK(page))) { + * spin_unlock(&pagemap_lru_lock); + * goto repeat; + * } + * ... + * } */ + spinlock_t pagemap_lru_lock = SPIN_LOCK_UNLOCKED; #define CLUSTER_PAGES (1 << page_cluster) @@ -72,15 +90,23 @@ { struct list_head *head = &mapping->clean_pages; + spin_lock(&mapping->page_lock); mapping->nrpages++; list_add(&page->list, head); page->mapping = mapping; + spin_unlock(&mapping->page_lock); } static inline void remove_page_from_inode_queue(struct page * page) { struct address_space * mapping = page->mapping; +#if CONFIG_SMP + if (!spin_is_locked(PAGECACHE_LOCK(page))) + BUG(); + if (!spin_is_locked(&mapping->page_lock)) + BUG(); +#endif mapping->nrpages--; list_del(&page->list); page->mapping = NULL; @@ -113,12 +139,81 @@ void remove_inode_page(struct page *page) { + struct address_space * mapping; + spinlock_t *pg_lock; + if (!PageLocked(page)) PAGE_BUG(page); - spin_lock(&pagecache_lock); + pg_lock = PAGECACHE_LOCK(page); + + spin_lock(pg_lock); + mapping = page->mapping; + spin_lock(&mapping->page_lock); __remove_inode_page(page); - spin_unlock(&pagecache_lock); + spin_unlock(&mapping->page_lock); + spin_unlock(pg_lock); +} + +/* + * Flush clean pages from the pagecache. + */ +void flush_inode_pages (struct inode * inode) +{ + unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT; + struct address_space * mapping = inode->i_mapping; + struct list_head *head, *curr; + struct page * page; + +retry: + head = &inode->i_mapping->clean_pages; + spin_lock(&pagemap_lru_lock); + spin_lock(&mapping->page_lock); + curr = head->next; + + while (curr != head) { + spinlock_t *pg_lock; + + page = list_entry(curr, struct page, list); + curr = curr->next; + + if (page->index == end_index) + continue; + pg_lock = PAGECACHE_LOCK(page); + if (!spin_trylock(pg_lock)) { + spin_unlock(&mapping->page_lock); + spin_unlock(&pagemap_lru_lock); + goto retry; + } + + /* We cannot invalidate a locked page */ + if (TryLockPage(page)) { + spin_unlock(pg_lock); + continue; + } + + /* + * We cannot flush a page if buffers are still active. + */ + if (page->buffers) { + spin_unlock(pg_lock); + spin_unlock(&mapping->page_lock); + spin_unlock(&pagemap_lru_lock); + try_to_free_buffers(page, 2); + UnlockPage(page); + goto retry; + } + + __lru_cache_del(page); + __remove_inode_page(page); + spin_unlock(pg_lock); + + UnlockPage(page); + page_cache_release(page); + } + + spin_unlock(&mapping->page_lock); + spin_unlock(&pagemap_lru_lock); } static inline int sync_page(struct page *page) @@ -135,12 +230,20 @@ */ void __set_page_dirty(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping; + spinlock_t *pg_lock; + + pg_lock = PAGECACHE_LOCK(page); + spin_lock(pg_lock); + + mapping = page->mapping; + spin_lock(&mapping->page_lock); - spin_lock(&pagecache_lock); list_del(&page->list); list_add(&page->list, &mapping->dirty_pages); - spin_unlock(&pagecache_lock); + + spin_unlock(&mapping->page_lock); + spin_unlock(pg_lock); if (mapping->host) mark_inode_dirty_pages(mapping->host); @@ -156,39 +259,58 @@ void invalidate_inode_pages(struct inode * inode) { + struct address_space * mapping = inode->i_mapping; struct list_head *head, *curr; struct page * page; head = &inode->i_mapping->clean_pages; - spin_lock(&pagecache_lock); - spin_lock(&pagemap_lru_lock); +retry: + spin_lock(&pagemap_lru_lock); + spin_lock(&mapping->page_lock); curr = head->next; while (curr != head) { + spinlock_t *pg_lock; + page = list_entry(curr, struct page, list); + pg_lock = PAGECACHE_LOCK(page); + if (!spin_trylock(pg_lock)) { + spin_unlock(&mapping->page_lock); + spin_unlock(&pagemap_lru_lock); + goto retry; + } curr = curr->next; /* We cannot invalidate something in use.. */ - if (page_count(page) != 1) + if (page_count(page) != 1) { + spin_unlock(pg_lock); continue; + } /* ..or dirty.. */ - if (PageDirty(page)) + if (PageDirty(page)) { + spin_unlock(pg_lock); continue; + } /* ..or locked */ - if (TryLockPage(page)) + if (TryLockPage(page)) { + spin_unlock(pg_lock); continue; + } __lru_cache_del(page); __remove_inode_page(page); + spin_unlock(pg_lock); + UnlockPage(page); page_cache_release(page); + } + spin_unlock(&mapping->page_lock); spin_unlock(&pagemap_lru_lock); - spin_unlock(&pagecache_lock); } static inline void truncate_partial_page(struct page *page, unsigned partial) @@ -202,7 +324,6 @@ static inline void truncate_complete_page(struct page *page) { - /* Leave it on the LRU if it gets converted into anonymous buffers */ if (!page->buffers || block_flushpage(page, 0)) lru_cache_del(page); @@ -217,13 +338,12 @@ * all sorts of fun problems ... */ ClearPageDirty(page); - ClearPageUptodate(page); remove_inode_page(page); page_cache_release(page); } -static int FASTCALL(truncate_list_pages(struct list_head *, unsigned long, unsigned *)); -static int truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial) +static int FASTCALL(truncate_list_pages(struct address_space * mapping, struct list_head *, unsigned long, unsigned *)); +static int truncate_list_pages(struct address_space * mapping, struct list_head *head, unsigned long start, unsigned *partial) { struct list_head *curr; struct page * page; @@ -234,24 +354,35 @@ page = list_entry(curr, struct page, list); curr = curr->next; - offset = page->index; /* Is one of the pages to truncate? */ + offset = page->index; if ((offset >= start) || (*partial && (offset + 1) == start)) { + spinlock_t *pg_lock; + + pg_lock = PAGECACHE_LOCK(page); + + if (!spin_trylock(pg_lock)) { + spin_unlock(&mapping->page_lock); + return 1; + } + if (TryLockPage(page)) { page_cache_get(page); - spin_unlock(&pagecache_lock); + spin_unlock(pg_lock); + spin_unlock(&mapping->page_lock); wait_on_page(page); page_cache_release(page); return 1; } page_cache_get(page); - spin_unlock(&pagecache_lock); + spin_unlock(pg_lock); + spin_unlock(&mapping->page_lock); if (*partial && (offset + 1) == start) { truncate_partial_page(page, *partial); *partial = 0; - } else + } else truncate_complete_page(page); UnlockPage(page); @@ -278,14 +409,14 @@ unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); repeat: - spin_lock(&pagecache_lock); - if (truncate_list_pages(&mapping->clean_pages, start, &partial)) + spin_lock(&mapping->page_lock); + if (truncate_list_pages(mapping,&mapping->clean_pages, start, &partial)) goto repeat; - if (truncate_list_pages(&mapping->dirty_pages, start, &partial)) + if (truncate_list_pages(mapping,&mapping->dirty_pages, start, &partial)) goto repeat; - if (truncate_list_pages(&mapping->locked_pages, start, &partial)) + if (truncate_list_pages(mapping,&mapping->locked_pages,start, &partial)) goto repeat; - spin_unlock(&pagecache_lock); + spin_unlock(&mapping->page_lock); } /* @@ -294,7 +425,7 @@ * ideal for ->writepage() clustering and other places where you don't * want to mark the page referenced. * - * The caller needs to hold the pagecache_lock. + * The caller needs to hold the page bucket lock. */ struct page * __find_page_simple(struct address_space *mapping, unsigned long index) { @@ -378,13 +509,13 @@ return error; } -static int do_buffer_fdatasync(struct list_head *head, unsigned long start, unsigned long end, int (*fn)(struct page *)) +static int do_buffer_fdatasync(struct address_space * mapping, struct list_head *head, unsigned long start, unsigned long end, int (*fn)(struct page *)) { struct list_head *curr; struct page *page; int retval = 0; - spin_lock(&pagecache_lock); + spin_lock(&mapping->page_lock); curr = head->next; while (curr != head) { page = list_entry(curr, struct page, list); @@ -397,7 +528,7 @@ continue; page_cache_get(page); - spin_unlock(&pagecache_lock); + spin_unlock(&mapping->page_lock); lock_page(page); /* The buffers could have been free'd while we waited for the page lock */ @@ -405,11 +536,11 @@ retval |= fn(page); UnlockPage(page); - spin_lock(&pagecache_lock); + spin_lock(&mapping->page_lock); curr = page->list.next; page_cache_release(page); } - spin_unlock(&pagecache_lock); + spin_unlock(&mapping->page_lock); return retval; } @@ -420,17 +551,18 @@ */ int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx) { + struct address_space * mapping = inode->i_mapping; int retval; /* writeout dirty buffers on pages from both clean and dirty lists */ - retval = do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, writeout_one_page); - retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, writeout_one_page); - retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, writeout_one_page); + retval = do_buffer_fdatasync(mapping, &mapping->dirty_pages, start_idx, end_idx, writeout_one_page); + retval |= do_buffer_fdatasync(mapping, &mapping->clean_pages, start_idx, end_idx, writeout_one_page); + retval |= do_buffer_fdatasync(mapping, &mapping->locked_pages, start_idx, end_idx, writeout_one_page); /* now wait for locked buffers on pages from both clean and dirty lists */ - retval |= do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, waitfor_one_page); - retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, waitfor_one_page); - retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, waitfor_one_page); + retval |= do_buffer_fdatasync(mapping, &mapping->dirty_pages, start_idx, end_idx, waitfor_one_page); + retval |= do_buffer_fdatasync(mapping, &mapping->clean_pages, start_idx, end_idx, waitfor_one_page); + retval |= do_buffer_fdatasync(mapping, &mapping->locked_pages, start_idx, end_idx, waitfor_one_page); return retval; } @@ -446,19 +578,29 @@ { int (*writepage)(struct page *) = mapping->a_ops->writepage; - spin_lock(&pagecache_lock); +repeat: + spin_lock(&mapping->page_lock); while (!list_empty(&mapping->dirty_pages)) { struct page *page = list_entry(mapping->dirty_pages.next, struct page, list); + spinlock_t *pg_lock; + pg_lock = PAGECACHE_LOCK(page); + if (!spin_trylock(pg_lock)) { + spin_unlock(&mapping->page_lock); + goto repeat; + } list_del(&page->list); list_add(&page->list, &mapping->locked_pages); - if (!PageDirty(page)) + if (!PageDirty(page)) { + spin_unlock(pg_lock); continue; + } page_cache_get(page); - spin_unlock(&pagecache_lock); + spin_unlock(pg_lock); + spin_unlock(&mapping->page_lock); lock_page(page); @@ -469,9 +611,9 @@ UnlockPage(page); page_cache_release(page); - spin_lock(&pagecache_lock); + spin_lock(&mapping->page_lock); } - spin_unlock(&pagecache_lock); + spin_unlock(&mapping->page_lock); } /** @@ -483,26 +625,36 @@ */ void filemap_fdatawait(struct address_space * mapping) { - spin_lock(&pagecache_lock); +repeat: + spin_lock(&mapping->page_lock); while (!list_empty(&mapping->locked_pages)) { struct page *page = list_entry(mapping->locked_pages.next, struct page, list); + spinlock_t *pg_lock; + pg_lock = PAGECACHE_LOCK(page); + if (!spin_trylock(pg_lock)) { + spin_unlock(&mapping->page_lock); + goto repeat; + } list_del(&page->list); list_add(&page->list, &mapping->clean_pages); - if (!PageLocked(page)) + if (!PageLocked(page)) { + spin_unlock(pg_lock); continue; + } page_cache_get(page); - spin_unlock(&pagecache_lock); + spin_unlock(pg_lock); + spin_unlock(&mapping->page_lock); ___wait_on_page(page); page_cache_release(page); - spin_lock(&pagecache_lock); + spin_lock(&mapping->page_lock); } - spin_unlock(&pagecache_lock); + spin_unlock(&mapping->page_lock); } /* @@ -513,16 +665,19 @@ */ void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index) { + spinlock_t *pg_lock; + if (!PageLocked(page)) BUG(); page_cache_get(page); - spin_lock(&pagecache_lock); + pg_lock = __PAGECACHE_LOCK(mapping, index); + spin_lock(pg_lock); page->index = index; add_page_to_inode_queue(mapping, page); add_page_to_hash_queue(page, page_hash(mapping, index)); lru_cache_add(page); - spin_unlock(&pagecache_lock); + spin_unlock(pg_lock); } /* @@ -549,9 +704,11 @@ void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset) { - spin_lock(&pagecache_lock); + spinlock_t *pg_lock = __PAGECACHE_LOCK(mapping, offset); + + spin_lock(pg_lock); __add_to_page_cache(page, mapping, offset, page_hash(mapping, offset)); - spin_unlock(&pagecache_lock); + spin_unlock(pg_lock); } static int add_to_page_cache_unique(struct page * page, @@ -560,8 +717,11 @@ { int err; struct page *alias; + spinlock_t *pg_lock; + + pg_lock = __PAGECACHE_LOCK(mapping, offset); + spin_lock(pg_lock); - spin_lock(&pagecache_lock); alias = __find_page_nolock(mapping, offset, *hash); err = 1; @@ -570,7 +730,8 @@ err = 0; } - spin_unlock(&pagecache_lock); + spin_unlock(pg_lock); + return err; } @@ -585,9 +746,9 @@ struct page **hash = page_hash(mapping, offset); struct page *page; - spin_lock(&pagecache_lock); + spin_lock(__PAGECACHE_LOCK(mapping, offset)); page = __find_page_nolock(mapping, offset, *hash); - spin_unlock(&pagecache_lock); + spin_unlock(__PAGECACHE_LOCK(mapping, offset)); if (page) return 0; @@ -702,11 +863,11 @@ * We scan the hash list read-only. Addition to and removal from * the hash-list needs a held write-lock. */ - spin_lock(&pagecache_lock); + spin_lock(__PAGECACHE_LOCK(mapping, offset)); page = __find_page_nolock(mapping, offset, *hash); if (page) page_cache_get(page); - spin_unlock(&pagecache_lock); + spin_unlock(__PAGECACHE_LOCK(mapping, offset)); return page; } @@ -723,11 +884,11 @@ * the hash-list needs a held write-lock. */ repeat: - spin_lock(&pagecache_lock); + spin_lock(__PAGECACHE_LOCK(mapping, offset)); page = __find_page_nolock(mapping, offset, *hash); if (page) { page_cache_get(page); - spin_unlock(&pagecache_lock); + spin_unlock(__PAGECACHE_LOCK(mapping, offset)); lock_page(page); @@ -740,7 +901,7 @@ page_cache_release(page); goto repeat; } - spin_unlock(&pagecache_lock); + spin_unlock(__PAGECACHE_LOCK(mapping, offset)); return NULL; } @@ -784,14 +945,16 @@ * been increased since the last time we were called, we * stop when the page isn't there. */ - spin_lock(&pagecache_lock); while (--index >= start) { + spin_lock(__PAGECACHE_LOCK(mapping, index)); page = __find_page_simple(mapping, index); - if (!page) + if (!page) { + spin_unlock(__PAGECACHE_LOCK(mapping, index)); break; + } deactivate_page(page); + spin_unlock(__PAGECACHE_LOCK(mapping, index)); } - spin_unlock(&pagecache_lock); } /* @@ -1036,7 +1199,7 @@ * This is really ugly. But the goto's actually try to clarify some * of the logic when it comes to error handling etc. */ -void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor) +void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor, int nonblock) { struct inode *inode = filp->f_dentry->d_inode; struct address_space *mapping = inode->i_mapping; @@ -1110,17 +1273,24 @@ */ hash = page_hash(mapping, index); - spin_lock(&pagecache_lock); + spin_lock(__PAGECACHE_LOCK(mapping, index)); page = __find_page_nolock(mapping, index, *hash); if (!page) goto no_cached_page; found_page: page_cache_get(page); - spin_unlock(&pagecache_lock); + spin_unlock(__PAGECACHE_LOCK(mapping, index)); - if (!Page_Uptodate(page)) + if (!Page_Uptodate(page)) { + if (nonblock) { + page_cache_release(page); + desc->error = -EWOULDBLOCKIO; + break; + } goto page_not_up_to_date; - generic_file_readahead(reada_ok, filp, inode, page); + } + if (!nonblock) + generic_file_readahead(reada_ok, filp, inode, page); page_ok: /* If users can be writing to this page using arbitrary * virtual addresses, take care about potential aliasing @@ -1196,6 +1366,11 @@ break; no_cached_page: + if (nonblock) { + spin_unlock(__PAGECACHE_LOCK(mapping, index)); + desc->error = -EWOULDBLOCKIO; + break; + } /* * Ok, it wasn't cached, so we need to create a new * page.. @@ -1203,7 +1378,7 @@ * We get here with the page cache lock held. */ if (!cached_page) { - spin_unlock(&pagecache_lock); + spin_unlock(__PAGECACHE_LOCK(mapping, index)); cached_page = page_cache_alloc(mapping); if (!cached_page) { desc->error = -ENOMEM; @@ -1214,7 +1389,7 @@ * Somebody may have added the page while we * dropped the page cache lock. Check for that. */ - spin_lock(&pagecache_lock); + spin_lock(__PAGECACHE_LOCK(mapping, index)); page = __find_page_nolock(mapping, index, *hash); if (page) goto found_page; @@ -1225,7 +1400,7 @@ */ page = cached_page; __add_to_page_cache(page, mapping, index, hash); - spin_unlock(&pagecache_lock); + spin_unlock(__PAGECACHE_LOCK(mapping, index)); cached_page = NULL; goto readpage; @@ -1279,7 +1454,7 @@ desc.count = count; desc.buf = buf; desc.error = 0; - do_generic_file_read(filp, ppos, &desc, file_read_actor); + do_generic_file_read(filp, ppos, &desc, file_read_actor, 0); retval = desc.written; if (!retval) @@ -1289,7 +1464,7 @@ return retval; } -static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size) +int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size) { ssize_t written; unsigned long count = desc->count; @@ -1382,7 +1557,7 @@ desc.count = count; desc.buf = (char *) out_file; desc.error = 0; - do_generic_file_read(in_file, ppos, &desc, file_send_actor); + do_generic_file_read(in_file, ppos, &desc, file_send_actor, 0); retval = desc.written; if (!retval) @@ -2201,11 +2376,11 @@ struct address_space * as = &vma->vm_file->f_dentry->d_inode->i_data; struct page * page, ** hash = page_hash(as, pgoff); - spin_lock(&pagecache_lock); + spin_lock(__PAGECACHE_LOCK(as, pgoff)); page = __find_page_nolock(as, pgoff, *hash); if ((page) && (Page_Uptodate(page))) present = 1; - spin_unlock(&pagecache_lock); + spin_unlock(__PAGECACHE_LOCK(as, pgoff)); return present; } @@ -2510,7 +2685,7 @@ * Check whether we've reached the file size limit. */ err = -EFBIG; - + if (limit != RLIM_INFINITY) { if (pos >= limit) { send_sig(SIGXFSZ, current, 0); @@ -2669,21 +2844,21 @@ void __init page_cache_init(unsigned long mempages) { - unsigned long htable_size, order; + unsigned long htable_size, order, i; htable_size = mempages; - htable_size *= sizeof(struct page *); + htable_size *= sizeof(struct page_cache_bucket); for(order = 0; (PAGE_SIZE << order) < htable_size; order++) ; do { - unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *); + unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page_cache_bucket); page_hash_bits = 0; while((tmp >>= 1UL) != 0UL) page_hash_bits++; - page_hash_table = (struct page **) + page_hash_table = (struct page_cache_bucket *) __get_free_pages(GFP_ATOMIC, order); } while(page_hash_table == NULL && --order > 0); @@ -2691,5 +2866,9 @@ (1 << page_hash_bits), order, (PAGE_SIZE << order)); if (!page_hash_table) panic("Failed to allocate page hash table\n"); - memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *)); + + for (i = 0; i < PAGE_HASH_SIZE; i++) { + spin_lock_init(&page_hash_table[i].lock); + page_hash_table[i].chain = NULL; + } } --- linux/mm/swap_state.c.orig Sat Dec 30 00:04:27 2000 +++ linux/mm/swap_state.c Tue Mar 13 12:48:48 2001 @@ -33,6 +33,7 @@ LIST_HEAD_INIT(swapper_space.dirty_pages), LIST_HEAD_INIT(swapper_space.locked_pages), 0, /* nrpages */ + SPIN_LOCK_UNLOCKED, &swap_aops, }; @@ -80,7 +81,9 @@ PageClearSwapCache(page); ClearPageDirty(page); + spin_lock(&mapping->page_lock); __remove_inode_page(page); + spin_unlock(&mapping->page_lock); } /* @@ -106,16 +109,18 @@ */ void delete_from_swap_cache_nolock(struct page *page) { + spinlock_t *pg_lock; + if (!PageLocked(page)) BUG(); - if (block_flushpage(page, 0)) lru_cache_del(page); - spin_lock(&pagecache_lock); + pg_lock = PAGECACHE_LOCK(page); + spin_lock(pg_lock); ClearPageDirty(page); __delete_from_swap_cache(page); - spin_unlock(&pagecache_lock); + spin_unlock(pg_lock); page_cache_release(page); } --- linux/mm/swap.c.orig Tue Mar 13 12:48:34 2001 +++ linux/mm/swap.c Tue Mar 13 12:48:48 2001 @@ -41,6 +41,7 @@ /* How many pages do we try to swap or page in/out together? */ int page_cluster; + /* * This variable contains the amount of page steals the system * is doing, averaged over a minute. We use this to determine how @@ -92,11 +93,12 @@ */ if (!page->age) activate_page_nolock(page); - - /* The actual page aging bit */ - page->age += PAGE_AGE_ADV; - if (page->age > PAGE_AGE_MAX) - page->age = PAGE_AGE_MAX; + else { + /* The actual page aging bit */ + page->age += PAGE_AGE_ADV; + if (page->age > PAGE_AGE_MAX) + page->age = PAGE_AGE_MAX; + } } /* @@ -248,6 +250,10 @@ */ void __lru_cache_del(struct page * page) { +#if CONFIG_SMP + if (!spin_is_locked(&pagemap_lru_lock)) + BUG(); +#endif if (PageActive(page)) { del_page_from_active_list(page); } else if (PageInactiveDirty(page)) { --- linux/include/linux/swap.h.orig Tue Mar 13 12:48:44 2001 +++ linux/include/linux/swap.h Tue Mar 13 12:48:48 2001 @@ -90,7 +90,6 @@ extern struct address_space swapper_space; extern atomic_t page_cache_size; extern atomic_t buffermem_pages; -extern spinlock_t pagecache_lock; extern void __remove_inode_page(struct page *); /* Incomplete types for prototype declarations: */ @@ -208,6 +207,10 @@ * Since we do exponential decay of the page age, we * can chose a fairly large maximum. */ +#define DEFAULT_PAGE_AGE_START 1 +#define DEFAULT_PAGE_AGE_ADV 1 +#define DEFAULT_PAGE_AGE_MAX 8192 + #define PAGE_AGE_START 2 #define PAGE_AGE_ADV 3 #define PAGE_AGE_MAX 64 @@ -223,22 +226,22 @@ #define ZERO_PAGE_BUG \ if (page_count(page) == 0) BUG(); -#define add_page_to_active_list(page) { \ +#define add_page_to_active_list(page) ({ \ DEBUG_ADD_PAGE \ ZERO_PAGE_BUG \ SetPageActive(page); \ list_add(&(page)->lru, &active_list); \ nr_active_pages++; \ -} +}) -#define add_page_to_inactive_dirty_list(page) { \ +#define add_page_to_inactive_dirty_list(page) ({ \ DEBUG_ADD_PAGE \ ZERO_PAGE_BUG \ SetPageInactiveDirty(page); \ list_add(&(page)->lru, &inactive_dirty_list); \ nr_inactive_dirty_pages++; \ page->zone->inactive_dirty_pages++; \ -} +}) #define add_page_to_inactive_clean_list(page) { \ DEBUG_ADD_PAGE \ @@ -248,30 +251,30 @@ page->zone->inactive_clean_pages++; \ } -#define del_page_from_active_list(page) { \ +#define del_page_from_active_list(page) ({ \ list_del(&(page)->lru); \ ClearPageActive(page); \ nr_active_pages--; \ DEBUG_ADD_PAGE \ ZERO_PAGE_BUG \ -} +}) -#define del_page_from_inactive_dirty_list(page) { \ +#define del_page_from_inactive_dirty_list(page) ({ \ list_del(&(page)->lru); \ ClearPageInactiveDirty(page); \ nr_inactive_dirty_pages--; \ page->zone->inactive_dirty_pages--; \ DEBUG_ADD_PAGE \ ZERO_PAGE_BUG \ -} +}) -#define del_page_from_inactive_clean_list(page) { \ +#define del_page_from_inactive_clean_list(page) ({ \ list_del(&(page)->lru); \ ClearPageInactiveClean(page); \ page->zone->inactive_clean_pages--; \ DEBUG_ADD_PAGE \ ZERO_PAGE_BUG \ -} +}) /* * In mm/swap.c::recalculate_vm_stats(), we substract --- linux/include/linux/errno.h.orig Wed Mar 29 03:51:39 2000 +++ linux/include/linux/errno.h Tue Mar 13 12:48:48 2001 @@ -21,6 +21,9 @@ #define EBADTYPE 527 /* Type not supported by server */ #define EJUKEBOX 528 /* Request initiated, but will not complete before timeout */ +/* Defined for TUX async IO */ +#define EWOULDBLOCKIO 530 /* Would block due to block-IO */ + #endif #endif --- linux/include/linux/fs.h.orig Tue Mar 13 12:48:44 2001 +++ linux/include/linux/fs.h Tue Mar 13 12:48:48 2001 @@ -377,6 +377,8 @@ struct list_head dirty_pages; /* list of dirty pages */ struct list_head locked_pages; /* list of locked pages */ unsigned long nrpages; /* number of total pages */ + spinlock_t page_lock; /* and spinlock protecting them */ + struct address_space_operations *a_ops; /* methods */ struct inode *host; /* owner: inode, block_device */ struct vm_area_struct *i_mmap; /* list of private mappings */ @@ -1302,6 +1304,7 @@ extern int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*, unsigned long *); extern int block_sync_page(struct page *); +extern void flush_inode_pages (struct inode * inode); int generic_block_bmap(struct address_space *, long, get_block_t *); int generic_commit_write(struct file *, struct page *, unsigned, unsigned); @@ -1311,7 +1314,7 @@ extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size); extern ssize_t generic_file_read(struct file *, char *, size_t, loff_t *); extern ssize_t generic_file_write(struct file *, const char *, size_t, loff_t *); -extern void do_generic_file_read(struct file *, loff_t *, read_descriptor_t *, read_actor_t); +extern void do_generic_file_read(struct file *, loff_t *, read_descriptor_t *, read_actor_t, int); extern ssize_t generic_read_dir(struct file *, char *, size_t, loff_t *); extern int generic_file_open(struct inode *, struct file *); --- linux/include/linux/pagemap.h.orig Tue Mar 13 12:48:44 2001 +++ linux/include/linux/pagemap.h Tue Mar 13 12:48:48 2001 @@ -42,12 +42,17 @@ */ #define page_cache_entry(x) virt_to_page(x) +struct page_cache_bucket { + spinlock_t lock; + struct page *chain; +} __attribute__((__aligned__(8))); + extern unsigned int page_hash_bits; #define PAGE_HASH_BITS (page_hash_bits) #define PAGE_HASH_SIZE (1 << PAGE_HASH_BITS) extern atomic_t page_cache_size; /* # of pages currently in the hash table */ -extern struct page **page_hash_table; +extern struct page_cache_bucket *page_hash_table; extern void page_cache_init(unsigned long); @@ -69,7 +74,12 @@ #undef s } -#define page_hash(mapping,index) (page_hash_table+_page_hashfn(mapping,index)) +#define page_hash(mapping,index) \ + &((page_hash_table+_page_hashfn(mapping,index))->chain) +#define __PAGECACHE_LOCK(mapping,index) \ + &((page_hash_table+_page_hashfn(mapping,index))->lock) +#define PAGECACHE_LOCK(page) \ + __PAGECACHE_LOCK((page)->mapping, (page)->index) extern struct page * __find_get_page(struct address_space *mapping, unsigned long offset, struct page **hash); --- linux/net/khttpd/datasending.c.orig Fri Nov 17 20:36:27 2000 +++ linux/net/khttpd/datasending.c Tue Mar 13 12:48:48 2001 @@ -127,7 +127,7 @@ desc.count = ReadSize; desc.buf = (char *) CurrentRequest->sock; desc.error = 0; - do_generic_file_read(CurrentRequest->filp, ppos, &desc, sock_send_actor); + do_generic_file_read(CurrentRequest->filp, ppos, &desc, sock_send_actor, 0); if (desc.written>0) { CurrentRequest->BytesSent += desc.written; --- linux/drivers/block/loop.c.orig Tue Mar 13 12:48:40 2001 +++ linux/drivers/block/loop.c Tue Mar 13 12:48:48 2001 @@ -268,7 +268,7 @@ spin_lock_irq(&lo->lo_lock); file = lo->lo_backing_file; spin_unlock_irq(&lo->lo_lock); - do_generic_file_read(file, &pos, &desc, lo_read_actor); + do_generic_file_read(file, &pos, &desc, lo_read_actor, 0); return desc.error; }