===== arch/ppc64/kernel/htab.c 1.11 vs edited ===== --- 1.11/arch/ppc64/kernel/htab.c Thu Dec 18 16:13:25 2003 +++ edited/arch/ppc64/kernel/htab.c Thu Feb 12 14:57:57 2004 @@ -48,6 +48,29 @@ #include #include +#define HPTE_LOCK_BIT 3 + +static inline void pSeries_lock_hpte(HPTE *hptep) +{ + unsigned long *word = &hptep->dw0.dword0; + + while (1) { + if (!test_and_set_bit(HPTE_LOCK_BIT, word)) + break; + while(test_bit(HPTE_LOCK_BIT, word)) + cpu_relax(); + } +} + +static inline void pSeries_unlock_hpte(HPTE *hptep) +{ + unsigned long *word = &hptep->dw0.dword0; + + asm volatile("lwsync":::"memory"); + clear_bit(HPTE_LOCK_BIT, word); +} + + /* * Note: pte --> Linux PTE * HPTE --> PowerPC Hashed Page Table Entry @@ -64,6 +87,7 @@ extern unsigned long _SDR1; extern unsigned long klimit; +extern rwlock_t pte_hash_lock[] __cacheline_aligned_in_smp; void make_pte(HPTE *htab, unsigned long va, unsigned long pa, int mode, unsigned long hash_mask, int large); @@ -303,11 +327,11 @@ static inline unsigned long computeHptePP(unsigned long pte) { - return (pte & _PAGE_USER) | + return ((pte & _PAGE_USER) | (((pte & _PAGE_USER) >> 1) & ((~((pte >> 2) & /* _PAGE_RW */ (pte >> 7))) & /* _PAGE_DIRTY */ - 1)); + 1))) & 0x7; } /* @@ -320,51 +344,73 @@ unsigned long va, vpn; unsigned long newpp, prpn; unsigned long hpteflags, lock_slot; + unsigned long access_ok, tmp; long slot; pte_t old_pte, new_pte; + int ret = 0; /* Search the Linux page table for a match with va */ va = (vsid << 28) | (ea & 0x0fffffff); vpn = va >> PAGE_SHIFT; lock_slot = get_lock_slot(vpn); - /* Acquire the hash table lock to guarantee that the linux - * pte we fetch will not change + /* + * Check the user's access rights to the page. If access should be + * prevented then send the problem up to do_page_fault. */ - spin_lock(&hash_table_lock[lock_slot].lock); - + /* * Check the user's access rights to the page. If access should be * prevented then send the problem up to do_page_fault. */ -#ifdef CONFIG_SHARED_MEMORY_ADDRESSING + access |= _PAGE_PRESENT; - if (unlikely(access & ~(pte_val(*ptep)))) { + + /* We'll do access checking and _PAGE_BUSY setting in assembly, since + * it needs to be atomic. + */ + + __asm__ __volatile__ ("\n + 1: ldarx %0,0,%3\n + # Check if PTE is busy\n + andi. %1,%0,%4\n + bne- 1b\n + ori %0,%0,%4\n + # Write the linux PTE atomically (setting busy)\n + stdcx. %0,0,%3\n + bne- 1b\n + # Check access rights (access & ~(pte_val(*ptep)))\n + andc. %1,%2,%0\n + bne- 2f\n + li %1,1\n + b 3f\n + 2: li %1,0\n + 3: isync" + : "=r" (old_pte), "=r" (access_ok) + : "r" (access), "r" (ptep), "i" (_PAGE_BUSY) + : "cr0", "memory"); + +#ifdef CONFIG_SHARED_MEMORY_ADDRESSING + if (unlikely(!access_ok)) { if(!(((ea >> SMALLOC_EA_SHIFT) == (SMALLOC_START >> SMALLOC_EA_SHIFT)) && ((current->thread.flags) & PPC_FLAG_SHARED))) { - spin_unlock(&hash_table_lock[lock_slot].lock); - return 1; + ret = 1; + goto out_unlock; } } #else - access |= _PAGE_PRESENT; - if (unlikely(access & ~(pte_val(*ptep)))) { - spin_unlock(&hash_table_lock[lock_slot].lock); - return 1; + if (unlikely(!access_ok)) { + ret = 1; + goto out_unlock; } #endif /* - * We have found a pte (which was present). - * The spinlocks prevent this status from changing - * The hash_table_lock prevents the _PAGE_HASHPTE status - * from changing (RPN, DIRTY and ACCESSED too) - * The page_table_lock prevents the pte from being - * invalidated or modified - */ - - /* + * We have found a proper pte. The hash_table_lock protects + * the pte from deallocation and the _PAGE_BUSY bit protects + * the contents of the PTE from changing. + * * At this point, we have a pte (old_pte) which can be used to build * or update an HPTE. There are 2 cases: * @@ -385,7 +431,7 @@ else pte_val(new_pte) |= _PAGE_ACCESSED; - newpp = computeHptePP(pte_val(new_pte)); + newpp = computeHptePP(pte_val(new_pte) & ~_PAGE_BUSY); /* Check if pte already has an hpte (case 2) */ if (unlikely(pte_val(old_pte) & _PAGE_HASHPTE)) { @@ -400,12 +446,13 @@ slot = (hash & htab_data.htab_hash_mask) * HPTES_PER_GROUP; slot += (pte_val(old_pte) & _PAGE_GROUP_IX) >> 12; - /* XXX fix large pte flag */ + /* XXX fix large pte flag */ if (ppc_md.hpte_updatepp(slot, secondary, newpp, va, 0) == -1) { pte_val(old_pte) &= ~_PAGE_HPTEFLAGS; } else { if (!pte_same(old_pte, new_pte)) { + /* _PAGE_BUSY is still set in new_pte */ *ptep = new_pte; } } @@ -425,12 +472,19 @@ pte_val(new_pte) |= ((slot<<12) & (_PAGE_GROUP_IX | _PAGE_SECONDARY)); + smp_wmb(); + /* _PAGE_BUSY is not set in new_pte */ *ptep = new_pte; + + return 0; } - spin_unlock(&hash_table_lock[lock_slot].lock); +out_unlock: + smp_wmb(); + + pte_val(*ptep) &= ~_PAGE_BUSY; - return 0; + return ret; } /* @@ -497,11 +551,14 @@ pgdir = mm->pgd; if (pgdir == NULL) return 1; - /* - * Lock the Linux page table to prevent mmap and kswapd - * from modifying entries while we search and update + /* The pte_hash_lock is used to block any PTE deallocations + * while we walk the tree and use the entry. While technically + * we both read and write the PTE entry while holding the read + * lock, the _PAGE_BUSY bit will block pte_update()s to the + * specific entry. */ - spin_lock(&mm->page_table_lock); + + read_lock(&pte_hash_lock[smp_processor_id()]); ptep = find_linux_pte(pgdir, ea); /* @@ -514,8 +571,7 @@ /* If no pte, send the problem up to do_page_fault */ ret = 1; } - - spin_unlock(&mm->page_table_lock); + read_unlock(&pte_hash_lock[smp_processor_id()]); return ret; } @@ -540,8 +596,6 @@ lock_slot = get_lock_slot(vpn); hash = hpt_hash(vpn, large); - spin_lock_irqsave(&hash_table_lock[lock_slot].lock, flags); - pte = __pte(pte_update(ptep, _PAGE_HPTEFLAGS, 0)); secondary = (pte_val(pte) & _PAGE_SECONDARY) >> 15; if (secondary) hash = ~hash; @@ -551,8 +605,6 @@ if (pte_val(pte) & _PAGE_HASHPTE) { ppc_md.hpte_invalidate(slot, secondary, va, large, local); } - - spin_unlock_irqrestore(&hash_table_lock[lock_slot].lock, flags); } long plpar_pte_enter(unsigned long flags, @@ -787,6 +839,8 @@ avpn = vpn >> 11; + pSeries_lock_hpte(hptep); + dw0 = hptep->dw0.dw0; /* @@ -794,9 +848,13 @@ * the AVPN, hash group, and valid bits. By doing it this way, * it is common with the pSeries LPAR optimal path. */ - if (dw0.bolted) return; + if (dw0.bolted) { + pSeries_unlock_hpte(hptep); + + return; + } - /* Invalidate the hpte. */ + /* Invalidate the hpte. This clears the lock as well. */ hptep->dw0.dword0 = 0; /* Invalidate the tlb */ @@ -875,9 +933,17 @@ avpn = vpn >> 11; + pSeries_lock_hpte(hptep); + dw0 = hptep->dw0.dw0; if ((dw0.avpn == avpn) && (dw0.v) && (dw0.h == secondary)) { + + if(hptep->dw1.dw1.pp == newpp) { + pSeries_unlock_hpte(hptep); + return 0; + } + /* Turn off valid bit in HPTE */ dw0.v = 0; hptep->dw0.dw0 = dw0; @@ -900,10 +966,14 @@ hptep->dw0.dw0 = dw0; __asm__ __volatile__ ("ptesync" : : : "memory"); + + pSeries_unlock_hpte(hptep); return 0; } + pSeries_unlock_hpte(hptep); + return -1; } @@ -1062,9 +1132,11 @@ dw0 = hptep->dw0.dw0; if (!dw0.v) { /* retry with lock held */ + pSeries_lock_hpte(hptep); dw0 = hptep->dw0.dw0; if (!dw0.v) break; + pSeries_unlock_hpte(hptep); } hptep++; } @@ -1079,9 +1151,11 @@ dw0 = hptep->dw0.dw0; if (!dw0.v) { /* retry with lock held */ + pSeries_lock_hpte(hptep); dw0 = hptep->dw0.dw0; if (!dw0.v) break; + pSeries_unlock_hpte(hptep); } hptep++; } @@ -1304,9 +1378,11 @@ if (dw0.v && !dw0.bolted) { /* retry with lock held */ + pSeries_lock_hpte(hptep); dw0 = hptep->dw0.dw0; if (dw0.v && !dw0.bolted) break; + pSeries_unlock_hpte(hptep); } slot_offset++; ===== arch/ppc64/mm/init.c 1.8 vs edited ===== --- 1.8/arch/ppc64/mm/init.c Tue Jan 6 17:54:44 2004 +++ edited/arch/ppc64/mm/init.c Thu Feb 12 14:59:11 2004 @@ -104,16 +104,97 @@ */ mmu_gather_t mmu_gathers[NR_CPUS]; +/* PTE free batching structures. We need a lock since not all + * operations take place under page_table_lock. Keep it per-CPU + * to avoid bottlenecks. + */ + +struct pte_freelist_batch ____cacheline_aligned pte_freelist_cur[NR_CPUS] __cacheline_aligned_in_smp; +rwlock_t pte_hash_lock[NR_CPUS] __cacheline_aligned_in_smp = { [0 ... NR_CPUS-1] = RW_LOCK_UNLOCKED }; + +unsigned long pte_freelist_forced_free; + +static inline void pte_free_sync(void) +{ + unsigned long flags; + int i; + + /* Make sure that all PTE/PMD updates are seen by other processors. */ + smb_mb(); + + /* All we need to know is that we can get the write lock if + * we wanted to, i.e. that no hash_page()s are holding it for reading. + * If none are reading, that means there's no currently executing + * hash_page() that might be working on one of the PTE's that will + * be deleted. Likewise, if there is a reader, we need to get the + * write lock to know when it releases the lock. + */ + + for (i = 0; i < smp_num_cpus; i++) + if (is_read_locked(&pte_hash_lock[i])) { + /* So we don't deadlock with a reader on current cpu */ + if(i == smp_processor_id()) + local_irq_save(flags); + + write_lock(&pte_hash_lock[i]); + write_unlock(&pte_hash_lock[i]); + + if(i == smp_processor_id()) + local_irq_restore(flags); + } +} + + +/* This is only called when we are critically out of memory + * (and fail to get a page in pte_free_tlb). + */ +void pte_free_now(pte_t *pte) +{ + pte_freelist_forced_free++; + + pte_free_sync(); + + pte_free_kernel(pte); +} + +/* Deallocates the pte-free batch after syncronizing with readers of + * any page tables. + */ +void pte_free_batch(void **batch, int size) +{ + unsigned int i; + + pte_free_sync(); + + for (i = 0; i < size; i++) + pte_free_kernel(batch[i]); + + free_page((unsigned long)batch); +} + + int do_check_pgt_cache(int low, int high) { int freed = 0; + struct pte_freelist_batch *batch; + + /* We use this function to push the current pte free batch to be + * deallocated, since do_check_pgt_cache() is callEd at the end of each + * free_one_pgd() and other parts of VM relies on all PTE's being + * properly freed upon return from that function. + */ + + batch = &pte_freelist_cur[smp_processor_id()]; + + if(batch->entry) { + pte_free_batch(batch->entry, batch->index); + batch->entry = NULL; + } if (pgtable_cache_size > high) { do { if (pgd_quicklist) free_page((unsigned long)pgd_alloc_one_fast(0)), ++freed; - if (pmd_quicklist) - free_page((unsigned long)pmd_alloc_one_fast(0, 0)), ++freed; if (pte_quicklist) free_page((unsigned long)pte_alloc_one_fast(0, 0)), ++freed; } while (pgtable_cache_size > low); ===== include/asm-ppc64/pgalloc.h 1.2 vs edited ===== --- 1.2/include/asm-ppc64/pgalloc.h Tue Apr 9 06:31:08 2002 +++ edited/include/asm-ppc64/pgalloc.h Wed Feb 11 17:37:12 2004 @@ -15,7 +15,6 @@ #define quicklists get_paca() #define pgd_quicklist (quicklists->pgd_cache) -#define pmd_quicklist (quicklists->pmd_cache) #define pte_quicklist (quicklists->pte_cache) #define pgtable_cache_size (quicklists->pgtable_cache_sz) @@ -60,10 +59,10 @@ static inline pmd_t* pmd_alloc_one_fast (struct mm_struct *mm, unsigned long addr) { - unsigned long *ret = (unsigned long *)pmd_quicklist; + unsigned long *ret = (unsigned long *)pte_quicklist; if (ret != NULL) { - pmd_quicklist = (unsigned long *)(*ret); + pte_quicklist = (unsigned long *)(*ret); ret[0] = 0; --pgtable_cache_size; } @@ -80,14 +79,6 @@ return pmd; } -static inline void -pmd_free (pmd_t *pmd) -{ - *(unsigned long *)pmd = (unsigned long) pmd_quicklist; - pmd_quicklist = (unsigned long *) pmd; - ++pgtable_cache_size; -} - #define pmd_populate(MM, PMD, PTE) pmd_set(PMD, PTE) static inline pte_t* @@ -115,12 +106,54 @@ } static inline void -pte_free (pte_t *pte) +pte_free_kernel (pte_t *pte) { *(unsigned long *)pte = (unsigned long) pte_quicklist; pte_quicklist = (unsigned long *) pte; ++pgtable_cache_size; } + + +/* Use the PTE functions for freeing PMD as well, since the same + * problem with tree traversals apply. Since pmd pointers are always + * virtual, no need for a page_address() translation. + */ + +#define pte_free(pte_page) __pte_free(pte_page) +#define pmd_free(pmd) __pte_free(pmd) + +struct pte_freelist_batch +{ + unsigned int index; + void **entry; +}; + +#define PTE_FREELIST_SIZE (PAGE_SIZE / sizeof(void *)) + +extern void pte_free_now(pte_t *pte); +extern void pte_free_batch(void **batch, int size); +extern struct ____cacheline_aligned pte_freelist_batch pte_freelist_cur[] __cacheline_aligned_in_smp; + +static inline void __pte_free(pte_t *pte) +{ + struct pte_freelist_batch *batchp = &pte_freelist_cur[smp_processor_id()]; + + if (batchp->entry == NULL) { + batchp->entry = (void **)__get_free_page(GFP_ATOMIC); + if (batchp->entry == NULL) { + pte_free_now(pte); + return; + } + batchp->index = 0; + } + + batchp->entry[batchp->index++] = pte; + if (batchp->index == PTE_FREELIST_SIZE) { + pte_free_batch(batchp->entry, batchp->index); + batchp->entry = NULL; + } +} + extern int do_check_pgt_cache(int, int); ===== include/asm-ppc64/pgtable.h 1.7 vs edited ===== --- 1.7/include/asm-ppc64/pgtable.h Mon Aug 25 23:47:52 2003 +++ edited/include/asm-ppc64/pgtable.h Wed Feb 11 16:46:28 2004 @@ -88,22 +88,22 @@ * Bits in a linux-style PTE. These match the bits in the * (hardware-defined) PowerPC PTE as closely as possible. */ -#define _PAGE_PRESENT 0x001UL /* software: pte contains a translation */ -#define _PAGE_USER 0x002UL /* matches one of the PP bits */ -#define _PAGE_RW 0x004UL /* software: user write access allowed */ -#define _PAGE_GUARDED 0x008UL -#define _PAGE_COHERENT 0x010UL /* M: enforce memory coherence (SMP systems) */ -#define _PAGE_NO_CACHE 0x020UL /* I: cache inhibit */ -#define _PAGE_WRITETHRU 0x040UL /* W: cache write-through */ -#define _PAGE_DIRTY 0x080UL /* C: page changed */ -#define _PAGE_ACCESSED 0x100UL /* R: page referenced */ -#define _PAGE_HPTENOIX 0x200UL /* software: pte HPTE slot unknown */ -#define _PAGE_HASHPTE 0x400UL /* software: pte has an associated HPTE */ -#define _PAGE_EXEC 0x800UL /* software: i-cache coherence required */ -#define _PAGE_SECONDARY 0x8000UL /* software: HPTE is in secondary group */ -#define _PAGE_GROUP_IX 0x7000UL /* software: HPTE index within group */ +#define _PAGE_PRESENT 0x0001 /* software: pte contains a translation */ +#define _PAGE_USER 0x0002 /* matches one of the PP bits */ +#define _PAGE_RW 0x0004 /* software: user write access allowed */ +#define _PAGE_GUARDED 0x0008 +#define _PAGE_COHERENT 0x0010 /* M: enforce memory coherence (SMP systems) */ +#define _PAGE_NO_CACHE 0x0020 /* I: cache inhibit */ +#define _PAGE_WRITETHRU 0x0040 /* W: cache write-through */ +#define _PAGE_DIRTY 0x0080 /* C: page changed */ +#define _PAGE_ACCESSED 0x0100 /* R: page referenced */ +#define _PAGE_BUSY 0x0200 /* software: pte & hash are busy */ +#define _PAGE_HASHPTE 0x0400 /* software: pte has an associated HPTE */ +#define _PAGE_EXEC 0x0800 /* software: i-cache coherence required */ +#define _PAGE_GROUP_IX 0x7000 /* software: HPTE index within group */ +#define _PAGE_SECONDARY 0x8000 /* software: HPTE is in secondary group */ /* Bits 0x7000 identify the index within an HPT Group */ -#define _PAGE_HPTEFLAGS (_PAGE_HASHPTE | _PAGE_HPTENOIX | _PAGE_SECONDARY | _PAGE_GROUP_IX) +#define _PAGE_HPTEFLAGS (_PAGE_BUSY | _PAGE_HASHPTE | _PAGE_SECONDARY | _PAGE_GROUP_IX) /* PAGE_MASK gives the right answer below, but only by accident */ /* It should be preserving the high 48 bits and then specifically */ /* preserving _PAGE_SECONDARY | _PAGE_GROUP_IX */ @@ -281,13 +281,15 @@ unsigned long old, tmp; __asm__ __volatile__("\n\ -1: ldarx %0,0,%3 \n\ +1: ldarx %0,0,%3 \n\ + andi. %1,%0,%7 # loop on _PAGE_BUSY set\n\ + bne- 1b \n\ andc %1,%0,%4 \n\ or %1,%1,%5 \n\ stdcx. %1,0,%3 \n\ bne- 1b" : "=&r" (old), "=&r" (tmp), "=m" (*p) - : "r" (p), "r" (clr), "r" (set), "m" (*p) + : "r" (p), "r" (clr), "r" (set), "m" (*p), "i" (_PAGE_BUSY) : "cc" ); return old; }