diff -pru -X /root/.diffexclude linux-2.4.21-6.EL-just_tools/arch/ppc64/kernel/htab.c hashpage/arch/ppc64/kernel/htab.c --- linux-2.4.21-6.EL-just_tools/arch/ppc64/kernel/htab.c 2004-01-06 20:35:23.000000000 -0600 +++ hashpage/arch/ppc64/kernel/htab.c 2004-01-18 23:13:51.000000000 -0600 @@ -48,6 +48,29 @@ #include #include +#define HPTE_LOCK_BIT 3 + +static inline void pSeries_lock_hpte(HPTE *hptep) +{ + unsigned long *word = &hptep->dw0.dword0; + + while (1) { + if (!test_and_set_bit(HPTE_LOCK_BIT, word)) + break; + while(test_bit(HPTE_LOCK_BIT, word)) + cpu_relax(); + } +} + +static inline void pSeries_unlock_hpte(HPTE *hptep) +{ + unsigned long *word = &hptep->dw0.dword0; + + asm volatile("lwsync":::"memory"); + clear_bit(HPTE_LOCK_BIT, word); +} JSD: Other places within the kernel do an smp_mb__before_clear_bit() when JSD: clearing a bit representing a lock. Would that be a better choice here JSD: (it resolves to a sync)? + + /* * Note: pte --> Linux PTE * HPTE --> PowerPC Hashed Page Table Entry @@ -64,6 +87,7 @@ HTAB htab_data = {NULL, 0, 0, 0, 0}; extern unsigned long _SDR1; extern unsigned long klimit; +extern rwlock_t pte_hash_lock[] __cacheline_aligned_in_smp; void make_pte(HPTE *htab, unsigned long va, unsigned long pa, int mode, unsigned long hash_mask, int large); @@ -320,45 +344,73 @@ int __hash_page(unsigned long ea, unsign unsigned long va, vpn; unsigned long newpp, prpn; unsigned long hpteflags, lock_slot; + unsigned long access_ok, tmp; long slot; pte_t old_pte, new_pte; + int ret = 0; /* Search the Linux page table for a match with va */ va = (vsid << 28) | (ea & 0x0fffffff); vpn = va >> PAGE_SHIFT; lock_slot = get_lock_slot(vpn); - /* Acquire the hash table lock to guarantee that the linux - * pte we fetch will not change + /* + * Check the user's access rights to the page. If access should be + * prevented then send the problem up to do_page_fault. */ - spin_lock(&hash_table_lock[lock_slot].lock); - + /* * Check the user's access rights to the page. If access should be * prevented then send the problem up to do_page_fault. */ -#ifdef CONFIG_SHARED_MEMORY_ADDRESSING + access |= _PAGE_PRESENT; - if (unlikely(access & ~(pte_val(*ptep)))) { + + /* We'll do access checking and _PAGE_BUSY setting in assembly, since + * it needs to be atomic. + */ + + __asm__ __volatile__ ("\n + 1: ldarx %0,0,%3\n + # Check access rights (access & ~(pte_val(*ptep)))\n + andc. %1,%2,%0\n + bne- 2f\n + # Check if PTE is busy\n + andi. %1,%0,%4\n + bne- 1b\n + ori %0,%0,%4\n + # Write the linux PTE atomically (setting busy)\n + stdcx. %0,0,%3\n + bne- 1b\n + li %1,1\n + b 3f\n + 2: stdcx. %0,0,%3 # to clear the reservation\n + li %1,0\n + 3:" + : "=r" (old_pte), "=r" (access_ok) + : "r" (access), "r" (ptep), "i" (_PAGE_BUSY) + : "cr0", "memory"); + +#ifdef CONFIG_SHARED_MEMORY_ADDRESSING + if (unlikely(!access_ok)) { if(!(((ea >> SMALLOC_EA_SHIFT) == (SMALLOC_START >> SMALLOC_EA_SHIFT)) && ((current->thread.flags) & PPC_FLAG_SHARED))) { - spin_unlock(&hash_table_lock[lock_slot].lock); - return 1; + ret = 1; + goto out_unlock; } } JSD: The inline assembly code will not set _PAGE_BUSY if it finds that JSD: the user's access rights doesn't allow access to the page. JSD: So, if access_ok = 0, _PAGE_BUSY is not set, and the code may JSD: branch to out_unlock where _PAGE_BUSY is unconditionally cleared. JSD: It could be possible for another processor to coincidently have JSD: set _PAGE_BUSY in the PTE but then have this processor clear it JSD: before the other processor wanted it clear. Do you agree this can JSD: happen? JSD: Furthermore, the "ea" condition check (above) might yield false and JSD: the code then continue as though access_ok were true, modifying the JSD: PTE without the _PAGE_BUSY bit set. Seems bad. #else - access |= _PAGE_PRESENT; - if (unlikely(access & ~(pte_val(*ptep)))) { - spin_unlock(&hash_table_lock[lock_slot].lock); - return 1; + if (unlikely(!access_ok)) { + ret = 1; + goto out_unlock; } #endif /* * We have found a pte (which was present). * The spinlocks prevent this status from changing - * The hash_table_lock prevents the _PAGE_HASHPTE status + * The _PAGE_BUSY prevents the _PAGE_HASHPTE status * from changing (RPN, DIRTY and ACCESSED too) * The page_table_lock prevents the pte from being * invalidated or modified @@ -385,7 +437,7 @@ int __hash_page(unsigned long ea, unsign else pte_val(new_pte) |= _PAGE_ACCESSED; - newpp = computeHptePP(pte_val(new_pte)); + newpp = computeHptePP(pte_val(new_pte) & ~_PAGE_BUSY); /* Check if pte already has an hpte (case 2) */ if (unlikely(pte_val(old_pte) & _PAGE_HASHPTE)) { @@ -400,7 +452,7 @@ int __hash_page(unsigned long ea, unsign slot = (hash & htab_data.htab_hash_mask) * HPTES_PER_GROUP; slot += (pte_val(old_pte) & _PAGE_GROUP_IX) >> 12; - /* XXX fix large pte flag */ + /* XXX fix large pte flag */ if (ppc_md.hpte_updatepp(slot, secondary, newpp, va, 0) == -1) { pte_val(old_pte) &= ~_PAGE_HPTEFLAGS; @@ -428,9 +480,12 @@ int __hash_page(unsigned long ea, unsign *ptep = new_pte; } JSD: NOTE: at this point, the *ptep = new_pte just unlocked the PTE by JSD: clearing _PAGE_BUSY. The code then goes on to clear it again, thereby JSD: possibly rendering unsafe updates that some other processor might be JSD: doing after it thought it had set _PAGE_BUSY. There is a small window JSD: here. - spin_unlock(&hash_table_lock[lock_slot].lock); +out_unlock: + smp_wmb(); JSD: I believe an smp_mb__before_clear_bit() would be a better choice, or JSD: at least an lwsync because this is an unlock. + + pte_val(*ptep) &= ~_PAGE_BUSY; - return 0; + return ret; } /* @@ -497,11 +552,14 @@ int hash_page(unsigned long ea, unsigned pgdir = mm->pgd; if (pgdir == NULL) return 1; - /* - * Lock the Linux page table to prevent mmap and kswapd - * from modifying entries while we search and update + /* The pte_hash_lock is used to block any PTE deallocations + * while we walk the tree and use the entry. While technically + * we both read and write the PTE entry while holding the read + * lock, the _PAGE_BUSY bit will block pte_update()s to the + * specific entry. */ - spin_lock(&mm->page_table_lock); + + read_lock(&pte_hash_lock[smp_processor_id()]); ptep = find_linux_pte(pgdir, ea); /* @@ -514,8 +572,7 @@ int hash_page(unsigned long ea, unsigned /* If no pte, send the problem up to do_page_fault */ ret = 1; } - - spin_unlock(&mm->page_table_lock); + read_unlock(&pte_hash_lock[smp_processor_id()]); return ret; } @@ -540,8 +597,6 @@ void flush_hash_page(unsigned long conte lock_slot = get_lock_slot(vpn); hash = hpt_hash(vpn, large); - spin_lock_irqsave(&hash_table_lock[lock_slot].lock, flags); - pte = __pte(pte_update(ptep, _PAGE_HPTEFLAGS, 0)); secondary = (pte_val(pte) & _PAGE_SECONDARY) >> 15; if (secondary) hash = ~hash; @@ -551,8 +606,6 @@ void flush_hash_page(unsigned long conte if (pte_val(pte) & _PAGE_HASHPTE) { ppc_md.hpte_invalidate(slot, secondary, va, large, local); } - - spin_unlock_irqrestore(&hash_table_lock[lock_slot].lock, flags); } long plpar_pte_enter(unsigned long flags, JSD: In the above two hunks of code, a pte_update is done which returns JSD: the old pte value. This value is checked to determine if the hpte JSD: should be invalidated. However, there is no lock held between the JSD: time the pte value is read and the time the hpte is invalidated. JSD: The hpte_invalidate routine doesn't check to make sure the va JSD: passed in is really the one being invalidated in the slot -- it just JSD: assumes the slot, etc are enough to locate it. So we might be JSD: invalidating the wrong thing here. Probably a don't care, but thought JSD: I'd ask. @@ -787,6 +840,8 @@ static void hpte_invalidate(unsigned lon avpn = vpn >> 11; + pSeries_lock_hpte(hptep); + dw0 = hptep->dw0.dw0; /* @@ -794,7 +849,11 @@ static void hpte_invalidate(unsigned lon * the AVPN, hash group, and valid bits. By doing it this way, * it is common with the pSeries LPAR optimal path. */ - if (dw0.bolted) return; + if (dw0.bolted) { + pSeries_unlock_hpte(hptep); + + return; + } /* Invalidate the hpte. */ hptep->dw0.dword0 = 0; JSD: It would really be nice to add a comment here that the above JSD: assignment statement is also unlocking the hpte as well. @@ -875,6 +934,8 @@ static long hpte_updatepp(unsigned long avpn = vpn >> 11; + pSeries_lock_hpte(hptep); + dw0 = hptep->dw0.dw0; if ((dw0.avpn == avpn) && (dw0.v) && (dw0.h == secondary)) { @@ -900,10 +961,14 @@ static long hpte_updatepp(unsigned long hptep->dw0.dw0 = dw0; __asm__ __volatile__ ("ptesync" : : : "memory"); + + pSeries_unlock_hpte(hptep); return 0; } + pSeries_unlock_hpte(hptep); + return -1; } @@ -1062,9 +1127,11 @@ repeat: dw0 = hptep->dw0.dw0; if (!dw0.v) { /* retry with lock held */ + pSeries_lock_hpte(hptep); dw0 = hptep->dw0.dw0; if (!dw0.v) break; + pSeries_unlock_hpte(hptep); } hptep++; } @@ -1079,9 +1146,11 @@ repeat: dw0 = hptep->dw0.dw0; if (!dw0.v) { /* retry with lock held */ + pSeries_lock_hpte(hptep); dw0 = hptep->dw0.dw0; if (!dw0.v) break; + pSeries_unlock_hpte(hptep); } hptep++; } @@ -1304,9 +1373,11 @@ static long hpte_remove(unsigned long hp if (dw0.v && !dw0.bolted) { /* retry with lock held */ + pSeries_lock_hpte(hptep); dw0 = hptep->dw0.dw0; if (dw0.v && !dw0.bolted) break; + pSeries_unlock_hpte(hptep); } slot_offset++; diff -pru -X /root/.diffexclude linux-2.4.21-6.EL-just_tools/arch/ppc64/mm/init.c hashpage/arch/ppc64/mm/init.c --- linux-2.4.21-6.EL-just_tools/arch/ppc64/mm/init.c 2004-01-06 20:35:23.000000000 -0600 +++ hashpage/arch/ppc64/mm/init.c 2004-01-18 18:11:14.000000000 -0600 @@ -104,9 +104,90 @@ unsigned long __max_memory; */ mmu_gather_t mmu_gathers[NR_CPUS]; +/* PTE free batching structures. We need a lock since not all + * operations take place under page_table_lock. Keep it per-CPU + * to avoid bottlenecks. + */ + +struct pte_freelist_batch ____cacheline_aligned pte_freelist_cur[NR_CPUS] __cacheline_aligned_in_smp; +rwlock_t pte_hash_lock[NR_CPUS] __cacheline_aligned_in_smp = { [0 ... NR_CPUS-1] = RW_LOCK_UNLOCKED }; + +unsigned long pte_freelist_forced_free; + +static inline void pte_free_sync(void) +{ + unsigned long flags; + int i; + + /* All we need to know is that we can get the write lock if + * we wanted to, i.e. that no hash_page()s are holding it for reading. + * If none are reading, that means there's no currently executing + * hash_page() that might be working on one of the PTE's that will + * be deleted. Likewise, if there is a reader, we need to get the + * write lock to know when it releases the lock. + */ + + for (i = 0; i < smp_num_cpus; i++) + if (is_read_locked(&pte_hash_lock[i])) { + if(i == smp_processor_id()) + local_irq_save(flags); + + write_lock(&pte_hash_lock[i]); + write_unlock(&pte_hash_lock[i]); + + if(i == smp_processor_id()) + local_irq_restore(flags); + } +} JSD: Two questions here. (1) Shouldn't interrupts be disabled for the JSD: write_lock/unlock here regardless of what processor we are running JSD: on? (2) I don't see how this code is preventing another processor JSD: from grabbing the read_lock immediately after this processor has JSD: checked to make sure it isn't held. + + +/* This is only called when we are critically out of memory + * (and fail to get a page in pte_free_tlb). + */ +void pte_free_now(pte_t *pte) +{ + pte_freelist_forced_free++; + + pte_free_sync(); + + pte_free_kernel(pte); +} + + +void pte_free_batch(void **batch, int size) +{ + unsigned int i; + + pte_free_sync(); + + for (i = 0; i < size; i++) + pte_free_kernel(batch[i]); + + free_page((unsigned long)batch); +} + + int do_check_pgt_cache(int low, int high) { int freed = 0; + struct pte_freelist_batch *batch; + + /* We use this function to push the current pte free batch to be + * deallocated, since do_check_pgt_cache() is callEd at the end of each + * free_one_pgd() and other parts of VM relies on all PTE's being + * properly freed upon return from that function. + */ + + batch = &pte_freelist_cur[smp_processor_id()]; + + spin_lock(&batch->lock); + + if(batch->entry) { + pte_free_batch(batch->entry, batch->index); + batch->entry = NULL; + } + + spin_unlock(&batch->lock); JSD: Since the pte_freelist_cur is a per-processor structure, I don't JSD: think you need the batch->lock at all. What other thing could be JSD: running on this same processor at the same time? #if 0 if (pgtable_cache_size > high) { @@ -246,14 +327,14 @@ static void map_io_page(unsigned long ea unsigned long vsid; if (mem_init_done) { - spin_lock(&ioremap_mm.page_table_lock); + spin_lock_irq(&ioremap_mm.page_table_lock); pgdp = pgd_offset_i(ea); pmdp = pmd_alloc(&ioremap_mm, pgdp, ea); ptep = pte_alloc_kernel(&ioremap_mm, pmdp, ea); pa = absolute_to_phys(pa); set_pte(ptep, mk_pte_phys(pa & PAGE_MASK, __pgprot(flags))); - spin_unlock(&ioremap_mm.page_table_lock); + spin_unlock_irq(&ioremap_mm.page_table_lock); } else { /* If the mm subsystem is not fully up, we cannot create a * linux page table entry for this mapping. Simply bolt an JSD: Why were these spinlocks changed to _irq? I noticed this change was JSD: not present in the 2.6 code. @@ -292,7 +373,9 @@ local_flush_tlb_all(void) void local_flush_tlb_mm(struct mm_struct *mm) { - spin_lock(&mm->page_table_lock); + unsigned long flags; + + spin_lock_irqsave(&mm->page_table_lock, flags); if ( mm->map_count ) { struct vm_area_struct *mp; @@ -300,7 +383,7 @@ local_flush_tlb_mm(struct mm_struct *mm) local_flush_tlb_range( mm, mp->vm_start, mp->vm_end ); } - spin_unlock(&mm->page_table_lock); + spin_unlock_irqrestore(&mm->page_table_lock, flags); } /* JSD: Same question about the _irq addition. Doesn't seem necessary. I'm JSD: probably missing something -- please explain. diff -pru -X /root/.diffexclude linux-2.4.21-6.EL-just_tools/include/asm-ppc64/pgalloc.h hashpage/include/asm-ppc64/pgalloc.h --- linux-2.4.21-6.EL-just_tools/include/asm-ppc64/pgalloc.h 2004-01-06 20:35:29.000000000 -0600 +++ hashpage/include/asm-ppc64/pgalloc.h 2004-01-18 18:11:14.000000000 -0600 @@ -93,12 +93,6 @@ pte_free_kernel(pte_t *pte) } -static inline void -pmd_free (pmd_t *pmd) -{ - free_page((unsigned long)pmd); -} - #define pte_alloc_one_fast(mm, address) (0) static inline struct page * @@ -112,7 +106,52 @@ pte_alloc_one(struct mm_struct *mm, unsi return NULL; } -#define pte_free(pte_page) pte_free_kernel(page_address(pte_page)) +/* Use the PTE functions for freeing PMD as well, since the same + * problem with tree traversals apply. Since pmd pointers are always + * virtual, no need for a page_address() translation. + */ + +#define pte_free(pte_page) __pte_free(page_address(pte_page)) +#define pmd_free(pmd) __pte_free(pmd) + +struct pte_freelist_batch +{ + unsigned int index; + spinlock_t lock; + void** entry; +}; + +#define PTE_FREELIST_SIZE (PAGE_SIZE / sizeof(void *)) + +extern void pte_free_now(pte_t *pte); +extern void pte_free_batch(void **batch, int size); +extern struct ____cacheline_aligned pte_freelist_batch pte_freelist_cur[] __cacheline_aligned_in_smp; + +static inline void __pte_free(pte_t *pte) +{ + struct pte_freelist_batch *batchp = &pte_freelist_cur[smp_processor_id()]; + + spin_lock(&batchp->lock); + + if (batchp->entry == NULL) { + batchp->entry = (void **)__get_free_page(GFP_ATOMIC); + if (batchp->entry == NULL) { + spin_unlock(&batchp->lock); + pte_free_now(pte); + return; + } + batchp->index = 0; + } + + batchp->entry[batchp->index++] = pte; + if (batchp->index == PTE_FREELIST_SIZE) { + pte_free_batch(batchp->entry, batchp->index); + batchp->entry = NULL; + } + + spin_unlock(&batchp->lock); +} + extern int do_check_pgt_cache(int, int); diff -pru -X /root/.diffexclude linux-2.4.21-6.EL-just_tools/include/asm-ppc64/pgtable.h hashpage/include/asm-ppc64/pgtable.h --- linux-2.4.21-6.EL-just_tools/include/asm-ppc64/pgtable.h 2004-01-06 20:35:29.000000000 -0600 +++ hashpage/include/asm-ppc64/pgtable.h 2004-01-19 00:47:33.000000000 -0600 @@ -88,22 +88,22 @@ * Bits in a linux-style PTE. These match the bits in the * (hardware-defined) PowerPC PTE as closely as possible. */ -#define _PAGE_PRESENT 0x001UL /* software: pte contains a translation */ -#define _PAGE_USER 0x002UL /* matches one of the PP bits */ -#define _PAGE_RW 0x004UL /* software: user write access allowed */ -#define _PAGE_GUARDED 0x008UL -#define _PAGE_COHERENT 0x010UL /* M: enforce memory coherence (SMP systems) */ -#define _PAGE_NO_CACHE 0x020UL /* I: cache inhibit */ -#define _PAGE_WRITETHRU 0x040UL /* W: cache write-through */ -#define _PAGE_DIRTY 0x080UL /* C: page changed */ -#define _PAGE_ACCESSED 0x100UL /* R: page referenced */ -#define _PAGE_HPTENOIX 0x200UL /* software: pte HPTE slot unknown */ -#define _PAGE_HASHPTE 0x400UL /* software: pte has an associated HPTE */ -#define _PAGE_EXEC 0x800UL /* software: i-cache coherence required */ -#define _PAGE_SECONDARY 0x8000UL /* software: HPTE is in secondary group */ -#define _PAGE_GROUP_IX 0x7000UL /* software: HPTE index within group */ +#define _PAGE_PRESENT 0x0001 /* software: pte contains a translation */ +#define _PAGE_USER 0x0002 /* matches one of the PP bits */ +#define _PAGE_RW 0x0004 /* software: user write access allowed */ +#define _PAGE_GUARDED 0x0008 +#define _PAGE_COHERENT 0x0010 /* M: enforce memory coherence (SMP systems) */ +#define _PAGE_NO_CACHE 0x0020 /* I: cache inhibit */ +#define _PAGE_WRITETHRU 0x0040 /* W: cache write-through */ +#define _PAGE_DIRTY 0x0080 /* C: page changed */ +#define _PAGE_ACCESSED 0x0100 /* R: page referenced */ +#define _PAGE_BUSY 0x0200 /* software: pte & hash are busy */ +#define _PAGE_HASHPTE 0x0400 /* software: pte has an associated HPTE */ +#define _PAGE_EXEC 0x0800 /* software: i-cache coherence required */ +#define _PAGE_GROUP_IX 0x7000 /* software: HPTE index within group */ +#define _PAGE_SECONDARY 0x8000 /* software: HPTE is in secondary group */ /* Bits 0x7000 identify the index within an HPT Group */ -#define _PAGE_HPTEFLAGS (_PAGE_HASHPTE | _PAGE_HPTENOIX | _PAGE_SECONDARY | _PAGE_GROUP_IX) +#define _PAGE_HPTEFLAGS (_PAGE_BUSY | _PAGE_HASHPTE | _PAGE_SECONDARY | _PAGE_GROUP_IX) /* PAGE_MASK gives the right answer below, but only by accident */ /* It should be preserving the high 48 bits and then specifically */ /* preserving _PAGE_SECONDARY | _PAGE_GROUP_IX */ JSD: JSD: Why was the UL (unsigned long) dropped from the bit definitions? JSD: @@ -289,13 +289,15 @@ static inline unsigned long pte_update( unsigned long old, tmp; __asm__ __volatile__("\n\ -1: ldarx %0,0,%3 \n\ +1: ldarx %0,0,%3 \n\ + andi. %1,%0,%7 # loop on _PAGE_BUSY set\n\ + bne- 1b \n\ andc %1,%0,%4 \n\ or %1,%1,%5 \n\ stdcx. %1,0,%3 \n\ bne- 1b" : "=&r" (old), "=&r" (tmp), "=m" (*p) - : "r" (p), "r" (clr), "r" (set), "m" (*p) + : "r" (p), "r" (clr), "r" (set), "m" (*p), "i" (_PAGE_BUSY) : "cc" ); return old; }