diff -pru -X /root/.diffexclude linux-2.4.21-6.EL-just_tools/arch/ppc64/kernel/htab.c hashpage/arch/ppc64/kernel/htab.c
--- linux-2.4.21-6.EL-just_tools/arch/ppc64/kernel/htab.c	2004-01-06 20:35:23.000000000 -0600
+++ hashpage/arch/ppc64/kernel/htab.c	2004-01-18 23:13:51.000000000 -0600
@@ -48,6 +48,29 @@
 #include <asm/iSeries/HvCallHpt.h>
 #include <asm/cputable.h>

+#define HPTE_LOCK_BIT 3
+
+static inline void pSeries_lock_hpte(HPTE *hptep)
+{
+	unsigned long *word = &hptep->dw0.dword0;
+
+	while (1) {
+		if (!test_and_set_bit(HPTE_LOCK_BIT, word))
+			break;
+		while(test_bit(HPTE_LOCK_BIT, word))
+			cpu_relax();
+	}
+}
+
+static inline void pSeries_unlock_hpte(HPTE *hptep)
+{
+	unsigned long *word = &hptep->dw0.dword0;
+
+	asm volatile("lwsync":::"memory");
+	clear_bit(HPTE_LOCK_BIT, word);
+}

JSD: Other places within the kernel do an smp_mb__before_clear_bit() when
JSD: clearing a bit representing a lock. Would that be a better choice here
JSD: (it resolves to a sync)?
+
+
 /*
  * Note:  pte   --> Linux PTE
  *        HPTE  --> PowerPC Hashed Page Table Entry
@@ -64,6 +87,7 @@ HTAB htab_data = {NULL, 0, 0, 0, 0};

 extern unsigned long _SDR1;
 extern unsigned long klimit;
+extern rwlock_t pte_hash_lock[] __cacheline_aligned_in_smp;

 void make_pte(HPTE *htab, unsigned long va, unsigned long pa,
 	      int mode, unsigned long hash_mask, int large);
@@ -320,45 +344,73 @@ int __hash_page(unsigned long ea, unsign
 	unsigned long va, vpn;
 	unsigned long newpp, prpn;
 	unsigned long hpteflags, lock_slot;
+	unsigned long access_ok, tmp;
 	long slot;
 	pte_t old_pte, new_pte;
+	int ret = 0;

 	/* Search the Linux page table for a match with va */
 	va = (vsid << 28) | (ea & 0x0fffffff);
 	vpn = va >> PAGE_SHIFT;
 	lock_slot = get_lock_slot(vpn);

-	/* Acquire the hash table lock to guarantee that the linux
-	 * pte we fetch will not change
+	/*
+	 * Check the user's access rights to the page.  If access should be
+	 * prevented then send the problem up to do_page_fault.
 	 */
-	spin_lock(&hash_table_lock[lock_slot].lock);
-
+
 	/*
 	 * Check the user's access rights to the page.  If access should be
 	 * prevented then send the problem up to do_page_fault.
 	 */
-#ifdef CONFIG_SHARED_MEMORY_ADDRESSING
+
 	access |= _PAGE_PRESENT;
-	if (unlikely(access & ~(pte_val(*ptep)))) {
+
+	/* We'll do access checking and _PAGE_BUSY setting in assembly, since
+	 * it needs to be atomic.
+	 */
+
+	__asm__ __volatile__ ("\n
+	1:	ldarx	%0,0,%3\n
+		# Check access rights (access & ~(pte_val(*ptep)))\n
+		andc.	%1,%2,%0\n
+		bne-	2f\n
+		# Check if PTE is busy\n
+		andi.	%1,%0,%4\n
+		bne-	1b\n
+		ori	%0,%0,%4\n
+		# Write the linux PTE atomically (setting busy)\n
+		stdcx.	%0,0,%3\n
+		bne-	1b\n
+		li      %1,1\n
+                b	3f\n
+	2:      stdcx.  %0,0,%3 # to clear the reservation\n
+		li      %1,0\n
+	3:"
+	: "=r" (old_pte), "=r" (access_ok)
+	: "r" (access), "r" (ptep), "i" (_PAGE_BUSY)
+        : "cr0", "memory");
+
+#ifdef CONFIG_SHARED_MEMORY_ADDRESSING
+	if (unlikely(!access_ok)) {
 		if(!(((ea >> SMALLOC_EA_SHIFT) ==
 		      (SMALLOC_START >> SMALLOC_EA_SHIFT)) &&
 		     ((current->thread.flags) & PPC_FLAG_SHARED))) {
-			spin_unlock(&hash_table_lock[lock_slot].lock);
-			return 1;
+			ret = 1;
+			goto out_unlock;
 		}
 	}

JSD: The inline assembly code will not set _PAGE_BUSY if it finds that
JSD: the user's access rights doesn't allow access to the page.
JSD: So, if access_ok = 0, _PAGE_BUSY is not set, and the code may
JSD: branch to out_unlock where _PAGE_BUSY is unconditionally cleared.
JSD: It could be possible for another processor to coincidently have
JSD: set _PAGE_BUSY in the PTE but then have this processor clear it
JSD: before the other processor wanted it clear. Do you agree this can
JSD: happen?
JSD: Furthermore, the "ea" condition check (above) might yield false and
JSD: the code then continue as though access_ok were true, modifying the
JSD: PTE without the _PAGE_BUSY bit set. Seems bad.

 #else
-	access |= _PAGE_PRESENT;
-	if (unlikely(access & ~(pte_val(*ptep)))) {
-		spin_unlock(&hash_table_lock[lock_slot].lock);
-		return 1;
+	if (unlikely(!access_ok)) {
+		ret = 1;
+		goto out_unlock;
 	}
 #endif

 	/*
 	 * We have found a pte (which was present).
 	 * The spinlocks prevent this status from changing
-	 * The hash_table_lock prevents the _PAGE_HASHPTE status
+	 * The _PAGE_BUSY prevents the _PAGE_HASHPTE status
 	 * from changing (RPN, DIRTY and ACCESSED too)
 	 * The page_table_lock prevents the pte from being
 	 * invalidated or modified
@@ -385,7 +437,7 @@ int __hash_page(unsigned long ea, unsign
 	else
 		pte_val(new_pte) |= _PAGE_ACCESSED;

-	newpp = computeHptePP(pte_val(new_pte));
+	newpp = computeHptePP(pte_val(new_pte) & ~_PAGE_BUSY);

 	/* Check if pte already has an hpte (case 2) */
 	if (unlikely(pte_val(old_pte) & _PAGE_HASHPTE)) {
@@ -400,7 +452,7 @@ int __hash_page(unsigned long ea, unsign
 		slot = (hash & htab_data.htab_hash_mask) * HPTES_PER_GROUP;
 		slot += (pte_val(old_pte) & _PAGE_GROUP_IX) >> 12;

-		/* XXX fix large pte flag */
+	        /* XXX fix large pte flag */
 		if (ppc_md.hpte_updatepp(slot, secondary,
 					 newpp, va, 0) == -1) {
 			pte_val(old_pte) &= ~_PAGE_HPTEFLAGS;
@@ -428,9 +480,12 @@ int __hash_page(unsigned long ea, unsign
 		*ptep = new_pte;
 	}

JSD: NOTE: at this point, the *ptep = new_pte just unlocked the PTE by
JSD: clearing _PAGE_BUSY. The code then goes on to clear it again, thereby
JSD: possibly rendering unsafe updates that some other processor might be
JSD: doing after it thought it had set _PAGE_BUSY. There is a small window
JSD: here.

-	spin_unlock(&hash_table_lock[lock_slot].lock);
+out_unlock:
+	smp_wmb();

JSD: I believe an smp_mb__before_clear_bit() would be a better choice, or
JSD: at least an lwsync because this is an unlock.

+
+	pte_val(*ptep) &= ~_PAGE_BUSY;

-	return 0;
+	return ret;
 }

 /*
@@ -497,11 +552,14 @@ int hash_page(unsigned long ea, unsigned
 	pgdir = mm->pgd;
 	if (pgdir == NULL) return 1;

-	/*
-	 * Lock the Linux page table to prevent mmap and kswapd
-	 * from modifying entries while we search and update
+	/* The pte_hash_lock is used to block any PTE deallocations
+	 * while we walk the tree and use the entry. While technically
+	 * we both read and write the PTE entry while holding the read
+	 * lock, the _PAGE_BUSY bit will block pte_update()s to the
+	 * specific entry.
 	 */
-	spin_lock(&mm->page_table_lock);
+
+	read_lock(&pte_hash_lock[smp_processor_id()]);

 	ptep = find_linux_pte(pgdir, ea);
 	/*
@@ -514,8 +572,7 @@ int hash_page(unsigned long ea, unsigned
 		/* If no pte, send the problem up to do_page_fault */
 		ret = 1;
 	}
-
-	spin_unlock(&mm->page_table_lock);
+	read_unlock(&pte_hash_lock[smp_processor_id()]);

 	return ret;
 }
@@ -540,8 +597,6 @@ void flush_hash_page(unsigned long conte
 	lock_slot = get_lock_slot(vpn);
 	hash = hpt_hash(vpn, large);

-	spin_lock_irqsave(&hash_table_lock[lock_slot].lock, flags);
-
 	pte = __pte(pte_update(ptep, _PAGE_HPTEFLAGS, 0));
 	secondary = (pte_val(pte) & _PAGE_SECONDARY) >> 15;
 	if (secondary) hash = ~hash;
@@ -551,8 +606,6 @@ void flush_hash_page(unsigned long conte
 	if (pte_val(pte) & _PAGE_HASHPTE) {
 		ppc_md.hpte_invalidate(slot, secondary, va, large, local);
 	}
-
-	spin_unlock_irqrestore(&hash_table_lock[lock_slot].lock, flags);
 }
 long plpar_pte_enter(unsigned long flags,

JSD: In the above two hunks of code, a pte_update is done which returns
JSD: the old pte value. This value is checked to determine if the hpte
JSD: should be invalidated. However, there is no lock held between the
JSD: time the pte value is read and the time the hpte is invalidated.
JSD: The hpte_invalidate routine doesn't check to make sure the va
JSD: passed in is really the one being invalidated in the slot -- it just
JSD: assumes the slot, etc are enough to locate it. So we might be
JSD: invalidating the wrong thing here. Probably a don't care, but thought
JSD: I'd ask.

@@ -787,6 +840,8 @@ static void hpte_invalidate(unsigned lon

 	avpn = vpn >> 11;

+	pSeries_lock_hpte(hptep);
+
 	dw0 = hptep->dw0.dw0;

 	/*
@@ -794,7 +849,11 @@ static void hpte_invalidate(unsigned lon
 	 * the AVPN, hash group, and valid bits.  By doing it this way,
 	 * it is common with the pSeries LPAR optimal path.
 	 */
-	if (dw0.bolted) return;
+	if (dw0.bolted) {
+		pSeries_unlock_hpte(hptep);
+
+		return;
+	}

 	/* Invalidate the hpte. */
 	hptep->dw0.dword0 = 0;

JSD: It would really be nice to add a comment here that the above
JSD: assignment statement is also unlocking the hpte as well.

@@ -875,6 +934,8 @@ static long hpte_updatepp(unsigned long

 	avpn = vpn >> 11;

+	pSeries_lock_hpte(hptep);
+
 	dw0 = hptep->dw0.dw0;
 	if ((dw0.avpn == avpn) &&
 	    (dw0.v) && (dw0.h == secondary)) {
@@ -900,10 +961,14 @@ static long hpte_updatepp(unsigned long
 		hptep->dw0.dw0 = dw0;

 		__asm__ __volatile__ ("ptesync" : : : "memory");
+
+		pSeries_unlock_hpte(hptep);

 		return 0;
 	}

+	pSeries_unlock_hpte(hptep);
+
 	return -1;
 }

@@ -1062,9 +1127,11 @@ repeat:
 		dw0 = hptep->dw0.dw0;
 		if (!dw0.v) {
 			/* retry with lock held */
+			pSeries_lock_hpte(hptep);
 			dw0 = hptep->dw0.dw0;
 			if (!dw0.v)
 				break;
+			pSeries_unlock_hpte(hptep);
 		}
 		hptep++;
 	}
@@ -1079,9 +1146,11 @@ repeat:
 			dw0 = hptep->dw0.dw0;
 			if (!dw0.v) {
 				/* retry with lock held */
+				pSeries_lock_hpte(hptep);
 				dw0 = hptep->dw0.dw0;
 				if (!dw0.v)
 					break;
+				pSeries_unlock_hpte(hptep);
 			}
 			hptep++;
 		}
@@ -1304,9 +1373,11 @@ static long hpte_remove(unsigned long hp

 		if (dw0.v && !dw0.bolted) {
 			/* retry with lock held */
+			pSeries_lock_hpte(hptep);
 			dw0 = hptep->dw0.dw0;
 			if (dw0.v && !dw0.bolted)
 				break;
+			pSeries_unlock_hpte(hptep);
 		}

 		slot_offset++;
diff -pru -X /root/.diffexclude linux-2.4.21-6.EL-just_tools/arch/ppc64/mm/init.c hashpage/arch/ppc64/mm/init.c
--- linux-2.4.21-6.EL-just_tools/arch/ppc64/mm/init.c	2004-01-06 20:35:23.000000000 -0600
+++ hashpage/arch/ppc64/mm/init.c	2004-01-18 18:11:14.000000000 -0600
@@ -104,9 +104,90 @@ unsigned long __max_memory;
  */
 mmu_gather_t     mmu_gathers[NR_CPUS];

+/* PTE free batching structures. We need a lock since not all
+ * operations take place under page_table_lock. Keep it per-CPU
+ * to avoid bottlenecks.
+ */
+
+struct pte_freelist_batch ____cacheline_aligned pte_freelist_cur[NR_CPUS] __cacheline_aligned_in_smp;
+rwlock_t pte_hash_lock[NR_CPUS] __cacheline_aligned_in_smp = { [0 ... NR_CPUS-1] = RW_LOCK_UNLOCKED };
+
+unsigned long pte_freelist_forced_free;
+
+static inline void pte_free_sync(void)
+{
+	unsigned long flags;
+	int i;
+
+	/* All we need to know is that we can get the write lock if
+	 * we wanted to, i.e. that no hash_page()s are holding it for reading.
+	 * If none are reading, that means there's no currently executing
+	 * hash_page() that might be working on one of the PTE's that will
+	 * be deleted. Likewise, if there is a reader, we need to get the
+	 * write lock to know when it releases the lock.
+	 */
+
+	for (i = 0; i < smp_num_cpus; i++)
+		if (is_read_locked(&pte_hash_lock[i])) {
+			if(i == smp_processor_id())
+				local_irq_save(flags);
+
+			write_lock(&pte_hash_lock[i]);
+			write_unlock(&pte_hash_lock[i]);
+
+			if(i == smp_processor_id())
+				local_irq_restore(flags);
+		}
+}

JSD: Two questions here. (1) Shouldn't interrupts be disabled for the
JSD: write_lock/unlock here regardless of what processor we are running
JSD: on? (2) I don't see how this code is preventing another processor
JSD: from grabbing the read_lock immediately after this processor has
JSD: checked to make sure it isn't held.
+
+
+/* This is only called when we are critically out of memory
+ * (and fail to get a page in pte_free_tlb).
+ */
+void pte_free_now(pte_t *pte)
+{
+	pte_freelist_forced_free++;
+
+	pte_free_sync();
+
+	pte_free_kernel(pte);
+}
+
+
+void pte_free_batch(void **batch, int size)
+{
+	unsigned int i;
+
+	pte_free_sync();
+
+	for (i = 0; i < size; i++)
+		pte_free_kernel(batch[i]);
+
+	free_page((unsigned long)batch);
+}
+
+
 int do_check_pgt_cache(int low, int high)
 {
 	int freed = 0;
+	struct pte_freelist_batch *batch;
+
+	/* We use this function to push the current pte free batch to be
+	 * deallocated, since do_check_pgt_cache() is callEd at the end of each
+	 * free_one_pgd() and other parts of VM relies on all PTE's being
+	 * properly freed upon return from that function.
+	 */
+
+	batch = &pte_freelist_cur[smp_processor_id()];
+
+	spin_lock(&batch->lock);
+
+	if(batch->entry) {
+		pte_free_batch(batch->entry, batch->index);
+		batch->entry = NULL;
+	}
+
+	spin_unlock(&batch->lock);

JSD: Since the pte_freelist_cur is a per-processor structure, I don't
JSD: think you need the batch->lock at all. What other thing could be
JSD: running on this same processor at the same time?

 #if 0
 	if (pgtable_cache_size > high) {
@@ -246,14 +327,14 @@ static void map_io_page(unsigned long ea
 	unsigned long vsid;

 	if (mem_init_done) {
-		spin_lock(&ioremap_mm.page_table_lock);
+		spin_lock_irq(&ioremap_mm.page_table_lock);
 		pgdp = pgd_offset_i(ea);
 		pmdp = pmd_alloc(&ioremap_mm, pgdp, ea);
 		ptep = pte_alloc_kernel(&ioremap_mm, pmdp, ea);

 		pa = absolute_to_phys(pa);
 		set_pte(ptep, mk_pte_phys(pa & PAGE_MASK, __pgprot(flags)));
-		spin_unlock(&ioremap_mm.page_table_lock);
+		spin_unlock_irq(&ioremap_mm.page_table_lock);
 	} else {
 		/* If the mm subsystem is not fully up, we cannot create a
 		 * linux page table entry for this mapping.  Simply bolt an

JSD: Why were these spinlocks changed to _irq? I noticed this change was
JSD: not present in the 2.6 code.

@@ -292,7 +373,9 @@ local_flush_tlb_all(void)
 void
 local_flush_tlb_mm(struct mm_struct *mm)
 {
-	spin_lock(&mm->page_table_lock);
+	unsigned long flags;
+
+	spin_lock_irqsave(&mm->page_table_lock, flags);

 	if ( mm->map_count ) {
 		struct vm_area_struct *mp;
@@ -300,7 +383,7 @@ local_flush_tlb_mm(struct mm_struct *mm)
 			local_flush_tlb_range( mm, mp->vm_start, mp->vm_end );
 	}

-	spin_unlock(&mm->page_table_lock);
+	spin_unlock_irqrestore(&mm->page_table_lock, flags);
 }

 /*

JSD: Same question about the _irq addition. Doesn't seem necessary. I'm
JSD: probably missing something -- please explain.

diff -pru -X /root/.diffexclude linux-2.4.21-6.EL-just_tools/include/asm-ppc64/pgalloc.h hashpage/include/asm-ppc64/pgalloc.h
--- linux-2.4.21-6.EL-just_tools/include/asm-ppc64/pgalloc.h	2004-01-06 20:35:29.000000000 -0600
+++ hashpage/include/asm-ppc64/pgalloc.h	2004-01-18 18:11:14.000000000 -0600
@@ -93,12 +93,6 @@ pte_free_kernel(pte_t *pte)
 }


-static inline void
-pmd_free (pmd_t *pmd)
-{
-        free_page((unsigned long)pmd);
-}
-
 #define pte_alloc_one_fast(mm, address)		(0)

 static inline struct page *
@@ -112,7 +106,52 @@ pte_alloc_one(struct mm_struct *mm, unsi
         return NULL;
 }

-#define pte_free(pte_page)      pte_free_kernel(page_address(pte_page))
+/* Use the PTE functions for freeing PMD as well, since the same
+ * problem with tree traversals apply. Since pmd pointers are always
+ * virtual, no need for a page_address() translation.
+ */
+
+#define pte_free(pte_page)      __pte_free(page_address(pte_page))
+#define pmd_free(pmd)           __pte_free(pmd)
+
+struct pte_freelist_batch
+{
+	unsigned int	index;
+	spinlock_t	lock;
+	void**		entry;
+};
+
+#define PTE_FREELIST_SIZE	(PAGE_SIZE / sizeof(void *))
+
+extern void pte_free_now(pte_t *pte);
+extern void pte_free_batch(void **batch, int size);
+extern struct ____cacheline_aligned pte_freelist_batch pte_freelist_cur[] __cacheline_aligned_in_smp;
+
+static inline void __pte_free(pte_t *pte)
+{
+	struct pte_freelist_batch *batchp = &pte_freelist_cur[smp_processor_id()];
+
+	spin_lock(&batchp->lock);
+
+	if (batchp->entry == NULL) {
+		batchp->entry = (void **)__get_free_page(GFP_ATOMIC);
+		if (batchp->entry == NULL) {
+			spin_unlock(&batchp->lock);
+			pte_free_now(pte);
+			return;
+		}
+		batchp->index = 0;
+	}
+
+	batchp->entry[batchp->index++] = pte;
+	if (batchp->index == PTE_FREELIST_SIZE) {
+		pte_free_batch(batchp->entry, batchp->index);
+		batchp->entry = NULL;
+	}
+
+	spin_unlock(&batchp->lock);
+}
+

 extern int do_check_pgt_cache(int, int);
diff -pru -X /root/.diffexclude linux-2.4.21-6.EL-just_tools/include/asm-ppc64/pgtable.h hashpage/include/asm-ppc64/pgtable.h
--- linux-2.4.21-6.EL-just_tools/include/asm-ppc64/pgtable.h	2004-01-06 20:35:29.000000000 -0600
+++ hashpage/include/asm-ppc64/pgtable.h	2004-01-19 00:47:33.000000000 -0600
@@ -88,22 +88,22 @@
  * Bits in a linux-style PTE.  These match the bits in the
  * (hardware-defined) PowerPC PTE as closely as possible.
  */
-#define _PAGE_PRESENT	0x001UL	/* software: pte contains a translation */
-#define _PAGE_USER	0x002UL	/* matches one of the PP bits */
-#define _PAGE_RW	0x004UL	/* software: user write access allowed */
-#define _PAGE_GUARDED	0x008UL
-#define _PAGE_COHERENT	0x010UL	/* M: enforce memory coherence (SMP systems) */
-#define _PAGE_NO_CACHE	0x020UL	/* I: cache inhibit */
-#define _PAGE_WRITETHRU	0x040UL	/* W: cache write-through */
-#define _PAGE_DIRTY	0x080UL	/* C: page changed */
-#define _PAGE_ACCESSED	0x100UL	/* R: page referenced */
-#define _PAGE_HPTENOIX	0x200UL /* software: pte HPTE slot unknown */
-#define _PAGE_HASHPTE	0x400UL	/* software: pte has an associated HPTE */
-#define _PAGE_EXEC	0x800UL	/* software: i-cache coherence required */
-#define _PAGE_SECONDARY 0x8000UL /* software: HPTE is in secondary group */
-#define _PAGE_GROUP_IX  0x7000UL /* software: HPTE index within group */
+#define _PAGE_PRESENT	0x0001 /* software: pte contains a translation */
+#define _PAGE_USER	0x0002 /* matches one of the PP bits */
+#define _PAGE_RW	0x0004 /* software: user write access allowed */
+#define _PAGE_GUARDED	0x0008
+#define _PAGE_COHERENT	0x0010 /* M: enforce memory coherence (SMP systems) */
+#define _PAGE_NO_CACHE	0x0020 /* I: cache inhibit */
+#define _PAGE_WRITETHRU	0x0040 /* W: cache write-through */
+#define _PAGE_DIRTY	0x0080 /* C: page changed */
+#define _PAGE_ACCESSED	0x0100 /* R: page referenced */
+#define _PAGE_BUSY	0x0200 /* software: pte & hash are busy */
+#define _PAGE_HASHPTE	0x0400 /* software: pte has an associated HPTE */
+#define _PAGE_EXEC	0x0800 /* software: i-cache coherence required */
+#define _PAGE_GROUP_IX  0x7000 /* software: HPTE index within group */
+#define _PAGE_SECONDARY 0x8000 /* software: HPTE is in secondary group */
 /* Bits 0x7000 identify the index within an HPT Group */
-#define _PAGE_HPTEFLAGS (_PAGE_HASHPTE | _PAGE_HPTENOIX | _PAGE_SECONDARY | _PAGE_GROUP_IX)
+#define _PAGE_HPTEFLAGS (_PAGE_BUSY | _PAGE_HASHPTE | _PAGE_SECONDARY | _PAGE_GROUP_IX)
 /* PAGE_MASK gives the right answer below, but only by accident */
 /* It should be preserving the high 48 bits and then specifically */
 /* preserving _PAGE_SECONDARY | _PAGE_GROUP_IX */

JSD:
JSD: Why was the UL (unsigned long) dropped from the bit definitions?
JSD:

@@ -289,13 +289,15 @@ static inline unsigned long pte_update(
 	unsigned long old, tmp;

 	__asm__ __volatile__("\n\
-1:	ldarx	%0,0,%3	\n\
+1:	ldarx	%0,0,%3 \n\
+        andi.   %1,%0,%7 # loop on _PAGE_BUSY set\n\
+        bne-    1b \n\
 	andc	%1,%0,%4 \n\
 	or	%1,%1,%5 \n\
 	stdcx.	%1,0,%3 \n\
 	bne-	1b"
 	: "=&r" (old), "=&r" (tmp), "=m" (*p)
-	: "r" (p), "r" (clr), "r" (set), "m" (*p)
+	: "r" (p), "r" (clr), "r" (set), "m" (*p), "i" (_PAGE_BUSY)
 	: "cc" );
 	return old;
 }