Index: arch/i386/include/param.h =================================================================== --- arch/i386/include/param.h (revision 790) +++ arch/i386/include/param.h (revision 791) @@ -88,7 +88,7 @@ #define KERNBASE_LOCORE 0xc0000000 /* start of kernel virtual space */ #endif -#define KERNBASE ((u_long)KERNBASE_LOCORE) +#define KERNBASE (1UL * KERNBASE_LOCORE) #define KERNTEXTOFF (KERNBASE_LOCORE + 0x100000) /* start of kernel text */ #define BTOPKERNBASE (KERNBASE >> PGSHIFT) @@ -182,13 +182,14 @@ /* * Mach derived conversion macros */ -#define x86_round_pdr(x) ((((unsigned)(x)) + PDOFSET) & ~PDOFSET) -#define x86_trunc_pdr(x) ((unsigned)(x) & ~PDOFSET) -#define x86_btod(x) ((unsigned)(x) >> PDSHIFT) -#define x86_dtob(x) ((unsigned)(x) << PDSHIFT) -#define x86_round_page(x) ((((unsigned)(x)) + PGOFSET) & ~PGOFSET) -#define x86_trunc_page(x) ((unsigned)(x) & ~PGOFSET) -#define x86_btop(x) ((unsigned)(x) >> PGSHIFT) -#define x86_ptob(x) ((unsigned)(x) << PGSHIFT) +#define x86_round_pdr(x) \ + ((((unsigned long)(x)) + (NBPD_L2 - 1)) & ~(NBPD_L2 - 1)) +#define x86_trunc_pdr(x) ((unsigned long)(x) & ~(NBPD_L2 - 1)) +#define x86_btod(x) ((unsigned long)(x) >> L2_SHIFT) +#define x86_dtob(x) ((unsigned long)(x) << L2_SHIFT) +#define x86_round_page(x) ((((unsigned long)(x)) + PGOFSET) & ~PGOFSET) +#define x86_trunc_page(x) ((unsigned long)(x) & ~PGOFSET) +#define x86_btop(x) ((unsigned long)(x) >> PGSHIFT) +#define x86_ptob(x) ((unsigned long)(x) << PGSHIFT) #endif /* _I386_PARAM_H_ */ Index: arch/i386/include/cpu.h =================================================================== --- arch/i386/include/cpu.h (revision 790) +++ arch/i386/include/cpu.h (revision 791) @@ -444,7 +444,7 @@ void est_init(struct cpu_info *); #define CPU_CONSDEV 1 /* dev_t: console terminal device */ #define CPU_BIOSBASEMEM 2 /* int: bios-reported base mem (K) */ #define CPU_BIOSEXTMEM 3 /* int: bios-reported ext. mem (K) */ -#define CPU_NKPDE 4 /* int: number of kernel PDEs */ +/* CPU_NKPDE 4 obsolete: int: number of kernel PDEs */ #define CPU_BOOTED_KERNEL 5 /* string: booted kernel name */ #define CPU_DISKINFO 6 /* struct disklist *: * disk geometry information */ Index: arch/i386/include/pte.h =================================================================== --- arch/i386/include/pte.h (revision 790) +++ arch/i386/include/pte.h (revision 791) @@ -1,6 +1,41 @@ /* $NetBSD: pte.h,v 1.14 2003/08/24 17:52:33 chs Exp $ */ /* + * Copyright (c) 2001 Wasabi Systems, Inc. + * All rights reserved. + * + * Written by Frank van der Linden for Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed for the NetBSD Project by + * Wasabi Systems, Inc. + * 4. The name of Wasabi Systems, Inc. may not be used to endorse + * or promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* * * Copyright (c) 1997 Charles D. Cranor and Washington University. * All rights reserved. @@ -142,16 +177,16 @@ typedef u_int32_t pt_entry_t; /* PTE */ * now we define various for playing with virtual addresses */ -#define PDSHIFT 22 /* offset of PD index in VA */ -#define NBPD (1 << PDSHIFT) /* # bytes mapped by PD (4MB) */ -#define PDOFSET (NBPD-1) /* mask for non-PD part of VA */ -#if 0 /* not used? */ -#define NPTEPD (NBPD / PAGE_SIZE) /* # of PTEs in a PD */ -#else -#define PTES_PER_PTP (NBPD / PAGE_SIZE) /* # of PTEs in a PTP */ -#endif -#define PD_MASK 0xffc00000 /* page directory address bits */ -#define PT_MASK 0x003ff000 /* page table address bits */ +#define L1_SHIFT 12 +#define L2_SHIFT 22 +#define NBPD_L1 (1ULL << L1_SHIFT) /* # bytes mapped by L1 ent (4K) */ +#define NBPD_L2 (1ULL << L2_SHIFT) /* # bytes mapped by L2 ent (4MB) */ + +#define L2_MASK 0xffc00000 +#define L1_MASK 0x003ff000 + +#define L2_FRAME L2_MASK +#define L1_FRAME (L2_FRAME|L1_MASK) /* * here we define the bits of the PDE/PTE, as described above: Index: arch/i386/include/pmap.h =================================================================== --- arch/i386/include/pmap.h (revision 790) +++ arch/i386/include/pmap.h (revision 791) @@ -33,12 +33,48 @@ */ /* + * Copyright (c) 2001 Wasabi Systems, Inc. + * All rights reserved. + * + * Written by Frank van der Linden for Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed for the NetBSD Project by + * Wasabi Systems, Inc. + * 4. The name of Wasabi Systems, Inc. may not be used to endorse + * or promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* * pmap.h: see pmap.c for the history of this pmap module. */ #ifndef _I386_PMAP_H_ #define _I386_PMAP_H_ +#ifndef _LOCORE #if defined(_KERNEL_OPT) #include "opt_user_ldt.h" #include "opt_largepages.h" @@ -48,6 +84,7 @@ #include #include #include +#endif /* * see pte.h for a description of i386 MMU terminology and hardware @@ -143,12 +180,22 @@ /* XXX MP should we allocate one APDP_PDE per processor?? */ /* - * the following defines identify the slots used as described above. + * Mask to get rid of the sign-extended part of addresses. + */ +#define VA_SIGN_MASK 0 +#define VA_SIGN_NEG(va) ((va) | VA_SIGN_MASK) +/* + * XXXfvdl this one's not right. */ +#define VA_SIGN_POS(va) ((va) & ~VA_SIGN_MASK) + +#define L2_SLOT_PTE 767 +#define L2_SLOT_KERN (KERNBASE >> L2_SHIFT) /* 768 */ +#define L2_SLOT_APTE 1023 -#define PDSLOT_PTE ((KERNBASE/NBPD)-1) /* 767: for recursive PDP map */ -#define PDSLOT_KERN (KERNBASE/NBPD) /* 768: start of kernel space */ -#define PDSLOT_APTE ((unsigned)1023) /* 1023: alternative recursive slot */ +#define PDIR_SLOT_KERN L2_SLOT_KERN +#define PDIR_SLOT_PTE L2_SLOT_PTE +#define PDIR_SLOT_APTE L2_SLOT_APTE /* * the following defines give the virtual addresses of various MMU @@ -156,34 +203,75 @@ * PTE_BASE and APTE_BASE: the base VA of the linear PTE mappings * PTD_BASE and APTD_BASE: the base VA of the recursive mapping of the PTD * PDP_PDE and APDP_PDE: the VA of the PDE that points back to the PDP/APDP + * */ -#define PTE_BASE ((pt_entry_t *) (PDSLOT_PTE * NBPD) ) -#define APTE_BASE ((pt_entry_t *) (PDSLOT_APTE * NBPD) ) -#define PDP_BASE ((pd_entry_t *)(((char *)PTE_BASE) + (PDSLOT_PTE * PAGE_SIZE))) -#define APDP_BASE ((pd_entry_t *)(((char *)APTE_BASE) + (PDSLOT_APTE * PAGE_SIZE))) -#define PDP_PDE (PDP_BASE + PDSLOT_PTE) -#define APDP_PDE (PDP_BASE + PDSLOT_APTE) - -/* - * the follow define determines how many PTPs should be set up for the - * kernel by locore.s at boot time. this should be large enough to - * get the VM system running. once the VM system is running, the - * pmap module can add more PTPs to the kernel area on demand. - */ +#define PTE_BASE ((pt_entry_t *) (L2_SLOT_PTE * NBPD_L2)) +#define APTE_BASE ((pt_entry_t *) (L2_SLOT_APTE * NBPD_L2)) -#ifndef NKPTP -#define NKPTP 4 /* 16MB to start */ -#endif -#define NKPTP_MIN 4 /* smallest value we allow */ -#define NKPTP_MAX (1024 - (KERNBASE/NBPD) - 1) +#define L1_BASE PTE_BASE +#define AL1_BASE APTE_BASE + +#define L2_BASE ((pd_entry_t *)((char *)L1_BASE + L2_SLOT_PTE * NBPD_L1)) + +#define AL2_BASE ((pd_entry_t *)((char *)AL1_BASE + L2_SLOT_PTE * NBPD_L1)) + +#define PDP_PDE (L2_BASE + PDIR_SLOT_PTE) +#define APDP_PDE (L2_BASE + PDIR_SLOT_APTE) + +#define PDP_BASE L2_BASE +#define APDP_BASE AL2_BASE + +#define NKL2_MAX_ENTRIES (NTOPLEVEL_PDES - (KERNBASE/NBPD_L2) - 1) /* largest value (-1 for APTP space) */ +#define NKL1_MAX_ENTRIES (unsigned long)(NKL2_MAX_ENTRIES * NPDPG) + +#define NKL2_KIMG_ENTRIES 4 /* - * pdei/ptei: generate index into PDP/PTP from a VA + * Since kva space is below the kernel in its entirety, we start off + * with zero entries on each level. + * + * if KERNBASE == VM_MIN_KERNEL_ADDRESS, + * pmap_alloc_level() deals with entries allocated by startup code. + */ + +#define NKL2_START_ENTRIES 0 +#define NKL1_START_ENTRIES 0 /* XXX notused */ + +#define NTOPLEVEL_PDES (PAGE_SIZE / (sizeof (pd_entry_t))) + +/* + * XXXyamt what's this? + * #define KERNSPACE (NKL4_ENTRIES * NBPD_L4) + */ + +#define NPDPG (PAGE_SIZE / sizeof (pd_entry_t)) + +#define ptei(VA) (((VA_SIGN_POS(VA)) & L1_MASK) >> L1_SHIFT) + +/* + * pl*_pi: index in the ptp page for a pde mapping a VA. + * (pl*_i below is the index in the virtual array of all pdes per level) */ -#define pdei(VA) (((VA) & PD_MASK) >> PDSHIFT) -#define ptei(VA) (((VA) & PT_MASK) >> PGSHIFT) +#define pl1_pi(VA) (((VA_SIGN_POS(VA)) & L1_MASK) >> L1_SHIFT) +#define pl2_pi(VA) (((VA_SIGN_POS(VA)) & L2_MASK) >> L2_SHIFT) + +/* + * pl*_i: generate index into pde/pte arrays in virtual space + */ +#define pl1_i(VA) (((VA_SIGN_POS(VA)) & L1_FRAME) >> L1_SHIFT) +#define pl2_i(VA) (((VA_SIGN_POS(VA)) & L2_FRAME) >> L2_SHIFT) +#define pl_i(va, lvl) \ + (((VA_SIGN_POS(va)) & ptp_masks[(lvl)-1]) >> ptp_shifts[(lvl)-1]) + +#define PTP_MASK_INITIALIZER { L1_FRAME, L2_FRAME } +#define PTP_SHIFT_INITIALIZER { L1_SHIFT, L2_SHIFT } +#define NKPTP_INITIALIZER { NKL1_START_ENTRIES, NKL2_START_ENTRIES } +#define NKPTPMAX_INITIALIZER { NKL1_MAX_ENTRIES, NKL2_MAX_ENTRIES } +#define NBPD_INITIALIZER { NBPD_L1, NBPD_L2 } +#define PDES_INITIALIZER { L2_BASE } +#define APDES_INITIALIZER { AL2_BASE } /* * PTP macros: @@ -195,10 +283,9 @@ * NBPD == number of bytes a PTP can map (4MB) */ -#define ptp_i2o(I) ((I) * PAGE_SIZE) /* index => offset */ -#define ptp_o2i(O) ((O) / PAGE_SIZE) /* offset => index */ -#define ptp_i2v(I) ((I) * NBPD) /* index => VA */ -#define ptp_v2i(V) ((V) / NBPD) /* VA => index (same as pdei) */ +#define ptp_va2o(va, lvl) (pl_i(va, (lvl)+1) * PAGE_SIZE) + +#define PTP_LEVELS 2 /* * PG_AVAIL usage: we make use of the ignored bits of the PTE @@ -214,7 +301,7 @@ */ #define NPTECL 8 -#ifdef _KERNEL +#if defined(_KERNEL) && !defined(_LOCORE) /* * pmap data structures: see pmap.c for details of locking. */ @@ -234,17 +321,23 @@ LIST_HEAD(pmap_head, pmap); /* struct pm * note that the pm_obj contains the simple_lock, the reference count, * page list, and number of PTPs within the pmap. * + * pm_lock is the same as the spinlock for vm object 0. Changes to + * the other objects may only be made if that lock has been taken + * (the other object locks are only used when uvm_pagealloc is called) + * * XXX If we ever support processor numbers higher than 31, we'll have * XXX to rethink the CPU mask. */ struct pmap { - struct uvm_object pm_obj; /* object (lck by object lock) */ -#define pm_lock pm_obj.vmobjlock + struct uvm_object pm_obj[PTP_LEVELS-1]; /* objects for lvl >= 1) */ +#define pm_lock pm_obj[0].vmobjlock +#define pm_obj_l1 pm_obj[0] LIST_ENTRY(pmap) pm_list; /* list (lck by pm_list lock) */ pd_entry_t *pm_pdir; /* VA of PD (lck by object lock) */ - u_int32_t pm_pdirpa; /* PA of PD (read-only after create) */ - struct vm_page *pm_ptphint; /* pointer to a PTP in our pmap */ + paddr_t pm_pdirpa; /* PA of PD (read-only after create) */ + struct vm_page *pm_ptphint[PTP_LEVELS-1]; + /* pointer to a PTP in our pmap */ struct pmap_statistics pm_stats; /* pmap stats (lck by object lock) */ vaddr_t pm_hiexec; /* highest executable mapping */ @@ -312,9 +405,13 @@ struct pv_page { extern u_long PTDpaddr; extern struct pmap kernel_pmap_store; /* kernel pmap */ -extern int nkpde; /* current # of PDEs for kernel */ extern int pmap_pg_g; /* do we support PG_G? */ +extern paddr_t ptp_masks[]; +extern int ptp_shifts[]; +extern long nkptp[], nbpd[], nkptpmax[]; +extern pd_entry_t *pdes[]; + /* * macros */ @@ -356,6 +453,7 @@ vaddr_t reserve_dumppages(vaddr_t); /* X void pmap_tlb_shootdown(pmap_t, vaddr_t, pt_entry_t, int32_t *); void pmap_tlb_shootnow(int32_t); void pmap_do_tlb_shootdown(struct cpu_info *); +void pmap_prealloc_lowmem_ptps(void); #define PMAP_GROWKERNEL /* turn on pmap_growkernel interface */ @@ -466,30 +564,33 @@ static __inline pt_entry_t * __attribute vtopte(vaddr_t va) { - KASSERT(va < (PDSLOT_KERN << PDSHIFT)); + KASSERT(va < (L2_SLOT_KERN * NBPD_L2)); - return (PTE_BASE + x86_btop(va)); + return (PTE_BASE + pl1_i(va)); } static __inline pt_entry_t * __attribute__((__unused__)) kvtopte(vaddr_t va) { - KASSERT(va >= (PDSLOT_KERN << PDSHIFT)); + KASSERT(va >= (L2_SLOT_KERN * NBPD_L2)); #ifdef LARGEPAGES { pd_entry_t *pde; - pde = PDP_BASE + pdei(va); + pde = L2_BASE + pl2_i(va); if (*pde & PG_PS) return ((pt_entry_t *)pde); } #endif - return (PTE_BASE + x86_btop(va)); + return (PTE_BASE + pl1_i(va)); } +#define pmap_pte_set(p, n) x86_atomic_testset_ul(p, n) +#define pmap_pte_clearbits(p, b) x86_atomic_clearbits_l(p, b) +#define pmap_pte_setbits(p, b) x86_atomic_setbits_l(p, b) #define pmap_cpu_has_pg_n() (cpu_class != CPUCLASS_386) #define pmap_cpu_has_invlpg() (cpu_class != CPUCLASS_386) @@ -506,5 +607,5 @@ void pmap_ldt_cleanup(struct lwp *); */ #define POOL_VTOPHYS(va) vtophys((vaddr_t) (va)) -#endif /* _KERNEL */ +#endif /* _KERNEL && !_LOCORE */ #endif /* _I386_PMAP_H_ */ Index: arch/i386/include/vmparam.h =================================================================== --- arch/i386/include/vmparam.h (revision 790) +++ arch/i386/include/vmparam.h (revision 791) @@ -102,11 +102,11 @@ /* user/kernel map constants */ #define VM_MIN_ADDRESS ((vaddr_t)0) -#define VM_MAXUSER_ADDRESS ((vaddr_t)(PDSLOT_PTE << PDSHIFT)) +#define VM_MAXUSER_ADDRESS ((vaddr_t)(PDIR_SLOT_PTE << L2_SHIFT)) #define VM_MAX_ADDRESS \ - ((vaddr_t)((PDSLOT_PTE << PDSHIFT) + (PDSLOT_PTE << PGSHIFT))) -#define VM_MIN_KERNEL_ADDRESS ((vaddr_t)(PDSLOT_KERN << PDSHIFT)) -#define VM_MAX_KERNEL_ADDRESS ((vaddr_t)(PDSLOT_APTE << PDSHIFT)) + ((vaddr_t)((PDIR_SLOT_PTE << L2_SHIFT) + (PDIR_SLOT_PTE << L1_SHIFT))) +#define VM_MIN_KERNEL_ADDRESS ((PDIR_SLOT_KERN << L2_SHIFT)) +#define VM_MAX_KERNEL_ADDRESS ((vaddr_t)(PDIR_SLOT_APTE << L2_SHIFT)) /* * The address to which unspecified mapping requests default Index: arch/i386/i386/db_memrw.c =================================================================== --- arch/i386/i386/db_memrw.c (revision 790) +++ arch/i386/i386/db_memrw.c (revision 791) @@ -140,7 +140,7 @@ db_write_text(vaddr_t addr, size_t size, */ #ifdef LARGEPAGES if (oldpte & PG_PS) - limit = NBPD - ((vaddr_t)dst & (NBPD - 1)); + limit = NBPD_L2 - ((vaddr_t)dst & (NBPD_L2 - 1)); else #endif limit = PAGE_SIZE - ((vaddr_t)dst & PGOFSET); Index: arch/i386/i386/pmap.c =================================================================== --- arch/i386/i386/pmap.c (revision 790) +++ arch/i386/i386/pmap.c (revision 791) @@ -33,6 +33,41 @@ */ /* + * Copyright 2001 (c) Wasabi Systems, Inc. + * All rights reserved. + * + * Written by Frank van der Linden for Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed for the NetBSD Project by + * Wasabi Systems, Inc. + * 4. The name of Wasabi Systems, Inc. may not be used to endorse + * or promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* * pmap.c: i386 pmap module rewrite * Chuck Cranor * 11-Aug-97 @@ -62,12 +97,14 @@ #include __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.173 2004/05/13 12:24:05 yamt Exp $"); +#ifndef __x86_64__ #include "opt_cputype.h" -#include "opt_user_ldt.h" -#include "opt_largepages.h" +#endif #include "opt_lockdebug.h" #include "opt_multiprocessor.h" #include "opt_kstack_dr0.h" +#include "opt_user_ldt.h" +#include "opt_largepages.h" #include #include @@ -185,6 +222,7 @@ __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.1 * => failure: kmem_object locked, no free vm_pages, etc. * save VA for later call to [a], go to plan 3. * If we fail, we simply let pmap_enter() tell UVM about it. + * */ /* @@ -229,18 +267,28 @@ __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.1 * this lock protects the list of active pmaps (headed by "pmaps"). * we lock it when adding or removing pmaps from this list. * + * XXX: would be nice to have per-CPU VAs for the above 4 */ /* * locking data structures */ +vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; +int ptp_shifts[] = PTP_SHIFT_INITIALIZER; +long nkptp[] = NKPTP_INITIALIZER; +long nkptpmax[] = NKPTPMAX_INITIALIZER; +long nbpd[] = NBPD_INITIALIZER; +pd_entry_t *normal_pdes[] = PDES_INITIALIZER; +pd_entry_t *alternate_pdes[] = APDES_INITIALIZER; + +/* int nkpde = NKPTP; */ + static struct simplelock pvalloc_lock; static struct simplelock pmaps_lock; #if defined(MULTIPROCESSOR) || defined(LOCKDEBUG) static struct lock pmap_main_lock; - #define PMAP_MAP_TO_HEAD_LOCK() \ (void) spinlockmgr(&pmap_main_lock, LK_SHARED, NULL) #define PMAP_MAP_TO_HEAD_UNLOCK() \ @@ -249,7 +297,7 @@ static struct lock pmap_main_lock; #define PMAP_HEAD_TO_MAP_LOCK() \ (void) spinlockmgr(&pmap_main_lock, LK_EXCLUSIVE, NULL) #define PMAP_HEAD_TO_MAP_UNLOCK() \ - spinlockmgr(&pmap_main_lock, LK_RELEASE, (void *) 0) + (void) spinlockmgr(&pmap_main_lock, LK_RELEASE, NULL) #else @@ -319,18 +367,6 @@ union pmap_tlb_shootdown_job_al *pj_page struct pmap kernel_pmap_store; /* the kernel's pmap (proc0) */ /* - * nkpde is the number of kernel PTPs allocated for the kernel at - * boot time (NKPTP is a compile time override). this number can - * grow dynamically as needed (but once allocated, we never free - * kernel PTPs). - */ - -int nkpde = NKPTP; -#ifdef NKPDE -#error "obsolete NKPDE: use NKPTP" -#endif - -/* * pmap_pg_g: if our processor supports PG_G in the PTE then we * set pmap_pg_g to PG_G (otherwise it is zero). */ @@ -418,6 +454,7 @@ static struct pmap_head pmaps; struct pool pmap_pmap_pool; + /* * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a * X86_MAXPROCS*NPTECL array of PTE's, to avoid cache line thrashing @@ -435,8 +472,8 @@ struct pool pmap_pmap_pool; /* * special VAs and the PTEs that map them */ -static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte; -static caddr_t csrcp, cdstp, zerop, ptpp; +static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte; +static caddr_t csrcp, cdstp, zerop, ptpp, early_zerop; /* * pool and cache that PDPs are allocated from @@ -456,18 +493,23 @@ extern paddr_t msgbuf_paddr; extern vaddr_t idt_vaddr; /* we allocate IDT early */ extern paddr_t idt_paddr; +#ifdef _LP64 +extern vaddr_t lo32_vaddr; +extern vaddr_t lo32_paddr; +#endif + +extern int end; + #if defined(I586_CPU) /* stuff to fix the pentium f00f bug */ extern vaddr_t pentium_idt_vaddr; #endif - /* * local prototypes */ static struct pv_entry *pmap_add_pvpage(struct pv_page *, boolean_t); -static struct vm_page *pmap_alloc_ptp(struct pmap *, int); static struct pv_entry *pmap_alloc_pv(struct pmap *, int); /* see codes below */ #define ALLOCPV_NEED 0 /* need PV now */ #define ALLOCPV_TRY 1 /* just try to allocate, don't steal */ @@ -480,10 +522,18 @@ static void pmap_free_pv(struct pmap * static void pmap_free_pvs(struct pmap *, struct pv_entry *); static void pmap_free_pv_doit(struct pv_entry *); static void pmap_free_pvpage(void); -static struct vm_page *pmap_get_ptp(struct pmap *, int); +static struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t, pd_entry_t **); +static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int); +static void pmap_freepage(struct pmap *, struct vm_page *, int, + struct pglist *); +static void pmap_free_ptp(struct pmap *, struct vm_page *, + vaddr_t, pt_entry_t *, + pd_entry_t **, int32_t *, + struct pglist *); static boolean_t pmap_is_curpmap(struct pmap *); static boolean_t pmap_is_active(struct pmap *, int); -static pt_entry_t *pmap_map_ptes(struct pmap *); +static void pmap_map_ptes(struct pmap *, pt_entry_t **, + pd_entry_t ***); static struct pv_entry *pmap_remove_pv(struct pv_head *, struct pmap *, vaddr_t); static void pmap_do_remove(struct pmap *, vaddr_t, vaddr_t, int); @@ -500,6 +550,9 @@ static pt_entry_t *pmap_tmpmap_pvepte(st static void pmap_tmpunmap_pa(void); static void pmap_tmpunmap_pvepte(struct pv_entry *); static void pmap_unmap_ptes(struct pmap *); +static boolean_t pmap_get_physpage(vaddr_t, int, paddr_t *); +static boolean_t pmap_pdes_valid(vaddr_t, pd_entry_t **, pd_entry_t *); +static void pmap_alloc_level(pd_entry_t **, vaddr_t, int, long *); static boolean_t pmap_reactivate(struct pmap *); @@ -601,7 +654,7 @@ pmap_tmpmap_pvepte(pve) return(vtopte(pve->pv_va)); return(((pt_entry_t *)pmap_tmpmap_pa(VM_PAGE_TO_PHYS(pve->pv_ptp))) - + ptei((unsigned)pve->pv_va)); + + ptei((unsigned long)pve->pv_va)); } /* @@ -661,17 +714,21 @@ pmap_apte_flush(struct pmap *pmap) * => must be undone with pmap_unmap_ptes before returning */ -__inline static pt_entry_t * -pmap_map_ptes(pmap) +__inline static void +pmap_map_ptes(pmap, ptepp, pdeppp) struct pmap *pmap; + pt_entry_t **ptepp; + pd_entry_t ***pdeppp; { - pd_entry_t opde; + pd_entry_t opde, npde; struct pmap *ourpmap; struct cpu_info *ci; /* the kernel's pmap is always accessible */ if (pmap == pmap_kernel()) { - return(PTE_BASE); + *ptepp = PTE_BASE; + *pdeppp = normal_pdes; + return; } ci = curcpu(); @@ -681,30 +738,34 @@ pmap_map_ptes(pmap) /* if curpmap then we are always mapped */ if (pmap_is_curpmap(pmap)) { - simple_lock(&pmap->pm_obj.vmobjlock); - return(PTE_BASE); + simple_lock(&pmap->pm_lock); + *ptepp = PTE_BASE; + *pdeppp = normal_pdes; + return; } ourpmap = ci->ci_pmap; /* need to lock both curpmap and pmap: use ordered locking */ if ((unsigned) pmap < (unsigned) ourpmap) { - simple_lock(&pmap->pm_obj.vmobjlock); - simple_lock(&ourpmap->pm_obj.vmobjlock); + simple_lock(&pmap->pm_lock); + simple_lock(&ourpmap->pm_lock); } else { - simple_lock(&ourpmap->pm_obj.vmobjlock); - simple_lock(&pmap->pm_obj.vmobjlock); + simple_lock(&ourpmap->pm_lock); + simple_lock(&pmap->pm_lock); } /* need to load a new alternate pt space into curpmap? */ COUNT(apdp_pde_map); opde = *APDP_PDE; if (!pmap_valid_entry(opde) || (opde & PG_FRAME) != pmap->pm_pdirpa) { - *APDP_PDE = (pd_entry_t) (pmap->pm_pdirpa | PG_RW | PG_V); + npde = (pd_entry_t) (pmap->pm_pdirpa | PG_RW | PG_V); + *APDP_PDE = npde; if (pmap_valid_entry(opde)) pmap_apte_flush(ourpmap); } - return(APTE_BASE); + *ptepp = APTE_BASE; + *pdeppp = alternate_pdes; } /* @@ -720,7 +781,7 @@ pmap_unmap_ptes(pmap) return; } if (pmap_is_curpmap(pmap)) { - simple_unlock(&pmap->pm_obj.vmobjlock); + simple_unlock(&pmap->pm_lock); } else { struct pmap *ourpmap = curcpu()->ci_pmap; @@ -729,8 +790,8 @@ pmap_unmap_ptes(pmap) pmap_apte_flush(ourpmap); #endif COUNT(apdp_pde_unmap); - simple_unlock(&pmap->pm_obj.vmobjlock); - simple_unlock(&ourpmap->pm_obj.vmobjlock); + simple_unlock(&pmap->pm_lock); + simple_unlock(&ourpmap->pm_lock); } } @@ -828,7 +889,7 @@ pmap_kenter_pa(va, pa, prot) npte = pa | ((prot & VM_PROT_WRITE) ? PG_RW : PG_RO) | PG_V | pmap_pg_g; - opte = x86_atomic_testset_ul(pte, npte); /* zap! */ + opte = pmap_pte_set(pte, npte); /* zap! */ #ifdef LARGEPAGES /* XXX For now... */ if (opte & PG_PS) @@ -845,6 +906,18 @@ pmap_kenter_pa(va, pa, prot) pmap_update_pg(va); #endif } +#if defined(DEBUG) + { + paddr_t newpa; + boolean_t ret; + + ret = pmap_extract(pmap_kernel(), va, &newpa); + if (!ret) + panic("%s: no mapping", __func__); + if (newpa != pa) + panic("%s: different pa", __func__); + } +#endif /* defined(DEBUG) */ } /* @@ -866,13 +939,18 @@ pmap_kremove(va, len) pt_entry_t *pte, opte; int32_t cpumask = 0; + KASSERT((va & PAGE_MASK) == 0); + KASSERT((len & PAGE_MASK) == 0); + len >>= PAGE_SHIFT; for ( /* null */ ; len ; len--, va += PAGE_SIZE) { + KASSERT(pl_i(va, PTP_LEVELS) != PDIR_SLOT_PTE); + if (va < VM_MIN_KERNEL_ADDRESS) pte = vtopte(va); else pte = kvtopte(va); - opte = x86_atomic_testset_ul(pte, 0); /* zap! */ + opte = pmap_pte_set(pte, 0); /* zap! */ #ifdef LARGEPAGES /* XXX For now... */ if (opte & PG_PS) @@ -911,10 +989,14 @@ void pmap_bootstrap(kva_start) vaddr_t kva_start; { - struct pmap *kpm; vaddr_t kva; + struct pmap *kpm; pt_entry_t *pte; int i; + unsigned long p1i; +#if (VM_MIN_KERNEL_ADDRESS != KERNBASE) || defined(LARGEPAGES) + vaddr_t kva_end; +#endif /* * set up our local static global vars that keep track of the @@ -951,11 +1033,14 @@ pmap_bootstrap(kva_start) */ kpm = pmap_kernel(); - simple_lock_init(&kpm->pm_obj.vmobjlock); - kpm->pm_obj.pgops = NULL; - TAILQ_INIT(&kpm->pm_obj.memq); - kpm->pm_obj.uo_npages = 0; - kpm->pm_obj.uo_refs = 1; + for (i = 0; i < PTP_LEVELS - 1; i++) { + simple_lock_init(&kpm->pm_obj[i].vmobjlock); + kpm->pm_obj[i].pgops = NULL; + TAILQ_INIT(&kpm->pm_obj[i].memq); + kpm->pm_obj[i].uo_npages = 0; + kpm->pm_obj[i].uo_refs = 1; + kpm->pm_ptphint[i] = NULL; + } memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ kpm->pm_pdir = (pd_entry_t *)(lwp0.l_addr->u_pcb.pcb_cr3 + KERNBASE); kpm->pm_pdirpa = (u_int32_t) lwp0.l_addr->u_pcb.pcb_cr3; @@ -978,10 +1063,17 @@ pmap_bootstrap(kva_start) pmap_pg_g = PG_G; /* enable software */ /* add PG_G attribute to already mapped kernel pages */ +#if KERNBASE == VM_MIN_KERNEL_ADDRESS for (kva = VM_MIN_KERNEL_ADDRESS ; kva < virtual_avail ; - kva += PAGE_SIZE) - if (pmap_valid_entry(PTE_BASE[x86_btop(kva)])) - PTE_BASE[x86_btop(kva)] |= PG_G; +#else + kva_end = roundup((vaddr_t)&end, PAGE_SIZE); + for (kva = KERNBASE; kva < kva_end ; +#endif + kva += PAGE_SIZE) { + p1i = pl1_i(kva); + if (pmap_valid_entry(PTE_BASE[p1i])) + PTE_BASE[p1i] |= PG_G; + } } #ifdef LARGEPAGES @@ -991,7 +1083,6 @@ pmap_bootstrap(kva_start) if (cpu_feature & CPUID_PSE) { paddr_t pa; - vaddr_t kva_end; pd_entry_t *pde; extern char _etext; @@ -1011,10 +1102,10 @@ pmap_bootstrap(kva_start) * assume that the linker has properly aligned the * .data segment to a 4MB boundary. */ - kva_end = roundup((vaddr_t)&_etext, NBPD); + kva_end = roundup((vaddr_t)&_etext, NBPD_L2); for (pa = 0, kva = KERNBASE; kva < kva_end; - kva += NBPD, pa += NBPD) { - pde = &kpm->pm_pdir[pdei(kva)]; + kva += NBPD_L2, pa += NBPD_L2) { + pde = &kpm->pm_pdir[pl_i(kva, PTP_LEVELS)]; *pde = pa | pmap_pg_g | PG_PS | PG_KR | PG_V; /* zap! */ tlbflush(); @@ -1022,6 +1113,19 @@ pmap_bootstrap(kva_start) } #endif /* LARGEPAGES */ +#if VM_MIN_KERNEL_ADDRESS != KERNBASE + /* + * zero_pte is stuck at the end of mapped space for the kernel + * image (disjunct from kva space). This is done so that it + * can safely be used in pmap_growkernel (pmap_get_physpage), + * when it's called for the first time. + * XXXfvdl fix this for MULTIPROCESSOR later. + */ + + early_zerop = (caddr_t)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2); + early_zero_pte = PTE_BASE + pl1_i((unsigned long)early_zerop); +#endif + /* * now we allocate the "special" VAs which are used for tmp mappings * by the pmap (and other modules). we allocate the VAs by advancing @@ -1030,7 +1134,7 @@ pmap_bootstrap(kva_start) * mapping. */ - pte = PTE_BASE + x86_btop(virtual_avail); + pte = PTE_BASE + pl1_i(virtual_avail); #ifdef MULTIPROCESSOR /* @@ -1052,8 +1156,8 @@ pmap_bootstrap(kva_start) virtual_avail += PAGE_SIZE * X86_MAXPROCS * NPTECL; pte += X86_MAXPROCS * NPTECL; #else - csrcp = (caddr_t) virtual_avail; csrc_pte = pte; /* allocate */ - virtual_avail += PAGE_SIZE; pte++; /* advance */ + csrcp = (caddr_t) virtual_avail; csrc_pte = pte; /* allocate */ + virtual_avail += PAGE_SIZE; pte++; /* advance */ cdstp = (caddr_t) virtual_avail; cdst_pte = pte; virtual_avail += PAGE_SIZE; pte++; @@ -1065,27 +1169,53 @@ pmap_bootstrap(kva_start) virtual_avail += PAGE_SIZE; pte++; #endif +#if VM_MIN_KERNEL_ADDRESS == KERNBASE + early_zerop = zerop; + early_zero_pte = zero_pte; +#endif + /* * Nothing after this point actually needs pte; */ pte = (void *)0xdeadbeef; /* XXX: vmmap used by mem.c... should be uvm_map_reserve */ + /* XXXfvdl PTEs not needed here */ vmmap = (char *)virtual_avail; /* don't need pte */ - virtual_avail += PAGE_SIZE; + virtual_avail += PAGE_SIZE; pte++; msgbuf_vaddr = virtual_avail; /* don't need pte */ virtual_avail += round_page(MSGBUFSIZE); + pte += x86_btop(round_page(MSGBUFSIZE)); idt_vaddr = virtual_avail; /* don't need pte */ - virtual_avail += PAGE_SIZE; +#ifdef __x86_64__ + virtual_avail += 2 * PAGE_SIZE; pte += 2; +#else + virtual_avail += PAGE_SIZE; pte++; +#endif idt_paddr = avail_start; /* steal a page */ +#ifdef __x86_64__ + avail_start += 2 * PAGE_SIZE; +#else avail_start += PAGE_SIZE; +#endif #if defined(I586_CPU) /* pentium f00f bug stuff */ pentium_idt_vaddr = virtual_avail; /* don't need pte */ - virtual_avail += PAGE_SIZE; + virtual_avail += PAGE_SIZE; pte++; +#endif + +#ifdef _LP64 + /* + * Grab a page below 4G for things that need it (i.e. + * having an initial %cr3 for the MP trampoline). + */ + lo32_vaddr = virtual_avail; + virtual_avail += PAGE_SIZE; pte++; + lo32_paddr = avail_start; + avail_start += PAGE_SIZE; #endif /* @@ -1128,6 +1258,7 @@ pmap_bootstrap(kva_start) /* * initialize the PDE pool and cache. */ + pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, 0, "pdppl", &pool_allocator_nointr); pool_cache_init(&pmap_pdp_cache, &pmap_pdp_pool, @@ -1141,6 +1272,36 @@ pmap_bootstrap(kva_start) } /* + * Pre-allocate PTPs for low memory, so that 1:1 mappings for various + * trampoline code can be entered. + */ +void +pmap_prealloc_lowmem_ptps(void) +{ + pd_entry_t *pdes; + int level; + paddr_t newp; + + pdes = pmap_kernel()->pm_pdir; + level = PTP_LEVELS; + for (;;) { + newp = avail_start; + avail_start += PAGE_SIZE; + *early_zero_pte = (newp & PG_FRAME) | PG_V | PG_RW; + pmap_update_pg((vaddr_t)early_zerop); + memset(early_zerop, 0, PAGE_SIZE); +#if defined(DIAGNOSTIC) + *early_zero_pte = 0; +#endif /* defined(DIAGNOSTIC) */ + pdes[pl_i(0, level)] = (newp & PG_FRAME) | PG_V | PG_RW; + level--; + if (level <= 1) + break; + pdes = normal_pdes[level - 2]; + } +} + +/* * pmap_init: called from uvm_init, our job is to get the pmap * system ready to manage mappings... this mainly means initing * the pv_entry stuff. @@ -1248,6 +1409,7 @@ pmap_alloc_pv(pmap, mode) else (void) pmap_alloc_pvpage(pmap, ALLOCPV_NONEED); } + simple_unlock(&pvalloc_lock); return(pv); } @@ -1309,10 +1471,14 @@ pmap_alloc_pvpage(pmap, mode) } } + /* + * we have a VA, now let's try and allocate a page. + */ + pg = uvm_pagealloc(NULL, pv_cachedva - vm_map_min(kernel_map), NULL, UVM_PGA_USERESERVE); if (pg == NULL) - return (NULL); + return NULL; pg->flags &= ~PG_BUSY; /* never busy */ /* @@ -1596,35 +1762,87 @@ pmap_remove_pv(pvh, pmap, va) * p t p f u n c t i o n s */ -/* - * pmap_alloc_ptp: allocate a PTP for a PMAP - * - * => pmap should already be locked by caller - * => we use the ptp's wire_count to count the number of active mappings - * in the PTP (we start it at one to prevent any chance this PTP - * will ever leak onto the active/inactive queues) - */ +static __inline struct vm_page * +pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level) +{ + int lidx = level - 1; + struct vm_page *pg; -__inline static struct vm_page * -pmap_alloc_ptp(pmap, pde_index) - struct pmap *pmap; - int pde_index; + KASSERT(pmap != pmap_kernel()); + + if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] && + pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) { + return (pmap->pm_ptphint[lidx]); + } + if (lidx == 0) + pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level)); + else { + simple_lock(&pmap->pm_obj[lidx].vmobjlock); + pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level)); + simple_unlock(&pmap->pm_obj[lidx].vmobjlock); + } + return pg; +} + +static __inline void +pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level, + struct pglist *tofree) { - struct vm_page *ptp; + int lidx; + struct uvm_object *obj; - ptp = uvm_pagealloc(&pmap->pm_obj, ptp_i2o(pde_index), NULL, - UVM_PGA_USERESERVE|UVM_PGA_ZERO); - if (ptp == NULL) - return(NULL); - - /* got one! */ - ptp->flags &= ~PG_BUSY; /* never busy */ - ptp->wire_count = 1; /* no mappings yet */ - pmap->pm_pdir[pde_index] = - (pd_entry_t) (VM_PAGE_TO_PHYS(ptp) | PG_u | PG_RW | PG_V); - pmap->pm_stats.resident_count++; /* count PTP as resident */ - pmap->pm_ptphint = ptp; - return(ptp); + lidx = level - 1; + + obj = &pmap->pm_obj[lidx]; + pmap->pm_stats.resident_count--; + if (lidx != 0) + simple_lock(&obj->vmobjlock); + if (pmap->pm_ptphint[lidx] == ptp) + pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq); + + ptp->wire_count = 0; + ptp->flags |= PG_ZERO; + uvm_pagerealloc(ptp, NULL, 0); + TAILQ_INSERT_TAIL(tofree, ptp, listq); + + if (lidx != 0) + simple_unlock(&obj->vmobjlock); +} + +static void +pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, + pt_entry_t *ptes, pd_entry_t **pdes, int32_t *cpumaskp, + struct pglist *tofree) +{ + unsigned long index; + int level; + vaddr_t invaladdr; + pd_entry_t opde; + + KASSERT(pmap != pmap_kernel()); + + level = 1; + do { + pmap_freepage(pmap, ptp, level, tofree); + index = pl_i(va, level + 1); + opde = pmap_pte_set(&pdes[level - 1][index], 0); + invaladdr = level == 1 ? (vaddr_t)ptes : + (vaddr_t)pdes[level - 2]; + pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE, + opde, cpumaskp); +#if defined(MULTIPROCESSOR) + invaladdr = level == 1 ? (vaddr_t)PTE_BASE : + (vaddr_t)normal_pdes[level - 2]; + pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE, opde, + cpumaskp); +#endif + if (level < PTP_LEVELS - 1) { + ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1); + ptp->wire_count--; + if (ptp->wire_count > 1) + break; + } + } while (++level < PTP_LEVELS); } /* @@ -1634,32 +1852,95 @@ pmap_alloc_ptp(pmap, pde_index) * => pmap should be locked */ + static struct vm_page * -pmap_get_ptp(pmap, pde_index) - struct pmap *pmap; - int pde_index; +pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t **pdes) { - struct vm_page *ptp; + struct vm_page *ptp, *pptp; + int i; + unsigned long index; + pd_entry_t *pva; + paddr_t ppa, pa; + struct uvm_object *obj; - if (pmap_valid_entry(pmap->pm_pdir[pde_index])) { + KASSERT(pmap != pmap_kernel()); + LOCK_ASSERT(simple_lock_held(&pmap->pm_lock)); - /* valid... check hint (saves us a PA->PG lookup) */ - if (pmap->pm_ptphint && - (pmap->pm_pdir[pde_index] & PG_FRAME) == - VM_PAGE_TO_PHYS(pmap->pm_ptphint)) - return(pmap->pm_ptphint); + ptp = NULL; + pa = (paddr_t)-1; + + /* + * Loop through all page table levels seeing if we need to + * add a new page to that level. + */ + for (i = PTP_LEVELS; i > 1; i--) { + /* + * Save values from previous round. + */ + pptp = ptp; + ppa = pa; + + index = pl_i(va, i); + pva = pdes[i - 2]; + + if (pmap_valid_entry(pva[index])) { + ppa = pva[index] & PG_FRAME; + ptp = NULL; + continue; + } + + obj = &pmap->pm_obj[i-2]; + /* + * XXX pm_obj[0] is pm_lock, which is already locked. + */ + if (i != 2) + simple_lock(&obj->vmobjlock); + ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL, + UVM_PGA_USERESERVE|UVM_PGA_ZERO); + if (i != 2) + simple_unlock(&obj->vmobjlock); - ptp = uvm_pagelookup(&pmap->pm_obj, ptp_i2o(pde_index)); -#ifdef DIAGNOSTIC if (ptp == NULL) + return NULL; + + ptp->flags &= ~PG_BUSY; /* never busy */ + ptp->wire_count = 1; + pmap->pm_ptphint[i - 2] = ptp; + pa = VM_PAGE_TO_PHYS(ptp); + pva[index] = (pd_entry_t) (pa | PG_u | PG_RW | PG_V); + pmap->pm_stats.resident_count++; + /* + * If we're not in the top level, increase the + * wire count of the parent page. + */ + if (i < PTP_LEVELS) { + if (pptp == NULL) + pptp = pmap_find_ptp(pmap, va, ppa, i); +#ifdef DIAGNOSTIC + if (pptp == NULL) + panic("pde page disappeared"); +#endif + pptp->wire_count++; + } + } + + /* + * ptp is not NULL if we just allocated a new ptp. If it's + * still NULL, we must look up the existing one. + */ + if (ptp == NULL) { + ptp = pmap_find_ptp(pmap, va, ppa, 1); +#ifdef DIAGNOSTIC + if (ptp == NULL) { + printf("va %lx ppa %lx\n", (unsigned long)va, + (unsigned long)ppa); panic("pmap_get_ptp: unmanaged user PTP"); + } #endif - pmap->pm_ptphint = ptp; - return(ptp); } - /* allocate a new PTP (updates ptphint) */ - return(pmap_alloc_ptp(pmap, pde_index)); + pmap->pm_ptphint[0] = ptp; + return(ptp); } /* @@ -1675,6 +1956,7 @@ pmap_pdp_ctor(void *arg, void *object, i { pd_entry_t *pdir = object; paddr_t pdirpa; + int npde; /* * NOTE: The `pmap_lock' is held when the PDP is allocated. @@ -1685,18 +1967,24 @@ pmap_pdp_ctor(void *arg, void *object, i (void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa); /* zero init area */ - memset(pdir, 0, PDSLOT_PTE * sizeof(pd_entry_t)); + memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t)); /* put in recursive PDE to map the PTEs */ - pdir[PDSLOT_PTE] = pdirpa | PG_V | PG_KW; + pdir[PDIR_SLOT_PTE] = pdirpa | PG_V | PG_KW; + + npde = nkptp[PTP_LEVELS - 1]; /* put in kernel VM PDEs */ - memcpy(&pdir[PDSLOT_KERN], &PDP_BASE[PDSLOT_KERN], - nkpde * sizeof(pd_entry_t)); + memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN], + npde * sizeof(pd_entry_t)); /* zero the rest */ - memset(&pdir[PDSLOT_KERN + nkpde], 0, - PAGE_SIZE - ((PDSLOT_KERN + nkpde) * sizeof(pd_entry_t))); + memset(&pdir[PDIR_SLOT_KERN + npde], 0, + (NTOPLEVEL_PDES - (PDIR_SLOT_KERN + npde)) * sizeof(pd_entry_t)); + +#if VM_MIN_KERNEL_ADDRESS != KERNBASE + pdir[pl2_pi(KERNBASE)] = PDP_BASE[pl2_pi(KERNBASE)]; +#endif return (0); } @@ -1712,19 +2000,22 @@ struct pmap * pmap_create() { struct pmap *pmap; + int i; u_int gen; pmap = pool_get(&pmap_pmap_pool, PR_WAITOK); /* init uvm_object */ - simple_lock_init(&pmap->pm_obj.vmobjlock); - pmap->pm_obj.pgops = NULL; /* currently not a mappable object */ - TAILQ_INIT(&pmap->pm_obj.memq); - pmap->pm_obj.uo_npages = 0; - pmap->pm_obj.uo_refs = 1; + for (i = 0; i < PTP_LEVELS - 1; i++) { + simple_lock_init(&pmap->pm_obj[i].vmobjlock); + pmap->pm_obj[i].pgops = NULL; /* not a mappable object */ + TAILQ_INIT(&pmap->pm_obj[i].memq); + pmap->pm_obj[i].uo_npages = 0; + pmap->pm_obj[i].uo_refs = 1; + pmap->pm_ptphint[i] = NULL; + } pmap->pm_stats.wired_count = 0; pmap->pm_stats.resident_count = 1; /* count the PDP allocd below */ - pmap->pm_ptphint = NULL; pmap->pm_hiexec = 0; pmap->pm_flags = 0; pmap->pm_cpus = 0; @@ -1758,7 +2049,7 @@ pmap_create() goto try_again; } - pmap->pm_pdirpa = pmap->pm_pdir[PDSLOT_PTE] & PG_FRAME; + pmap->pm_pdirpa = pmap->pm_pdir[PDIR_SLOT_PTE] & PG_FRAME; LIST_INSERT_HEAD(&pmaps, pmap, pm_list); @@ -1780,15 +2071,16 @@ pmap_destroy(pmap) #ifdef DIAGNOSTIC struct cpu_info *ci; CPU_INFO_ITERATOR cii; + int i; #endif /* DIAGNOSTIC */ /* * drop reference count */ - simple_lock(&pmap->pm_obj.vmobjlock); - refs = --pmap->pm_obj.uo_refs; - simple_unlock(&pmap->pm_obj.vmobjlock); + simple_lock(&pmap->pm_lock); + refs = --pmap->pm_obj[0].uo_refs; + simple_unlock(&pmap->pm_lock); if (refs > 0) { return; } @@ -1815,13 +2107,24 @@ pmap_destroy(pmap) * destroyed pmap shouldn't have remaining PTPs */ - KASSERT(pmap->pm_obj.uo_npages == 0); - KASSERT(TAILQ_EMPTY(&pmap->pm_obj.memq)); + for (i = 0; i < PTP_LEVELS - 1; i++) { + struct vm_page *pg; + + KASSERT(pmap->pm_obj[i].uo_npages == 0); + KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq)); + while ((pg = TAILQ_FIRST(&pmap->pm_obj[i].memq)) != NULL) { + KASSERT((pg->flags & PG_BUSY) == 0); + + pg->wire_count = 0; + uvm_pagefree(pg); + } + } /* * MULTIPROCESSOR -- no need to flush out of other processors' * APTE space because we do that in pmap_unmap_ptes(). */ + /* XXX: need to flush it out of other processor's APTE space? */ pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir); #ifdef USER_LDT @@ -1850,9 +2153,9 @@ void pmap_reference(pmap) struct pmap *pmap; { - simple_lock(&pmap->pm_obj.vmobjlock); - pmap->pm_obj.uo_refs++; - simple_unlock(&pmap->pm_obj.vmobjlock); + simple_lock(&pmap->pm_lock); + pmap->pm_obj[0].uo_refs++; + simple_unlock(&pmap->pm_lock); } #if defined(PMAP_FORK) @@ -1865,8 +2168,8 @@ void pmap_fork(pmap1, pmap2) struct pmap *pmap1, *pmap2; { - simple_lock(&pmap1->pm_obj.vmobjlock); - simple_lock(&pmap2->pm_obj.vmobjlock); + simple_lock(&pmap1->pm_lock); + simple_lock(&pmap2->pm_lock); #ifdef USER_LDT /* Copy the LDT, if necessary. */ @@ -1884,8 +2187,8 @@ pmap_fork(pmap1, pmap2) } #endif /* USER_LDT */ - simple_unlock(&pmap2->pm_obj.vmobjlock); - simple_unlock(&pmap1->pm_obj.vmobjlock); + simple_unlock(&pmap2->pm_lock); + simple_unlock(&pmap1->pm_lock); } #endif /* PMAP_FORK */ @@ -1904,7 +2207,7 @@ pmap_ldt_cleanup(l) union descriptor *old_ldt = NULL; size_t len = 0; - simple_lock(&pmap->pm_obj.vmobjlock); + simple_lock(&pmap->pm_lock); if (pmap->pm_flags & PMF_USER_LDT) { ldt_free(pmap); @@ -1919,7 +2222,7 @@ pmap_ldt_cleanup(l) pmap->pm_flags &= ~PMF_USER_LDT; } - simple_unlock(&pmap->pm_obj.vmobjlock); + simple_unlock(&pmap->pm_lock); if (old_ldt != NULL) uvm_km_free(kernel_map, (vaddr_t)old_ldt, len); @@ -2165,6 +2468,24 @@ pmap_deactivate2(l) * some misc. functions */ +static boolean_t +pmap_pdes_valid(vaddr_t va, pd_entry_t **pdes, pd_entry_t *lastpde) +{ + int i; + unsigned long index; + pd_entry_t pde; + + for (i = PTP_LEVELS; i > 1; i--) { + index = pl_i(va, i); + pde = pdes[i - 2][index]; + if ((pde & PG_V) == 0) + return FALSE; + } + if (lastpde != NULL) + *lastpde = pde; + return TRUE; +} + /* * pmap_extract: extract a PA for the given VA */ @@ -2176,28 +2497,33 @@ pmap_extract(pmap, va, pap) paddr_t *pap; { pt_entry_t *ptes, pte; - pd_entry_t pde; + pd_entry_t pde, **pdes; + + pmap_map_ptes(pmap, &ptes, &pdes); + if (pmap_pdes_valid(va, pdes, &pde) == FALSE) { + pmap_unmap_ptes(pmap); + return FALSE; + } + pte = ptes[pl1_i(va)]; + pmap_unmap_ptes(pmap); - if (__predict_true((pde = pmap->pm_pdir[pdei(va)]) != 0)) { #ifdef LARGEPAGES - if (pde & PG_PS) { - if (pap != NULL) - *pap = (pde & PG_LGFRAME) | (va & ~PG_LGFRAME); - return (TRUE); - } + if (pde & PG_PS) { + if (pap != NULL) + *pap = + (pde & PG_LGFRAME) | (VA_SIGN_POS(va) & ~PG_LGFRAME); + return (TRUE); + } #endif - ptes = pmap_map_ptes(pmap); - pte = ptes[x86_btop(va)]; - pmap_unmap_ptes(pmap); - if (__predict_true((pte & PG_V) != 0)) { - if (pap != NULL) - *pap = (pte & PG_FRAME) | (va & ~PG_FRAME); - return (TRUE); - } + if (__predict_true((pte & PG_V) != 0)) { + if (pap != NULL) + *pap = (pte & PG_FRAME) | (VA_SIGN_POS(va) & ~PG_FRAME); + return (TRUE); } - return (FALSE); + + return FALSE; } @@ -2254,6 +2580,7 @@ pmap_map(va, spa, epa, prot) return va; } + /* * pmap_zero_page: zero a page */ @@ -2408,7 +2735,7 @@ pmap_remove_ptes(pmap, ptp, ptpva, start } /* atomically save the old PTE and zap! it */ - opte = x86_atomic_testset_ul(pte, 0); + opte = pmap_pte_set(pte, 0); pmap_exec_account(pmap, startva, opte, 0); if (opte & PG_W) @@ -2497,7 +2824,7 @@ pmap_remove_pte(pmap, ptp, pte, va, cpum } /* atomically save the old PTE and zap! it */ - opte = x86_atomic_testset_ul(pte, 0); + opte = pmap_pte_set(pte, 0); pmap_exec_account(pmap, va, opte, 0); if (opte & PG_W) @@ -2573,13 +2900,14 @@ pmap_do_remove(pmap, sva, eva, flags) vaddr_t sva, eva; int flags; { - pt_entry_t *ptes, opte; + pt_entry_t *ptes; + pd_entry_t **pdes, pde; boolean_t result; paddr_t ptppa; vaddr_t blkendva; struct vm_page *ptp; int32_t cpumask = 0; - TAILQ_HEAD(, vm_page) empty_ptps; + struct pglist empty_ptps; struct cpu_info *ci; struct pmap *curpmap; @@ -2591,83 +2919,49 @@ pmap_do_remove(pmap, sva, eva, flags) PMAP_MAP_TO_HEAD_LOCK(); - ptes = pmap_map_ptes(pmap); /* locks pmap */ - ci = curcpu(); curpmap = ci->ci_pmap; + pmap_map_ptes(pmap, &ptes, &pdes); /* locks pmap */ /* * removing one page? take shortcut function. */ if (sva + PAGE_SIZE == eva) { - if (pmap_valid_entry(pmap->pm_pdir[pdei(sva)])) { + KASSERT(pl_i(sva, PTP_LEVELS) != PDIR_SLOT_PTE); + if (pmap_pdes_valid(sva, pdes, &pde)) { /* PA of the PTP */ - ptppa = pmap->pm_pdir[pdei(sva)] & PG_FRAME; + ptppa = pde & PG_FRAME; /* get PTP if non-kernel mapping */ + if (pmap == pmap_kernel()) { /* we never free kernel PTPs */ ptp = NULL; } else { - if (pmap->pm_ptphint && - VM_PAGE_TO_PHYS(pmap->pm_ptphint) == - ptppa) { - ptp = pmap->pm_ptphint; - } else { - ptp = PHYS_TO_VM_PAGE(ptppa); -#ifdef DIAGNOSTIC - if (ptp == NULL) - panic("pmap_remove: unmanaged " - "PTP detected"); + ptp = pmap_find_ptp(pmap, sva, ptppa, 1); +#ifdef DIAGNOSTIC + if (ptp == NULL) + panic("pmap_remove: unmanaged " + "PTP detected"); #endif - } } /* do it! */ result = pmap_remove_pte(pmap, ptp, - &ptes[x86_btop(sva)], sva, &cpumask, flags); + &ptes[pl1_i(sva)], sva, &cpumask, flags); /* * if mapping removed and the PTP is no longer * being used, free it! */ - if (result && ptp && ptp->wire_count <= 1) { - /* zap! */ - opte = x86_atomic_testset_ul( - &pmap->pm_pdir[pdei(sva)], 0); -#if defined(MULTIPROCESSOR) - /* - * XXXthorpej Redundant shootdown can happen - * here if we're using APTE space. - */ -#endif - pmap_tlb_shootdown(curpmap, - ((vaddr_t)ptes) + ptp->offset, opte, - &cpumask); -#if defined(MULTIPROCESSOR) - /* - * Always shoot down the pmap's self-mapping - * of the PTP. - * XXXthorpej Redundant shootdown can happen - * here if pmap == curpmap (not APTE space). - */ - pmap_tlb_shootdown(pmap, - ((vaddr_t)PTE_BASE) + ptp->offset, opte, - &cpumask); -#endif - pmap->pm_stats.resident_count--; - if (pmap->pm_ptphint == ptp) - pmap->pm_ptphint = - TAILQ_FIRST(&pmap->pm_obj.memq); - ptp->wire_count = 0; - ptp->flags |= PG_ZERO; - uvm_pagerealloc(ptp, NULL, 0); - TAILQ_INSERT_TAIL(&empty_ptps, ptp, listq); - } + if (result && ptp && ptp->wire_count <= 1) + pmap_free_ptp(pmap, ptp, sva, ptes, pdes, + &cpumask, &empty_ptps); } + pmap_tlb_shootnow(cpumask); pmap_unmap_ptes(pmap); /* unlock pmap */ PMAP_MAP_TO_HEAD_UNLOCK(); @@ -2700,68 +2994,35 @@ pmap_do_remove(pmap, sva, eva, flags) * be VM_MAX_ADDRESS. */ - if (pdei(sva) == PDSLOT_PTE) + if (pl_i(sva, PTP_LEVELS) == PDIR_SLOT_PTE) /* XXXCDC: ugly hack to avoid freeing PDP here */ continue; - if (!pmap_valid_entry(pmap->pm_pdir[pdei(sva)])) - /* valid block? */ + if (!pmap_pdes_valid(sva, pdes, &pde)) continue; /* PA of the PTP */ - ptppa = (pmap->pm_pdir[pdei(sva)] & PG_FRAME); + ptppa = pde & PG_FRAME; /* get PTP if non-kernel mapping */ if (pmap == pmap_kernel()) { /* we never free kernel PTPs */ ptp = NULL; } else { - if (pmap->pm_ptphint && - VM_PAGE_TO_PHYS(pmap->pm_ptphint) == ptppa) { - ptp = pmap->pm_ptphint; - } else { - ptp = PHYS_TO_VM_PAGE(ptppa); + ptp = pmap_find_ptp(pmap, sva, ptppa, 1); #ifdef DIAGNOSTIC - if (ptp == NULL) - panic("pmap_remove: unmanaged PTP " - "detected"); + if (ptp == NULL) + panic("pmap_remove: unmanaged PTP " + "detected"); #endif - } } pmap_remove_ptes(pmap, ptp, - (vaddr_t)&ptes[x86_btop(sva)], sva, blkendva, &cpumask, flags); + (vaddr_t)&ptes[pl1_i(sva)], sva, blkendva, &cpumask, flags); /* if PTP is no longer being used, free it! */ if (ptp && ptp->wire_count <= 1) { - /* zap! */ - opte = x86_atomic_testset_ul( - &pmap->pm_pdir[pdei(sva)], 0); -#if defined(MULTIPROCESSOR) - /* - * XXXthorpej Redundant shootdown can happen here - * if we're using APTE space. - */ -#endif - pmap_tlb_shootdown(curpmap, - ((vaddr_t)ptes) + ptp->offset, opte, &cpumask); -#if defined(MULTIPROCESSOR) - /* - * Always shoot down the pmap's self-mapping - * of the PTP. - * XXXthorpej Redundant shootdown can happen here - * if pmap == curpmap (not APTE space). - */ - pmap_tlb_shootdown(pmap, - ((vaddr_t)PTE_BASE) + ptp->offset, opte, &cpumask); -#endif - pmap->pm_stats.resident_count--; - if (pmap->pm_ptphint == ptp) /* update hint? */ - pmap->pm_ptphint = pmap->pm_obj.memq.tqh_first; - ptp->wire_count = 0; - ptp->flags |= PG_ZERO; - /* Postpone free to shootdown */ - uvm_pagerealloc(ptp, NULL, 0); - TAILQ_INSERT_TAIL(&empty_ptps, ptp, listq); + pmap_free_ptp(pmap, ptp, sva, ptes, pdes, + &cpumask, &empty_ptps); } } @@ -2787,13 +3048,15 @@ pmap_page_remove(pg) struct pv_head *pvh; struct pv_entry *pve, *npve, *killlist = NULL; pt_entry_t *ptes, opte; - int32_t cpumask = 0; - TAILQ_HEAD(, vm_page) empty_ptps; + struct pglist empty_ptps; struct vm_page *ptp; struct cpu_info *ci; struct pmap *curpmap; + pd_entry_t **pdes; + int32_t cpumask = 0; #ifdef DIAGNOSTIC + pd_entry_t pde; int bank, off; bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off); @@ -2819,25 +3082,24 @@ pmap_page_remove(pg) for (pve = SPLAY_MIN(pvtree, &pvh->pvh_root); pve != NULL; pve = npve) { npve = SPLAY_NEXT(pvtree, &pvh->pvh_root, pve); - ptes = pmap_map_ptes(pve->pv_pmap); /* locks pmap */ + pmap_map_ptes(pve->pv_pmap, &ptes, &pdes); /* locks pmap */ #ifdef DIAGNOSTIC - if (pve->pv_ptp && (pve->pv_pmap->pm_pdir[pdei(pve->pv_va)] & - PG_FRAME) - != VM_PAGE_TO_PHYS(pve->pv_ptp)) { + if (pve->pv_ptp && pmap_pdes_valid(pve->pv_va, pdes, &pde) && + (pde & PG_FRAME) != VM_PAGE_TO_PHYS(pve->pv_ptp)) { printf("pmap_page_remove: pg=%p: va=%lx, pv_ptp=%p\n", pg, pve->pv_va, pve->pv_ptp); printf("pmap_page_remove: PTP's phys addr: " - "actual=%x, recorded=%lx\n", - (pve->pv_pmap->pm_pdir[pdei(pve->pv_va)] & - PG_FRAME), VM_PAGE_TO_PHYS(pve->pv_ptp)); + "actual=%lx, recorded=%lx\n", + (unsigned long)(pde & PG_FRAME), + VM_PAGE_TO_PHYS(pve->pv_ptp)); panic("pmap_page_remove: mapped managed page has " "invalid pv_ptp field"); } #endif /* atomically save the old PTE and zap! it */ - opte = x86_atomic_testset_ul(&ptes[x86_btop(pve->pv_va)], 0); + opte = pmap_pte_set(&ptes[pl1_i(pve->pv_va)], 0); if (opte & PG_W) pve->pv_pmap->pm_stats.wired_count--; @@ -2855,41 +3117,9 @@ pmap_page_remove(pg) if (pve->pv_ptp) { pve->pv_ptp->wire_count--; if (pve->pv_ptp->wire_count <= 1) { - /* - * Do we have to shootdown the page just to - * get the pte out of the TLB ? - */ - if(!(opte & PG_U)) - pmap_tlb_shootdown(pve->pv_pmap, - pve->pv_va, opte, &cpumask); - - /* zap! */ - opte = x86_atomic_testset_ul( - &pve->pv_pmap->pm_pdir[pdei(pve->pv_va)], - 0); - pmap_tlb_shootdown(curpmap, - ((vaddr_t)ptes) + pve->pv_ptp->offset, - opte, &cpumask); -#if defined(MULTIPROCESSOR) - /* - * Always shoot down the other pmap's - * self-mapping of the PTP. - */ - pmap_tlb_shootdown(pve->pv_pmap, - ((vaddr_t)PTE_BASE) + pve->pv_ptp->offset, - opte, &cpumask); -#endif - pve->pv_pmap->pm_stats.resident_count--; - /* update hint? */ - if (pve->pv_pmap->pm_ptphint == pve->pv_ptp) - pve->pv_pmap->pm_ptphint = - pve->pv_pmap->pm_obj.memq.tqh_first; - pve->pv_ptp->wire_count = 0; - pve->pv_ptp->flags |= PG_ZERO; - /* Free only after the shootdown */ - uvm_pagerealloc(pve->pv_ptp, NULL, 0); - TAILQ_INSERT_TAIL(&empty_ptps, pve->pv_ptp, - listq); + pmap_free_ptp(pve->pv_pmap, pve->pv_ptp, + pve->pv_va, ptes, pdes, &cpumask, + &empty_ptps); } } pmap_unmap_ptes(pve->pv_pmap); /* unlocks pmap */ @@ -2929,8 +3159,8 @@ pmap_test_attrs(pg, testbits) int *myattrs; struct pv_head *pvh; struct pv_entry *pve; - volatile pt_entry_t *ptes; - pt_entry_t pte; + pt_entry_t *ptes, pte; + pd_entry_t **pdes; #if DIAGNOSTIC int bank, off; @@ -2964,8 +3194,8 @@ pmap_test_attrs(pg, testbits) for (pve = SPLAY_MIN(pvtree, &pvh->pvh_root); pve != NULL && (*myattrs & testbits) == 0; pve = SPLAY_NEXT(pvtree, &pvh->pvh_root, pve)) { - ptes = pmap_map_ptes(pve->pv_pmap); - pte = ptes[x86_btop(pve->pv_va)]; + pmap_map_ptes(pve->pv_pmap, &ptes, &pdes); + pte = ptes[pl1_i(pve->pv_va)]; pmap_unmap_ptes(pve->pv_pmap); *myattrs |= pte; } @@ -2997,6 +3227,7 @@ pmap_clear_attrs(pg, clearbits) struct pv_head *pvh; struct pv_entry *pve; pt_entry_t *ptes, opte; + pd_entry_t **pdes; int *myattrs; int32_t cpumask = 0; @@ -3019,14 +3250,14 @@ pmap_clear_attrs(pg, clearbits) *myattrs &= ~clearbits; SPLAY_FOREACH(pve, pvtree, &pvh->pvh_root) { + pmap_map_ptes(pve->pv_pmap, &ptes, &pdes); /* locks pmap */ #ifdef DIAGNOSTIC - if (!pmap_valid_entry(pve->pv_pmap->pm_pdir[pdei(pve->pv_va)])) + if (!pmap_pdes_valid(pve->pv_va, pdes, NULL)) panic("pmap_change_attrs: mapping without PTP " "detected"); #endif - ptes = pmap_map_ptes(pve->pv_pmap); /* locks pmap */ - opte = ptes[x86_btop(pve->pv_va)]; + opte = ptes[pl1_i(pve->pv_va)]; if (opte & clearbits) { /* We need to do something */ if (clearbits == PG_RW) { @@ -3038,7 +3269,7 @@ pmap_clear_attrs(pg, clearbits) */ /* First zap the RW bit! */ - x86_atomic_clearbits_l( + pmap_pte_clearbits( &ptes[x86_btop(pve->pv_va)], PG_RW); opte = ptes[x86_btop(pve->pv_va)]; @@ -3055,8 +3286,7 @@ pmap_clear_attrs(pg, clearbits) */ /* zap! */ - opte = x86_atomic_testset_ul( - &ptes[x86_btop(pve->pv_va)], + opte = pmap_pte_set(&ptes[x86_btop(pve->pv_va)], (opte & ~(PG_U | PG_M))); result |= (opte & clearbits); @@ -3073,10 +3303,10 @@ no_tlb_shootdown: PMAP_HEAD_TO_MAP_UNLOCK(); pmap_tlb_shootnow(cpumask); + return(result != 0); } - /* * p m a p p r o t e c t i o n f u n c t i o n s */ @@ -3108,12 +3338,12 @@ pmap_write_protect(pmap, sva, eva, prot) vaddr_t sva, eva; vm_prot_t prot; { - pt_entry_t *ptes, *epte; - volatile pt_entry_t *spte; + pt_entry_t *ptes, *spte, *epte; + pd_entry_t **pdes; vaddr_t blockend; int32_t cpumask = 0; - ptes = pmap_map_ptes(pmap); /* locks pmap */ + pmap_map_ptes(pmap, &ptes, &pdes); /* locks pmap */ /* should be ok, but just in case ... */ sva &= PG_FRAME; @@ -3121,7 +3351,7 @@ pmap_write_protect(pmap, sva, eva, prot) for (/* null */ ; sva < eva ; sva = blockend) { - blockend = (sva & PD_MASK) + NBPD; + blockend = (sva & L2_FRAME) + NBPD_L2; if (blockend > eva) blockend = eva; @@ -3135,11 +3365,11 @@ pmap_write_protect(pmap, sva, eva, prot) */ /* XXXCDC: ugly hack to avoid freeing PDP here */ - if (pdei(sva) == PDSLOT_PTE) + if (pl_i(sva, PTP_LEVELS) == PDIR_SLOT_PTE) continue; /* empty block? */ - if (!pmap_valid_entry(pmap->pm_pdir[pdei(sva)])) + if (!pmap_pdes_valid(sva, pdes, NULL)) continue; #ifdef DIAGNOSTIC @@ -3148,12 +3378,12 @@ pmap_write_protect(pmap, sva, eva, prot) panic("pmap_write_protect: PTE space"); #endif - spte = &ptes[x86_btop(sva)]; - epte = &ptes[x86_btop(blockend)]; + spte = &ptes[pl1_i(sva)]; + epte = &ptes[pl1_i(blockend)]; for (/*null */; spte < epte ; spte++) { if ((*spte & (PG_RW|PG_V)) == (PG_RW|PG_V)) { - x86_atomic_clearbits_l(spte, PG_RW); /* zap! */ + pmap_pte_clearbits(spte, PG_RW); if (*spte & PG_M) pmap_tlb_shootdown(pmap, x86_ptob(spte - ptes), @@ -3186,16 +3416,18 @@ pmap_unwire(pmap, va) vaddr_t va; { pt_entry_t *ptes; + pd_entry_t **pdes; - if (pmap_valid_entry(pmap->pm_pdir[pdei(va)])) { - ptes = pmap_map_ptes(pmap); /* locks pmap */ + pmap_map_ptes(pmap, &ptes, &pdes); /* locks pmap */ + + if (pmap_pdes_valid(va, pdes, NULL)) { #ifdef DIAGNOSTIC - if (!pmap_valid_entry(ptes[x86_btop(va)])) + if (!pmap_valid_entry(ptes[pl1_i(va)])) panic("pmap_unwire: invalid (unmapped) va 0x%lx", va); #endif - if ((ptes[x86_btop(va)] & PG_W) != 0) { - x86_atomic_clearbits_l(&ptes[x86_btop(va)], PG_W); + if ((ptes[pl1_i(va)] & PG_W) != 0) { + pmap_pte_clearbits(&ptes[pl1_i(va)], PG_W); pmap->pm_stats.wired_count--; } #ifdef DIAGNOSTIC @@ -3260,6 +3492,7 @@ pmap_enter(pmap, va, pa, prot, flags) int flags; { pt_entry_t *ptes, opte, npte; + pd_entry_t **pdes; struct vm_page *ptp, *pg; struct vm_page_md *mdpg; struct pv_head *old_pvh, *new_pvh; @@ -3277,8 +3510,8 @@ pmap_enter(pmap, va, pa, prot, flags) /* sanity check: kernel PTPs should already have been pre-allocated */ if (va >= VM_MIN_KERNEL_ADDRESS && - !pmap_valid_entry(pmap->pm_pdir[pdei(va)])) - panic("pmap_enter: missing kernel PTP!"); + !pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)])) + panic("pmap_enter: missing kernel PTP for va %lx!", va); #endif npte = pa | protection_codes[prot] | PG_V; @@ -3296,11 +3529,15 @@ pmap_enter(pmap, va, pa, prot, flags) /* get lock */ PMAP_MAP_TO_HEAD_LOCK(); - ptes = pmap_map_ptes(pmap); /* locks pmap */ + /* + * map in ptes and get a pointer to our PTP (unless we are the kernel) + */ + + pmap_map_ptes(pmap, &ptes, &pdes); /* locks pmap */ if (pmap == pmap_kernel()) { ptp = NULL; } else { - ptp = pmap_get_ptp(pmap, pdei(va)); + ptp = pmap_get_ptp(pmap, va, pdes); if (ptp == NULL) { if (flags & PMAP_CANFAIL) { error = ENOMEM; @@ -3315,7 +3552,7 @@ pmap_enter(pmap, va, pa, prot, flags) * on SMP the PTE might gain PG_U and PG_M flags * before we zap it later */ - opte = ptes[x86_btop(va)]; /* old PTE */ + opte = ptes[pl1_i(va)]; /* old PTE */ /* * is there currently a valid mapping at our VA and does it @@ -3335,7 +3572,7 @@ pmap_enter(pmap, va, pa, prot, flags) npte |= (opte & PG_PVLIST); /* zap! */ - opte = x86_atomic_testset_ul(&ptes[x86_btop(va)], npte); + opte = pmap_pte_set(&ptes[pl1_i(va)], npte); /* * Any change in the protection level that the CPU @@ -3347,7 +3584,7 @@ pmap_enter(pmap, va, pa, prot, flags) * No need to flush the TLB. * Just add old PG_M, ... flags in new entry. */ - x86_atomic_setbits_l(&ptes[x86_btop(va)], + pmap_pte_setbits(&ptes[pl1_i(va)], opte & (PG_M | PG_U)); goto out_ok; } @@ -3428,7 +3665,7 @@ pmap_enter(pmap, va, pa, prot, flags) pmap_lock_pvhs(old_pvh, new_pvh); /* zap! */ - opte = x86_atomic_testset_ul(&ptes[x86_btop(va)], npte); + opte = pmap_pte_set(&ptes[pl1_i(va)], npte); pve = pmap_remove_pv(old_pvh, pmap, va); KASSERT(pve != 0); @@ -3457,7 +3694,7 @@ pmap_enter(pmap, va, pa, prot, flags) simple_unlock(&new_pvh->pvh_lock); } - opte = x86_atomic_testset_ul(&ptes[x86_btop(va)], npte); /* zap! */ + opte = pmap_pte_set(&ptes[pl1_i(va)], npte); /* zap! */ shootdown_test: /* Update page attributes if needed */ @@ -3483,9 +3720,118 @@ out: pmap_unmap_ptes(pmap); PMAP_MAP_TO_HEAD_UNLOCK(); +#if defined(DEBUG) + if (error == 0 && pmap == pmap_kernel()) { + paddr_t newpa; + boolean_t ret; + + ret = pmap_extract(pmap, va, &newpa); + if (!ret) + panic("%s: no mapping", __func__); + if (newpa != pa) + panic("%s: different pa", __func__); + } +#endif /* defined(DEBUG) */ + return error; } +static boolean_t +pmap_get_physpage(va, level, paddrp) + vaddr_t va; + int level; + paddr_t *paddrp; +{ + struct vm_page *ptp; + struct pmap *kpm = pmap_kernel(); + + if (uvm.page_init_done == FALSE) { + /* + * we're growing the kernel pmap early (from + * uvm_pageboot_alloc()). this case must be + * handled a little differently. + */ + + if (uvm_page_physget(paddrp) == FALSE) + panic("pmap_get_physpage: out of memory"); + *early_zero_pte = (*paddrp & PG_FRAME) | PG_V | PG_RW; + pmap_update_pg((vaddr_t)early_zerop); + memset(early_zerop, 0, PAGE_SIZE); +#if defined(DIAGNOSTIC) + *early_zero_pte = 0; +#endif /* defined(DIAGNOSTIC) */ + } else { + /* XXX */ + if (level != 1) + simple_lock(&kpm->pm_obj[level - 1].vmobjlock); + ptp = uvm_pagealloc(&kpm->pm_obj[level - 1], + ptp_va2o(va, level), NULL, + UVM_PGA_USERESERVE|UVM_PGA_ZERO); + if (level != 1) + simple_unlock(&kpm->pm_obj[level - 1].vmobjlock); + if (ptp == NULL) + panic("pmap_get_physpage: out of memory"); + ptp->flags &= ~PG_BUSY; + ptp->wire_count = 1; + *paddrp = VM_PAGE_TO_PHYS(ptp); + } + kpm->pm_stats.resident_count++; + return TRUE; +} + +/* + * Allocate the amount of specified ptps for a ptp level, and populate + * all levels below accordingly, mapping virtual addresses starting at + * kva. + * + * Used by pmap_growkernel. + */ +static void +pmap_alloc_level(pdes, kva, lvl, needed_ptps) + pd_entry_t **pdes; + vaddr_t kva; + int lvl; + long *needed_ptps; +{ + unsigned long i; + vaddr_t va; + paddr_t pa; + unsigned long index, endindex; + int level; + pd_entry_t *pdep; + + for (level = lvl; level > 1; level--) { + if (level == PTP_LEVELS) + pdep = pmap_kernel()->pm_pdir; + else + pdep = pdes[level - 2]; + va = kva; + index = pl_i(kva, level); + endindex = index + needed_ptps[level - 1] - 1; + + for (i = index; i <= endindex; i++) { + if (!pmap_valid_entry(pdep[i])) { + pmap_get_physpage(va, level - 1, &pa); + pdep[i] = pa | PG_RW | PG_V; + } else { + /* + * already allocated by bootstrap code. + * + * XXXyamt register to uvm object? + */ +#if defined(DEBUG) + printf("%s: skip entry %ld (0x%x)\n", + __func__, i, pdep[i]); +#endif /* defined(DEBUG) */ + } + KASSERT(level != PTP_LEVELS || nkptp[level - 1] + + pl_i(VM_MIN_KERNEL_ADDRESS, level) == i); + nkptp[level - 1]++; + va += nbpd[level - 1]; + } + } +} + /* * pmap_growkernel: increase usage of KVM space * @@ -3493,68 +3839,54 @@ out: * the pmaps on the system. */ +static vaddr_t pmap_maxkvaddr = VM_MIN_KERNEL_ADDRESS; + vaddr_t pmap_growkernel(maxkvaddr) vaddr_t maxkvaddr; { struct pmap *kpm = pmap_kernel(), *pm; - int needed_kpde; /* needed number of kernel PTPs */ - int s; - paddr_t ptaddr; - - needed_kpde = (u_int)(maxkvaddr - VM_MIN_KERNEL_ADDRESS + (NBPD-1)) - / NBPD; - if (needed_kpde <= nkpde) - goto out; /* we are OK */ - - /* - * whoops! we need to add kernel PTPs - */ - - s = splhigh(); /* to be safe */ - simple_lock(&kpm->pm_obj.vmobjlock); - - for (/*null*/ ; nkpde < needed_kpde ; nkpde++) { - - if (uvm.page_init_done == FALSE) { - - /* - * we're growing the kernel pmap early (from - * uvm_pageboot_alloc()). this case must be - * handled a little differently. - */ - - if (uvm_page_physget(&ptaddr) == FALSE) - panic("pmap_growkernel: out of memory"); - pmap_zero_page(ptaddr); - - kpm->pm_pdir[PDSLOT_KERN + nkpde] = - ptaddr | PG_RW | PG_V; - - /* count PTP as resident */ - kpm->pm_stats.resident_count++; - continue; - } - + int s, i; + unsigned newpdes; + long needed_kptp[PTP_LEVELS], target_nptp, old; + + if (maxkvaddr <= pmap_maxkvaddr) + return pmap_maxkvaddr; + + maxkvaddr = x86_round_pdr(maxkvaddr); + old = nkptp[PTP_LEVELS - 1]; + /* + * This loop could be optimized more, but pmap_growkernel() + * is called infrequently. + */ + for (i = PTP_LEVELS - 1; i >= 1; i--) { + target_nptp = pl_i(maxkvaddr - 1, i + 1) - + pl_i(VM_MIN_KERNEL_ADDRESS, i + 1); /* - * THIS *MUST* BE CODED SO AS TO WORK IN THE - * pmap_initialized == FALSE CASE! WE MAY BE - * INVOKED WHILE pmap_init() IS RUNNING! + * XXX only need to check toplevel. */ + if (target_nptp > nkptpmax[i]) + panic("out of KVA space"); + needed_kptp[i] = target_nptp - nkptp[i] + 1; + } - if (pmap_alloc_ptp(kpm, PDSLOT_KERN + nkpde) == NULL) { - panic("pmap_growkernel: alloc ptp failed"); - } - /* PG_u not for kernel */ - kpm->pm_pdir[PDSLOT_KERN + nkpde] &= ~PG_u; + s = splhigh(); /* to be safe */ + simple_lock(&kpm->pm_lock); + pmap_alloc_level(normal_pdes, pmap_maxkvaddr, PTP_LEVELS, + needed_kptp); - /* distribute new kernel PTP to all active pmaps */ + /* + * If the number of top level entries changed, update all + * pmaps. + */ + if (needed_kptp[PTP_LEVELS - 1] != 0) { + newpdes = nkptp[PTP_LEVELS - 1] - old; simple_lock(&pmaps_lock); - for (pm = pmaps.lh_first; pm != NULL; - pm = pm->pm_list.le_next) { - pm->pm_pdir[PDSLOT_KERN + nkpde] = - kpm->pm_pdir[PDSLOT_KERN + nkpde]; + LIST_FOREACH(pm, &pmaps, pm_list) { + memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old], + &kpm->pm_pdir[PDIR_SLOT_KERN + old], + newpdes * sizeof (pd_entry_t)); } /* Invalidate the PDP cache. */ @@ -3563,12 +3895,11 @@ pmap_growkernel(maxkvaddr) simple_unlock(&pmaps_lock); } - - simple_unlock(&kpm->pm_obj.vmobjlock); + pmap_maxkvaddr = maxkvaddr; + simple_unlock(&kpm->pm_lock); splx(s); -out: - return (VM_MIN_KERNEL_ADDRESS + (nkpde * NBPD)); + return maxkvaddr; } #ifdef DEBUG @@ -3586,6 +3917,7 @@ pmap_dump(pmap, sva, eva) vaddr_t sva, eva; { pt_entry_t *ptes, *pte; + pd_entry_t **pdes; vaddr_t blkendva; /* @@ -3601,7 +3933,7 @@ pmap_dump(pmap, sva, eva) */ PMAP_MAP_TO_HEAD_LOCK(); - ptes = pmap_map_ptes(pmap); /* locks pmap */ + pmap_map_ptes(pmap, &ptes, &pdes); /* locks pmap */ /* * dumping a range of pages: we dump in PTP sized blocks (4MB) @@ -3615,15 +3947,16 @@ pmap_dump(pmap, sva, eva) blkendva = eva; /* valid block? */ - if (!pmap_valid_entry(pmap->pm_pdir[pdei(sva)])) + if (!pmap_pdes_valid(sva, pdes, NULL)) continue; - pte = &ptes[x86_btop(sva)]; + pte = &ptes[pl1_i(sva)]; for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) { if (!pmap_valid_entry(*pte)) continue; - printf("va %#lx -> pa %#x (pte=%#x)\n", - sva, *pte, *pte & PG_FRAME); + printf("va %#lx -> pa %#lx (pte=%#lx)\n", + (unsigned long)sva, (unsigned long)*pte, + (unsigned long)*pte & PG_FRAME); } } pmap_unmap_ptes(pmap); @@ -3730,7 +4063,9 @@ pmap_tlb_shootdown(pmap, va, pte, cpumas if (ci != self && !(ci->ci_flags & CPUF_RUNNING)) continue; pq = &pmap_tlb_shootdown_q[ci->ci_cpuid]; +#if defined(MULTIPROCESSOR) __cpu_simple_lock(&pq->pq_slock); +#endif /* * If there's a global flush already queued, or a @@ -3739,7 +4074,9 @@ pmap_tlb_shootdown(pmap, va, pte, cpumas */ if (pq->pq_flushg > 0 || (pq->pq_flushu > 0 && (pte & pmap_pg_g) == 0)) { +#if defined(MULTIPROCESSOR) __cpu_simple_unlock(&pq->pq_slock); +#endif continue; } @@ -3756,7 +4093,9 @@ pmap_tlb_shootdown(pmap, va, pte, cpumas if (cpu_class == CPUCLASS_386) { pq->pq_flushu++; *cpumaskp |= 1U << ci->ci_cpuid; +#if defined(MULTIPROCESSOR) __cpu_simple_unlock(&pq->pq_slock); +#endif continue; } #endif @@ -3772,7 +4111,9 @@ pmap_tlb_shootdown(pmap, va, pte, cpumas */ if (ci == self && pq->pq_count < PMAP_TLB_MAXJOBS) { pmap_update_pg(va); +#if defined(MULTIPROCESSOR) __cpu_simple_unlock(&pq->pq_slock); +#endif continue; } else { if (pq->pq_pte & pmap_pg_g) @@ -3794,7 +4135,9 @@ pmap_tlb_shootdown(pmap, va, pte, cpumas TAILQ_INSERT_TAIL(&pq->pq_head, pj, pj_list); *cpumaskp |= 1U << ci->ci_cpuid; } +#if defined(MULTIPROCESSOR) __cpu_simple_unlock(&pq->pq_slock); +#endif } splx(s); } @@ -3859,7 +4202,9 @@ pmap_do_tlb_shootdown(struct cpu_info *s s = splvm(); #endif /* MULTIPROCESSOR */ +#ifdef MULTIPROCESSOR __cpu_simple_lock(&pq->pq_slock); +#endif if (pq->pq_flushg) { COUNT(flushg); @@ -3900,8 +4245,8 @@ pmap_do_tlb_shootdown(struct cpu_info *s for (CPU_INFO_FOREACH(cii, ci)) x86_atomic_clearbits_l(&ci->ci_tlb_ipi_mask, (1U << cpu_id)); -#endif /* MULTIPROCESSOR */ __cpu_simple_unlock(&pq->pq_slock); +#endif /* MULTIPROCESSOR */ splx(s); } @@ -3946,15 +4291,21 @@ pmap_tlb_shootdown_job_get(pq) if (pq->pq_count >= PMAP_TLB_MAXJOBS) return (NULL); +#ifdef MULTIPROCESSOR __cpu_simple_lock(&pmap_tlb_shootdown_job_lock); +#endif if (pj_free == NULL) { +#ifdef MULTIPROCESSOR __cpu_simple_unlock(&pmap_tlb_shootdown_job_lock); +#endif return NULL; } pj = &pj_free->pja_job; pj_free = (union pmap_tlb_shootdown_job_al *)pj_free->pja_job.pj_nextfree; +#ifdef MULTIPROCESSOR __cpu_simple_unlock(&pmap_tlb_shootdown_job_lock); +#endif pq->pq_count++; return (pj); @@ -3977,10 +4328,14 @@ pmap_tlb_shootdown_job_put(pq, pj) if (pq->pq_count == 0) panic("pmap_tlb_shootdown_job_put: queue length inconsistency"); #endif +#ifdef MULTIPROCESSOR __cpu_simple_lock(&pmap_tlb_shootdown_job_lock); +#endif pj->pj_nextfree = &pj_free->pja_job; pj_free = (union pmap_tlb_shootdown_job_al *)pj; +#ifdef MULTIPROCESSOR __cpu_simple_unlock(&pmap_tlb_shootdown_job_lock); +#endif pq->pq_count--; } Index: arch/i386/i386/machdep.c =================================================================== --- arch/i386/i386/machdep.c (revision 790) +++ arch/i386/i386/machdep.c (revision 791) @@ -505,11 +505,6 @@ SYSCTL_SETUP(sysctl_machdep_setup, "sysc CTL_MACHDEP, CPU_BIOSEXTMEM, CTL_EOL); sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, - CTLTYPE_INT, "nkpde", NULL, - NULL, 0, &nkpde, 0, - CTL_MACHDEP, CPU_NKPDE, CTL_EOL); - sysctl_createv(clog, 0, NULL, NULL, - CTLFLAG_PERMANENT, CTLTYPE_STRING, "booted_kernel", NULL, sysctl_machdep_booted_kernel, 0, NULL, 0, CTL_MACHDEP, CPU_BOOTED_KERNEL, CTL_EOL); Index: arch/i386/i386/locore.S =================================================================== --- arch/i386/i386/locore.S (revision 790) +++ arch/i386/i386/locore.S (revision 791) @@ -1,5 +1,45 @@ /* $NetBSD: locore.S,v 1.28 2004/05/13 12:15:01 yamt Exp $ */ +/* + * Copyright-o-rama! + */ + +/* + * Copyright (c) 2001 Wasabi Systems, Inc. + * All rights reserved. + * + * Written by Frank van der Linden for Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed for the NetBSD Project by + * Wasabi Systems, Inc. + * 4. The name of Wasabi Systems, Inc. may not be used to endorse + * or promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + + /*- * Copyright (c) 1998, 2000 The NetBSD Foundation, Inc. * All rights reserved. @@ -152,27 +192,6 @@ #ifdef MULTIPROCESSOR #include #endif - -/* - * PTmap is recursive pagemap at top of virtual address space. - * Within PTmap, the page directory can be found (third indirection). - * - * XXX 4 == sizeof pde - */ - .set _C_LABEL(PTmap),(PDSLOT_PTE << PDSHIFT) - .set _C_LABEL(PTD),(_C_LABEL(PTmap) + PDSLOT_PTE * PAGE_SIZE) - .set _C_LABEL(PTDpde),(_C_LABEL(PTD) + PDSLOT_PTE * 4) - -/* - * APTmap, APTD is the alternate recursive pagemap. - * It's used when modifying another process's page tables. - * - * XXX 4 == sizeof pde - */ - .set _C_LABEL(APTmap),(PDSLOT_APTE << PDSHIFT) - .set _C_LABEL(APTD),(_C_LABEL(APTmap) + PDSLOT_APTE * PAGE_SIZE) - .set _C_LABEL(APTDpde),(_C_LABEL(PTD) + PDSLOT_APTE * 4) - /* * Initialization @@ -500,10 +519,26 @@ try586: /* Use the `cpuid' instruction. * text | data | bss | [syms] | page dir | proc0 kstack * 0 1 2 3 */ -#define PROC0PDIR ((0) * PAGE_SIZE) -#define PROC0STACK ((1) * PAGE_SIZE) -#define SYSMAP ((1+UPAGES) * PAGE_SIZE) -#define TABLESIZE ((1+UPAGES) * PAGE_SIZE) /* + nkpde * PAGE_SIZE */ + +#define PROC0_PDIR_OFF 0 +#define PROC0_STK_OFF (PROC0_PDIR_OFF + NBPG) +#define PROC0_PTP1_OFF (PROC0_STK_OFF + UPAGES * NBPG) + +#define TABLESIZE ((NKL2_KIMG_ENTRIES + 1 + UPAGES) * NBPG) + +/* + * fillkpt + * eax = pte (page frame | control | status) + * ebx = page table address + * ecx = number of pages to map + */ + +#define fillkpt \ +1: movl %eax,(%ebx) ; /* store phys addr */ \ + addl $4,%ebx ; /* next pte/pde */ \ + addl $NBPG,%eax ; /* next phys page */ \ + loop 1b ; \ + /* Find end of kernel image. */ movl $RELOC(end),%edi @@ -517,60 +552,40 @@ try586: /* Use the `cpuid' instruction. 1: #endif - /* Calculate where to start the bootstrap tables. */ + /* Clear tables */ movl %edi,%esi # edi = esym ? esym : end addl $PGOFSET,%esi # page align up andl $~PGOFSET,%esi - /* - * Calculate the size of the kernel page table directory, and - * how many entries it will have. - */ - movl RELOC(nkpde),%ecx # get nkpde - cmpl $NKPTP_MIN,%ecx # larger than min? - jge 1f - movl $NKPTP_MIN,%ecx # set at min - jmp 2f -1: cmpl $NKPTP_MAX,%ecx # larger than max? - jle 2f - movl $NKPTP_MAX,%ecx -2: - - /* Clear memory for bootstrap tables. */ - shll $PGSHIFT,%ecx - addl $TABLESIZE,%ecx - addl %esi,%ecx # end of tables - subl %edi,%ecx # size of tables - shrl $2,%ecx + movl %esi,%edi xorl %eax,%eax cld + movl $TABLESIZE,%ecx + shrl $2,%ecx rep stosl -/* - * fillkpt - * eax = pte (page frame | control | status) - * ebx = page table address - * ecx = number of pages to map - */ -#define fillkpt \ -1: movl %eax,(%ebx) ; \ - addl $PAGE_SIZE,%eax ; /* increment physical address */ \ - addl $4,%ebx ; /* next pte */ \ - loop 1b ; + leal (PROC0_PTP1_OFF)(%esi), %ebx /* * Build initial page tables. */ - /* Calculate end of text segment, rounded to a page. */ - leal (RELOC(etext)+PGOFSET),%edx + /* + * Compute etext - KERNBASE. This can't be > 4G, or we can't deal + * with it anyway, since we can't load it in 32 bit mode. So use + * the bottom 32 bits. + */ + movl $RELOC(etext),%edx + addl $PGOFSET,%edx andl $~PGOFSET,%edx - /* Skip over the first 1MB. */ + /* + * Skip the first MB. + */ movl $_RELOC(KERNTEXTOFF),%eax movl %eax,%ecx - shrl $PGSHIFT,%ecx - leal (SYSMAP)(%esi,%ecx,4),%ebx + shrl $(PGSHIFT-2),%ecx /* ((n >> PGSHIFT) << 2) for # pdes */ + addl %ecx,%ebx /* Map the kernel text read-only. */ movl %edx,%ecx @@ -581,15 +596,13 @@ try586: /* Use the `cpuid' instruction. /* Map the data, BSS, and bootstrap tables read-write. */ leal (PG_V|PG_KW)(%edx),%eax - movl RELOC(nkpde),%ecx - shll $PGSHIFT,%ecx - addl $TABLESIZE,%ecx + movl $TABLESIZE,%ecx addl %esi,%ecx # end of tables subl %edx,%ecx # subtract end of text shrl $PGSHIFT,%ecx fillkpt - /* Map ISA I/O memory. */ + /* Map ISA I/O mem (later atdevbase) */ movl $(IOM_BEGIN|PG_V|PG_KW/*|PG_N*/),%eax # having these bits set movl $(IOM_SIZE>>PGSHIFT),%ecx # for this many pte s, fillkpt @@ -597,28 +610,40 @@ try586: /* Use the `cpuid' instruction. /* * Construct a page table directory. */ - /* Install PDEs for temporary double map of kernel. */ - movl RELOC(nkpde),%ecx # for this many pde s, - leal (PROC0PDIR+0*4)(%esi),%ebx # which is where temp maps! - leal (SYSMAP+PG_V|PG_KW)(%esi),%eax # pte for KPT in proc 0, + /* Set up top level entries for identity mapping */ + leal (PROC0_PDIR_OFF)(%esi),%ebx + leal (PROC0_PTP1_OFF)(%esi),%eax + orl $(PG_V|PG_KW), %eax + movl $NKL2_KIMG_ENTRIES,%ecx fillkpt - /* Map kernel PDEs. */ - movl RELOC(nkpde),%ecx # for this many pde s, - leal (PROC0PDIR+PDSLOT_KERN*4)(%esi),%ebx # kernel pde offset - leal (SYSMAP+PG_V|PG_KW)(%esi),%eax # pte for KPT in proc 0, + /* Set up top level entries for actual kernel mapping */ + leal (PROC0_PDIR_OFF + L2_SLOT_KERNBASE*4)(%esi),%ebx + leal (PROC0_PTP1_OFF)(%esi),%eax + orl $(PG_V|PG_KW), %eax + movl $NKL2_KIMG_ENTRIES,%ecx fillkpt /* Install a PDE recursively mapping page directory as a page table! */ - leal (PROC0PDIR+PG_V|PG_KW)(%esi),%eax # pte for ptd - movl %eax,(PROC0PDIR+PDSLOT_PTE*4)(%esi) # recursive PD slot + leal (PROC0_PDIR_OFF + PDIR_SLOT_PTE*4)(%esi),%ebx + leal (PROC0_PDIR_OFF)(%esi),%eax + orl $(PG_V|PG_KW),%eax + movl %eax,(%ebx) + /* Save phys. addr of PTD, for libkvm. */ movl %esi,RELOC(PTDpaddr) - /* Load base of page directory and enable mapping. */ + /* + * Startup checklist: + * 1. Load %cr3 with pointer to PDIR. + */ movl %esi,%eax # phys address of ptd in proc 0 movl %eax,%cr3 # load ptd addr into mmu + + /* + * 2. Enable paging and the rest of it. + */ movl %cr0,%eax # get control word # enable paging & NPX emulation orl $(CR0_PE|CR0_PG|CR0_NE|CR0_TS|CR0_EM|CR0_MP),%eax @@ -628,23 +653,25 @@ try586: /* Use the `cpuid' instruction. ret begin: - /* Now running relocated at KERNBASE_LOCORE. Remove double mapping. */ - movl _C_LABEL(nkpde),%ecx # for this many pde s, - leal (PROC0PDIR+0*4)(%esi),%ebx # which is where temp maps! - addl $(KERNBASE_LOCORE), %ebx # now use relocated address + /* + * We have arrived. + * There's no need anymore for the identity mapping in low + * memory, remove it. + */ + movl $NKL2_KIMG_ENTRIES,%ecx + leal (PROC0_PDIR_OFF)(%esi),%ebx # old, phys address of PDIR + addl $(KERNBASE_LOCORE), %ebx # new, virtual address of PDIR 1: movl $0,(%ebx) - addl $4,%ebx # next pde + addl $4,%ebx loop 1b /* Relocate atdevbase. */ - movl _C_LABEL(nkpde),%edx - shll $PGSHIFT,%edx - addl $(TABLESIZE+KERNBASE_LOCORE),%edx + movl $(TABLESIZE+KERNBASE_LOCORE),%edx addl %esi,%edx movl %edx,_C_LABEL(atdevbase) /* Set up bootstrap stack. */ - leal (PROC0STACK+KERNBASE_LOCORE)(%esi),%eax + leal (PROC0_STK_OFF+KERNBASE_LOCORE)(%esi),%eax movl %eax,_C_LABEL(proc0paddr) leal (USPACE-FRAMESIZE)(%eax),%esp movl %esi,PCB_CR3(%eax) # pcb->pcb_cr3 @@ -655,9 +682,7 @@ begin: call _C_LABEL(initgdt) addl $4,%esp - movl _C_LABEL(nkpde),%eax - shll $PGSHIFT,%eax - addl $TABLESIZE,%eax + movl $TABLESIZE,%eax addl %esi,%eax # skip past stack and page tables pushl %eax @@ -915,7 +940,7 @@ ENTRY(i386_copyout) movl $2f,PCB_ONFAULT(%edx) 1: /* Check PTE for each page. */ - testb $PG_RW,_C_LABEL(PTmap)(,%edi,4) + testb $PG_RW,PTE_BASE(,%edi,4) jz 2f 4: incl %edi @@ -1125,7 +1150,7 @@ ENTRY(copyoutstr) movl %edi,%eax shrl $PGSHIFT,%eax # calculate pte address - testb $PG_RW,_C_LABEL(PTmap)(,%eax,4) + testb $PG_RW,PTE_BASE(,%eax,4) jnz 2f 6: /* Simulate a trap. */ @@ -1441,7 +1466,7 @@ ENTRY(suword) movl %edx,%eax shrl $PGSHIFT,%eax # calculate pte address - testb $PG_RW,_C_LABEL(PTmap)(,%eax,4) + testb $PG_RW,PTE_BASE(,%eax,4) jnz 1f 3: /* Simulate a trap. */ @@ -1489,7 +1514,7 @@ ENTRY(susword) movl %edx,%eax shrl $PGSHIFT,%eax # calculate pte address - testb $PG_RW,_C_LABEL(PTmap)(,%eax,4) + testb $PG_RW,PTE_BASE(,%eax,4) jnz 1f 3: /* Simulate a trap. */ @@ -1539,7 +1564,7 @@ ENTRY(suswintr) movl %edx,%eax shrl $PGSHIFT,%eax # calculate pte address - testb $PG_RW,_C_LABEL(PTmap)(,%eax,4) + testb $PG_RW,PTE_BASE(,%eax,4) jnz 1f /* Simulate a trap. */ @@ -1577,7 +1602,7 @@ ENTRY(subyte) movl %edx,%eax shrl $PGSHIFT,%eax # calculate pte address - testb $PG_RW,_C_LABEL(PTmap)(,%eax,4) + testb $PG_RW,PTE_BASE(,%eax,4) jnz 1f 3: /* Simulate a trap. */ Index: arch/i386/i386/genassym.cf =================================================================== --- arch/i386/i386/genassym.cf (revision 790) +++ arch/i386/i386/genassym.cf (revision 791) @@ -146,11 +146,12 @@ define PAGE_SIZE PAGE_SIZE define LSRUN LSRUN define LSONPROC LSONPROC -define PDSLOT_PTE PDSLOT_PTE -define PDSLOT_APTE PDSLOT_APTE -define PDSLOT_KERN PDSLOT_KERN -define NKPTP_MIN NKPTP_MIN -define NKPTP_MAX NKPTP_MAX +define L2_SLOT_KERNBASE pl2_pi(KERNBASE) +define L1_SLOT_KERNBASE pl1_pi(KERNBASE) + +define PDIR_SLOT_PTE PDIR_SLOT_PTE +define PTE_BASE PTE_BASE +define NKL2_KIMG_ENTRIES NKL2_KIMG_ENTRIES define VM_MAXUSER_ADDRESS (int)VM_MAXUSER_ADDRESS