在进程中用户态地址如何得到物理地址

100 阅读 0 评论 66 点赞

我是靠谱客的博主怕黑蜡烛，最近开发中收集的这篇文章主要介绍在进程中用户态地址如何得到物理地址，觉得挺不错的，现在分享给大家，希望可以做个参考。

概述

在3.9内核里drivers/staging/tidspbridge/core/tiomap3430.c中发现一个有意思的函数：

/*
* ======== user_va2_pa ========
* Purpose:
* This function walks through the page tables to convert a userland
* virtual address to physical address
*/
static u32 user_va2_pa(struct mm_struct *mm, u32 address)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *ptep, pte;

pgd = pgd_offset(mm, address);
if (pgd_none(*pgd) || pgd_bad(*pgd))
return 0;

pud = pud_offset(pgd, address);
if (pud_none(*pud) || pud_bad(*pud))
return 0;

pmd = pmd_offset(pud, address);
if (pmd_none(*pmd) || pmd_bad(*pmd))
return 0;

ptep = pte_offset_map(pmd, address);
if (ptep) {
  pte = *ptep;
  if (pte_present(pte))
   return pte & PAGE_MASK;
}

return 0;
}

这个从进程的PGD一级页表开始，遍历各级页表，根据指定用户态虚拟地址得到物理地址。

分析：

对于32位ARM来说：

arch/arm/include/asm/pgtable.h中定义了

/* to find an entry in a page-table-directory */
#define pgd_index(addr) ((addr) >> PGDIR_SHIFT)

#define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))

而在arch/arm/include/asm/pgtable-2level.h中定义了

/*
* PMD_SHIFT determines the size of the area a second-level page table can map
* PGDIR_SHIFT determines what a third-level page table entry can map
*/
#define PMD_SHIFT 21
#define PGDIR_SHIFT 21

这里ARM的一级页表经过改造了，不是标准的硬件要求的20位SHIFT，即以1MB为单位的一级页表，这里实际上是2MB为单位的一级页表。

#define PTRS_PER_PTE  512
#define PTRS_PER_PMD  1
#define PTRS_PER_PGD  2048

而相应的一级和二级页表项也已经改成了2048和512个表项了，而不是原始的4096和256个表项了，从头文件的注释来看，主要ARM的硬件页表不支持“页脏”和“页young”属性，需要用Linux页表辅助ARM的硬件页表来完全支持linux系统。如果页大小是4KB的话，一个PAGE正好可以放1024个表项，把它分成上“半”页和下“半”页，各512个表项，上“半”页放的是linux页表项，而下“半”页放的是硬件页表项。所以我们会看到linux创建ARM页表时，填充完linux页表后，会加上2048个字节的偏移（半个页大小），然后填充硬件页表项。同时请参考代码pgtable-2level.h(arch/arm/include/asm/)中的注释，画的比较清晰。

/*
* Hardware-wise, we have a two level page table structure, where the first
* level has 4096 entries, and the second level has 256 entries. Each entry
* is one 32-bit word. Most of the bits in the second level entry are used
* by hardware, and there aren't any "accessed" and "dirty" bits.
*
* Linux on the other hand has a three level page table structure, which can
* be wrapped to fit a two level page table structure easily - using the PGD
* and PTE only. However, Linux also expects one "PTE" table per page, and
* at least a "dirty" bit.
*
* Therefore, we tweak the implementation slightly - we tell Linux that we
* have 2048 entries in the first level, each of which is 8 bytes (iow, two
* hardware pointers to the second level.) The second level contains two
* hardware PTE tables arranged contiguously, preceded by Linux versions
* which contain the state information Linux needs. We, therefore, end up
* with 512 entries in the "PTE" level.
*
* This leads to the page tables having the following layout:
*
*    pgd             pte
* |        |
* +--------+
* |        |       +------------+ +0
* +- - - - +       | Linux pt 0 |
* |        |       +------------+ +1024
* +--------+ +0    | Linux pt 1 |
* |        |-----> +------------+ +2048
* +- - - - + +4    | h/w pt 0 |
* |        |-----> +------------+ +3072
* +--------+ +8    | h/w pt 1 |
* |        |       +------------+ +4096
*
* See L_PTE_xxx below for definitions of bits in the "Linux pt", and
* PTE_xxx for definitions of bits appearing in the "h/w pt".
*
* PMD_xxx definitions refer to bits in the first level page table.
*
* The "dirty" bit is emulated by only granting hardware write permission
* iff the page is marked "writable" and "dirty" in the Linux PTE. This
* means that a write to a clean page will cause a permission fault, and
* the Linux MM layer will mark the page dirty via handle_pte_fault().
* For the hardware to notice the permission change, the TLB entry must
* be flushed, and ptep_set_access_flags() does that for us.
*
* The "accessed" or "young" bit is emulated by a similar method; we only
* allow accesses to the page if the "young" bit is set. Accesses to the
* page will cause a fault, and handle_pte_fault() will set the young bit
* for us as long as the page is marked present in the corresponding Linux
* PTE entry. Again, ptep_set_access_flags() will ensure that the TLB is
* up to date.
*
* However, when the "young" bit is cleared, we deny access to the page
* by clearing the hardware PTE. Currently Linux does not flush the TLB
* for us in this case, which means the TLB will retain the transation
* until either the TLB entry is evicted under pressure, or a context
* switch which changes the user space mapping occurs.
*/
#define PTRS_PER_PTE  512
#define PTRS_PER_PMD  1
#define PTRS_PER_PGD  2048

我们以cortex-A9为例，我们跟踪create_mapping(arch/arm/mm/mmu.c)函数中，最终填充页表项函数会调用：

/*
* cpu_v7_set_pte_ext(ptep, pte)
*
* Set a level 2 translation table entry.
*
* - ptep - pointer to level 2 translation table entry
*    (hardware version is stored at +2048 bytes)
* - pte   - PTE value to store
* - ext - value for extended PTE bits
*/
ENTRY(cpu_v7_set_pte_ext)
#ifdef CONFIG_MMU
str r1, [r0]   @ linux version

bic r3, r1, #0x000003f0
bic r3, r3, #PTE_TYPE_MASK
orr r3, r3, r2
orr r3, r3, #PTE_EXT_AP0 | 2

tst r1, #1 << 4
orrne r3, r3, #PTE_EXT_TEX(1)

eor r1, r1, #L_PTE_DIRTY
tst r1, #L_PTE_RDONLY | L_PTE_DIRTY
orrne r3, r3, #PTE_EXT_APX

tst r1, #L_PTE_USER
orrne r3, r3, #PTE_EXT_AP1
#ifdef CONFIG_CPU_USE_DOMAINS
@ allow kernel read/write access to read-only user pages
tstne r3, #PTE_EXT_APX
bicne r3, r3, #PTE_EXT_APX | PTE_EXT_AP0
#endif

tst r1, #L_PTE_XN
orrne r3, r3, #PTE_EXT_XN

tst r1, #L_PTE_YOUNG
tstne r1, #L_PTE_PRESENT
moveq r3, #0

ARM( str r3, [r0, #2048]! )
mcr p15, 0, r0, c7, c10, 1 @ flush_pte
#endif
mov pc, lr
ENDPROC(cpu_v7_set_pte_ext)

以上代码参考的是linux-3.7内核的代码。如果你手上有2.6.11内核源码的话(注意这个版本的内核不支持cortex-A9)，发现在该版本内核的二级页表的linux页表和硬件页表和3.7内核放置是相反的，上半页是硬件页表，下半页是linux页表。请参文件pgtable.h(arch/asm-arm/)：

/*
* Hardware-wise, we have a two level page table structure, where the first
* level has 4096 entries, and the second level has 256 entries. Each entry
* is one 32-bit word. Most of the bits in the second level entry are used
* by hardware, and there aren't any "accessed" and "dirty" bits.
*
* Linux on the other hand has a three level page table structure, which can
* be wrapped to fit a two level page table structure easily - using the PGD
* and PTE only. However, Linux also expects one "PTE" table per page, and
* at least a "dirty" bit.
*
* Therefore, we tweak the implementation slightly - we tell Linux that we
* have 2048 entries in the first level, each of which is 8 bytes (iow, two
* hardware pointers to the second level.) The second level contains two
* hardware PTE tables arranged contiguously, followed by Linux versions
* which contain the state information Linux needs. We, therefore, end up
* with 512 entries in the "PTE" level.
*
* This leads to the page tables having the following layout:
*
*    pgd             pte
* |        |
* +--------+ +0
* |        |-----> +------------+ +0
* +- - - - + +4    | h/w pt 0 |
* |        |-----> +------------+ +1024
* +--------+ +8    | h/w pt 1 |
* |        |       +------------+ +2048
* +- - - - +       | Linux pt 0 |
* |        |       +------------+ +3072
* +--------+       | Linux pt 1 |
* |        |       +------------+ +4096
*
* See L_PTE_xxx below for definitions of bits in the "Linux pt", and
* PTE_xxx for definitions of bits appearing in the "h/w pt".
*
* PMD_xxx definitions refer to bits in the first level page table.
*
* The "dirty" bit is emulated by only granting hardware write permission
* iff the page is marked "writable" and "dirty" in the Linux PTE. This
* means that a write to a clean page will cause a permission fault, and
* the Linux MM layer will mark the page dirty via handle_pte_fault().
* For the hardware to notice the permission change, the TLB entry must
* be flushed, and ptep_establish() does that for us.
*
* The "accessed" or "young" bit is emulated by a similar method; we only
* allow accesses to the page if the "young" bit is set. Accesses to the
* page will cause a fault, and handle_pte_fault() will set the young bit
* for us as long as the page is marked present in the corresponding Linux
* PTE entry. Again, ptep_establish() will ensure that the TLB is up to
* date.
*
* However, when the "young" bit is cleared, we deny access to the page
* by clearing the hardware PTE. Currently Linux does not flush the TLB
* for us in this case, which means the TLB will retain the transation
* until either the TLB entry is evicted under pressure, or a context
* switch which changes the user space mapping occurs.
*/
#define PTRS_PER_PTE  512
#define PTRS_PER_PMD  1
#define PTRS_PER_PGD  2048

以ARM920t(s3c2410)为例，proc-arm920.S(arch/arm/mm)中cpu_arm920_set_pte函数：

/*
* cpu_arm920_set_pte(ptep, pte)
*
* Set a PTE and flush it out
*/
.align 5
ENTRY(cpu_arm920_set_pte)
str r1, [r0], #-2048 @ linux version

eor r1, r1, #L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_WRITE | L_PTE_DIRTY

bic r2, r1, #PTE_SMALL_AP_MASK
bic r2, r2, #PTE_TYPE_MASK
orr r2, r2, #PTE_TYPE_SMALL

tst r1, #L_PTE_USER @ User?
orrne r2, r2, #PTE_SMALL_AP_URO_SRW

tst r1, #L_PTE_WRITE | L_PTE_DIRTY @ Write and Dirty?
orreq r2, r2, #PTE_SMALL_AP_UNO_SRW

tst r1, #L_PTE_PRESENT | L_PTE_YOUNG @ Present and Young?
movne r2, #0

#ifdef CONFIG_CPU_DCACHE_WRITETHROUGH
eor r3, r2, #0x0a   @ C & small page?
tst r3, #0x0b
biceq r2, r2, #4
#endif
str r2, [r0]   @ hardware version
mov r0, r0
mcr p15, 0, r0, c7, c10, 1  @ clean D entry
mcr p15, 0, r0, c7, c10, 4  @ drain WB
mov pc, lr

很有趣。

后续分析待续。。。