Linux用户空间内存区域的匿名映射

小程序：扫一扫查出行
【扫一扫了解最新限行尾号】
复制小程序

1

在调用mmap系统调用时，可以指定的标志(flag)参数：

   1: #define MAP_SHARED    0x01        /* Share changes */

CRLF

   2: #define MAP_PRIVATE    0x02        /* Changes are private */

CRLF

   3: #define MAP_TYPE    0x0f        /* Mask for type of mapping */

CRLF

   4: #define MAP_FIXED    0x10        /* Interpret addr exactly */

CRLF

   5: #define MAP_ANONYMOUS    0x20        /* don't use a file */

CRLF

   6: #ifdef CONFIG_MMAP_ALLOW_UNINITIALIZED

CRLF

   7: # define MAP_UNINITIALIZED 0x4000000    /* For anonymous mmap, memory could be uninitialized */

CRLF

   8: #else

CRLF

   9: # define MAP_UNINITIALIZED 0x0        /* Don't support this flag */

CRLF

  10: #endif

CRLF

MAP_SHARED

用于多个进程共享对一个文件的访问

MAP_PRIVATE

用于创建一个与数据源分离的私有映射，对区域的写入操作不影响数据源文件中的内容

MAP_FIXED

用于在指定的目标线性地址创建一个映射，不允许调整到其他地址

MAP_ANONYMOUS

用于创建与文件无关的映射，或者说没有数据源的映射

do_anonymous_page会调用alloc_zeroed_user_highpage_movable分配一个初始化为全0的内存页。

2

在vm_area_struct数据结构定义中，有一个双链表结点：anon_vma_chain

   1: struct vm_area_struct {

CRLF

   2: ......

CRLF

   3: /*

CRLF

   4:      * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma

CRLF

   5:      * list, after a COW of one of the file pages.    A MAP_SHARED vma

CRLF

   6:      * can only be in the i_mmap tree.  An anonymous MAP_PRIVATE, stack

CRLF

   7:      * or brk vma (with NULL file) can only be in an anon_vma list.

CRLF

   8:      */

CRLF

   9:     struct list_head anon_vma_chain; /* Serialized by mmap_sem &

CRLF

  10:                       * page_table_lock */

CRLF

  11:     struct anon_vma *anon_vma;    /* Serialized by page_table_lock */

CRLF

  12: ......

CRLF

  13: }

CRLF

其中，struct anon_vma定义:

   1: /*

CRLF

   2:  * The anon_vma heads a list of private "related" vmas, to scan if

CRLF

   3:  * an anonymous page pointing to this anon_vma needs to be unmapped:

CRLF

   4:  * the vmas on the list will be related by forking, or by splitting.

CRLF

   5:  *

CRLF

   6:  * Since vmas come and go as they are split and merged (particularly

CRLF

   7:  * in mprotect), the mapping field of an anonymous page cannot point

CRLF

   8:  * directly to a vma: instead it points to an anon_vma, on whose list

CRLF

   9:  * the related vmas can be easily linked or unlinked.

CRLF

  10:  *

CRLF

  11:  * After unlinking the last vma on the list, we must garbage collect

CRLF

  12:  * the anon_vma object itself: we're guaranteed no page can be

CRLF

  13:  * pointing to this anon_vma once its vma list is empty.

CRLF

  14:  */

CRLF

  15: struct anon_vma {

CRLF

  16:     struct anon_vma *root;    /* Root of this anon_vma tree */

CRLF

  17:     struct mutex mutex;    /* Serialize access to vma list */

CRLF

  18:     /*

CRLF

  19:      * The refcount is taken on an anon_vma when there is no

CRLF

  20:      * guarantee that the vma of page tables will exist for

CRLF

  21:      * the duration of the operation. A caller that takes

CRLF

  22:      * the reference is responsible for clearing up the

CRLF

  23:      * anon_vma if they are the last user on release

CRLF

  24:      */

CRLF

  25:     atomic_t refcount;

CRLF

26:

CRLF

  27:     /*

CRLF

  28:      * NOTE: the LSB of the head.next is set by

CRLF

  29:      * mm_take_all_locks() _after_ taking the above lock. So the

CRLF

  30:      * head must only be read/written after taking the above lock

CRLF

  31:      * to be sure to see a valid next pointer. The LSB bit itself

CRLF

  32:      * is serialized by a system wide lock only visible to

CRLF

  33:      * mm_take_all_locks() (mm_all_locks_mutex).

CRLF

  34:      */

CRLF

  35:     struct list_head head;    /* Chain of private "related" vmas */

CRLF

  36: };

CRLF

3

do_mmap

   1: static inline unsigned long do_mmap(struct file *file, unsigned long addr,

CRLF

   2:     unsigned long len, unsigned long prot,

CRLF

   3:     unsigned long flag, unsigned long offset)

CRLF

   4: {

CRLF

   5:     unsigned long ret = -EINVAL;

CRLF

   6:     if ((offset + PAGE_ALIGN(len)) < offset)

CRLF

   7:         goto out;

CRLF

   8:     if (!(offset & ~PAGE_MASK))

CRLF

   9:         ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);

CRLF

  10: out:

CRLF

  11:     return ret;

CRLF

  12: }

CRLF

if ((offset + PAGE_ALIGN(len)) < offset)

/* to align the pointer to the (next) page boundary */
#define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE)

/*
* 'kernel.h' contains some often-used function prototypes etc
*/
#define __ALIGN_KERNEL(x, a) __ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1)
#define __ALIGN_KERNEL_MASK(x, mask)

即

if ((offset + (((len) + (PAGE_SIZE)) & ~(PAGE_SIZE-1))) < offset)

表示如果len太长，再进行align to page boundary操作就会溢出了，那么没有那么多的线性地址空间可以给它映射，因此失败。

if (!(offset & ~PAGE_MASK))

如果offset是位于页的边界处，则继续操作

ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);

其中最后一个参数代表了映射区域在文件中的页序号。

   1: /*

CRLF

   2:  * The caller must hold down_write(&current->mm->mmap_sem).

CRLF

   3:  */

CRLF

4:

CRLF

   5: unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,

CRLF

   6:             unsigned long len, unsigned long prot,

CRLF

   7:             unsigned long flags, unsigned long pgoff)

CRLF

   8: {

CRLF

   9:     struct mm_struct * mm = current->mm;

CRLF

  10:     struct inode *inode;

CRLF

  11:     vm_flags_t vm_flags;

CRLF

  12:     int error;

CRLF

  13:     unsigned long reqprot = prot;

CRLF

14:

CRLF

  15:     /*

CRLF

  16:      * Does the application expect PROT_READ to imply PROT_EXEC?

CRLF

  17:      *

CRLF

  18:      * (the exception is when the underlying filesystem is noexec

CRLF

  19:      *  mounted, in which case we dont add PROT_EXEC.)

CRLF

  20:      */

CRLF

  21:     if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))

CRLF

  22:         if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))

CRLF

  23:             prot |= PROT_EXEC;

CRLF

24:

CRLF

  25:     if (!len)

CRLF

  26:         return -EINVAL;

CRLF

27:

CRLF

  28:     if (!(flags & MAP_FIXED))

CRLF

  29:         addr = round_hint_to_min(addr);

CRLF

30:

CRLF

  31:     /* Careful about overflows.. */

CRLF

  32:     len = PAGE_ALIGN(len);

CRLF

  33:     if (!len)

CRLF

  34:         return -ENOMEM;

CRLF

35:

CRLF

  36:     /* offset overflow? */

CRLF

  37:     if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)

CRLF

  38:                return -EOVERFLOW;

CRLF

39:

CRLF

  40:     /* Too many mappings? */

CRLF

  41:     if (mm->map_count > sysctl_max_map_count)

CRLF

  42:         return -ENOMEM;

CRLF

43:

CRLF

  44:     /* Obtain the address to map to. we verify (or select) it and ensure

CRLF

  45:      * that it represents a valid section of the address space.

CRLF

  46:      */

CRLF

  47:     addr = get_unmapped_area(file, addr, len, pgoff, flags);

CRLF

  48:     if (addr & ~PAGE_MASK)

CRLF

  49:         return addr;

CRLF

50:

CRLF

  51:     /* Do simple checking here so the lower-level routines won't have

CRLF

  52:      * to. we assume access permissions have been handled by the open

CRLF

  53:      * of the memory object, so we don't do any here.

CRLF

  54:      */

CRLF

  55:     vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |

CRLF

  56:             mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;

CRLF

57:

CRLF

  58:     if (flags & MAP_LOCKED)

CRLF

  59:         if (!can_do_mlock())

CRLF

  60:             return -EPERM;

CRLF

61:

CRLF

  62:     /* mlock MCL_FUTURE? */

CRLF

  63:     if (vm_flags & VM_LOCKED) {

CRLF

  64:         unsigned long locked, lock_limit;

CRLF

  65:         locked = len >> PAGE_SHIFT;

CRLF

  66:         locked += mm->locked_vm;

CRLF

  67:         lock_limit = rlimit(RLIMIT_MEMLOCK);

CRLF

  68:         lock_limit >>= PAGE_SHIFT;

CRLF

  69:         if (locked > lock_limit && !capable(CAP_IPC_LOCK))

CRLF

  70:             return -EAGAIN;

CRLF

  71:     }

CRLF

72:

CRLF

  73:     inode = file ? file->f_path.dentry->d_inode : NULL;

CRLF

74:

CRLF

  75:     if (file) {

CRLF

  76:         switch (flags & MAP_TYPE) {

CRLF

  77:         case MAP_SHARED:

CRLF

  78:             if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))

CRLF

  79:                 return -EACCES;

CRLF

80:

CRLF

  81:             /*

CRLF

  82:              * Make sure we don't allow writing to an append-only

CRLF

  83:              * file..

CRLF

  84:              */

CRLF

  85:             if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))

CRLF

  86:                 return -EACCES;

CRLF

87:

CRLF

  88:             /*

CRLF

  89:              * Make sure there are no mandatory locks on the file.

CRLF

  90:              */

CRLF

  91:             if (locks_verify_locked(inode))

CRLF

  92:                 return -EAGAIN;

CRLF

93:

CRLF

  94:             vm_flags |= VM_SHARED | VM_MAYSHARE;

CRLF

  95:             if (!(file->f_mode & FMODE_WRITE))

CRLF

  96:                 vm_flags &= ~(VM_MAYWRITE | VM_SHARED);

CRLF

97:

CRLF

  98:             /* fall through */

CRLF

  99:         case MAP_PRIVATE:

CRLF

 100:             if (!(file->f_mode & FMODE_READ))

CRLF

 101:                 return -EACCES;

CRLF

 102:             if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {

CRLF

 103:                 if (vm_flags & VM_EXEC)

CRLF

 104:                     return -EPERM;

CRLF

 105:                 vm_flags &= ~VM_MAYEXEC;

CRLF

 106:             }

CRLF

 107:

CRLF

 108:             if (!file->f_op || !file->f_op->mmap)

CRLF

 109:                 return -ENODEV;

CRLF

 110:             break;

CRLF

 111:

CRLF

 112:         default:

CRLF

 113:             return -EINVAL;

CRLF

 114:         }

CRLF

 115:     } else {

CRLF

 116:         switch (flags & MAP_TYPE) {

CRLF

 117:         case MAP_SHARED:

CRLF

 118:             /*

CRLF

 119:              * Ignore pgoff.

CRLF

 120:              */

CRLF

 121:             pgoff = 0;

CRLF

 122:             vm_flags |= VM_SHARED | VM_MAYSHARE;

CRLF

 123:             break;

CRLF

 124:         case MAP_PRIVATE:

CRLF

 125:             /*

CRLF

 126:              * Set pgoff according to addr for anon_vma.

CRLF

 127:              */

CRLF

 128:             pgoff = addr >> PAGE_SHIFT;

CRLF

 129:             break;

CRLF

 130:         default:

CRLF

 131:             return -EINVAL;

CRLF

 132:         }

CRLF

 133:     }

CRLF

 134:

CRLF

 135:     error = security_file_mmap(file, reqprot, prot, flags, addr, 0);

CRLF

 136:     if (error)

CRLF

 137:         return error;

CRLF

 138:

CRLF

 139:     return mmap_region(file, addr, len, flags, vm_flags, pgoff);

CRLF

 140: }

CRLF

 141: EXPORT_SYMBOL(do_mmap_pgoff);

CRLF

/* Obtain the address to map to. we verify (or select) it and ensure
* that it represents a valid section of the address space.
*/
addr = get_unmapped_area(file, addr, len, pgoff, flags);
if (addr & ~PAGE_MASK)
return addr;

get_unmapped_area函数用于查找到一个可以安放请求的这么长的一个vma的线性地址范围，返回这个范围的起始地址。如果这个起始地址不是从页对齐处开始的，代表找到的这个地址是不符合要求的，因此也不再往下走了，直接返回。

但是是问题是，如果直接返回了，那么调用都会不会不做检查，直接认为内核已经完成了mmap的操作，而尝试去读写这块还没有与文件建立起关联的内存区域呢，会发生什么不可知的事？

【根据http://www.cnblogs.com/long123king/p/3502170.html中的思想，当进程真正需要访问页时，会触发Page Fault，那么这一步关键是设置好相应的Page Fault handler以及相应struct的指针成员】