2005-04-17 00:20:36 +02:00
|
|
|
#ifndef _LINUX_HUGETLB_H
|
|
|
|
#define _LINUX_HUGETLB_H
|
|
|
|
|
|
|
|
#ifdef CONFIG_HUGETLB_PAGE
|
|
|
|
|
|
|
|
#include <linux/mempolicy.h>
|
2005-06-22 02:14:44 +02:00
|
|
|
#include <asm/tlbflush.h>
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
struct ctl_table;
|
|
|
|
|
|
|
|
static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
return vma->vm_flags & VM_HUGETLB;
|
|
|
|
}
|
|
|
|
|
|
|
|
int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
|
|
|
|
int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
|
|
|
|
int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int);
|
|
|
|
void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
|
|
|
|
int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
|
|
|
|
int hugetlb_report_meminfo(char *);
|
|
|
|
int hugetlb_report_node_meminfo(int, char *);
|
|
|
|
unsigned long hugetlb_total_pages(void);
|
2005-10-20 17:24:28 +02:00
|
|
|
int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
|
|
unsigned long address, int write_access);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
extern unsigned long max_huge_pages;
|
|
|
|
extern const unsigned long hugetlb_zero, hugetlb_infinity;
|
|
|
|
extern int sysctl_hugetlb_shm_group;
|
|
|
|
|
2005-06-22 02:14:44 +02:00
|
|
|
/* arch callbacks */
|
|
|
|
|
|
|
|
pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr);
|
|
|
|
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr);
|
|
|
|
struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
|
|
|
|
int write);
|
|
|
|
struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
|
|
|
|
pmd_t *pmd, int write);
|
|
|
|
int pmd_huge(pmd_t pmd);
|
2006-03-22 09:08:50 +01:00
|
|
|
void hugetlb_change_protection(struct vm_area_struct *vma,
|
|
|
|
unsigned long address, unsigned long end, pgprot_t newprot);
|
2005-06-22 02:14:44 +02:00
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
#ifndef ARCH_HAS_HUGEPAGE_ONLY_RANGE
|
|
|
|
#define is_hugepage_only_range(mm, addr, len) 0
|
[PATCH] hugepage: Fix hugepage logic in free_pgtables()
free_pgtables() has special logic to call hugetlb_free_pgd_range() instead
of the normal free_pgd_range() on hugepage VMAs. However, the test it uses
to do so is incorrect: it calls is_hugepage_only_range on a hugepage sized
range at the start of the vma. is_hugepage_only_range() will return true
if the given range has any intersection with a hugepage address region, and
in this case the given region need not be hugepage aligned. So, for
example, this test can return true if called on, say, a 4k VMA immediately
preceding a (nicely aligned) hugepage VMA.
At present we get away with this because the powerpc version of
hugetlb_free_pgd_range() is just a call to free_pgd_range(). On ia64 (the
only other arch with a non-trivial is_hugepage_only_range()) we get away
with it for a different reason; the hugepage area is not contiguous with
the rest of the user address space, and VMAs are not permitted in between,
so the test can't return a false positive there.
Nonetheless this should be fixed. We do that in the patch below by
replacing the is_hugepage_only_range() test with an explicit test of the
VMA using is_vm_hugetlb_page().
This in turn changes behaviour for platforms where is_hugepage_only_range()
returns false always (everything except powerpc and ia64). We address this
by ensuring that hugetlb_free_pgd_range() is defined to be identical to
free_pgd_range() (instead of a no-op) on everything except ia64. Even so,
it will prevent some otherwise possible coalescing of calls down to
free_pgd_range(). Since this only happens for hugepage VMAs, removing this
small optimization seems unlikely to cause any trouble.
This patch causes no regressions on the libhugetlbfs testsuite - ppc64
POWER5 (8-way), ppc64 G5 (2-way) and i386 Pentium M (UP).
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-22 09:08:57 +01:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef ARCH_HAS_HUGETLB_FREE_PGD_RANGE
|
|
|
|
#define hugetlb_free_pgd_range free_pgd_range
|
2006-03-22 09:08:59 +01:00
|
|
|
#else
|
|
|
|
void hugetlb_free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
|
|
|
|
unsigned long end, unsigned long floor,
|
|
|
|
unsigned long ceiling);
|
2005-04-17 00:20:36 +02:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef ARCH_HAS_PREPARE_HUGEPAGE_RANGE
|
2006-03-22 09:09:01 +01:00
|
|
|
/*
|
|
|
|
* If the arch doesn't supply something else, assume that hugepage
|
|
|
|
* size aligned regions are ok without further preparation.
|
|
|
|
*/
|
|
|
|
static inline int prepare_hugepage_range(unsigned long addr, unsigned long len)
|
|
|
|
{
|
|
|
|
if (len & ~HPAGE_MASK)
|
|
|
|
return -EINVAL;
|
|
|
|
if (addr & ~HPAGE_MASK)
|
|
|
|
return -EINVAL;
|
|
|
|
return 0;
|
|
|
|
}
|
2005-04-17 00:20:36 +02:00
|
|
|
#else
|
|
|
|
int prepare_hugepage_range(unsigned long addr, unsigned long len);
|
|
|
|
#endif
|
|
|
|
|
2005-06-22 02:14:44 +02:00
|
|
|
#ifndef ARCH_HAS_SETCLEAR_HUGE_PTE
|
|
|
|
#define set_huge_pte_at(mm, addr, ptep, pte) set_pte_at(mm, addr, ptep, pte)
|
|
|
|
#define huge_ptep_get_and_clear(mm, addr, ptep) ptep_get_and_clear(mm, addr, ptep)
|
|
|
|
#else
|
|
|
|
void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
|
|
|
|
pte_t *ptep, pte_t pte);
|
|
|
|
pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
|
|
|
|
pte_t *ptep);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef ARCH_HAS_HUGETLB_PREFAULT_HOOK
|
|
|
|
#define hugetlb_prefault_arch_hook(mm) do { } while (0)
|
|
|
|
#else
|
|
|
|
void hugetlb_prefault_arch_hook(struct mm_struct *mm);
|
|
|
|
#endif
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
#else /* !CONFIG_HUGETLB_PAGE */
|
|
|
|
|
|
|
|
static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
static inline unsigned long hugetlb_total_pages(void)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#define follow_hugetlb_page(m,v,p,vs,a,b,i) ({ BUG(); 0; })
|
|
|
|
#define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL)
|
|
|
|
#define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; })
|
|
|
|
#define hugetlb_prefault(mapping, vma) ({ BUG(); 0; })
|
|
|
|
#define unmap_hugepage_range(vma, start, end) BUG()
|
|
|
|
#define hugetlb_report_meminfo(buf) 0
|
|
|
|
#define hugetlb_report_node_meminfo(n, buf) 0
|
|
|
|
#define follow_huge_pmd(mm, addr, pmd, write) NULL
|
|
|
|
#define prepare_hugepage_range(addr, len) (-EINVAL)
|
|
|
|
#define pmd_huge(x) 0
|
|
|
|
#define is_hugepage_only_range(mm, addr, len) 0
|
[PATCH] hugepage: Fix hugepage logic in free_pgtables()
free_pgtables() has special logic to call hugetlb_free_pgd_range() instead
of the normal free_pgd_range() on hugepage VMAs. However, the test it uses
to do so is incorrect: it calls is_hugepage_only_range on a hugepage sized
range at the start of the vma. is_hugepage_only_range() will return true
if the given range has any intersection with a hugepage address region, and
in this case the given region need not be hugepage aligned. So, for
example, this test can return true if called on, say, a 4k VMA immediately
preceding a (nicely aligned) hugepage VMA.
At present we get away with this because the powerpc version of
hugetlb_free_pgd_range() is just a call to free_pgd_range(). On ia64 (the
only other arch with a non-trivial is_hugepage_only_range()) we get away
with it for a different reason; the hugepage area is not contiguous with
the rest of the user address space, and VMAs are not permitted in between,
so the test can't return a false positive there.
Nonetheless this should be fixed. We do that in the patch below by
replacing the is_hugepage_only_range() test with an explicit test of the
VMA using is_vm_hugetlb_page().
This in turn changes behaviour for platforms where is_hugepage_only_range()
returns false always (everything except powerpc and ia64). We address this
by ensuring that hugetlb_free_pgd_range() is defined to be identical to
free_pgd_range() (instead of a no-op) on everything except ia64. Even so,
it will prevent some otherwise possible coalescing of calls down to
free_pgd_range(). Since this only happens for hugepage VMAs, removing this
small optimization seems unlikely to cause any trouble.
This patch causes no regressions on the libhugetlbfs testsuite - ppc64
POWER5 (8-way), ppc64 G5 (2-way) and i386 Pentium M (UP).
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-22 09:08:57 +01:00
|
|
|
#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
|
2005-10-20 17:24:28 +02:00
|
|
|
#define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; })
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2006-03-22 09:08:50 +01:00
|
|
|
#define hugetlb_change_protection(vma, address, end, newprot)
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
#ifndef HPAGE_MASK
|
2005-11-14 01:06:42 +01:00
|
|
|
#define HPAGE_MASK PAGE_MASK /* Keep the compiler happy */
|
|
|
|
#define HPAGE_SIZE PAGE_SIZE
|
2005-04-17 00:20:36 +02:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif /* !CONFIG_HUGETLB_PAGE */
|
|
|
|
|
|
|
|
#ifdef CONFIG_HUGETLBFS
|
|
|
|
struct hugetlbfs_config {
|
|
|
|
uid_t uid;
|
|
|
|
gid_t gid;
|
|
|
|
umode_t mode;
|
|
|
|
long nr_blocks;
|
|
|
|
long nr_inodes;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct hugetlbfs_sb_info {
|
|
|
|
long max_blocks; /* blocks allowed */
|
|
|
|
long free_blocks; /* blocks free */
|
|
|
|
long max_inodes; /* inodes allowed */
|
|
|
|
long free_inodes; /* inodes free */
|
|
|
|
spinlock_t stat_lock;
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
struct hugetlbfs_inode_info {
|
|
|
|
struct shared_policy policy;
|
[PATCH] hugepage: Strict page reservation for hugepage inodes
These days, hugepages are demand-allocated at first fault time. There's a
somewhat dubious (and racy) heuristic when making a new mmap() to check if
there are enough available hugepages to fully satisfy that mapping.
A particularly obvious case where the heuristic breaks down is where a
process maps its hugepages not as a single chunk, but as a bunch of
individually mmap()ed (or shmat()ed) blocks without touching and
instantiating the pages in between allocations. In this case the size of
each block is compared against the total number of available hugepages.
It's thus easy for the process to become overcommitted, because each block
mapping will succeed, although the total number of hugepages required by
all blocks exceeds the number available. In particular, this defeats such
a program which will detect a mapping failure and adjust its hugepage usage
downward accordingly.
The patch below addresses this problem, by strictly reserving a number of
physical hugepages for hugepage inodes which have been mapped, but not
instatiated. MAP_SHARED mappings are thus "safe" - they will fail on
mmap(), not later with an OOM SIGKILL. MAP_PRIVATE mappings can still
trigger an OOM. (Actually SHARED mappings can technically still OOM, but
only if the sysadmin explicitly reduces the hugepage pool between mapping
and instantiation)
This patch appears to address the problem at hand - it allows DB2 to start
correctly, for instance, which previously suffered the failure described
above.
This patch causes no regressions on the libhugetblfs testsuite, and makes a
test (designed to catch this problem) pass which previously failed (ppc64,
POWER5).
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-22 09:08:55 +01:00
|
|
|
/* Protected by the (global) hugetlb_lock */
|
|
|
|
unsigned long prereserved_hpages;
|
2005-04-17 00:20:36 +02:00
|
|
|
struct inode vfs_inode;
|
|
|
|
};
|
|
|
|
|
|
|
|
static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
|
|
|
|
{
|
|
|
|
return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
|
|
|
|
{
|
|
|
|
return sb->s_fs_info;
|
|
|
|
}
|
|
|
|
|
|
|
|
extern struct file_operations hugetlbfs_file_operations;
|
|
|
|
extern struct vm_operations_struct hugetlb_vm_ops;
|
|
|
|
struct file *hugetlb_zero_setup(size_t);
|
[PATCH] hugepage: Strict page reservation for hugepage inodes
These days, hugepages are demand-allocated at first fault time. There's a
somewhat dubious (and racy) heuristic when making a new mmap() to check if
there are enough available hugepages to fully satisfy that mapping.
A particularly obvious case where the heuristic breaks down is where a
process maps its hugepages not as a single chunk, but as a bunch of
individually mmap()ed (or shmat()ed) blocks without touching and
instantiating the pages in between allocations. In this case the size of
each block is compared against the total number of available hugepages.
It's thus easy for the process to become overcommitted, because each block
mapping will succeed, although the total number of hugepages required by
all blocks exceeds the number available. In particular, this defeats such
a program which will detect a mapping failure and adjust its hugepage usage
downward accordingly.
The patch below addresses this problem, by strictly reserving a number of
physical hugepages for hugepage inodes which have been mapped, but not
instatiated. MAP_SHARED mappings are thus "safe" - they will fail on
mmap(), not later with an OOM SIGKILL. MAP_PRIVATE mappings can still
trigger an OOM. (Actually SHARED mappings can technically still OOM, but
only if the sysadmin explicitly reduces the hugepage pool between mapping
and instantiation)
This patch appears to address the problem at hand - it allows DB2 to start
correctly, for instance, which previously suffered the failure described
above.
This patch causes no regressions on the libhugetblfs testsuite, and makes a
test (designed to catch this problem) pass which previously failed (ppc64,
POWER5).
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-22 09:08:55 +01:00
|
|
|
int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
|
|
|
|
unsigned long atleast_hpages);
|
|
|
|
void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
|
|
|
|
unsigned long atmost_hpages);
|
2005-04-17 00:20:36 +02:00
|
|
|
int hugetlb_get_quota(struct address_space *mapping);
|
|
|
|
void hugetlb_put_quota(struct address_space *mapping);
|
|
|
|
|
|
|
|
static inline int is_file_hugepages(struct file *file)
|
|
|
|
{
|
|
|
|
return file->f_op == &hugetlbfs_file_operations;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void set_file_hugepages(struct file *file)
|
|
|
|
{
|
|
|
|
file->f_op = &hugetlbfs_file_operations;
|
|
|
|
}
|
|
|
|
#else /* !CONFIG_HUGETLBFS */
|
|
|
|
|
|
|
|
#define is_file_hugepages(file) 0
|
|
|
|
#define set_file_hugepages(file) BUG()
|
|
|
|
#define hugetlb_zero_setup(size) ERR_PTR(-ENOSYS)
|
|
|
|
|
|
|
|
#endif /* !CONFIG_HUGETLBFS */
|
|
|
|
|
|
|
|
#endif /* _LINUX_HUGETLB_H */
|