#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
#include <linux/hugetlb.h>
+#include <linux/backing-dev.h>
#include <asm/tlbflush.h>
anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
if (anon_vma) {
atomic_set(&anon_vma->refcount, 1);
+ anon_vma->degree = 1; /* Reference for first vma */
+ anon_vma->parent = anon_vma;
/*
* Initialise the anon_vma root to point to itself. If called
* from fork, the root will be reset to the parents anon_vma.
* LOCK should suffice since the actual taking of the lock must
* happen _before_ what follows.
*/
+ might_sleep();
if (mutex_is_locked(&anon_vma->root->mutex)) {
anon_vma_lock(anon_vma);
anon_vma_unlock(anon_vma);
avc->vma = vma;
list_add(&avc->same_vma, &vma->anon_vma_chain);
list_add_tail(&avc->same_anon_vma, &anon_vma->head);
+ /* vma reference or self-parent link for new root */
+ anon_vma->degree++;
allocated = NULL;
avc = NULL;
}
/*
* Attach the anon_vmas from src to dst.
* Returns 0 on success, -ENOMEM on failure.
+ *
+ * If dst->anon_vma is NULL this function tries to find and reuse existing
+ * anon_vma which has no vmas and only one child anon_vma. This prevents
+ * degradation of anon_vma hierarchy to endless linear chain in case of
+ * constantly forking task. On the other hand, an anon_vma with more than one
+ * child isn't reused even if there was no alive vma, thus rmap walker has a
+ * good chance of avoiding scanning the whole hierarchy when it searches where
+ * page is mapped.
*/
int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
{
anon_vma = pavc->anon_vma;
root = lock_anon_vma_root(root, anon_vma);
anon_vma_chain_link(dst, avc, anon_vma);
+
+ /*
+ * Reuse existing anon_vma if its degree lower than two,
+ * that means it has no vma and only one anon_vma child.
+ *
+ * Do not chose parent anon_vma, otherwise first child
+ * will always reuse it. Root anon_vma is never reused:
+ * it has self-parent reference and at least one child.
+ */
+ if (!dst->anon_vma && anon_vma != src->anon_vma &&
+ anon_vma->degree < 2)
+ dst->anon_vma = anon_vma;
}
+ if (dst->anon_vma)
+ dst->anon_vma->degree++;
unlock_anon_vma_root(root);
return 0;
enomem_failure:
+ /*
+ * dst->anon_vma is dropped here otherwise its degree can be incorrectly
+ * decremented in unlink_anon_vmas().
+ * We can safely do this because callers of anon_vma_clone() don't care
+ * about dst->anon_vma if anon_vma_clone() failed.
+ */
+ dst->anon_vma = NULL;
unlink_anon_vmas(dst);
return -ENOMEM;
}
if (!pvma->anon_vma)
return 0;
+ /* Drop inherited anon_vma, we'll reuse existing or allocate new. */
+ vma->anon_vma = NULL;
+
/*
* First, attach the new VMA to the parent VMA's anon_vmas,
* so rmap can find non-COWed pages in child processes.
if (anon_vma_clone(vma, pvma))
return -ENOMEM;
+ /* An existing anon_vma has been reused, all done then. */
+ if (vma->anon_vma)
+ return 0;
+
/* Then add our own anon_vma. */
anon_vma = anon_vma_alloc();
if (!anon_vma)
* lock any of the anon_vmas in this anon_vma tree.
*/
anon_vma->root = pvma->anon_vma->root;
+ anon_vma->parent = pvma->anon_vma;
/*
* With refcounts, an anon_vma can stay around longer than the
* process it belongs to. The root anon_vma needs to be pinned until
vma->anon_vma = anon_vma;
anon_vma_lock(anon_vma);
anon_vma_chain_link(vma, avc, anon_vma);
+ anon_vma->parent->degree++;
anon_vma_unlock(anon_vma);
return 0;
* Leave empty anon_vmas on the list - we'll need
* to free them outside the lock.
*/
- if (list_empty(&anon_vma->head))
+ if (list_empty(&anon_vma->head)) {
+ anon_vma->parent->degree--;
continue;
+ }
list_del(&avc->same_vma);
anon_vma_chain_free(avc);
}
+ if (vma->anon_vma)
+ vma->anon_vma->degree--;
unlock_anon_vma_root(root);
/*
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma = avc->anon_vma;
+ BUG_ON(anon_vma->degree);
put_anon_vma(anon_vma);
list_del(&avc->same_vma);
* above cannot corrupt).
*/
if (!page_mapped(page)) {
+ rcu_read_unlock();
put_anon_vma(anon_vma);
- anon_vma = NULL;
+ return NULL;
}
out:
rcu_read_unlock();
}
if (!page_mapped(page)) {
+ rcu_read_unlock();
put_anon_vma(anon_vma);
- anon_vma = NULL;
- goto out;
+ return NULL;
}
/* we pinned the anon_vma, its safe to sleep */
spinlock_t *ptl;
if (unlikely(PageHuge(page))) {
+ /* when pud is not present, pte will be NULL */
pte = huge_pte_offset(mm, address);
+ if (!pte)
+ return NULL;
+
ptl = &mm->page_table_lock;
goto check;
}
if (page_mapped(page)) {
struct address_space *mapping = page_mapping(page);
- if (mapping) {
+ if (mapping)
ret = page_mkclean_file(mapping, page);
- if (page_test_and_clear_dirty(page_to_pfn(page), 1))
- ret = 1;
- }
}
return ret;
*/
void page_remove_rmap(struct page *page)
{
+ struct address_space *mapping = page_mapping(page);
+
/* page still mapped by someone else? */
if (!atomic_add_negative(-1, &page->_mapcount))
return;
* this if the page is anon, so about to be freed; but perhaps
* not if it's in swapcache - there might be another pte slot
* containing the swap entry, but page not yet written to swap.
+ *
+ * And we can skip it on file pages, so long as the filesystem
+ * participates in dirty tracking; but need to catch shm and tmpfs
+ * and ramfs pages which have been modified since creation by read
+ * fault.
+ *
+ * Note that mapping must be decided above, before decrementing
+ * mapcount (which luckily provides a barrier): once page is unmapped,
+ * it could be truncated and page->mapping reset to NULL at any moment.
+ * Note also that we are relying on page_mapping(page) to set mapping
+ * to &swapper_space when PageSwapCache(page).
*/
- if ((!PageAnon(page) || PageSwapCache(page)) &&
+ if (mapping && !mapping_cap_account_dirty(mapping) &&
page_test_and_clear_dirty(page_to_pfn(page), 1))
set_page_dirty(page);
/*
BUG_ON(!page || PageAnon(page));
if (locked_vma) {
- mlock_vma_page(page); /* no-op if already mlocked */
- if (page == check_page)
+ if (page == check_page) {
+ /* we know we have check_page locked */
+ mlock_vma_page(page);
ret = SWAP_MLOCK;
+ } else if (trylock_page(page)) {
+ /*
+ * If we can lock the page, perform mlock.
+ * Otherwise leave the page alone, it will be
+ * eventually encountered again later.
+ */
+ mlock_vma_page(page);
+ unlock_page(page);
+ }
continue; /* don't unmap */
}
{
struct anon_vma *root = anon_vma->root;
+ anon_vma_free(anon_vma);
if (root != anon_vma && atomic_dec_and_test(&root->refcount))
anon_vma_free(root);
-
- anon_vma_free(anon_vma);
}
#ifdef CONFIG_MIGRATION