mm/migrate.c

   1 /*
   2  * Memory Migration functionality - linux/mm/migration.c
   3  *
   4  * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
   5  *
   6  * Page migration was first developed in the context of the memory hotplug
   7  * project. The main authors of the migration code are:
   8  *
   9  * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
  10  * Hirokazu Takahashi <taka@valinux.co.jp>
  11  * Dave Hansen <haveblue@us.ibm.com>
  12  * Christoph Lameter <clameter@sgi.com>
  13  */
  14
  15 #include <linux/migrate.h>
  16 #include <linux/module.h>
  17 #include <linux/swap.h>
  18 #include <linux/pagemap.h>
  19 #include <linux/buffer_head.h>  /* for try_to_release_page(),
  20                                         buffer_heads_over_limit */
  21 #include <linux/mm_inline.h>
  22 #include <linux/pagevec.h>
  23 #include <linux/rmap.h>
  24 #include <linux/topology.h>
  25 #include <linux/cpu.h>
  26 #include <linux/cpuset.h>
  27 #include <linux/swapops.h>
  28
  29 #include "internal.h"
  30
  31 #include "internal.h"
  32
  33 /* The maximum number of pages to take off the LRU for migration */
  34 #define MIGRATE_CHUNK_SIZE 256
  35
  36 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
  37
  38 /*
  39  * Isolate one page from the LRU lists. If successful put it onto
  40  * the indicated list with elevated page count.
  41  *
  42  * Result:
  43  *  -EBUSY: page not on LRU list
  44  *  0: page removed from LRU list and added to the specified list.
  45  */
  46 int isolate_lru_page(struct page *page, struct list_head *pagelist)
  47 {
  48         int ret = -EBUSY;
  49
  50         if (PageLRU(page)) {
  51                 struct zone *zone = page_zone(page);
  52
  53                 spin_lock_irq(&zone->lru_lock);
  54                 if (PageLRU(page)) {
  55                         ret = 0;
  56                         get_page(page);
  57                         ClearPageLRU(page);
  58                         if (PageActive(page))
  59                                 del_page_from_active_list(zone, page);
  60                         else
  61                                 del_page_from_inactive_list(zone, page);
  62                         list_add_tail(&page->lru, pagelist);
  63                 }
  64                 spin_unlock_irq(&zone->lru_lock);
  65         }
  66         return ret;
  67 }
  68
  69 /*
  70  * migrate_prep() needs to be called after we have compiled the list of pages
  71  * to be migrated using isolate_lru_page() but before we begin a series of calls
  72  * to migrate_pages().
  73  */
  74 int migrate_prep(void)
  75 {
  76         /* Must have swap device for migration */
  77         if (nr_swap_pages <= 0)
  78                 return -ENODEV;
  79
  80         /*
  81          * Clear the LRU lists so pages can be isolated.
  82          * Note that pages may be moved off the LRU after we have
  83          * drained them. Those pages will fail to migrate like other
  84          * pages that may be busy.
  85          */
  86         lru_add_drain_all();
  87
  88         return 0;
  89 }
  90
  91 static inline void move_to_lru(struct page *page)
  92 {
  93         list_del(&page->lru);
  94         if (PageActive(page)) {
  95                 /*
  96                  * lru_cache_add_active checks that
  97                  * the PG_active bit is off.
  98                  */
  99                 ClearPageActive(page);
 100                 lru_cache_add_active(page);
 101         } else {
 102                 lru_cache_add(page);
 103         }
 104         put_page(page);
 105 }
 106
 107 /*
 108  * Add isolated pages on the list back to the LRU.
 109  *
 110  * returns the number of pages put back.
 111  */
 112 int putback_lru_pages(struct list_head *l)
 113 {
 114         struct page *page;
 115         struct page *page2;
 116         int count = 0;
 117
 118         list_for_each_entry_safe(page, page2, l, lru) {
 119                 move_to_lru(page);
 120                 count++;
 121         }
 122         return count;
 123 }
 124
 125 /*
 126  * Non migratable page
 127  */
 128 int fail_migrate_page(struct page *newpage, struct page *page)
 129 {
 130         return -EIO;
 131 }
 132 EXPORT_SYMBOL(fail_migrate_page);
 133
 134 /*
 135  * swapout a single page
 136  * page is locked upon entry, unlocked on exit
 137  */
 138 static int swap_page(struct page *page)
 139 {
 140         struct address_space *mapping = page_mapping(page);
 141
 142         if (page_mapped(page) && mapping)
 143                 if (try_to_unmap(page, 1) != SWAP_SUCCESS)
 144                         goto unlock_retry;
 145
 146         if (PageDirty(page)) {
 147                 /* Page is dirty, try to write it out here */
 148                 switch(pageout(page, mapping)) {
 149                 case PAGE_KEEP:
 150                 case PAGE_ACTIVATE:
 151                         goto unlock_retry;
 152
 153                 case PAGE_SUCCESS:
 154                         goto retry;
 155
 156                 case PAGE_CLEAN:
 157                         ; /* try to free the page below */
 158                 }
 159         }
 160
 161         if (PagePrivate(page)) {
 162                 if (!try_to_release_page(page, GFP_KERNEL) ||
 163                     (!mapping && page_count(page) == 1))
 164                         goto unlock_retry;
 165         }
 166
 167         if (remove_mapping(mapping, page)) {
 168                 /* Success */
 169                 unlock_page(page);
 170                 return 0;
 171         }
 172
 173 unlock_retry:
 174         unlock_page(page);
 175
 176 retry:
 177         return -EAGAIN;
 178 }
 179 EXPORT_SYMBOL(swap_page);
 180
 181 /*
 182  * Remove references for a page and establish the new page with the correct
 183  * basic settings to be able to stop accesses to the page.
 184  */
 185 int migrate_page_remove_references(struct page *newpage,
 186                                 struct page *page, int nr_refs)
 187 {
 188         struct address_space *mapping = page_mapping(page);
 189         struct page **radix_pointer;
 190
 191         /*
 192          * Avoid doing any of the following work if the page count
 193          * indicates that the page is in use or truncate has removed
 194          * the page.
 195          */
 196         if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
 197                 return -EAGAIN;
 198
 199         /*
 200          * Establish swap ptes for anonymous pages or destroy pte
 201          * maps for files.
 202          *
 203          * In order to reestablish file backed mappings the fault handlers
 204          * will take the radix tree_lock which may then be used to stop
 205          * processses from accessing this page until the new page is ready.
 206          *
 207          * A process accessing via a swap pte (an anonymous page) will take a
 208          * page_lock on the old page which will block the process until the
 209          * migration attempt is complete. At that time the PageSwapCache bit
 210          * will be examined. If the page was migrated then the PageSwapCache
 211          * bit will be clear and the operation to retrieve the page will be
 212          * retried which will find the new page in the radix tree. Then a new
 213          * direct mapping may be generated based on the radix tree contents.
 214          *
 215          * If the page was not migrated then the PageSwapCache bit
 216          * is still set and the operation may continue.
 217          */
 218         if (try_to_unmap(page, 1) == SWAP_FAIL)
 219                 /* A vma has VM_LOCKED set -> permanent failure */
 220                 return -EPERM;
 221
 222         /*
 223          * Give up if we were unable to remove all mappings.
 224          */
 225         if (page_mapcount(page))
 226                 return -EAGAIN;
 227
 228         write_lock_irq(&mapping->tree_lock);
 229
 230         radix_pointer = (struct page **)radix_tree_lookup_slot(
 231                                                 &mapping->page_tree,
 232                                                 page_index(page));
 233
 234         if (!page_mapping(page) || page_count(page) != nr_refs ||
 235                         *radix_pointer != page) {
 236                 write_unlock_irq(&mapping->tree_lock);
 237                 return 1;
 238         }
 239
 240         /*
 241          * Now we know that no one else is looking at the page.
 242          *
 243          * Certain minimal information about a page must be available
 244          * in order for other subsystems to properly handle the page if they
 245          * find it through the radix tree update before we are finished
 246          * copying the page.
 247          */
 248         get_page(newpage);
 249         newpage->index = page->index;
 250         newpage->mapping = page->mapping;
 251         if (PageSwapCache(page)) {
 252                 SetPageSwapCache(newpage);
 253                 set_page_private(newpage, page_private(page));
 254         }
 255
 256         *radix_pointer = newpage;
 257         __put_page(page);
 258         write_unlock_irq(&mapping->tree_lock);
 259
 260         return 0;
 261 }
 262 EXPORT_SYMBOL(migrate_page_remove_references);
 263
 264 /*
 265  * Copy the page to its new location
 266  */
 267 void migrate_page_copy(struct page *newpage, struct page *page)
 268 {
 269         copy_highpage(newpage, page);
 270
 271         if (PageError(page))
 272                 SetPageError(newpage);
 273         if (PageReferenced(page))
 274                 SetPageReferenced(newpage);
 275         if (PageUptodate(page))
 276                 SetPageUptodate(newpage);
 277         if (PageActive(page))
 278                 SetPageActive(newpage);
 279         if (PageChecked(page))
 280                 SetPageChecked(newpage);
 281         if (PageMappedToDisk(page))
 282                 SetPageMappedToDisk(newpage);
 283
 284         if (PageDirty(page)) {
 285                 clear_page_dirty_for_io(page);
 286                 set_page_dirty(newpage);
 287         }
 288
 289         ClearPageSwapCache(page);
 290         ClearPageActive(page);
 291         ClearPagePrivate(page);
 292         set_page_private(page, 0);
 293         page->mapping = NULL;
 294
 295         /*
 296          * If any waiters have accumulated on the new page then
 297          * wake them up.
 298          */
 299         if (PageWriteback(newpage))
 300                 end_page_writeback(newpage);
 301 }
 302 EXPORT_SYMBOL(migrate_page_copy);
 303
 304 /*
 305  * Common logic to directly migrate a single page suitable for
 306  * pages that do not use PagePrivate.
 307  *
 308  * Pages are locked upon entry and exit.
 309  */
 310 int migrate_page(struct page *newpage, struct page *page)
 311 {
 312         int rc;
 313
 314         BUG_ON(PageWriteback(page));    /* Writeback must be complete */
 315
 316         rc = migrate_page_remove_references(newpage, page, 2);
 317
 318         if (rc)
 319                 return rc;
 320
 321         migrate_page_copy(newpage, page);
 322
 323         /*
 324          * Remove auxiliary swap entries and replace
 325          * them with real ptes.
 326          *
 327          * Note that a real pte entry will allow processes that are not
 328          * waiting on the page lock to use the new page via the page tables
 329          * before the new page is unlocked.
 330          */
 331         remove_from_swap(newpage);
 332         return 0;
 333 }
 334 EXPORT_SYMBOL(migrate_page);
 335
 336 /*
 337  * migrate_pages
 338  *
 339  * Two lists are passed to this function. The first list
 340  * contains the pages isolated from the LRU to be migrated.
 341  * The second list contains new pages that the pages isolated
 342  * can be moved to. If the second list is NULL then all
 343  * pages are swapped out.
 344  *
 345  * The function returns after 10 attempts or if no pages
 346  * are movable anymore because to has become empty
 347  * or no retryable pages exist anymore.
 348  *
 349  * Return: Number of pages not migrated when "to" ran empty.
 350  */
 351 int migrate_pages(struct list_head *from, struct list_head *to,
 352                   struct list_head *moved, struct list_head *failed)
 353 {
 354         int retry;
 355         int nr_failed = 0;
 356         int pass = 0;
 357         struct page *page;
 358         struct page *page2;
 359         int swapwrite = current->flags & PF_SWAPWRITE;
 360         int rc;
 361
 362         if (!swapwrite)
 363                 current->flags |= PF_SWAPWRITE;
 364
 365 redo:
 366         retry = 0;
 367
 368         list_for_each_entry_safe(page, page2, from, lru) {
 369                 struct page *newpage = NULL;
 370                 struct address_space *mapping;
 371
 372                 cond_resched();
 373
 374                 rc = 0;
 375                 if (page_count(page) == 1)
 376                         /* page was freed from under us. So we are done. */
 377                         goto next;
 378
 379                 if (to && list_empty(to))
 380                         break;
 381
 382                 /*
 383                  * Skip locked pages during the first two passes to give the
 384                  * functions holding the lock time to release the page. Later we
 385                  * use lock_page() to have a higher chance of acquiring the
 386                  * lock.
 387                  */
 388                 rc = -EAGAIN;
 389                 if (pass > 2)
 390                         lock_page(page);
 391                 else
 392                         if (TestSetPageLocked(page))
 393                                 goto next;
 394
 395                 /*
 396                  * Only wait on writeback if we have already done a pass where
 397                  * we we may have triggered writeouts for lots of pages.
 398                  */
 399                 if (pass > 0) {
 400                         wait_on_page_writeback(page);
 401                 } else {
 402                         if (PageWriteback(page))
 403                                 goto unlock_page;
 404                 }
 405
 406                 /*
 407                  * Anonymous pages must have swap cache references otherwise
 408                  * the information contained in the page maps cannot be
 409                  * preserved.
 410                  */
 411                 if (PageAnon(page) && !PageSwapCache(page)) {
 412                         if (!add_to_swap(page, GFP_KERNEL)) {
 413                                 rc = -ENOMEM;
 414                                 goto unlock_page;
 415                         }
 416                 }
 417
 418                 if (!to) {
 419                         rc = swap_page(page);
 420                         goto next;
 421                 }
 422
 423                 newpage = lru_to_page(to);
 424                 lock_page(newpage);
 425
 426                 /*
 427                  * Pages are properly locked and writeback is complete.
 428                  * Try to migrate the page.
 429                  */
 430                 mapping = page_mapping(page);
 431                 if (!mapping)
 432                         goto unlock_both;
 433
 434                 if (mapping->a_ops->migratepage) {
 435                         /*
 436                          * Most pages have a mapping and most filesystems
 437                          * should provide a migration function. Anonymous
 438                          * pages are part of swap space which also has its
 439                          * own migration function. This is the most common
 440                          * path for page migration.
 441                          */
 442                         rc = mapping->a_ops->migratepage(newpage, page);
 443                         goto unlock_both;
 444                 }
 445
 446                 /*
 447                  * Default handling if a filesystem does not provide
 448                  * a migration function. We can only migrate clean
 449                  * pages so try to write out any dirty pages first.
 450                  */
 451                 if (PageDirty(page)) {
 452                         switch (pageout(page, mapping)) {
 453                         case PAGE_KEEP:
 454                         case PAGE_ACTIVATE:
 455                                 goto unlock_both;
 456
 457                         case PAGE_SUCCESS:
 458                                 unlock_page(newpage);
 459                                 goto next;
 460
 461                         case PAGE_CLEAN:
 462                                 ; /* try to migrate the page below */
 463                         }
 464                 }
 465
 466                 /*
 467                  * Buffers are managed in a filesystem specific way.
 468                  * We must have no buffers or drop them.
 469                  */
 470                 if (!page_has_buffers(page) ||
 471                     try_to_release_page(page, GFP_KERNEL)) {
 472                         rc = migrate_page(newpage, page);
 473                         goto unlock_both;
 474                 }
 475
 476                 /*
 477                  * On early passes with mapped pages simply
 478                  * retry. There may be a lock held for some
 479                  * buffers that may go away. Later
 480                  * swap them out.
 481                  */
 482                 if (pass > 4) {
 483                         /*
 484                          * Persistently unable to drop buffers..... As a
 485                          * measure of last resort we fall back to
 486                          * swap_page().
 487                          */
 488                         unlock_page(newpage);
 489                         newpage = NULL;
 490                         rc = swap_page(page);
 491                         goto next;
 492                 }
 493
 494 unlock_both:
 495                 unlock_page(newpage);
 496
 497 unlock_page:
 498                 unlock_page(page);
 499
 500 next:
 501                 if (rc == -EAGAIN) {
 502                         retry++;
 503                 } else if (rc) {
 504                         /* Permanent failure */
 505                         list_move(&page->lru, failed);
 506                         nr_failed++;
 507                 } else {
 508                         if (newpage) {
 509                                 /* Successful migration. Return page to LRU */
 510                                 move_to_lru(newpage);
 511                         }
 512                         list_move(&page->lru, moved);
 513                 }
 514         }
 515         if (retry && pass++ < 10)
 516                 goto redo;
 517
 518         if (!swapwrite)
 519                 current->flags &= ~PF_SWAPWRITE;
 520
 521         return nr_failed + retry;
 522 }
 523
 524 /*
 525  * Migration function for pages with buffers. This function can only be used
 526  * if the underlying filesystem guarantees that no other references to "page"
 527  * exist.
 528  */
 529 int buffer_migrate_page(struct page *newpage, struct page *page)
 530 {
 531         struct address_space *mapping = page->mapping;
 532         struct buffer_head *bh, *head;
 533         int rc;
 534
 535         if (!mapping)
 536                 return -EAGAIN;
 537
 538         if (!page_has_buffers(page))
 539                 return migrate_page(newpage, page);
 540
 541         head = page_buffers(page);
 542
 543         rc = migrate_page_remove_references(newpage, page, 3);
 544
 545         if (rc)
 546                 return rc;
 547
 548         bh = head;
 549         do {
 550                 get_bh(bh);
 551                 lock_buffer(bh);
 552                 bh = bh->b_this_page;
 553
 554         } while (bh != head);
 555
 556         ClearPagePrivate(page);
 557         set_page_private(newpage, page_private(page));
 558         set_page_private(page, 0);
 559         put_page(page);
 560         get_page(newpage);
 561
 562         bh = head;
 563         do {
 564                 set_bh_page(bh, newpage, bh_offset(bh));
 565                 bh = bh->b_this_page;
 566
 567         } while (bh != head);
 568
 569         SetPagePrivate(newpage);
 570
 571         migrate_page_copy(newpage, page);
 572
 573         bh = head;
 574         do {
 575                 unlock_buffer(bh);
 576                 put_bh(bh);
 577                 bh = bh->b_this_page;
 578
 579         } while (bh != head);
 580
 581         return 0;
 582 }
 583 EXPORT_SYMBOL(buffer_migrate_page);
 584
 585 /*
 586  * Migrate the list 'pagelist' of pages to a certain destination.
 587  *
 588  * Specify destination with either non-NULL vma or dest_node >= 0
 589  * Return the number of pages not migrated or error code
 590  */
 591 int migrate_pages_to(struct list_head *pagelist,
 592                         struct vm_area_struct *vma, int dest)
 593 {
 594         LIST_HEAD(newlist);
 595         LIST_HEAD(moved);
 596         LIST_HEAD(failed);
 597         int err = 0;
 598         unsigned long offset = 0;
 599         int nr_pages;
 600         struct page *page;
 601         struct list_head *p;
 602
 603 redo:
 604         nr_pages = 0;
 605         list_for_each(p, pagelist) {
 606                 if (vma) {
 607                         /*
 608                          * The address passed to alloc_page_vma is used to
 609                          * generate the proper interleave behavior. We fake
 610                          * the address here by an increasing offset in order
 611                          * to get the proper distribution of pages.
 612                          *
 613                          * No decision has been made as to which page
 614                          * a certain old page is moved to so we cannot
 615                          * specify the correct address.
 616                          */
 617                         page = alloc_page_vma(GFP_HIGHUSER, vma,
 618                                         offset + vma->vm_start);
 619                         offset += PAGE_SIZE;
 620                 }
 621                 else
 622                         page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
 623
 624                 if (!page) {
 625                         err = -ENOMEM;
 626                         goto out;
 627                 }
 628                 list_add_tail(&page->lru, &newlist);
 629                 nr_pages++;
 630                 if (nr_pages > MIGRATE_CHUNK_SIZE)
 631                         break;
 632         }
 633         err = migrate_pages(pagelist, &newlist, &moved, &failed);
 634
 635         putback_lru_pages(&moved);      /* Call release pages instead ?? */
 636
 637         if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
 638                 goto redo;
 639 out:
 640         /* Return leftover allocated pages */
 641         while (!list_empty(&newlist)) {
 642                 page = list_entry(newlist.next, struct page, lru);
 643                 list_del(&page->lru);
 644                 __free_page(page);
 645         }
 646         list_splice(&failed, pagelist);
 647         if (err < 0)
 648                 return err;
 649
 650         /* Calculate number of leftover pages */
 651         nr_pages = 0;
 652         list_for_each(p, pagelist)
 653                 nr_pages++;
 654         return nr_pages;
 655 }