~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/mm/swap_state.c

Version: ~ [ linux-6.11-rc3 ] ~ [ linux-6.10.4 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.45 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.104 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.164 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.223 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.281 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.319 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0
  2 /*
  3  *  linux/mm/swap_state.c
  4  *
  5  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  6  *  Swap reorganised 29.12.95, Stephen Tweedie
  7  *
  8  *  Rewritten to use page cache, (C) 1998 Stephen Tweedie
  9  */
 10 #include <linux/mm.h>
 11 #include <linux/gfp.h>
 12 #include <linux/kernel_stat.h>
 13 #include <linux/mempolicy.h>
 14 #include <linux/swap.h>
 15 #include <linux/swapops.h>
 16 #include <linux/init.h>
 17 #include <linux/pagemap.h>
 18 #include <linux/pagevec.h>
 19 #include <linux/backing-dev.h>
 20 #include <linux/blkdev.h>
 21 #include <linux/migrate.h>
 22 #include <linux/vmalloc.h>
 23 #include <linux/swap_slots.h>
 24 #include <linux/huge_mm.h>
 25 #include <linux/shmem_fs.h>
 26 #include "internal.h"
 27 #include "swap.h"
 28 
 29 /*
 30  * swapper_space is a fiction, retained to simplify the path through
 31  * vmscan's shrink_folio_list.
 32  */
 33 static const struct address_space_operations swap_aops = {
 34         .writepage      = swap_writepage,
 35         .dirty_folio    = noop_dirty_folio,
 36 #ifdef CONFIG_MIGRATION
 37         .migrate_folio  = migrate_folio,
 38 #endif
 39 };
 40 
 41 struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly;
 42 static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly;
 43 static bool enable_vma_readahead __read_mostly = true;
 44 
 45 #define SWAP_RA_ORDER_CEILING   5
 46 
 47 #define SWAP_RA_WIN_SHIFT       (PAGE_SHIFT / 2)
 48 #define SWAP_RA_HITS_MASK       ((1UL << SWAP_RA_WIN_SHIFT) - 1)
 49 #define SWAP_RA_HITS_MAX        SWAP_RA_HITS_MASK
 50 #define SWAP_RA_WIN_MASK        (~PAGE_MASK & ~SWAP_RA_HITS_MASK)
 51 
 52 #define SWAP_RA_HITS(v)         ((v) & SWAP_RA_HITS_MASK)
 53 #define SWAP_RA_WIN(v)          (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT)
 54 #define SWAP_RA_ADDR(v)         ((v) & PAGE_MASK)
 55 
 56 #define SWAP_RA_VAL(addr, win, hits)                            \
 57         (((addr) & PAGE_MASK) |                                 \
 58          (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) |    \
 59          ((hits) & SWAP_RA_HITS_MASK))
 60 
 61 /* Initial readahead hits is 4 to start up with a small window */
 62 #define GET_SWAP_RA_VAL(vma)                                    \
 63         (atomic_long_read(&(vma)->swap_readahead_info) ? : 4)
 64 
 65 static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
 66 
 67 void show_swap_cache_info(void)
 68 {
 69         printk("%lu pages in swap cache\n", total_swapcache_pages());
 70         printk("Free swap  = %ldkB\n", K(get_nr_swap_pages()));
 71         printk("Total swap = %lukB\n", K(total_swap_pages));
 72 }
 73 
 74 void *get_shadow_from_swap_cache(swp_entry_t entry)
 75 {
 76         struct address_space *address_space = swap_address_space(entry);
 77         pgoff_t idx = swap_cache_index(entry);
 78         void *shadow;
 79 
 80         shadow = xa_load(&address_space->i_pages, idx);
 81         if (xa_is_value(shadow))
 82                 return shadow;
 83         return NULL;
 84 }
 85 
 86 /*
 87  * add_to_swap_cache resembles filemap_add_folio on swapper_space,
 88  * but sets SwapCache flag and private instead of mapping and index.
 89  */
 90 int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
 91                         gfp_t gfp, void **shadowp)
 92 {
 93         struct address_space *address_space = swap_address_space(entry);
 94         pgoff_t idx = swap_cache_index(entry);
 95         XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio));
 96         unsigned long i, nr = folio_nr_pages(folio);
 97         void *old;
 98 
 99         xas_set_update(&xas, workingset_update_node);
100 
101         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
102         VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
103         VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);
104 
105         folio_ref_add(folio, nr);
106         folio_set_swapcache(folio);
107         folio->swap = entry;
108 
109         do {
110                 xas_lock_irq(&xas);
111                 xas_create_range(&xas);
112                 if (xas_error(&xas))
113                         goto unlock;
114                 for (i = 0; i < nr; i++) {
115                         VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio);
116                         if (shadowp) {
117                                 old = xas_load(&xas);
118                                 if (xa_is_value(old))
119                                         *shadowp = old;
120                         }
121                         xas_store(&xas, folio);
122                         xas_next(&xas);
123                 }
124                 address_space->nrpages += nr;
125                 __node_stat_mod_folio(folio, NR_FILE_PAGES, nr);
126                 __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr);
127 unlock:
128                 xas_unlock_irq(&xas);
129         } while (xas_nomem(&xas, gfp));
130 
131         if (!xas_error(&xas))
132                 return 0;
133 
134         folio_clear_swapcache(folio);
135         folio_ref_sub(folio, nr);
136         return xas_error(&xas);
137 }
138 
139 /*
140  * This must be called only on folios that have
141  * been verified to be in the swap cache.
142  */
143 void __delete_from_swap_cache(struct folio *folio,
144                         swp_entry_t entry, void *shadow)
145 {
146         struct address_space *address_space = swap_address_space(entry);
147         int i;
148         long nr = folio_nr_pages(folio);
149         pgoff_t idx = swap_cache_index(entry);
150         XA_STATE(xas, &address_space->i_pages, idx);
151 
152         xas_set_update(&xas, workingset_update_node);
153 
154         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
155         VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
156         VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);
157 
158         for (i = 0; i < nr; i++) {
159                 void *entry = xas_store(&xas, shadow);
160                 VM_BUG_ON_PAGE(entry != folio, entry);
161                 xas_next(&xas);
162         }
163         folio->swap.val = 0;
164         folio_clear_swapcache(folio);
165         address_space->nrpages -= nr;
166         __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
167         __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr);
168 }
169 
170 /**
171  * add_to_swap - allocate swap space for a folio
172  * @folio: folio we want to move to swap
173  *
174  * Allocate swap space for the folio and add the folio to the
175  * swap cache.
176  *
177  * Context: Caller needs to hold the folio lock.
178  * Return: Whether the folio was added to the swap cache.
179  */
180 bool add_to_swap(struct folio *folio)
181 {
182         swp_entry_t entry;
183         int err;
184 
185         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
186         VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio);
187 
188         entry = folio_alloc_swap(folio);
189         if (!entry.val)
190                 return false;
191 
192         /*
193          * XArray node allocations from PF_MEMALLOC contexts could
194          * completely exhaust the page allocator. __GFP_NOMEMALLOC
195          * stops emergency reserves from being allocated.
196          *
197          * TODO: this could cause a theoretical memory reclaim
198          * deadlock in the swap out path.
199          */
200         /*
201          * Add it to the swap cache.
202          */
203         err = add_to_swap_cache(folio, entry,
204                         __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL);
205         if (err)
206                 /*
207                  * add_to_swap_cache() doesn't return -EEXIST, so we can safely
208                  * clear SWAP_HAS_CACHE flag.
209                  */
210                 goto fail;
211         /*
212          * Normally the folio will be dirtied in unmap because its
213          * pte should be dirty. A special case is MADV_FREE page. The
214          * page's pte could have dirty bit cleared but the folio's
215          * SwapBacked flag is still set because clearing the dirty bit
216          * and SwapBacked flag has no lock protected. For such folio,
217          * unmap will not set dirty bit for it, so folio reclaim will
218          * not write the folio out. This can cause data corruption when
219          * the folio is swapped in later. Always setting the dirty flag
220          * for the folio solves the problem.
221          */
222         folio_mark_dirty(folio);
223 
224         return true;
225 
226 fail:
227         put_swap_folio(folio, entry);
228         return false;
229 }
230 
231 /*
232  * This must be called only on folios that have
233  * been verified to be in the swap cache and locked.
234  * It will never put the folio into the free list,
235  * the caller has a reference on the folio.
236  */
237 void delete_from_swap_cache(struct folio *folio)
238 {
239         swp_entry_t entry = folio->swap;
240         struct address_space *address_space = swap_address_space(entry);
241 
242         xa_lock_irq(&address_space->i_pages);
243         __delete_from_swap_cache(folio, entry, NULL);
244         xa_unlock_irq(&address_space->i_pages);
245 
246         put_swap_folio(folio, entry);
247         folio_ref_sub(folio, folio_nr_pages(folio));
248 }
249 
250 void clear_shadow_from_swap_cache(int type, unsigned long begin,
251                                 unsigned long end)
252 {
253         unsigned long curr = begin;
254         void *old;
255 
256         for (;;) {
257                 swp_entry_t entry = swp_entry(type, curr);
258                 unsigned long index = curr & SWAP_ADDRESS_SPACE_MASK;
259                 struct address_space *address_space = swap_address_space(entry);
260                 XA_STATE(xas, &address_space->i_pages, index);
261 
262                 xas_set_update(&xas, workingset_update_node);
263 
264                 xa_lock_irq(&address_space->i_pages);
265                 xas_for_each(&xas, old, min(index + (end - curr), SWAP_ADDRESS_SPACE_PAGES)) {
266                         if (!xa_is_value(old))
267                                 continue;
268                         xas_store(&xas, NULL);
269                 }
270                 xa_unlock_irq(&address_space->i_pages);
271 
272                 /* search the next swapcache until we meet end */
273                 curr >>= SWAP_ADDRESS_SPACE_SHIFT;
274                 curr++;
275                 curr <<= SWAP_ADDRESS_SPACE_SHIFT;
276                 if (curr > end)
277                         break;
278         }
279 }
280 
281 /*
282  * If we are the only user, then try to free up the swap cache.
283  *
284  * Its ok to check the swapcache flag without the folio lock
285  * here because we are going to recheck again inside
286  * folio_free_swap() _with_ the lock.
287  *                                      - Marcelo
288  */
289 void free_swap_cache(struct folio *folio)
290 {
291         if (folio_test_swapcache(folio) && !folio_mapped(folio) &&
292             folio_trylock(folio)) {
293                 folio_free_swap(folio);
294                 folio_unlock(folio);
295         }
296 }
297 
298 /*
299  * Perform a free_page(), also freeing any swap cache associated with
300  * this page if it is the last user of the page.
301  */
302 void free_page_and_swap_cache(struct page *page)
303 {
304         struct folio *folio = page_folio(page);
305 
306         free_swap_cache(folio);
307         if (!is_huge_zero_folio(folio))
308                 folio_put(folio);
309 }
310 
311 /*
312  * Passed an array of pages, drop them all from swapcache and then release
313  * them.  They are removed from the LRU and freed if this is their last use.
314  */
315 void free_pages_and_swap_cache(struct encoded_page **pages, int nr)
316 {
317         struct folio_batch folios;
318         unsigned int refs[PAGEVEC_SIZE];
319 
320         lru_add_drain();
321         folio_batch_init(&folios);
322         for (int i = 0; i < nr; i++) {
323                 struct folio *folio = page_folio(encoded_page_ptr(pages[i]));
324 
325                 free_swap_cache(folio);
326                 refs[folios.nr] = 1;
327                 if (unlikely(encoded_page_flags(pages[i]) &
328                              ENCODED_PAGE_BIT_NR_PAGES_NEXT))
329                         refs[folios.nr] = encoded_nr_pages(pages[++i]);
330 
331                 if (folio_batch_add(&folios, folio) == 0)
332                         folios_put_refs(&folios, refs);
333         }
334         if (folios.nr)
335                 folios_put_refs(&folios, refs);
336 }
337 
338 static inline bool swap_use_vma_readahead(void)
339 {
340         return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap);
341 }
342 
343 /*
344  * Lookup a swap entry in the swap cache. A found folio will be returned
345  * unlocked and with its refcount incremented - we rely on the kernel
346  * lock getting page table operations atomic even if we drop the folio
347  * lock before returning.
348  *
349  * Caller must lock the swap device or hold a reference to keep it valid.
350  */
351 struct folio *swap_cache_get_folio(swp_entry_t entry,
352                 struct vm_area_struct *vma, unsigned long addr)
353 {
354         struct folio *folio;
355 
356         folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry));
357         if (!IS_ERR(folio)) {
358                 bool vma_ra = swap_use_vma_readahead();
359                 bool readahead;
360 
361                 /*
362                  * At the moment, we don't support PG_readahead for anon THP
363                  * so let's bail out rather than confusing the readahead stat.
364                  */
365                 if (unlikely(folio_test_large(folio)))
366                         return folio;
367 
368                 readahead = folio_test_clear_readahead(folio);
369                 if (vma && vma_ra) {
370                         unsigned long ra_val;
371                         int win, hits;
372 
373                         ra_val = GET_SWAP_RA_VAL(vma);
374                         win = SWAP_RA_WIN(ra_val);
375                         hits = SWAP_RA_HITS(ra_val);
376                         if (readahead)
377                                 hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
378                         atomic_long_set(&vma->swap_readahead_info,
379                                         SWAP_RA_VAL(addr, win, hits));
380                 }
381 
382                 if (readahead) {
383                         count_vm_event(SWAP_RA_HIT);
384                         if (!vma || !vma_ra)
385                                 atomic_inc(&swapin_readahead_hits);
386                 }
387         } else {
388                 folio = NULL;
389         }
390 
391         return folio;
392 }
393 
394 /**
395  * filemap_get_incore_folio - Find and get a folio from the page or swap caches.
396  * @mapping: The address_space to search.
397  * @index: The page cache index.
398  *
399  * This differs from filemap_get_folio() in that it will also look for the
400  * folio in the swap cache.
401  *
402  * Return: The found folio or %NULL.
403  */
404 struct folio *filemap_get_incore_folio(struct address_space *mapping,
405                 pgoff_t index)
406 {
407         swp_entry_t swp;
408         struct swap_info_struct *si;
409         struct folio *folio = filemap_get_entry(mapping, index);
410 
411         if (!folio)
412                 return ERR_PTR(-ENOENT);
413         if (!xa_is_value(folio))
414                 return folio;
415         if (!shmem_mapping(mapping))
416                 return ERR_PTR(-ENOENT);
417 
418         swp = radix_to_swp_entry(folio);
419         /* There might be swapin error entries in shmem mapping. */
420         if (non_swap_entry(swp))
421                 return ERR_PTR(-ENOENT);
422         /* Prevent swapoff from happening to us */
423         si = get_swap_device(swp);
424         if (!si)
425                 return ERR_PTR(-ENOENT);
426         index = swap_cache_index(swp);
427         folio = filemap_get_folio(swap_address_space(swp), index);
428         put_swap_device(si);
429         return folio;
430 }
431 
432 struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
433                 struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated,
434                 bool skip_if_exists)
435 {
436         struct swap_info_struct *si;
437         struct folio *folio;
438         void *shadow = NULL;
439 
440         *new_page_allocated = false;
441         si = get_swap_device(entry);
442         if (!si)
443                 return NULL;
444 
445         for (;;) {
446                 int err;
447                 /*
448                  * First check the swap cache.  Since this is normally
449                  * called after swap_cache_get_folio() failed, re-calling
450                  * that would confuse statistics.
451                  */
452                 folio = filemap_get_folio(swap_address_space(entry),
453                                           swap_cache_index(entry));
454                 if (!IS_ERR(folio))
455                         goto got_folio;
456 
457                 /*
458                  * Just skip read ahead for unused swap slot.
459                  * During swap_off when swap_slot_cache is disabled,
460                  * we have to handle the race between putting
461                  * swap entry in swap cache and marking swap slot
462                  * as SWAP_HAS_CACHE.  That's done in later part of code or
463                  * else swap_off will be aborted if we return NULL.
464                  */
465                 if (!swap_swapcount(si, entry) && swap_slot_cache_enabled)
466                         goto fail_put_swap;
467 
468                 /*
469                  * Get a new folio to read into from swap.  Allocate it now,
470                  * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will
471                  * cause any racers to loop around until we add it to cache.
472                  */
473                 folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id());
474                 if (!folio)
475                         goto fail_put_swap;
476 
477                 /*
478                  * Swap entry may have been freed since our caller observed it.
479                  */
480                 err = swapcache_prepare(entry);
481                 if (!err)
482                         break;
483 
484                 folio_put(folio);
485                 if (err != -EEXIST)
486                         goto fail_put_swap;
487 
488                 /*
489                  * Protect against a recursive call to __read_swap_cache_async()
490                  * on the same entry waiting forever here because SWAP_HAS_CACHE
491                  * is set but the folio is not the swap cache yet. This can
492                  * happen today if mem_cgroup_swapin_charge_folio() below
493                  * triggers reclaim through zswap, which may call
494                  * __read_swap_cache_async() in the writeback path.
495                  */
496                 if (skip_if_exists)
497                         goto fail_put_swap;
498 
499                 /*
500                  * We might race against __delete_from_swap_cache(), and
501                  * stumble across a swap_map entry whose SWAP_HAS_CACHE
502                  * has not yet been cleared.  Or race against another
503                  * __read_swap_cache_async(), which has set SWAP_HAS_CACHE
504                  * in swap_map, but not yet added its folio to swap cache.
505                  */
506                 schedule_timeout_uninterruptible(1);
507         }
508 
509         /*
510          * The swap entry is ours to swap in. Prepare the new folio.
511          */
512 
513         __folio_set_locked(folio);
514         __folio_set_swapbacked(folio);
515 
516         if (mem_cgroup_swapin_charge_folio(folio, NULL, gfp_mask, entry))
517                 goto fail_unlock;
518 
519         /* May fail (-ENOMEM) if XArray node allocation failed. */
520         if (add_to_swap_cache(folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow))
521                 goto fail_unlock;
522 
523         mem_cgroup_swapin_uncharge_swap(entry);
524 
525         if (shadow)
526                 workingset_refault(folio, shadow);
527 
528         /* Caller will initiate read into locked folio */
529         folio_add_lru(folio);
530         *new_page_allocated = true;
531 got_folio:
532         put_swap_device(si);
533         return folio;
534 
535 fail_unlock:
536         put_swap_folio(folio, entry);
537         folio_unlock(folio);
538         folio_put(folio);
539 fail_put_swap:
540         put_swap_device(si);
541         return NULL;
542 }
543 
544 /*
545  * Locate a page of swap in physical memory, reserving swap cache space
546  * and reading the disk if it is not already cached.
547  * A failure return means that either the page allocation failed or that
548  * the swap entry is no longer in use.
549  *
550  * get/put_swap_device() aren't needed to call this function, because
551  * __read_swap_cache_async() call them and swap_read_folio() holds the
552  * swap cache folio lock.
553  */
554 struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
555                 struct vm_area_struct *vma, unsigned long addr,
556                 struct swap_iocb **plug)
557 {
558         bool page_allocated;
559         struct mempolicy *mpol;
560         pgoff_t ilx;
561         struct folio *folio;
562 
563         mpol = get_vma_policy(vma, addr, 0, &ilx);
564         folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
565                                         &page_allocated, false);
566         mpol_cond_put(mpol);
567 
568         if (page_allocated)
569                 swap_read_folio(folio, plug);
570         return folio;
571 }
572 
573 static unsigned int __swapin_nr_pages(unsigned long prev_offset,
574                                       unsigned long offset,
575                                       int hits,
576                                       int max_pages,
577                                       int prev_win)
578 {
579         unsigned int pages, last_ra;
580 
581         /*
582          * This heuristic has been found to work well on both sequential and
583          * random loads, swapping to hard disk or to SSD: please don't ask
584          * what the "+ 2" means, it just happens to work well, that's all.
585          */
586         pages = hits + 2;
587         if (pages == 2) {
588                 /*
589                  * We can have no readahead hits to judge by: but must not get
590                  * stuck here forever, so check for an adjacent offset instead
591                  * (and don't even bother to check whether swap type is same).
592                  */
593                 if (offset != prev_offset + 1 && offset != prev_offset - 1)
594                         pages = 1;
595         } else {
596                 unsigned int roundup = 4;
597                 while (roundup < pages)
598                         roundup <<= 1;
599                 pages = roundup;
600         }
601 
602         if (pages > max_pages)
603                 pages = max_pages;
604 
605         /* Don't shrink readahead too fast */
606         last_ra = prev_win / 2;
607         if (pages < last_ra)
608                 pages = last_ra;
609 
610         return pages;
611 }
612 
613 static unsigned long swapin_nr_pages(unsigned long offset)
614 {
615         static unsigned long prev_offset;
616         unsigned int hits, pages, max_pages;
617         static atomic_t last_readahead_pages;
618 
619         max_pages = 1 << READ_ONCE(page_cluster);
620         if (max_pages <= 1)
621                 return 1;
622 
623         hits = atomic_xchg(&swapin_readahead_hits, 0);
624         pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits,
625                                   max_pages,
626                                   atomic_read(&last_readahead_pages));
627         if (!hits)
628                 WRITE_ONCE(prev_offset, offset);
629         atomic_set(&last_readahead_pages, pages);
630 
631         return pages;
632 }
633 
634 /**
635  * swap_cluster_readahead - swap in pages in hope we need them soon
636  * @entry: swap entry of this memory
637  * @gfp_mask: memory allocation flags
638  * @mpol: NUMA memory allocation policy to be applied
639  * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE
640  *
641  * Returns the struct folio for entry and addr, after queueing swapin.
642  *
643  * Primitive swap readahead code. We simply read an aligned block of
644  * (1 << page_cluster) entries in the swap area. This method is chosen
645  * because it doesn't cost us any seek time.  We also make sure to queue
646  * the 'original' request together with the readahead ones...
647  *
648  * Note: it is intentional that the same NUMA policy and interleave index
649  * are used for every page of the readahead: neighbouring pages on swap
650  * are fairly likely to have been swapped out from the same node.
651  */
652 struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
653                                     struct mempolicy *mpol, pgoff_t ilx)
654 {
655         struct folio *folio;
656         unsigned long entry_offset = swp_offset(entry);
657         unsigned long offset = entry_offset;
658         unsigned long start_offset, end_offset;
659         unsigned long mask;
660         struct swap_info_struct *si = swp_swap_info(entry);
661         struct blk_plug plug;
662         struct swap_iocb *splug = NULL;
663         bool page_allocated;
664 
665         mask = swapin_nr_pages(offset) - 1;
666         if (!mask)
667                 goto skip;
668 
669         /* Read a page_cluster sized and aligned cluster around offset. */
670         start_offset = offset & ~mask;
671         end_offset = offset | mask;
672         if (!start_offset)      /* First page is swap header. */
673                 start_offset++;
674         if (end_offset >= si->max)
675                 end_offset = si->max - 1;
676 
677         blk_start_plug(&plug);
678         for (offset = start_offset; offset <= end_offset ; offset++) {
679                 /* Ok, do the async read-ahead now */
680                 folio = __read_swap_cache_async(
681                                 swp_entry(swp_type(entry), offset),
682                                 gfp_mask, mpol, ilx, &page_allocated, false);
683                 if (!folio)
684                         continue;
685                 if (page_allocated) {
686                         swap_read_folio(folio, &splug);
687                         if (offset != entry_offset) {
688                                 folio_set_readahead(folio);
689                                 count_vm_event(SWAP_RA);
690                         }
691                 }
692                 folio_put(folio);
693         }
694         blk_finish_plug(&plug);
695         swap_read_unplug(splug);
696         lru_add_drain();        /* Push any new pages onto the LRU now */
697 skip:
698         /* The page was likely read above, so no need for plugging here */
699         folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
700                                         &page_allocated, false);
701         if (unlikely(page_allocated)) {
702                 zswap_folio_swapin(folio);
703                 swap_read_folio(folio, NULL);
704         }
705         return folio;
706 }
707 
708 int init_swap_address_space(unsigned int type, unsigned long nr_pages)
709 {
710         struct address_space *spaces, *space;
711         unsigned int i, nr;
712 
713         nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
714         spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL);
715         if (!spaces)
716                 return -ENOMEM;
717         for (i = 0; i < nr; i++) {
718                 space = spaces + i;
719                 xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ);
720                 atomic_set(&space->i_mmap_writable, 0);
721                 space->a_ops = &swap_aops;
722                 /* swap cache doesn't use writeback related tags */
723                 mapping_set_no_writeback_tags(space);
724         }
725         nr_swapper_spaces[type] = nr;
726         swapper_spaces[type] = spaces;
727 
728         return 0;
729 }
730 
731 void exit_swap_address_space(unsigned int type)
732 {
733         int i;
734         struct address_space *spaces = swapper_spaces[type];
735 
736         for (i = 0; i < nr_swapper_spaces[type]; i++)
737                 VM_WARN_ON_ONCE(!mapping_empty(&spaces[i]));
738         kvfree(spaces);
739         nr_swapper_spaces[type] = 0;
740         swapper_spaces[type] = NULL;
741 }
742 
743 static int swap_vma_ra_win(struct vm_fault *vmf, unsigned long *start,
744                            unsigned long *end)
745 {
746         struct vm_area_struct *vma = vmf->vma;
747         unsigned long ra_val;
748         unsigned long faddr, prev_faddr, left, right;
749         unsigned int max_win, hits, prev_win, win;
750 
751         max_win = 1 << min(READ_ONCE(page_cluster), SWAP_RA_ORDER_CEILING);
752         if (max_win == 1)
753                 return 1;
754 
755         faddr = vmf->address;
756         ra_val = GET_SWAP_RA_VAL(vma);
757         prev_faddr = SWAP_RA_ADDR(ra_val);
758         prev_win = SWAP_RA_WIN(ra_val);
759         hits = SWAP_RA_HITS(ra_val);
760         win = __swapin_nr_pages(PFN_DOWN(prev_faddr), PFN_DOWN(faddr), hits,
761                                 max_win, prev_win);
762         atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(faddr, win, 0));
763         if (win == 1)
764                 return 1;
765 
766         if (faddr == prev_faddr + PAGE_SIZE)
767                 left = faddr;
768         else if (prev_faddr == faddr + PAGE_SIZE)
769                 left = faddr - (win << PAGE_SHIFT) + PAGE_SIZE;
770         else
771                 left = faddr - (((win - 1) / 2) << PAGE_SHIFT);
772         right = left + (win << PAGE_SHIFT);
773         if ((long)left < 0)
774                 left = 0;
775         *start = max3(left, vma->vm_start, faddr & PMD_MASK);
776         *end = min3(right, vma->vm_end, (faddr & PMD_MASK) + PMD_SIZE);
777 
778         return win;
779 }
780 
781 /**
782  * swap_vma_readahead - swap in pages in hope we need them soon
783  * @targ_entry: swap entry of the targeted memory
784  * @gfp_mask: memory allocation flags
785  * @mpol: NUMA memory allocation policy to be applied
786  * @targ_ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE
787  * @vmf: fault information
788  *
789  * Returns the struct folio for entry and addr, after queueing swapin.
790  *
791  * Primitive swap readahead code. We simply read in a few pages whose
792  * virtual addresses are around the fault address in the same vma.
793  *
794  * Caller must hold read mmap_lock if vmf->vma is not NULL.
795  *
796  */
797 static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
798                 struct mempolicy *mpol, pgoff_t targ_ilx, struct vm_fault *vmf)
799 {
800         struct blk_plug plug;
801         struct swap_iocb *splug = NULL;
802         struct folio *folio;
803         pte_t *pte = NULL, pentry;
804         int win;
805         unsigned long start, end, addr;
806         swp_entry_t entry;
807         pgoff_t ilx;
808         bool page_allocated;
809 
810         win = swap_vma_ra_win(vmf, &start, &end);
811         if (win == 1)
812                 goto skip;
813 
814         ilx = targ_ilx - PFN_DOWN(vmf->address - start);
815 
816         blk_start_plug(&plug);
817         for (addr = start; addr < end; ilx++, addr += PAGE_SIZE) {
818                 if (!pte++) {
819                         pte = pte_offset_map(vmf->pmd, addr);
820                         if (!pte)
821                                 break;
822                 }
823                 pentry = ptep_get_lockless(pte);
824                 if (!is_swap_pte(pentry))
825                         continue;
826                 entry = pte_to_swp_entry(pentry);
827                 if (unlikely(non_swap_entry(entry)))
828                         continue;
829                 pte_unmap(pte);
830                 pte = NULL;
831                 folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
832                                                 &page_allocated, false);
833                 if (!folio)
834                         continue;
835                 if (page_allocated) {
836                         swap_read_folio(folio, &splug);
837                         if (addr != vmf->address) {
838                                 folio_set_readahead(folio);
839                                 count_vm_event(SWAP_RA);
840                         }
841                 }
842                 folio_put(folio);
843         }
844         if (pte)
845                 pte_unmap(pte);
846         blk_finish_plug(&plug);
847         swap_read_unplug(splug);
848         lru_add_drain();
849 skip:
850         /* The folio was likely read above, so no need for plugging here */
851         folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx,
852                                         &page_allocated, false);
853         if (unlikely(page_allocated)) {
854                 zswap_folio_swapin(folio);
855                 swap_read_folio(folio, NULL);
856         }
857         return folio;
858 }
859 
860 /**
861  * swapin_readahead - swap in pages in hope we need them soon
862  * @entry: swap entry of this memory
863  * @gfp_mask: memory allocation flags
864  * @vmf: fault information
865  *
866  * Returns the struct page for entry and addr, after queueing swapin.
867  *
868  * It's a main entry function for swap readahead. By the configuration,
869  * it will read ahead blocks by cluster-based(ie, physical disk based)
870  * or vma-based(ie, virtual address based on faulty address) readahead.
871  */
872 struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
873                                 struct vm_fault *vmf)
874 {
875         struct mempolicy *mpol;
876         pgoff_t ilx;
877         struct folio *folio;
878 
879         mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx);
880         folio = swap_use_vma_readahead() ?
881                 swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) :
882                 swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
883         mpol_cond_put(mpol);
884 
885         if (!folio)
886                 return NULL;
887         return folio_file_page(folio, swp_offset(entry));
888 }
889 
890 #ifdef CONFIG_SYSFS
891 static ssize_t vma_ra_enabled_show(struct kobject *kobj,
892                                      struct kobj_attribute *attr, char *buf)
893 {
894         return sysfs_emit(buf, "%s\n",
895                           enable_vma_readahead ? "true" : "false");
896 }
897 static ssize_t vma_ra_enabled_store(struct kobject *kobj,
898                                       struct kobj_attribute *attr,
899                                       const char *buf, size_t count)
900 {
901         ssize_t ret;
902 
903         ret = kstrtobool(buf, &enable_vma_readahead);
904         if (ret)
905                 return ret;
906 
907         return count;
908 }
909 static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled);
910 
911 static struct attribute *swap_attrs[] = {
912         &vma_ra_enabled_attr.attr,
913         NULL,
914 };
915 
916 static const struct attribute_group swap_attr_group = {
917         .attrs = swap_attrs,
918 };
919 
920 static int __init swap_init_sysfs(void)
921 {
922         int err;
923         struct kobject *swap_kobj;
924 
925         swap_kobj = kobject_create_and_add("swap", mm_kobj);
926         if (!swap_kobj) {
927                 pr_err("failed to create swap kobject\n");
928                 return -ENOMEM;
929         }
930         err = sysfs_create_group(swap_kobj, &swap_attr_group);
931         if (err) {
932                 pr_err("failed to register swap group\n");
933                 goto delete_obj;
934         }
935         return 0;
936 
937 delete_obj:
938         kobject_put(swap_kobj);
939         return err;
940 }
941 subsys_initcall(swap_init_sysfs);
942 #endif
943 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php