~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~
swapfile.c

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~
Diff markup

Differences between /mm/swapfile.c (Architecture i386) and /mm/swapfile.c (Architecture sparc64)

  1 // SPDX-License-Identifier: GPL-2.0-only            1 // SPDX-License-Identifier: GPL-2.0-only
  2 /*                                                  2 /*
  3  *  linux/mm/swapfile.c                             3  *  linux/mm/swapfile.c
  4  *                                                  4  *
  5  *  Copyright (C) 1991, 1992, 1993, 1994  Linu      5  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  6  *  Swap reorganised 29.12.95, Stephen Tweedie      6  *  Swap reorganised 29.12.95, Stephen Tweedie
  7  */                                                 7  */
  8                                                     8 
  9 #include <linux/blkdev.h>                           9 #include <linux/blkdev.h>
 10 #include <linux/mm.h>                              10 #include <linux/mm.h>
 11 #include <linux/sched/mm.h>                        11 #include <linux/sched/mm.h>
 12 #include <linux/sched/task.h>                      12 #include <linux/sched/task.h>
 13 #include <linux/hugetlb.h>                         13 #include <linux/hugetlb.h>
 14 #include <linux/mman.h>                            14 #include <linux/mman.h>
 15 #include <linux/slab.h>                            15 #include <linux/slab.h>
 16 #include <linux/kernel_stat.h>                     16 #include <linux/kernel_stat.h>
 17 #include <linux/swap.h>                            17 #include <linux/swap.h>
 18 #include <linux/vmalloc.h>                         18 #include <linux/vmalloc.h>
 19 #include <linux/pagemap.h>                         19 #include <linux/pagemap.h>
 20 #include <linux/namei.h>                           20 #include <linux/namei.h>
 21 #include <linux/shmem_fs.h>                        21 #include <linux/shmem_fs.h>
 22 #include <linux/blk-cgroup.h>                      22 #include <linux/blk-cgroup.h>
 23 #include <linux/random.h>                          23 #include <linux/random.h>
 24 #include <linux/writeback.h>                       24 #include <linux/writeback.h>
 25 #include <linux/proc_fs.h>                         25 #include <linux/proc_fs.h>
 26 #include <linux/seq_file.h>                        26 #include <linux/seq_file.h>
 27 #include <linux/init.h>                            27 #include <linux/init.h>
 28 #include <linux/ksm.h>                             28 #include <linux/ksm.h>
 29 #include <linux/rmap.h>                            29 #include <linux/rmap.h>
 30 #include <linux/security.h>                        30 #include <linux/security.h>
 31 #include <linux/backing-dev.h>                     31 #include <linux/backing-dev.h>
 32 #include <linux/mutex.h>                           32 #include <linux/mutex.h>
 33 #include <linux/capability.h>                      33 #include <linux/capability.h>
 34 #include <linux/syscalls.h>                        34 #include <linux/syscalls.h>
 35 #include <linux/memcontrol.h>                      35 #include <linux/memcontrol.h>
 36 #include <linux/poll.h>                            36 #include <linux/poll.h>
 37 #include <linux/oom.h>                             37 #include <linux/oom.h>
 38 #include <linux/swapfile.h>                        38 #include <linux/swapfile.h>
 39 #include <linux/export.h>                          39 #include <linux/export.h>
 40 #include <linux/swap_slots.h>                      40 #include <linux/swap_slots.h>
 41 #include <linux/sort.h>                            41 #include <linux/sort.h>
 42 #include <linux/completion.h>                      42 #include <linux/completion.h>
 43 #include <linux/suspend.h>                         43 #include <linux/suspend.h>
 44 #include <linux/zswap.h>                           44 #include <linux/zswap.h>
 45 #include <linux/plist.h>                           45 #include <linux/plist.h>
 46                                                    46 
 47 #include <asm/tlbflush.h>                          47 #include <asm/tlbflush.h>
 48 #include <linux/swapops.h>                         48 #include <linux/swapops.h>
 49 #include <linux/swap_cgroup.h>                     49 #include <linux/swap_cgroup.h>
 50 #include "internal.h"                              50 #include "internal.h"
 51 #include "swap.h"                                  51 #include "swap.h"
 52                                                    52 
 53 static bool swap_count_continued(struct swap_i     53 static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
 54                                  unsigned char     54                                  unsigned char);
 55 static void free_swap_count_continuations(stru     55 static void free_swap_count_continuations(struct swap_info_struct *);
 56 static void swap_entry_range_free(struct swap_     56 static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry,
 57                                   unsigned int     57                                   unsigned int nr_pages);
 58 static void swap_range_alloc(struct swap_info_     58 static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
 59                              unsigned int nr_e     59                              unsigned int nr_entries);
 60 static bool folio_swapcache_freeable(struct fo     60 static bool folio_swapcache_freeable(struct folio *folio);
 61 static struct swap_cluster_info *lock_cluster_     61 static struct swap_cluster_info *lock_cluster_or_swap_info(
 62                 struct swap_info_struct *si, u     62                 struct swap_info_struct *si, unsigned long offset);
 63 static void unlock_cluster_or_swap_info(struct     63 static void unlock_cluster_or_swap_info(struct swap_info_struct *si,
 64                                         struct     64                                         struct swap_cluster_info *ci);
 65                                                    65 
 66 static DEFINE_SPINLOCK(swap_lock);                 66 static DEFINE_SPINLOCK(swap_lock);
 67 static unsigned int nr_swapfiles;                  67 static unsigned int nr_swapfiles;
 68 atomic_long_t nr_swap_pages;                       68 atomic_long_t nr_swap_pages;
 69 /*                                                 69 /*
 70  * Some modules use swappable objects and may      70  * Some modules use swappable objects and may try to swap them out under
 71  * memory pressure (via the shrinker). Before      71  * memory pressure (via the shrinker). Before doing so, they may wish to
 72  * check to see if any swap space is available     72  * check to see if any swap space is available.
 73  */                                                73  */
 74 EXPORT_SYMBOL_GPL(nr_swap_pages);                  74 EXPORT_SYMBOL_GPL(nr_swap_pages);
 75 /* protected with swap_lock. reading in vm_swa     75 /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
 76 long total_swap_pages;                             76 long total_swap_pages;
 77 static int least_priority = -1;                    77 static int least_priority = -1;
 78 unsigned long swapfile_maximum_size;               78 unsigned long swapfile_maximum_size;
 79 #ifdef CONFIG_MIGRATION                            79 #ifdef CONFIG_MIGRATION
 80 bool swap_migration_ad_supported;                  80 bool swap_migration_ad_supported;
 81 #endif  /* CONFIG_MIGRATION */                     81 #endif  /* CONFIG_MIGRATION */
 82                                                    82 
 83 static const char Bad_file[] = "Bad swap file      83 static const char Bad_file[] = "Bad swap file entry ";
 84 static const char Unused_file[] = "Unused swap     84 static const char Unused_file[] = "Unused swap file entry ";
 85 static const char Bad_offset[] = "Bad swap off     85 static const char Bad_offset[] = "Bad swap offset entry ";
 86 static const char Unused_offset[] = "Unused sw     86 static const char Unused_offset[] = "Unused swap offset entry ";
 87                                                    87 
 88 /*                                                 88 /*
 89  * all active swap_info_structs                    89  * all active swap_info_structs
 90  * protected with swap_lock, and ordered by pr     90  * protected with swap_lock, and ordered by priority.
 91  */                                                91  */
 92 static PLIST_HEAD(swap_active_head);               92 static PLIST_HEAD(swap_active_head);
 93                                                    93 
 94 /*                                                 94 /*
 95  * all available (active, not full) swap_info_     95  * all available (active, not full) swap_info_structs
 96  * protected with swap_avail_lock, ordered by      96  * protected with swap_avail_lock, ordered by priority.
 97  * This is used by folio_alloc_swap() instead      97  * This is used by folio_alloc_swap() instead of swap_active_head
 98  * because swap_active_head includes all swap_     98  * because swap_active_head includes all swap_info_structs,
 99  * but folio_alloc_swap() doesn't need to look     99  * but folio_alloc_swap() doesn't need to look at full ones.
100  * This uses its own lock instead of swap_lock    100  * This uses its own lock instead of swap_lock because when a
101  * swap_info_struct changes between not-full/f    101  * swap_info_struct changes between not-full/full, it needs to
102  * add/remove itself to/from this list, but th    102  * add/remove itself to/from this list, but the swap_info_struct->lock
103  * is held and the locking order requires swap    103  * is held and the locking order requires swap_lock to be taken
104  * before any swap_info_struct->lock.             104  * before any swap_info_struct->lock.
105  */                                               105  */
106 static struct plist_head *swap_avail_heads;       106 static struct plist_head *swap_avail_heads;
107 static DEFINE_SPINLOCK(swap_avail_lock);          107 static DEFINE_SPINLOCK(swap_avail_lock);
108                                                   108 
109 static struct swap_info_struct *swap_info[MAX_    109 static struct swap_info_struct *swap_info[MAX_SWAPFILES];
110                                                   110 
111 static DEFINE_MUTEX(swapon_mutex);                111 static DEFINE_MUTEX(swapon_mutex);
112                                                   112 
113 static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait)    113 static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
114 /* Activity counter to indicate that a swapon     114 /* Activity counter to indicate that a swapon or swapoff has occurred */
115 static atomic_t proc_poll_event = ATOMIC_INIT(    115 static atomic_t proc_poll_event = ATOMIC_INIT(0);
116                                                   116 
117 atomic_t nr_rotate_swap = ATOMIC_INIT(0);         117 atomic_t nr_rotate_swap = ATOMIC_INIT(0);
118                                                   118 
119 static struct swap_info_struct *swap_type_to_s    119 static struct swap_info_struct *swap_type_to_swap_info(int type)
120 {                                                 120 {
121         if (type >= MAX_SWAPFILES)                121         if (type >= MAX_SWAPFILES)
122                 return NULL;                      122                 return NULL;
123                                                   123 
124         return READ_ONCE(swap_info[type]); /*     124         return READ_ONCE(swap_info[type]); /* rcu_dereference() */
125 }                                                 125 }
126                                                   126 
127 static inline unsigned char swap_count(unsigne    127 static inline unsigned char swap_count(unsigned char ent)
128 {                                                 128 {
129         return ent & ~SWAP_HAS_CACHE;   /* may    129         return ent & ~SWAP_HAS_CACHE;   /* may include COUNT_CONTINUED flag */
130 }                                                 130 }
131                                                   131 
132 /* Reclaim the swap entry anyway if possible *    132 /* Reclaim the swap entry anyway if possible */
133 #define TTRS_ANYWAY             0x1               133 #define TTRS_ANYWAY             0x1
134 /*                                                134 /*
135  * Reclaim the swap entry if there are no more    135  * Reclaim the swap entry if there are no more mappings of the
136  * corresponding page                             136  * corresponding page
137  */                                               137  */
138 #define TTRS_UNMAPPED           0x2               138 #define TTRS_UNMAPPED           0x2
139 /* Reclaim the swap entry if swap is getting f    139 /* Reclaim the swap entry if swap is getting full */
140 #define TTRS_FULL               0x4               140 #define TTRS_FULL               0x4
141 /* Reclaim directly, bypass the slot cache and    141 /* Reclaim directly, bypass the slot cache and don't touch device lock */
142 #define TTRS_DIRECT             0x8               142 #define TTRS_DIRECT             0x8
143                                                   143 
144 static bool swap_is_has_cache(struct swap_info    144 static bool swap_is_has_cache(struct swap_info_struct *si,
145                               unsigned long of    145                               unsigned long offset, int nr_pages)
146 {                                                 146 {
147         unsigned char *map = si->swap_map + of    147         unsigned char *map = si->swap_map + offset;
148         unsigned char *map_end = map + nr_page    148         unsigned char *map_end = map + nr_pages;
149                                                   149 
150         do {                                      150         do {
151                 VM_BUG_ON(!(*map & SWAP_HAS_CA    151                 VM_BUG_ON(!(*map & SWAP_HAS_CACHE));
152                 if (*map != SWAP_HAS_CACHE)       152                 if (*map != SWAP_HAS_CACHE)
153                         return false;             153                         return false;
154         } while (++map < map_end);                154         } while (++map < map_end);
155                                                   155 
156         return true;                              156         return true;
157 }                                                 157 }
158                                                   158 
159 static bool swap_is_last_map(struct swap_info_    159 static bool swap_is_last_map(struct swap_info_struct *si,
160                 unsigned long offset, int nr_p    160                 unsigned long offset, int nr_pages, bool *has_cache)
161 {                                                 161 {
162         unsigned char *map = si->swap_map + of    162         unsigned char *map = si->swap_map + offset;
163         unsigned char *map_end = map + nr_page    163         unsigned char *map_end = map + nr_pages;
164         unsigned char count = *map;               164         unsigned char count = *map;
165                                                   165 
166         if (swap_count(count) != 1)               166         if (swap_count(count) != 1)
167                 return false;                     167                 return false;
168                                                   168 
169         while (++map < map_end) {                 169         while (++map < map_end) {
170                 if (*map != count)                170                 if (*map != count)
171                         return false;             171                         return false;
172         }                                         172         }
173                                                   173 
174         *has_cache = !!(count & SWAP_HAS_CACHE    174         *has_cache = !!(count & SWAP_HAS_CACHE);
175         return true;                              175         return true;
176 }                                                 176 }
177                                                   177 
178 /*                                                178 /*
179  * returns number of pages in the folio that b    179  * returns number of pages in the folio that backs the swap entry. If positive,
180  * the folio was reclaimed. If negative, the f    180  * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
181  * folio was associated with the swap entry.      181  * folio was associated with the swap entry.
182  */                                               182  */
183 static int __try_to_reclaim_swap(struct swap_i    183 static int __try_to_reclaim_swap(struct swap_info_struct *si,
184                                  unsigned long    184                                  unsigned long offset, unsigned long flags)
185 {                                                 185 {
186         swp_entry_t entry = swp_entry(si->type    186         swp_entry_t entry = swp_entry(si->type, offset);
187         struct address_space *address_space =     187         struct address_space *address_space = swap_address_space(entry);
188         struct swap_cluster_info *ci;             188         struct swap_cluster_info *ci;
189         struct folio *folio;                      189         struct folio *folio;
190         int ret, nr_pages;                        190         int ret, nr_pages;
191         bool need_reclaim;                        191         bool need_reclaim;
192                                                   192 
193         folio = filemap_get_folio(address_spac    193         folio = filemap_get_folio(address_space, swap_cache_index(entry));
194         if (IS_ERR(folio))                        194         if (IS_ERR(folio))
195                 return 0;                         195                 return 0;
196                                                   196 
197         nr_pages = folio_nr_pages(folio);         197         nr_pages = folio_nr_pages(folio);
198         ret = -nr_pages;                          198         ret = -nr_pages;
199                                                   199 
200         /*                                        200         /*
201          * When this function is called from s    201          * When this function is called from scan_swap_map_slots() and it's
202          * called by vmscan.c at reclaiming fo    202          * called by vmscan.c at reclaiming folios. So we hold a folio lock
203          * here. We have to use trylock for av    203          * here. We have to use trylock for avoiding deadlock. This is a special
204          * case and you should use folio_free_    204          * case and you should use folio_free_swap() with explicit folio_lock()
205          * in usual operations.                   205          * in usual operations.
206          */                                       206          */
207         if (!folio_trylock(folio))                207         if (!folio_trylock(folio))
208                 goto out;                         208                 goto out;
209                                                   209 
210         /* offset could point to the middle of    210         /* offset could point to the middle of a large folio */
211         entry = folio->swap;                      211         entry = folio->swap;
212         offset = swp_offset(entry);               212         offset = swp_offset(entry);
213                                                   213 
214         need_reclaim = ((flags & TTRS_ANYWAY)     214         need_reclaim = ((flags & TTRS_ANYWAY) ||
215                         ((flags & TTRS_UNMAPPE    215                         ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
216                         ((flags & TTRS_FULL) &    216                         ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)));
217         if (!need_reclaim || !folio_swapcache_    217         if (!need_reclaim || !folio_swapcache_freeable(folio))
218                 goto out_unlock;                  218                 goto out_unlock;
219                                                   219 
220         /*                                        220         /*
221          * It's safe to delete the folio from     221          * It's safe to delete the folio from swap cache only if the folio's
222          * swap_map is HAS_CACHE only, which m    222          * swap_map is HAS_CACHE only, which means the slots have no page table
223          * reference or pending writeback, and    223          * reference or pending writeback, and can't be allocated to others.
224          */                                       224          */
225         ci = lock_cluster_or_swap_info(si, off    225         ci = lock_cluster_or_swap_info(si, offset);
226         need_reclaim = swap_is_has_cache(si, o    226         need_reclaim = swap_is_has_cache(si, offset, nr_pages);
227         unlock_cluster_or_swap_info(si, ci);      227         unlock_cluster_or_swap_info(si, ci);
228         if (!need_reclaim)                        228         if (!need_reclaim)
229                 goto out_unlock;                  229                 goto out_unlock;
230                                                   230 
231         if (!(flags & TTRS_DIRECT)) {             231         if (!(flags & TTRS_DIRECT)) {
232                 /* Free through slot cache */     232                 /* Free through slot cache */
233                 delete_from_swap_cache(folio);    233                 delete_from_swap_cache(folio);
234                 folio_set_dirty(folio);           234                 folio_set_dirty(folio);
235                 ret = nr_pages;                   235                 ret = nr_pages;
236                 goto out_unlock;                  236                 goto out_unlock;
237         }                                         237         }
238                                                   238 
239         xa_lock_irq(&address_space->i_pages);     239         xa_lock_irq(&address_space->i_pages);
240         __delete_from_swap_cache(folio, entry,    240         __delete_from_swap_cache(folio, entry, NULL);
241         xa_unlock_irq(&address_space->i_pages)    241         xa_unlock_irq(&address_space->i_pages);
242         folio_ref_sub(folio, nr_pages);           242         folio_ref_sub(folio, nr_pages);
243         folio_set_dirty(folio);                   243         folio_set_dirty(folio);
244                                                   244 
245         spin_lock(&si->lock);                     245         spin_lock(&si->lock);
246         /* Only sinple page folio can be backe    246         /* Only sinple page folio can be backed by zswap */
247         if (nr_pages == 1)                        247         if (nr_pages == 1)
248                 zswap_invalidate(entry);          248                 zswap_invalidate(entry);
249         swap_entry_range_free(si, entry, nr_pa    249         swap_entry_range_free(si, entry, nr_pages);
250         spin_unlock(&si->lock);                   250         spin_unlock(&si->lock);
251         ret = nr_pages;                           251         ret = nr_pages;
252 out_unlock:                                       252 out_unlock:
253         folio_unlock(folio);                      253         folio_unlock(folio);
254 out:                                              254 out:
255         folio_put(folio);                         255         folio_put(folio);
256         return ret;                               256         return ret;
257 }                                                 257 }
258                                                   258 
259 static inline struct swap_extent *first_se(str    259 static inline struct swap_extent *first_se(struct swap_info_struct *sis)
260 {                                                 260 {
261         struct rb_node *rb = rb_first(&sis->sw    261         struct rb_node *rb = rb_first(&sis->swap_extent_root);
262         return rb_entry(rb, struct swap_extent    262         return rb_entry(rb, struct swap_extent, rb_node);
263 }                                                 263 }
264                                                   264 
265 static inline struct swap_extent *next_se(stru    265 static inline struct swap_extent *next_se(struct swap_extent *se)
266 {                                                 266 {
267         struct rb_node *rb = rb_next(&se->rb_n    267         struct rb_node *rb = rb_next(&se->rb_node);
268         return rb ? rb_entry(rb, struct swap_e    268         return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL;
269 }                                                 269 }
270                                                   270 
271 /*                                                271 /*
272  * swapon tell device that all the old swap co    272  * swapon tell device that all the old swap contents can be discarded,
273  * to allow the swap device to optimize its we    273  * to allow the swap device to optimize its wear-levelling.
274  */                                               274  */
275 static int discard_swap(struct swap_info_struc    275 static int discard_swap(struct swap_info_struct *si)
276 {                                                 276 {
277         struct swap_extent *se;                   277         struct swap_extent *se;
278         sector_t start_block;                     278         sector_t start_block;
279         sector_t nr_blocks;                       279         sector_t nr_blocks;
280         int err = 0;                              280         int err = 0;
281                                                   281 
282         /* Do not discard the swap header page    282         /* Do not discard the swap header page! */
283         se = first_se(si);                        283         se = first_se(si);
284         start_block = (se->start_block + 1) <<    284         start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
285         nr_blocks = ((sector_t)se->nr_pages -     285         nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
286         if (nr_blocks) {                          286         if (nr_blocks) {
287                 err = blkdev_issue_discard(si-    287                 err = blkdev_issue_discard(si->bdev, start_block,
288                                 nr_blocks, GFP    288                                 nr_blocks, GFP_KERNEL);
289                 if (err)                          289                 if (err)
290                         return err;               290                         return err;
291                 cond_resched();                   291                 cond_resched();
292         }                                         292         }
293                                                   293 
294         for (se = next_se(se); se; se = next_s    294         for (se = next_se(se); se; se = next_se(se)) {
295                 start_block = se->start_block     295                 start_block = se->start_block << (PAGE_SHIFT - 9);
296                 nr_blocks = (sector_t)se->nr_p    296                 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
297                                                   297 
298                 err = blkdev_issue_discard(si-    298                 err = blkdev_issue_discard(si->bdev, start_block,
299                                 nr_blocks, GFP    299                                 nr_blocks, GFP_KERNEL);
300                 if (err)                          300                 if (err)
301                         break;                    301                         break;
302                                                   302 
303                 cond_resched();                   303                 cond_resched();
304         }                                         304         }
305         return err;             /* That will o    305         return err;             /* That will often be -EOPNOTSUPP */
306 }                                                 306 }
307                                                   307 
308 static struct swap_extent *                       308 static struct swap_extent *
309 offset_to_swap_extent(struct swap_info_struct     309 offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
310 {                                                 310 {
311         struct swap_extent *se;                   311         struct swap_extent *se;
312         struct rb_node *rb;                       312         struct rb_node *rb;
313                                                   313 
314         rb = sis->swap_extent_root.rb_node;       314         rb = sis->swap_extent_root.rb_node;
315         while (rb) {                              315         while (rb) {
316                 se = rb_entry(rb, struct swap_    316                 se = rb_entry(rb, struct swap_extent, rb_node);
317                 if (offset < se->start_page)      317                 if (offset < se->start_page)
318                         rb = rb->rb_left;         318                         rb = rb->rb_left;
319                 else if (offset >= se->start_p    319                 else if (offset >= se->start_page + se->nr_pages)
320                         rb = rb->rb_right;        320                         rb = rb->rb_right;
321                 else                              321                 else
322                         return se;                322                         return se;
323         }                                         323         }
324         /* It *must* be present */                324         /* It *must* be present */
325         BUG();                                    325         BUG();
326 }                                                 326 }
327                                                   327 
328 sector_t swap_folio_sector(struct folio *folio    328 sector_t swap_folio_sector(struct folio *folio)
329 {                                                 329 {
330         struct swap_info_struct *sis = swp_swa    330         struct swap_info_struct *sis = swp_swap_info(folio->swap);
331         struct swap_extent *se;                   331         struct swap_extent *se;
332         sector_t sector;                          332         sector_t sector;
333         pgoff_t offset;                           333         pgoff_t offset;
334                                                   334 
335         offset = swp_offset(folio->swap);         335         offset = swp_offset(folio->swap);
336         se = offset_to_swap_extent(sis, offset    336         se = offset_to_swap_extent(sis, offset);
337         sector = se->start_block + (offset - s    337         sector = se->start_block + (offset - se->start_page);
338         return sector << (PAGE_SHIFT - 9);        338         return sector << (PAGE_SHIFT - 9);
339 }                                                 339 }
340                                                   340 
341 /*                                                341 /*
342  * swap allocation tell device that a cluster     342  * swap allocation tell device that a cluster of swap can now be discarded,
343  * to allow the swap device to optimize its we    343  * to allow the swap device to optimize its wear-levelling.
344  */                                               344  */
345 static void discard_swap_cluster(struct swap_i    345 static void discard_swap_cluster(struct swap_info_struct *si,
346                                  pgoff_t start    346                                  pgoff_t start_page, pgoff_t nr_pages)
347 {                                                 347 {
348         struct swap_extent *se = offset_to_swa    348         struct swap_extent *se = offset_to_swap_extent(si, start_page);
349                                                   349 
350         while (nr_pages) {                        350         while (nr_pages) {
351                 pgoff_t offset = start_page -     351                 pgoff_t offset = start_page - se->start_page;
352                 sector_t start_block = se->sta    352                 sector_t start_block = se->start_block + offset;
353                 sector_t nr_blocks = se->nr_pa    353                 sector_t nr_blocks = se->nr_pages - offset;
354                                                   354 
355                 if (nr_blocks > nr_pages)         355                 if (nr_blocks > nr_pages)
356                         nr_blocks = nr_pages;     356                         nr_blocks = nr_pages;
357                 start_page += nr_blocks;          357                 start_page += nr_blocks;
358                 nr_pages -= nr_blocks;            358                 nr_pages -= nr_blocks;
359                                                   359 
360                 start_block <<= PAGE_SHIFT - 9    360                 start_block <<= PAGE_SHIFT - 9;
361                 nr_blocks <<= PAGE_SHIFT - 9;     361                 nr_blocks <<= PAGE_SHIFT - 9;
362                 if (blkdev_issue_discard(si->b    362                 if (blkdev_issue_discard(si->bdev, start_block,
363                                         nr_blo    363                                         nr_blocks, GFP_NOIO))
364                         break;                    364                         break;
365                                                   365 
366                 se = next_se(se);                 366                 se = next_se(se);
367         }                                         367         }
368 }                                                 368 }
369                                                   369 
370 #ifdef CONFIG_THP_SWAP                            370 #ifdef CONFIG_THP_SWAP
371 #define SWAPFILE_CLUSTER        HPAGE_PMD_NR      371 #define SWAPFILE_CLUSTER        HPAGE_PMD_NR
372                                                   372 
373 #define swap_entry_order(order) (order)           373 #define swap_entry_order(order) (order)
374 #else                                             374 #else
375 #define SWAPFILE_CLUSTER        256               375 #define SWAPFILE_CLUSTER        256
376                                                   376 
377 /*                                                377 /*
378  * Define swap_entry_order() as constant to le    378  * Define swap_entry_order() as constant to let compiler to optimize
379  * out some code if !CONFIG_THP_SWAP              379  * out some code if !CONFIG_THP_SWAP
380  */                                               380  */
381 #define swap_entry_order(order) 0                 381 #define swap_entry_order(order) 0
382 #endif                                            382 #endif
383 #define LATENCY_LIMIT           256               383 #define LATENCY_LIMIT           256
384                                                   384 
385 static inline bool cluster_is_free(struct swap    385 static inline bool cluster_is_free(struct swap_cluster_info *info)
386 {                                                 386 {
387         return info->flags & CLUSTER_FLAG_FREE    387         return info->flags & CLUSTER_FLAG_FREE;
388 }                                                 388 }
389                                                   389 
390 static inline unsigned int cluster_index(struc    390 static inline unsigned int cluster_index(struct swap_info_struct *si,
391                                          struc    391                                          struct swap_cluster_info *ci)
392 {                                                 392 {
393         return ci - si->cluster_info;             393         return ci - si->cluster_info;
394 }                                                 394 }
395                                                   395 
396 static inline unsigned int cluster_offset(stru    396 static inline unsigned int cluster_offset(struct swap_info_struct *si,
397                                           stru    397                                           struct swap_cluster_info *ci)
398 {                                                 398 {
399         return cluster_index(si, ci) * SWAPFIL    399         return cluster_index(si, ci) * SWAPFILE_CLUSTER;
400 }                                                 400 }
401                                                   401 
402 static inline struct swap_cluster_info *lock_c    402 static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
403                                                   403                                                      unsigned long offset)
404 {                                                 404 {
405         struct swap_cluster_info *ci;             405         struct swap_cluster_info *ci;
406                                                   406 
407         ci = si->cluster_info;                    407         ci = si->cluster_info;
408         if (ci) {                                 408         if (ci) {
409                 ci += offset / SWAPFILE_CLUSTE    409                 ci += offset / SWAPFILE_CLUSTER;
410                 spin_lock(&ci->lock);             410                 spin_lock(&ci->lock);
411         }                                         411         }
412         return ci;                                412         return ci;
413 }                                                 413 }
414                                                   414 
415 static inline void unlock_cluster(struct swap_    415 static inline void unlock_cluster(struct swap_cluster_info *ci)
416 {                                                 416 {
417         if (ci)                                   417         if (ci)
418                 spin_unlock(&ci->lock);           418                 spin_unlock(&ci->lock);
419 }                                                 419 }
420                                                   420 
421 /*                                                421 /*
422  * Determine the locking method in use for thi    422  * Determine the locking method in use for this device.  Return
423  * swap_cluster_info if SSD-style cluster-base    423  * swap_cluster_info if SSD-style cluster-based locking is in place.
424  */                                               424  */
425 static inline struct swap_cluster_info *lock_c    425 static inline struct swap_cluster_info *lock_cluster_or_swap_info(
426                 struct swap_info_struct *si, u    426                 struct swap_info_struct *si, unsigned long offset)
427 {                                                 427 {
428         struct swap_cluster_info *ci;             428         struct swap_cluster_info *ci;
429                                                   429 
430         /* Try to use fine-grained SSD-style l    430         /* Try to use fine-grained SSD-style locking if available: */
431         ci = lock_cluster(si, offset);            431         ci = lock_cluster(si, offset);
432         /* Otherwise, fall back to traditional    432         /* Otherwise, fall back to traditional, coarse locking: */
433         if (!ci)                                  433         if (!ci)
434                 spin_lock(&si->lock);             434                 spin_lock(&si->lock);
435                                                   435 
436         return ci;                                436         return ci;
437 }                                                 437 }
438                                                   438 
439 static inline void unlock_cluster_or_swap_info    439 static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
440                                                   440                                                struct swap_cluster_info *ci)
441 {                                                 441 {
442         if (ci)                                   442         if (ci)
443                 unlock_cluster(ci);               443                 unlock_cluster(ci);
444         else                                      444         else
445                 spin_unlock(&si->lock);           445                 spin_unlock(&si->lock);
446 }                                                 446 }
447                                                   447 
448 /* Add a cluster to discard list and schedule     448 /* Add a cluster to discard list and schedule it to do discard */
449 static void swap_cluster_schedule_discard(stru    449 static void swap_cluster_schedule_discard(struct swap_info_struct *si,
450                 struct swap_cluster_info *ci)     450                 struct swap_cluster_info *ci)
451 {                                                 451 {
452         unsigned int idx = cluster_index(si, c    452         unsigned int idx = cluster_index(si, ci);
453         /*                                        453         /*
454          * If scan_swap_map_slots() can't find    454          * If scan_swap_map_slots() can't find a free cluster, it will check
455          * si->swap_map directly. To make sure    455          * si->swap_map directly. To make sure the discarding cluster isn't
456          * taken by scan_swap_map_slots(), mar    456          * taken by scan_swap_map_slots(), mark the swap entries bad (occupied).
457          * It will be cleared after discard       457          * It will be cleared after discard
458          */                                       458          */
459         memset(si->swap_map + idx * SWAPFILE_C    459         memset(si->swap_map + idx * SWAPFILE_CLUSTER,
460                         SWAP_MAP_BAD, SWAPFILE    460                         SWAP_MAP_BAD, SWAPFILE_CLUSTER);
461                                                   461 
462         VM_BUG_ON(ci->flags & CLUSTER_FLAG_FRE    462         VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
463         list_move_tail(&ci->list, &si->discard    463         list_move_tail(&ci->list, &si->discard_clusters);
464         ci->flags = 0;                            464         ci->flags = 0;
465         schedule_work(&si->discard_work);         465         schedule_work(&si->discard_work);
466 }                                                 466 }
467                                                   467 
468 static void __free_cluster(struct swap_info_st    468 static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
469 {                                                 469 {
470         lockdep_assert_held(&si->lock);           470         lockdep_assert_held(&si->lock);
471         lockdep_assert_held(&ci->lock);           471         lockdep_assert_held(&ci->lock);
472                                                   472 
473         if (ci->flags)                            473         if (ci->flags)
474                 list_move_tail(&ci->list, &si-    474                 list_move_tail(&ci->list, &si->free_clusters);
475         else                                      475         else
476                 list_add_tail(&ci->list, &si->    476                 list_add_tail(&ci->list, &si->free_clusters);
477         ci->flags = CLUSTER_FLAG_FREE;            477         ci->flags = CLUSTER_FLAG_FREE;
478         ci->order = 0;                            478         ci->order = 0;
479 }                                                 479 }
480                                                   480 
481 /*                                                481 /*
482  * Doing discard actually. After a cluster dis    482  * Doing discard actually. After a cluster discard is finished, the cluster
483  * will be added to free cluster list. caller     483  * will be added to free cluster list. caller should hold si->lock.
484 */                                                484 */
485 static void swap_do_scheduled_discard(struct s    485 static void swap_do_scheduled_discard(struct swap_info_struct *si)
486 {                                                 486 {
487         struct swap_cluster_info *ci;             487         struct swap_cluster_info *ci;
488         unsigned int idx;                         488         unsigned int idx;
489                                                   489 
490         while (!list_empty(&si->discard_cluste    490         while (!list_empty(&si->discard_clusters)) {
491                 ci = list_first_entry(&si->dis    491                 ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list);
492                 list_del(&ci->list);              492                 list_del(&ci->list);
493                 idx = cluster_index(si, ci);      493                 idx = cluster_index(si, ci);
494                 spin_unlock(&si->lock);           494                 spin_unlock(&si->lock);
495                                                   495 
496                 discard_swap_cluster(si, idx *    496                 discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
497                                 SWAPFILE_CLUST    497                                 SWAPFILE_CLUSTER);
498                                                   498 
499                 spin_lock(&si->lock);             499                 spin_lock(&si->lock);
500                 spin_lock(&ci->lock);             500                 spin_lock(&ci->lock);
501                 __free_cluster(si, ci);           501                 __free_cluster(si, ci);
502                 memset(si->swap_map + idx * SW    502                 memset(si->swap_map + idx * SWAPFILE_CLUSTER,
503                                 0, SWAPFILE_CL    503                                 0, SWAPFILE_CLUSTER);
504                 spin_unlock(&ci->lock);           504                 spin_unlock(&ci->lock);
505         }                                         505         }
506 }                                                 506 }
507                                                   507 
508 static void swap_discard_work(struct work_stru    508 static void swap_discard_work(struct work_struct *work)
509 {                                                 509 {
510         struct swap_info_struct *si;              510         struct swap_info_struct *si;
511                                                   511 
512         si = container_of(work, struct swap_in    512         si = container_of(work, struct swap_info_struct, discard_work);
513                                                   513 
514         spin_lock(&si->lock);                     514         spin_lock(&si->lock);
515         swap_do_scheduled_discard(si);            515         swap_do_scheduled_discard(si);
516         spin_unlock(&si->lock);                   516         spin_unlock(&si->lock);
517 }                                                 517 }
518                                                   518 
519 static void swap_users_ref_free(struct percpu_    519 static void swap_users_ref_free(struct percpu_ref *ref)
520 {                                                 520 {
521         struct swap_info_struct *si;              521         struct swap_info_struct *si;
522                                                   522 
523         si = container_of(ref, struct swap_inf    523         si = container_of(ref, struct swap_info_struct, users);
524         complete(&si->comp);                      524         complete(&si->comp);
525 }                                                 525 }
526                                                   526 
527 static void free_cluster(struct swap_info_stru    527 static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
528 {                                                 528 {
529         VM_BUG_ON(ci->count != 0);                529         VM_BUG_ON(ci->count != 0);
530         lockdep_assert_held(&si->lock);           530         lockdep_assert_held(&si->lock);
531         lockdep_assert_held(&ci->lock);           531         lockdep_assert_held(&ci->lock);
532                                                   532 
533         if (ci->flags & CLUSTER_FLAG_FRAG)        533         if (ci->flags & CLUSTER_FLAG_FRAG)
534                 si->frag_cluster_nr[ci->order]    534                 si->frag_cluster_nr[ci->order]--;
535                                                   535 
536         /*                                        536         /*
537          * If the swap is discardable, prepare    537          * If the swap is discardable, prepare discard the cluster
538          * instead of free it immediately. The    538          * instead of free it immediately. The cluster will be freed
539          * after discard.                         539          * after discard.
540          */                                       540          */
541         if ((si->flags & (SWP_WRITEOK | SWP_PA    541         if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
542             (SWP_WRITEOK | SWP_PAGE_DISCARD))     542             (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
543                 swap_cluster_schedule_discard(    543                 swap_cluster_schedule_discard(si, ci);
544                 return;                           544                 return;
545         }                                         545         }
546                                                   546 
547         __free_cluster(si, ci);                   547         __free_cluster(si, ci);
548 }                                                 548 }
549                                                   549 
550 /*                                                550 /*
551  * The cluster corresponding to page_nr will b    551  * The cluster corresponding to page_nr will be used. The cluster will not be
552  * added to free cluster list and its usage co    552  * added to free cluster list and its usage counter will be increased by 1.
553  * Only used for initialization.                  553  * Only used for initialization.
554  */                                               554  */
555 static void inc_cluster_info_page(struct swap_    555 static void inc_cluster_info_page(struct swap_info_struct *si,
556         struct swap_cluster_info *cluster_info    556         struct swap_cluster_info *cluster_info, unsigned long page_nr)
557 {                                                 557 {
558         unsigned long idx = page_nr / SWAPFILE    558         unsigned long idx = page_nr / SWAPFILE_CLUSTER;
559         struct swap_cluster_info *ci;             559         struct swap_cluster_info *ci;
560                                                   560 
561         if (!cluster_info)                        561         if (!cluster_info)
562                 return;                           562                 return;
563                                                   563 
564         ci = cluster_info + idx;                  564         ci = cluster_info + idx;
565         ci->count++;                              565         ci->count++;
566                                                   566 
567         VM_BUG_ON(ci->count > SWAPFILE_CLUSTER    567         VM_BUG_ON(ci->count > SWAPFILE_CLUSTER);
568         VM_BUG_ON(ci->flags);                     568         VM_BUG_ON(ci->flags);
569 }                                                 569 }
570                                                   570 
571 /*                                                571 /*
572  * The cluster ci decreases @nr_pages usage. I    572  * The cluster ci decreases @nr_pages usage. If the usage counter becomes 0,
573  * which means no page in the cluster is in us    573  * which means no page in the cluster is in use, we can optionally discard
574  * the cluster and add it to free cluster list    574  * the cluster and add it to free cluster list.
575  */                                               575  */
576 static void dec_cluster_info_page(struct swap_    576 static void dec_cluster_info_page(struct swap_info_struct *si,
577                                   struct swap_    577                                   struct swap_cluster_info *ci, int nr_pages)
578 {                                                 578 {
579         if (!si->cluster_info)                    579         if (!si->cluster_info)
580                 return;                           580                 return;
581                                                   581 
582         VM_BUG_ON(ci->count < nr_pages);          582         VM_BUG_ON(ci->count < nr_pages);
583         VM_BUG_ON(cluster_is_free(ci));           583         VM_BUG_ON(cluster_is_free(ci));
584         lockdep_assert_held(&si->lock);           584         lockdep_assert_held(&si->lock);
585         lockdep_assert_held(&ci->lock);           585         lockdep_assert_held(&ci->lock);
586         ci->count -= nr_pages;                    586         ci->count -= nr_pages;
587                                                   587 
588         if (!ci->count) {                         588         if (!ci->count) {
589                 free_cluster(si, ci);             589                 free_cluster(si, ci);
590                 return;                           590                 return;
591         }                                         591         }
592                                                   592 
593         if (!(ci->flags & CLUSTER_FLAG_NONFULL    593         if (!(ci->flags & CLUSTER_FLAG_NONFULL)) {
594                 VM_BUG_ON(ci->flags & CLUSTER_    594                 VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
595                 if (ci->flags & CLUSTER_FLAG_F    595                 if (ci->flags & CLUSTER_FLAG_FRAG)
596                         si->frag_cluster_nr[ci    596                         si->frag_cluster_nr[ci->order]--;
597                 list_move_tail(&ci->list, &si-    597                 list_move_tail(&ci->list, &si->nonfull_clusters[ci->order]);
598                 ci->flags = CLUSTER_FLAG_NONFU    598                 ci->flags = CLUSTER_FLAG_NONFULL;
599         }                                         599         }
600 }                                                 600 }
601                                                   601 
602 static bool cluster_reclaim_range(struct swap_    602 static bool cluster_reclaim_range(struct swap_info_struct *si,
603                                   struct swap_    603                                   struct swap_cluster_info *ci,
604                                   unsigned lon    604                                   unsigned long start, unsigned long end)
605 {                                                 605 {
606         unsigned char *map = si->swap_map;        606         unsigned char *map = si->swap_map;
607         unsigned long offset;                     607         unsigned long offset;
608                                                   608 
609         spin_unlock(&ci->lock);                   609         spin_unlock(&ci->lock);
610         spin_unlock(&si->lock);                   610         spin_unlock(&si->lock);
611                                                   611 
612         for (offset = start; offset < end; off    612         for (offset = start; offset < end; offset++) {
613                 switch (READ_ONCE(map[offset])    613                 switch (READ_ONCE(map[offset])) {
614                 case 0:                           614                 case 0:
615                         continue;                 615                         continue;
616                 case SWAP_HAS_CACHE:              616                 case SWAP_HAS_CACHE:
617                         if (__try_to_reclaim_s    617                         if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT) > 0)
618                                 continue;         618                                 continue;
619                         goto out;                 619                         goto out;
620                 default:                          620                 default:
621                         goto out;                 621                         goto out;
622                 }                                 622                 }
623         }                                         623         }
624 out:                                              624 out:
625         spin_lock(&si->lock);                     625         spin_lock(&si->lock);
626         spin_lock(&ci->lock);                     626         spin_lock(&ci->lock);
627                                                   627 
628         /*                                        628         /*
629          * Recheck the range no matter reclaim    629          * Recheck the range no matter reclaim succeeded or not, the slot
630          * could have been be freed while we a    630          * could have been be freed while we are not holding the lock.
631          */                                       631          */
632         for (offset = start; offset < end; off    632         for (offset = start; offset < end; offset++)
633                 if (READ_ONCE(map[offset]))       633                 if (READ_ONCE(map[offset]))
634                         return false;             634                         return false;
635                                                   635 
636         return true;                              636         return true;
637 }                                                 637 }
638                                                   638 
639 static bool cluster_scan_range(struct swap_inf    639 static bool cluster_scan_range(struct swap_info_struct *si,
640                                struct swap_clu    640                                struct swap_cluster_info *ci,
641                                unsigned long s    641                                unsigned long start, unsigned int nr_pages)
642 {                                                 642 {
643         unsigned long offset, end = start + nr    643         unsigned long offset, end = start + nr_pages;
644         unsigned char *map = si->swap_map;        644         unsigned char *map = si->swap_map;
645         bool need_reclaim = false;                645         bool need_reclaim = false;
646                                                   646 
647         for (offset = start; offset < end; off    647         for (offset = start; offset < end; offset++) {
648                 switch (READ_ONCE(map[offset])    648                 switch (READ_ONCE(map[offset])) {
649                 case 0:                           649                 case 0:
650                         continue;                 650                         continue;
651                 case SWAP_HAS_CACHE:              651                 case SWAP_HAS_CACHE:
652                         if (!vm_swap_full())      652                         if (!vm_swap_full())
653                                 return false;     653                                 return false;
654                         need_reclaim = true;      654                         need_reclaim = true;
655                         continue;                 655                         continue;
656                 default:                          656                 default:
657                         return false;             657                         return false;
658                 }                                 658                 }
659         }                                         659         }
660                                                   660 
661         if (need_reclaim)                         661         if (need_reclaim)
662                 return cluster_reclaim_range(s    662                 return cluster_reclaim_range(si, ci, start, end);
663                                                   663 
664         return true;                              664         return true;
665 }                                                 665 }
666                                                   666 
667 static void cluster_alloc_range(struct swap_in    667 static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
668                                 unsigned int s    668                                 unsigned int start, unsigned char usage,
669                                 unsigned int o    669                                 unsigned int order)
670 {                                                 670 {
671         unsigned int nr_pages = 1 << order;       671         unsigned int nr_pages = 1 << order;
672                                                   672 
673         if (cluster_is_free(ci)) {                673         if (cluster_is_free(ci)) {
674                 if (nr_pages < SWAPFILE_CLUSTE    674                 if (nr_pages < SWAPFILE_CLUSTER) {
675                         list_move_tail(&ci->li    675                         list_move_tail(&ci->list, &si->nonfull_clusters[order]);
676                         ci->flags = CLUSTER_FL    676                         ci->flags = CLUSTER_FLAG_NONFULL;
677                 }                                 677                 }
678                 ci->order = order;                678                 ci->order = order;
679         }                                         679         }
680                                                   680 
681         memset(si->swap_map + start, usage, nr    681         memset(si->swap_map + start, usage, nr_pages);
682         swap_range_alloc(si, start, nr_pages);    682         swap_range_alloc(si, start, nr_pages);
683         ci->count += nr_pages;                    683         ci->count += nr_pages;
684                                                   684 
685         if (ci->count == SWAPFILE_CLUSTER) {      685         if (ci->count == SWAPFILE_CLUSTER) {
686                 VM_BUG_ON(!(ci->flags &           686                 VM_BUG_ON(!(ci->flags &
687                           (CLUSTER_FLAG_FREE |    687                           (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG)));
688                 if (ci->flags & CLUSTER_FLAG_F    688                 if (ci->flags & CLUSTER_FLAG_FRAG)
689                         si->frag_cluster_nr[ci    689                         si->frag_cluster_nr[ci->order]--;
690                 list_move_tail(&ci->list, &si-    690                 list_move_tail(&ci->list, &si->full_clusters);
691                 ci->flags = CLUSTER_FLAG_FULL;    691                 ci->flags = CLUSTER_FLAG_FULL;
692         }                                         692         }
693 }                                                 693 }
694                                                   694 
695 static unsigned int alloc_swap_scan_cluster(st    695 static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset,
696                                             un    696                                             unsigned int *foundp, unsigned int order,
697                                             un    697                                             unsigned char usage)
698 {                                                 698 {
699         unsigned long start = offset & ~(SWAPF    699         unsigned long start = offset & ~(SWAPFILE_CLUSTER - 1);
700         unsigned long end = min(start + SWAPFI    700         unsigned long end = min(start + SWAPFILE_CLUSTER, si->max);
701         unsigned int nr_pages = 1 << order;       701         unsigned int nr_pages = 1 << order;
702         struct swap_cluster_info *ci;             702         struct swap_cluster_info *ci;
703                                                   703 
704         if (end < nr_pages)                       704         if (end < nr_pages)
705                 return SWAP_NEXT_INVALID;         705                 return SWAP_NEXT_INVALID;
706         end -= nr_pages;                          706         end -= nr_pages;
707                                                   707 
708         ci = lock_cluster(si, offset);            708         ci = lock_cluster(si, offset);
709         if (ci->count + nr_pages > SWAPFILE_CL    709         if (ci->count + nr_pages > SWAPFILE_CLUSTER) {
710                 offset = SWAP_NEXT_INVALID;       710                 offset = SWAP_NEXT_INVALID;
711                 goto done;                        711                 goto done;
712         }                                         712         }
713                                                   713 
714         while (offset <= end) {                   714         while (offset <= end) {
715                 if (cluster_scan_range(si, ci,    715                 if (cluster_scan_range(si, ci, offset, nr_pages)) {
716                         cluster_alloc_range(si    716                         cluster_alloc_range(si, ci, offset, usage, order);
717                         *foundp = offset;         717                         *foundp = offset;
718                         if (ci->count == SWAPF    718                         if (ci->count == SWAPFILE_CLUSTER) {
719                                 offset = SWAP_    719                                 offset = SWAP_NEXT_INVALID;
720                                 goto done;        720                                 goto done;
721                         }                         721                         }
722                         offset += nr_pages;       722                         offset += nr_pages;
723                         break;                    723                         break;
724                 }                                 724                 }
725                 offset += nr_pages;               725                 offset += nr_pages;
726         }                                         726         }
727         if (offset > end)                         727         if (offset > end)
728                 offset = SWAP_NEXT_INVALID;       728                 offset = SWAP_NEXT_INVALID;
729 done:                                             729 done:
730         unlock_cluster(ci);                       730         unlock_cluster(ci);
731         return offset;                            731         return offset;
732 }                                                 732 }
733                                                   733 
734 /* Return true if reclaimed a whole cluster */    734 /* Return true if reclaimed a whole cluster */
735 static void swap_reclaim_full_clusters(struct     735 static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
736 {                                                 736 {
737         long to_scan = 1;                         737         long to_scan = 1;
738         unsigned long offset, end;                738         unsigned long offset, end;
739         struct swap_cluster_info *ci;             739         struct swap_cluster_info *ci;
740         unsigned char *map = si->swap_map;        740         unsigned char *map = si->swap_map;
741         int nr_reclaim;                           741         int nr_reclaim;
742                                                   742 
743         if (force)                                743         if (force)
744                 to_scan = si->inuse_pages / SW    744                 to_scan = si->inuse_pages / SWAPFILE_CLUSTER;
745                                                   745 
746         while (!list_empty(&si->full_clusters)    746         while (!list_empty(&si->full_clusters)) {
747                 ci = list_first_entry(&si->ful    747                 ci = list_first_entry(&si->full_clusters, struct swap_cluster_info, list);
748                 list_move_tail(&ci->list, &si-    748                 list_move_tail(&ci->list, &si->full_clusters);
749                 offset = cluster_offset(si, ci    749                 offset = cluster_offset(si, ci);
750                 end = min(si->max, offset + SW    750                 end = min(si->max, offset + SWAPFILE_CLUSTER);
751                 to_scan--;                        751                 to_scan--;
752                                                   752 
753                 spin_unlock(&si->lock);           753                 spin_unlock(&si->lock);
754                 while (offset < end) {            754                 while (offset < end) {
755                         if (READ_ONCE(map[offs    755                         if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) {
756                                 nr_reclaim = _    756                                 nr_reclaim = __try_to_reclaim_swap(si, offset,
757                                                   757                                                                    TTRS_ANYWAY | TTRS_DIRECT);
758                                 if (nr_reclaim    758                                 if (nr_reclaim) {
759                                         offset    759                                         offset += abs(nr_reclaim);
760                                         contin    760                                         continue;
761                                 }                 761                                 }
762                         }                         762                         }
763                         offset++;                 763                         offset++;
764                 }                                 764                 }
765                 spin_lock(&si->lock);             765                 spin_lock(&si->lock);
766                                                   766 
767                 if (to_scan <= 0)                 767                 if (to_scan <= 0)
768                         break;                    768                         break;
769         }                                         769         }
770 }                                                 770 }
771                                                   771 
772 static void swap_reclaim_work(struct work_stru    772 static void swap_reclaim_work(struct work_struct *work)
773 {                                                 773 {
774         struct swap_info_struct *si;              774         struct swap_info_struct *si;
775                                                   775 
776         si = container_of(work, struct swap_in    776         si = container_of(work, struct swap_info_struct, reclaim_work);
777                                                   777 
778         spin_lock(&si->lock);                     778         spin_lock(&si->lock);
779         swap_reclaim_full_clusters(si, true);     779         swap_reclaim_full_clusters(si, true);
780         spin_unlock(&si->lock);                   780         spin_unlock(&si->lock);
781 }                                                 781 }
782                                                   782 
783 /*                                                783 /*
784  * Try to get swap entries with specified orde    784  * Try to get swap entries with specified order from current cpu's swap entry
785  * pool (a cluster). This might involve alloca    785  * pool (a cluster). This might involve allocating a new cluster for current CPU
786  * too.                                           786  * too.
787  */                                               787  */
788 static unsigned long cluster_alloc_swap_entry(    788 static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
789                                                   789                                               unsigned char usage)
790 {                                                 790 {
791         struct percpu_cluster *cluster;           791         struct percpu_cluster *cluster;
792         struct swap_cluster_info *ci;             792         struct swap_cluster_info *ci;
793         unsigned int offset, found = 0;           793         unsigned int offset, found = 0;
794                                                   794 
795 new_cluster:                                      795 new_cluster:
796         lockdep_assert_held(&si->lock);           796         lockdep_assert_held(&si->lock);
797         cluster = this_cpu_ptr(si->percpu_clus    797         cluster = this_cpu_ptr(si->percpu_cluster);
798         offset = cluster->next[order];            798         offset = cluster->next[order];
799         if (offset) {                             799         if (offset) {
800                 offset = alloc_swap_scan_clust    800                 offset = alloc_swap_scan_cluster(si, offset, &found, order, usage);
801                 if (found)                        801                 if (found)
802                         goto done;                802                         goto done;
803         }                                         803         }
804                                                   804 
805         if (!list_empty(&si->free_clusters)) {    805         if (!list_empty(&si->free_clusters)) {
806                 ci = list_first_entry(&si->fre    806                 ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
807                 offset = alloc_swap_scan_clust    807                 offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage);
808                 VM_BUG_ON(!found);                808                 VM_BUG_ON(!found);
809                 goto done;                        809                 goto done;
810         }                                         810         }
811                                                   811 
812         /* Try reclaim from full clusters if f    812         /* Try reclaim from full clusters if free clusters list is drained */
813         if (vm_swap_full())                       813         if (vm_swap_full())
814                 swap_reclaim_full_clusters(si,    814                 swap_reclaim_full_clusters(si, false);
815                                                   815 
816         if (order < PMD_ORDER) {                  816         if (order < PMD_ORDER) {
817                 unsigned int frags = 0;           817                 unsigned int frags = 0;
818                                                   818 
819                 while (!list_empty(&si->nonful    819                 while (!list_empty(&si->nonfull_clusters[order])) {
820                         ci = list_first_entry(    820                         ci = list_first_entry(&si->nonfull_clusters[order],
821                                                   821                                               struct swap_cluster_info, list);
822                         list_move_tail(&ci->li    822                         list_move_tail(&ci->list, &si->frag_clusters[order]);
823                         ci->flags = CLUSTER_FL    823                         ci->flags = CLUSTER_FLAG_FRAG;
824                         si->frag_cluster_nr[or    824                         si->frag_cluster_nr[order]++;
825                         offset = alloc_swap_sc    825                         offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
826                                                   826                                                          &found, order, usage);
827                         frags++;                  827                         frags++;
828                         if (found)                828                         if (found)
829                                 break;            829                                 break;
830                 }                                 830                 }
831                                                   831 
832                 if (!found) {                     832                 if (!found) {
833                         /*                        833                         /*
834                          * Nonfull clusters ar    834                          * Nonfull clusters are moved to frag tail if we reached
835                          * here, count them to    835                          * here, count them too, don't over scan the frag list.
836                          */                       836                          */
837                         while (frags < si->fra    837                         while (frags < si->frag_cluster_nr[order]) {
838                                 ci = list_firs    838                                 ci = list_first_entry(&si->frag_clusters[order],
839                                                   839                                                       struct swap_cluster_info, list);
840                                 /*                840                                 /*
841                                  * Rotate the     841                                  * Rotate the frag list to iterate, they were all failing
842                                  * high order     842                                  * high order allocation or moved here due to per-CPU usage,
843                                  * this help k    843                                  * this help keeping usable cluster ahead.
844                                  */               844                                  */
845                                 list_move_tail    845                                 list_move_tail(&ci->list, &si->frag_clusters[order]);
846                                 offset = alloc    846                                 offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
847                                                   847                                                                  &found, order, usage);
848                                 frags++;          848                                 frags++;
849                                 if (found)        849                                 if (found)
850                                         break;    850                                         break;
851                         }                         851                         }
852                 }                                 852                 }
853         }                                         853         }
854                                                   854 
855         if (found)                                855         if (found)
856                 goto done;                        856                 goto done;
857                                                   857 
858         if (!list_empty(&si->discard_clusters)    858         if (!list_empty(&si->discard_clusters)) {
859                 /*                                859                 /*
860                  * we don't have free cluster     860                  * we don't have free cluster but have some clusters in
861                  * discarding, do discard now     861                  * discarding, do discard now and reclaim them, then
862                  * reread cluster_next_cpu sin    862                  * reread cluster_next_cpu since we dropped si->lock
863                  */                               863                  */
864                 swap_do_scheduled_discard(si);    864                 swap_do_scheduled_discard(si);
865                 goto new_cluster;                 865                 goto new_cluster;
866         }                                         866         }
867                                                   867 
868         if (order)                                868         if (order)
869                 goto done;                        869                 goto done;
870                                                   870 
871         /* Order 0 stealing from higher order     871         /* Order 0 stealing from higher order */
872         for (int o = 1; o < SWAP_NR_ORDERS; o+    872         for (int o = 1; o < SWAP_NR_ORDERS; o++) {
873                 /*                                873                 /*
874                  * Clusters here have at least    874                  * Clusters here have at least one usable slots and can't fail order 0
875                  * allocation, but reclaim may    875                  * allocation, but reclaim may drop si->lock and race with another user.
876                  */                               876                  */
877                 while (!list_empty(&si->frag_c    877                 while (!list_empty(&si->frag_clusters[o])) {
878                         ci = list_first_entry(    878                         ci = list_first_entry(&si->frag_clusters[o],
879                                                   879                                               struct swap_cluster_info, list);
880                         offset = alloc_swap_sc    880                         offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
881                                                   881                                                          &found, 0, usage);
882                         if (found)                882                         if (found)
883                                 goto done;        883                                 goto done;
884                 }                                 884                 }
885                                                   885 
886                 while (!list_empty(&si->nonful    886                 while (!list_empty(&si->nonfull_clusters[o])) {
887                         ci = list_first_entry(    887                         ci = list_first_entry(&si->nonfull_clusters[o],
888                                                   888                                               struct swap_cluster_info, list);
889                         offset = alloc_swap_sc    889                         offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
890                                                   890                                                          &found, 0, usage);
891                         if (found)                891                         if (found)
892                                 goto done;        892                                 goto done;
893                 }                                 893                 }
894         }                                         894         }
895                                                   895 
896 done:                                             896 done:
897         cluster->next[order] = offset;            897         cluster->next[order] = offset;
898         return found;                             898         return found;
899 }                                                 899 }
900                                                   900 
901 static void __del_from_avail_list(struct swap_    901 static void __del_from_avail_list(struct swap_info_struct *si)
902 {                                                 902 {
903         int nid;                                  903         int nid;
904                                                   904 
905         assert_spin_locked(&si->lock);            905         assert_spin_locked(&si->lock);
906         for_each_node(nid)                        906         for_each_node(nid)
907                 plist_del(&si->avail_lists[nid    907                 plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]);
908 }                                                 908 }
909                                                   909 
910 static void del_from_avail_list(struct swap_in    910 static void del_from_avail_list(struct swap_info_struct *si)
911 {                                                 911 {
912         spin_lock(&swap_avail_lock);              912         spin_lock(&swap_avail_lock);
913         __del_from_avail_list(si);                913         __del_from_avail_list(si);
914         spin_unlock(&swap_avail_lock);            914         spin_unlock(&swap_avail_lock);
915 }                                                 915 }
916                                                   916 
917 static void swap_range_alloc(struct swap_info_    917 static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
918                              unsigned int nr_e    918                              unsigned int nr_entries)
919 {                                                 919 {
920         unsigned int end = offset + nr_entries    920         unsigned int end = offset + nr_entries - 1;
921                                                   921 
922         if (offset == si->lowest_bit)             922         if (offset == si->lowest_bit)
923                 si->lowest_bit += nr_entries;     923                 si->lowest_bit += nr_entries;
924         if (end == si->highest_bit)               924         if (end == si->highest_bit)
925                 WRITE_ONCE(si->highest_bit, si    925                 WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries);
926         WRITE_ONCE(si->inuse_pages, si->inuse_    926         WRITE_ONCE(si->inuse_pages, si->inuse_pages + nr_entries);
927         if (si->inuse_pages == si->pages) {       927         if (si->inuse_pages == si->pages) {
928                 si->lowest_bit = si->max;         928                 si->lowest_bit = si->max;
929                 si->highest_bit = 0;              929                 si->highest_bit = 0;
930                 del_from_avail_list(si);          930                 del_from_avail_list(si);
931                                                   931 
932                 if (vm_swap_full())               932                 if (vm_swap_full())
933                         schedule_work(&si->rec    933                         schedule_work(&si->reclaim_work);
934         }                                         934         }
935 }                                                 935 }
936                                                   936 
937 static void add_to_avail_list(struct swap_info    937 static void add_to_avail_list(struct swap_info_struct *si)
938 {                                                 938 {
939         int nid;                                  939         int nid;
940                                                   940 
941         spin_lock(&swap_avail_lock);              941         spin_lock(&swap_avail_lock);
942         for_each_node(nid)                        942         for_each_node(nid)
943                 plist_add(&si->avail_lists[nid    943                 plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]);
944         spin_unlock(&swap_avail_lock);            944         spin_unlock(&swap_avail_lock);
945 }                                                 945 }
946                                                   946 
947 static void swap_range_free(struct swap_info_s    947 static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
948                             unsigned int nr_en    948                             unsigned int nr_entries)
949 {                                                 949 {
950         unsigned long begin = offset;             950         unsigned long begin = offset;
951         unsigned long end = offset + nr_entrie    951         unsigned long end = offset + nr_entries - 1;
952         void (*swap_slot_free_notify)(struct b    952         void (*swap_slot_free_notify)(struct block_device *, unsigned long);
953         unsigned int i;                           953         unsigned int i;
954                                                   954 
955         /*                                        955         /*
956          * Use atomic clear_bit operations onl    956          * Use atomic clear_bit operations only on zeromap instead of non-atomic
957          * bitmap_clear to prevent adjacent bi    957          * bitmap_clear to prevent adjacent bits corruption due to simultaneous writes.
958          */                                       958          */
959         for (i = 0; i < nr_entries; i++)          959         for (i = 0; i < nr_entries; i++)
960                 clear_bit(offset + i, si->zero    960                 clear_bit(offset + i, si->zeromap);
961                                                   961 
962         if (offset < si->lowest_bit)              962         if (offset < si->lowest_bit)
963                 si->lowest_bit = offset;          963                 si->lowest_bit = offset;
964         if (end > si->highest_bit) {              964         if (end > si->highest_bit) {
965                 bool was_full = !si->highest_b    965                 bool was_full = !si->highest_bit;
966                                                   966 
967                 WRITE_ONCE(si->highest_bit, en    967                 WRITE_ONCE(si->highest_bit, end);
968                 if (was_full && (si->flags & S    968                 if (was_full && (si->flags & SWP_WRITEOK))
969                         add_to_avail_list(si);    969                         add_to_avail_list(si);
970         }                                         970         }
971         if (si->flags & SWP_BLKDEV)               971         if (si->flags & SWP_BLKDEV)
972                 swap_slot_free_notify =           972                 swap_slot_free_notify =
973                         si->bdev->bd_disk->fop    973                         si->bdev->bd_disk->fops->swap_slot_free_notify;
974         else                                      974         else
975                 swap_slot_free_notify = NULL;     975                 swap_slot_free_notify = NULL;
976         while (offset <= end) {                   976         while (offset <= end) {
977                 arch_swap_invalidate_page(si->    977                 arch_swap_invalidate_page(si->type, offset);
978                 if (swap_slot_free_notify)        978                 if (swap_slot_free_notify)
979                         swap_slot_free_notify(    979                         swap_slot_free_notify(si->bdev, offset);
980                 offset++;                         980                 offset++;
981         }                                         981         }
982         clear_shadow_from_swap_cache(si->type,    982         clear_shadow_from_swap_cache(si->type, begin, end);
983                                                   983 
984         /*                                        984         /*
985          * Make sure that try_to_unuse() obser    985          * Make sure that try_to_unuse() observes si->inuse_pages reaching 0
986          * only after the above cleanups are d    986          * only after the above cleanups are done.
987          */                                       987          */
988         smp_wmb();                                988         smp_wmb();
989         atomic_long_add(nr_entries, &nr_swap_p    989         atomic_long_add(nr_entries, &nr_swap_pages);
990         WRITE_ONCE(si->inuse_pages, si->inuse_    990         WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries);
991 }                                                 991 }
992                                                   992 
993 static void set_cluster_next(struct swap_info_    993 static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
994 {                                                 994 {
995         unsigned long prev;                       995         unsigned long prev;
996                                                   996 
997         if (!(si->flags & SWP_SOLIDSTATE)) {      997         if (!(si->flags & SWP_SOLIDSTATE)) {
998                 si->cluster_next = next;          998                 si->cluster_next = next;
999                 return;                           999                 return;
1000         }                                        1000         }
1001                                                  1001 
1002         prev = this_cpu_read(*si->cluster_nex    1002         prev = this_cpu_read(*si->cluster_next_cpu);
1003         /*                                       1003         /*
1004          * Cross the swap address space size     1004          * Cross the swap address space size aligned trunk, choose
1005          * another trunk randomly to avoid lo    1005          * another trunk randomly to avoid lock contention on swap
1006          * address space if possible.            1006          * address space if possible.
1007          */                                      1007          */
1008         if ((prev >> SWAP_ADDRESS_SPACE_SHIFT    1008         if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) !=
1009             (next >> SWAP_ADDRESS_SPACE_SHIFT    1009             (next >> SWAP_ADDRESS_SPACE_SHIFT)) {
1010                 /* No free swap slots availab    1010                 /* No free swap slots available */
1011                 if (si->highest_bit <= si->lo    1011                 if (si->highest_bit <= si->lowest_bit)
1012                         return;                  1012                         return;
1013                 next = get_random_u32_inclusi    1013                 next = get_random_u32_inclusive(si->lowest_bit, si->highest_bit);
1014                 next = ALIGN_DOWN(next, SWAP_    1014                 next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES);
1015                 next = max_t(unsigned int, ne    1015                 next = max_t(unsigned int, next, si->lowest_bit);
1016         }                                        1016         }
1017         this_cpu_write(*si->cluster_next_cpu,    1017         this_cpu_write(*si->cluster_next_cpu, next);
1018 }                                                1018 }
1019                                                  1019 
1020 static bool swap_offset_available_and_locked(    1020 static bool swap_offset_available_and_locked(struct swap_info_struct *si,
1021                                                  1021                                              unsigned long offset)
1022 {                                                1022 {
1023         if (data_race(!si->swap_map[offset]))    1023         if (data_race(!si->swap_map[offset])) {
1024                 spin_lock(&si->lock);            1024                 spin_lock(&si->lock);
1025                 return true;                     1025                 return true;
1026         }                                        1026         }
1027                                                  1027 
1028         if (vm_swap_full() && READ_ONCE(si->s    1028         if (vm_swap_full() && READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
1029                 spin_lock(&si->lock);            1029                 spin_lock(&si->lock);
1030                 return true;                     1030                 return true;
1031         }                                        1031         }
1032                                                  1032 
1033         return false;                            1033         return false;
1034 }                                                1034 }
1035                                                  1035 
1036 static int cluster_alloc_swap(struct swap_inf    1036 static int cluster_alloc_swap(struct swap_info_struct *si,
1037                              unsigned char us    1037                              unsigned char usage, int nr,
1038                              swp_entry_t slot    1038                              swp_entry_t slots[], int order)
1039 {                                                1039 {
1040         int n_ret = 0;                           1040         int n_ret = 0;
1041                                                  1041 
1042         VM_BUG_ON(!si->cluster_info);            1042         VM_BUG_ON(!si->cluster_info);
1043                                                  1043 
1044         while (n_ret < nr) {                     1044         while (n_ret < nr) {
1045                 unsigned long offset = cluste    1045                 unsigned long offset = cluster_alloc_swap_entry(si, order, usage);
1046                                                  1046 
1047                 if (!offset)                     1047                 if (!offset)
1048                         break;                   1048                         break;
1049                 slots[n_ret++] = swp_entry(si    1049                 slots[n_ret++] = swp_entry(si->type, offset);
1050         }                                        1050         }
1051                                                  1051 
1052         return n_ret;                            1052         return n_ret;
1053 }                                                1053 }
1054                                                  1054 
1055 static int scan_swap_map_slots(struct swap_in    1055 static int scan_swap_map_slots(struct swap_info_struct *si,
1056                                unsigned char     1056                                unsigned char usage, int nr,
1057                                swp_entry_t sl    1057                                swp_entry_t slots[], int order)
1058 {                                                1058 {
1059         unsigned long offset;                    1059         unsigned long offset;
1060         unsigned long scan_base;                 1060         unsigned long scan_base;
1061         unsigned long last_in_cluster = 0;       1061         unsigned long last_in_cluster = 0;
1062         int latency_ration = LATENCY_LIMIT;      1062         int latency_ration = LATENCY_LIMIT;
1063         unsigned int nr_pages = 1 << order;      1063         unsigned int nr_pages = 1 << order;
1064         int n_ret = 0;                           1064         int n_ret = 0;
1065         bool scanned_many = false;               1065         bool scanned_many = false;
1066                                                  1066 
1067         /*                                       1067         /*
1068          * We try to cluster swap pages by al    1068          * We try to cluster swap pages by allocating them sequentially
1069          * in swap.  Once we've allocated SWA    1069          * in swap.  Once we've allocated SWAPFILE_CLUSTER pages this
1070          * way, however, we resort to first-f    1070          * way, however, we resort to first-free allocation, starting
1071          * a new cluster.  This prevents us f    1071          * a new cluster.  This prevents us from scattering swap pages
1072          * all over the entire swap partition    1072          * all over the entire swap partition, so that we reduce
1073          * overall disk seek times between sw    1073          * overall disk seek times between swap pages.  -- sct
1074          * But we do now try to find an empty    1074          * But we do now try to find an empty cluster.  -Andrea
1075          * And we let swap pages go all over     1075          * And we let swap pages go all over an SSD partition.  Hugh
1076          */                                      1076          */
1077                                                  1077 
1078         if (order > 0) {                         1078         if (order > 0) {
1079                 /*                               1079                 /*
1080                  * Should not even be attempt    1080                  * Should not even be attempting large allocations when huge
1081                  * page swap is disabled.  Wa    1081                  * page swap is disabled.  Warn and fail the allocation.
1082                  */                              1082                  */
1083                 if (!IS_ENABLED(CONFIG_THP_SW    1083                 if (!IS_ENABLED(CONFIG_THP_SWAP) ||
1084                     nr_pages > SWAPFILE_CLUST    1084                     nr_pages > SWAPFILE_CLUSTER) {
1085                         VM_WARN_ON_ONCE(1);      1085                         VM_WARN_ON_ONCE(1);
1086                         return 0;                1086                         return 0;
1087                 }                                1087                 }
1088                                                  1088 
1089                 /*                               1089                 /*
1090                  * Swapfile is not block devi    1090                  * Swapfile is not block device or not using clusters so unable
1091                  * to allocate large entries.    1091                  * to allocate large entries.
1092                  */                              1092                  */
1093                 if (!(si->flags & SWP_BLKDEV)    1093                 if (!(si->flags & SWP_BLKDEV) || !si->cluster_info)
1094                         return 0;                1094                         return 0;
1095         }                                        1095         }
1096                                                  1096 
1097         if (si->cluster_info)                    1097         if (si->cluster_info)
1098                 return cluster_alloc_swap(si,    1098                 return cluster_alloc_swap(si, usage, nr, slots, order);
1099                                                  1099 
1100         si->flags += SWP_SCANNING;               1100         si->flags += SWP_SCANNING;
1101                                                  1101 
1102         /* For HDD, sequential access is more    1102         /* For HDD, sequential access is more important. */
1103         scan_base = si->cluster_next;            1103         scan_base = si->cluster_next;
1104         offset = scan_base;                      1104         offset = scan_base;
1105                                                  1105 
1106         if (unlikely(!si->cluster_nr--)) {       1106         if (unlikely(!si->cluster_nr--)) {
1107                 if (si->pages - si->inuse_pag    1107                 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
1108                         si->cluster_nr = SWAP    1108                         si->cluster_nr = SWAPFILE_CLUSTER - 1;
1109                         goto checks;             1109                         goto checks;
1110                 }                                1110                 }
1111                                                  1111 
1112                 spin_unlock(&si->lock);          1112                 spin_unlock(&si->lock);
1113                                                  1113 
1114                 /*                               1114                 /*
1115                  * If seek is expensive, star    1115                  * If seek is expensive, start searching for new cluster from
1116                  * start of partition, to min    1116                  * start of partition, to minimize the span of allocated swap.
1117                  */                              1117                  */
1118                 scan_base = offset = si->lowe    1118                 scan_base = offset = si->lowest_bit;
1119                 last_in_cluster = offset + SW    1119                 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
1120                                                  1120 
1121                 /* Locate the first empty (un    1121                 /* Locate the first empty (unaligned) cluster */
1122                 for (; last_in_cluster <= REA    1122                 for (; last_in_cluster <= READ_ONCE(si->highest_bit); offset++) {
1123                         if (si->swap_map[offs    1123                         if (si->swap_map[offset])
1124                                 last_in_clust    1124                                 last_in_cluster = offset + SWAPFILE_CLUSTER;
1125                         else if (offset == la    1125                         else if (offset == last_in_cluster) {
1126                                 spin_lock(&si    1126                                 spin_lock(&si->lock);
1127                                 offset -= SWA    1127                                 offset -= SWAPFILE_CLUSTER - 1;
1128                                 si->cluster_n    1128                                 si->cluster_next = offset;
1129                                 si->cluster_n    1129                                 si->cluster_nr = SWAPFILE_CLUSTER - 1;
1130                                 goto checks;     1130                                 goto checks;
1131                         }                        1131                         }
1132                         if (unlikely(--latenc    1132                         if (unlikely(--latency_ration < 0)) {
1133                                 cond_resched(    1133                                 cond_resched();
1134                                 latency_ratio    1134                                 latency_ration = LATENCY_LIMIT;
1135                         }                        1135                         }
1136                 }                                1136                 }
1137                                                  1137 
1138                 offset = scan_base;              1138                 offset = scan_base;
1139                 spin_lock(&si->lock);            1139                 spin_lock(&si->lock);
1140                 si->cluster_nr = SWAPFILE_CLU    1140                 si->cluster_nr = SWAPFILE_CLUSTER - 1;
1141         }                                        1141         }
1142                                                  1142 
1143 checks:                                          1143 checks:
1144         if (!(si->flags & SWP_WRITEOK))          1144         if (!(si->flags & SWP_WRITEOK))
1145                 goto no_page;                    1145                 goto no_page;
1146         if (!si->highest_bit)                    1146         if (!si->highest_bit)
1147                 goto no_page;                    1147                 goto no_page;
1148         if (offset > si->highest_bit)            1148         if (offset > si->highest_bit)
1149                 scan_base = offset = si->lowe    1149                 scan_base = offset = si->lowest_bit;
1150                                                  1150 
1151         /* reuse swap entry of cache-only swa    1151         /* reuse swap entry of cache-only swap if not busy. */
1152         if (vm_swap_full() && si->swap_map[of    1152         if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
1153                 int swap_was_freed;              1153                 int swap_was_freed;
1154                 spin_unlock(&si->lock);          1154                 spin_unlock(&si->lock);
1155                 swap_was_freed = __try_to_rec    1155                 swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT);
1156                 spin_lock(&si->lock);            1156                 spin_lock(&si->lock);
1157                 /* entry was freed successful    1157                 /* entry was freed successfully, try to use this again */
1158                 if (swap_was_freed > 0)          1158                 if (swap_was_freed > 0)
1159                         goto checks;             1159                         goto checks;
1160                 goto scan; /* check next one     1160                 goto scan; /* check next one */
1161         }                                        1161         }
1162                                                  1162 
1163         if (si->swap_map[offset]) {              1163         if (si->swap_map[offset]) {
1164                 if (!n_ret)                      1164                 if (!n_ret)
1165                         goto scan;               1165                         goto scan;
1166                 else                             1166                 else
1167                         goto done;               1167                         goto done;
1168         }                                        1168         }
1169         memset(si->swap_map + offset, usage,     1169         memset(si->swap_map + offset, usage, nr_pages);
1170                                                  1170 
1171         swap_range_alloc(si, offset, nr_pages    1171         swap_range_alloc(si, offset, nr_pages);
1172         slots[n_ret++] = swp_entry(si->type,     1172         slots[n_ret++] = swp_entry(si->type, offset);
1173                                                  1173 
1174         /* got enough slots or reach max slot    1174         /* got enough slots or reach max slots? */
1175         if ((n_ret == nr) || (offset >= si->h    1175         if ((n_ret == nr) || (offset >= si->highest_bit))
1176                 goto done;                       1176                 goto done;
1177                                                  1177 
1178         /* search for next available slot */     1178         /* search for next available slot */
1179                                                  1179 
1180         /* time to take a break? */              1180         /* time to take a break? */
1181         if (unlikely(--latency_ration < 0)) {    1181         if (unlikely(--latency_ration < 0)) {
1182                 if (n_ret)                       1182                 if (n_ret)
1183                         goto done;               1183                         goto done;
1184                 spin_unlock(&si->lock);          1184                 spin_unlock(&si->lock);
1185                 cond_resched();                  1185                 cond_resched();
1186                 spin_lock(&si->lock);            1186                 spin_lock(&si->lock);
1187                 latency_ration = LATENCY_LIMI    1187                 latency_ration = LATENCY_LIMIT;
1188         }                                        1188         }
1189                                                  1189 
1190         if (si->cluster_nr && !si->swap_map[+    1190         if (si->cluster_nr && !si->swap_map[++offset]) {
1191                 /* non-ssd case, still more s    1191                 /* non-ssd case, still more slots in cluster? */
1192                 --si->cluster_nr;                1192                 --si->cluster_nr;
1193                 goto checks;                     1193                 goto checks;
1194         }                                        1194         }
1195                                                  1195 
1196         /*                                       1196         /*
1197          * Even if there's no free clusters a    1197          * Even if there's no free clusters available (fragmented),
1198          * try to scan a little more quickly     1198          * try to scan a little more quickly with lock held unless we
1199          * have scanned too many slots alread    1199          * have scanned too many slots already.
1200          */                                      1200          */
1201         if (!scanned_many) {                     1201         if (!scanned_many) {
1202                 unsigned long scan_limit;        1202                 unsigned long scan_limit;
1203                                                  1203 
1204                 if (offset < scan_base)          1204                 if (offset < scan_base)
1205                         scan_limit = scan_bas    1205                         scan_limit = scan_base;
1206                 else                             1206                 else
1207                         scan_limit = si->high    1207                         scan_limit = si->highest_bit;
1208                 for (; offset <= scan_limit &    1208                 for (; offset <= scan_limit && --latency_ration > 0;
1209                      offset++) {                 1209                      offset++) {
1210                         if (!si->swap_map[off    1210                         if (!si->swap_map[offset])
1211                                 goto checks;     1211                                 goto checks;
1212                 }                                1212                 }
1213         }                                        1213         }
1214                                                  1214 
1215 done:                                            1215 done:
1216         if (order == 0)                          1216         if (order == 0)
1217                 set_cluster_next(si, offset +    1217                 set_cluster_next(si, offset + 1);
1218         si->flags -= SWP_SCANNING;               1218         si->flags -= SWP_SCANNING;
1219         return n_ret;                            1219         return n_ret;
1220                                                  1220 
1221 scan:                                            1221 scan:
1222         VM_WARN_ON(order > 0);                   1222         VM_WARN_ON(order > 0);
1223         spin_unlock(&si->lock);                  1223         spin_unlock(&si->lock);
1224         while (++offset <= READ_ONCE(si->high    1224         while (++offset <= READ_ONCE(si->highest_bit)) {
1225                 if (unlikely(--latency_ration    1225                 if (unlikely(--latency_ration < 0)) {
1226                         cond_resched();          1226                         cond_resched();
1227                         latency_ration = LATE    1227                         latency_ration = LATENCY_LIMIT;
1228                         scanned_many = true;     1228                         scanned_many = true;
1229                 }                                1229                 }
1230                 if (swap_offset_available_and    1230                 if (swap_offset_available_and_locked(si, offset))
1231                         goto checks;             1231                         goto checks;
1232         }                                        1232         }
1233         offset = si->lowest_bit;                 1233         offset = si->lowest_bit;
1234         while (offset < scan_base) {             1234         while (offset < scan_base) {
1235                 if (unlikely(--latency_ration    1235                 if (unlikely(--latency_ration < 0)) {
1236                         cond_resched();          1236                         cond_resched();
1237                         latency_ration = LATE    1237                         latency_ration = LATENCY_LIMIT;
1238                         scanned_many = true;     1238                         scanned_many = true;
1239                 }                                1239                 }
1240                 if (swap_offset_available_and    1240                 if (swap_offset_available_and_locked(si, offset))
1241                         goto checks;             1241                         goto checks;
1242                 offset++;                        1242                 offset++;
1243         }                                        1243         }
1244         spin_lock(&si->lock);                    1244         spin_lock(&si->lock);
1245                                                  1245 
1246 no_page:                                         1246 no_page:
1247         si->flags -= SWP_SCANNING;               1247         si->flags -= SWP_SCANNING;
1248         return n_ret;                            1248         return n_ret;
1249 }                                                1249 }
1250                                                  1250 
1251 int get_swap_pages(int n_goal, swp_entry_t sw    1251 int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
1252 {                                                1252 {
1253         int order = swap_entry_order(entry_or    1253         int order = swap_entry_order(entry_order);
1254         unsigned long size = 1 << order;         1254         unsigned long size = 1 << order;
1255         struct swap_info_struct *si, *next;      1255         struct swap_info_struct *si, *next;
1256         long avail_pgs;                          1256         long avail_pgs;
1257         int n_ret = 0;                           1257         int n_ret = 0;
1258         int node;                                1258         int node;
1259                                                  1259 
1260         spin_lock(&swap_avail_lock);             1260         spin_lock(&swap_avail_lock);
1261                                                  1261 
1262         avail_pgs = atomic_long_read(&nr_swap    1262         avail_pgs = atomic_long_read(&nr_swap_pages) / size;
1263         if (avail_pgs <= 0) {                    1263         if (avail_pgs <= 0) {
1264                 spin_unlock(&swap_avail_lock)    1264                 spin_unlock(&swap_avail_lock);
1265                 goto noswap;                     1265                 goto noswap;
1266         }                                        1266         }
1267                                                  1267 
1268         n_goal = min3((long)n_goal, (long)SWA    1268         n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs);
1269                                                  1269 
1270         atomic_long_sub(n_goal * size, &nr_sw    1270         atomic_long_sub(n_goal * size, &nr_swap_pages);
1271                                                  1271 
1272 start_over:                                      1272 start_over:
1273         node = numa_node_id();                   1273         node = numa_node_id();
1274         plist_for_each_entry_safe(si, next, &    1274         plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
1275                 /* requeue si to after same-p    1275                 /* requeue si to after same-priority siblings */
1276                 plist_requeue(&si->avail_list    1276                 plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
1277                 spin_unlock(&swap_avail_lock)    1277                 spin_unlock(&swap_avail_lock);
1278                 spin_lock(&si->lock);            1278                 spin_lock(&si->lock);
1279                 if (!si->highest_bit || !(si-    1279                 if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
1280                         spin_lock(&swap_avail    1280                         spin_lock(&swap_avail_lock);
1281                         if (plist_node_empty(    1281                         if (plist_node_empty(&si->avail_lists[node])) {
1282                                 spin_unlock(&    1282                                 spin_unlock(&si->lock);
1283                                 goto nextsi;     1283                                 goto nextsi;
1284                         }                        1284                         }
1285                         WARN(!si->highest_bit    1285                         WARN(!si->highest_bit,
1286                              "swap_info %d in    1286                              "swap_info %d in list but !highest_bit\n",
1287                              si->type);          1287                              si->type);
1288                         WARN(!(si->flags & SW    1288                         WARN(!(si->flags & SWP_WRITEOK),
1289                              "swap_info %d in    1289                              "swap_info %d in list but !SWP_WRITEOK\n",
1290                              si->type);          1290                              si->type);
1291                         __del_from_avail_list    1291                         __del_from_avail_list(si);
1292                         spin_unlock(&si->lock    1292                         spin_unlock(&si->lock);
1293                         goto nextsi;             1293                         goto nextsi;
1294                 }                                1294                 }
1295                 n_ret = scan_swap_map_slots(s    1295                 n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
1296                                             n    1296                                             n_goal, swp_entries, order);
1297                 spin_unlock(&si->lock);          1297                 spin_unlock(&si->lock);
1298                 if (n_ret || size > 1)           1298                 if (n_ret || size > 1)
1299                         goto check_out;          1299                         goto check_out;
1300                 cond_resched();                  1300                 cond_resched();
1301                                                  1301 
1302                 spin_lock(&swap_avail_lock);     1302                 spin_lock(&swap_avail_lock);
1303 nextsi:                                          1303 nextsi:
1304                 /*                               1304                 /*
1305                  * if we got here, it's likel    1305                  * if we got here, it's likely that si was almost full before,
1306                  * and since scan_swap_map_sl    1306                  * and since scan_swap_map_slots() can drop the si->lock,
1307                  * multiple callers probably     1307                  * multiple callers probably all tried to get a page from the
1308                  * same si and it filled up b    1308                  * same si and it filled up before we could get one; or, the si
1309                  * filled up between us dropp    1309                  * filled up between us dropping swap_avail_lock and taking
1310                  * si->lock. Since we dropped    1310                  * si->lock. Since we dropped the swap_avail_lock, the
1311                  * swap_avail_head list may h    1311                  * swap_avail_head list may have been modified; so if next is
1312                  * still in the swap_avail_he    1312                  * still in the swap_avail_head list then try it, otherwise
1313                  * start over if we have not     1313                  * start over if we have not gotten any slots.
1314                  */                              1314                  */
1315                 if (plist_node_empty(&next->a    1315                 if (plist_node_empty(&next->avail_lists[node]))
1316                         goto start_over;         1316                         goto start_over;
1317         }                                        1317         }
1318                                                  1318 
1319         spin_unlock(&swap_avail_lock);           1319         spin_unlock(&swap_avail_lock);
1320                                                  1320 
1321 check_out:                                       1321 check_out:
1322         if (n_ret < n_goal)                      1322         if (n_ret < n_goal)
1323                 atomic_long_add((long)(n_goal    1323                 atomic_long_add((long)(n_goal - n_ret) * size,
1324                                 &nr_swap_page    1324                                 &nr_swap_pages);
1325 noswap:                                          1325 noswap:
1326         return n_ret;                            1326         return n_ret;
1327 }                                                1327 }
1328                                                  1328 
1329 static struct swap_info_struct *_swap_info_ge    1329 static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
1330 {                                                1330 {
1331         struct swap_info_struct *si;             1331         struct swap_info_struct *si;
1332         unsigned long offset;                    1332         unsigned long offset;
1333                                                  1333 
1334         if (!entry.val)                          1334         if (!entry.val)
1335                 goto out;                        1335                 goto out;
1336         si = swp_swap_info(entry);               1336         si = swp_swap_info(entry);
1337         if (!si)                                 1337         if (!si)
1338                 goto bad_nofile;                 1338                 goto bad_nofile;
1339         if (data_race(!(si->flags & SWP_USED)    1339         if (data_race(!(si->flags & SWP_USED)))
1340                 goto bad_device;                 1340                 goto bad_device;
1341         offset = swp_offset(entry);              1341         offset = swp_offset(entry);
1342         if (offset >= si->max)                   1342         if (offset >= si->max)
1343                 goto bad_offset;                 1343                 goto bad_offset;
1344         if (data_race(!si->swap_map[swp_offse    1344         if (data_race(!si->swap_map[swp_offset(entry)]))
1345                 goto bad_free;                   1345                 goto bad_free;
1346         return si;                               1346         return si;
1347                                                  1347 
1348 bad_free:                                        1348 bad_free:
1349         pr_err("%s: %s%08lx\n", __func__, Unu    1349         pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val);
1350         goto out;                                1350         goto out;
1351 bad_offset:                                      1351 bad_offset:
1352         pr_err("%s: %s%08lx\n", __func__, Bad    1352         pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val);
1353         goto out;                                1353         goto out;
1354 bad_device:                                      1354 bad_device:
1355         pr_err("%s: %s%08lx\n", __func__, Unu    1355         pr_err("%s: %s%08lx\n", __func__, Unused_file, entry.val);
1356         goto out;                                1356         goto out;
1357 bad_nofile:                                      1357 bad_nofile:
1358         pr_err("%s: %s%08lx\n", __func__, Bad    1358         pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
1359 out:                                             1359 out:
1360         return NULL;                             1360         return NULL;
1361 }                                                1361 }
1362                                                  1362 
1363 static struct swap_info_struct *swap_info_get    1363 static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
1364                                         struc    1364                                         struct swap_info_struct *q)
1365 {                                                1365 {
1366         struct swap_info_struct *p;              1366         struct swap_info_struct *p;
1367                                                  1367 
1368         p = _swap_info_get(entry);               1368         p = _swap_info_get(entry);
1369                                                  1369 
1370         if (p != q) {                            1370         if (p != q) {
1371                 if (q != NULL)                   1371                 if (q != NULL)
1372                         spin_unlock(&q->lock)    1372                         spin_unlock(&q->lock);
1373                 if (p != NULL)                   1373                 if (p != NULL)
1374                         spin_lock(&p->lock);     1374                         spin_lock(&p->lock);
1375         }                                        1375         }
1376         return p;                                1376         return p;
1377 }                                                1377 }
1378                                                  1378 
1379 static unsigned char __swap_entry_free_locked    1379 static unsigned char __swap_entry_free_locked(struct swap_info_struct *si,
1380                                                  1380                                               unsigned long offset,
1381                                                  1381                                               unsigned char usage)
1382 {                                                1382 {
1383         unsigned char count;                     1383         unsigned char count;
1384         unsigned char has_cache;                 1384         unsigned char has_cache;
1385                                                  1385 
1386         count = si->swap_map[offset];            1386         count = si->swap_map[offset];
1387                                                  1387 
1388         has_cache = count & SWAP_HAS_CACHE;      1388         has_cache = count & SWAP_HAS_CACHE;
1389         count &= ~SWAP_HAS_CACHE;                1389         count &= ~SWAP_HAS_CACHE;
1390                                                  1390 
1391         if (usage == SWAP_HAS_CACHE) {           1391         if (usage == SWAP_HAS_CACHE) {
1392                 VM_BUG_ON(!has_cache);           1392                 VM_BUG_ON(!has_cache);
1393                 has_cache = 0;                   1393                 has_cache = 0;
1394         } else if (count == SWAP_MAP_SHMEM) {    1394         } else if (count == SWAP_MAP_SHMEM) {
1395                 /*                               1395                 /*
1396                  * Or we could insist on shme    1396                  * Or we could insist on shmem.c using a special
1397                  * swap_shmem_free() and free    1397                  * swap_shmem_free() and free_shmem_swap_and_cache()...
1398                  */                              1398                  */
1399                 count = 0;                       1399                 count = 0;
1400         } else if ((count & ~COUNT_CONTINUED)    1400         } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
1401                 if (count == COUNT_CONTINUED)    1401                 if (count == COUNT_CONTINUED) {
1402                         if (swap_count_contin    1402                         if (swap_count_continued(si, offset, count))
1403                                 count = SWAP_    1403                                 count = SWAP_MAP_MAX | COUNT_CONTINUED;
1404                         else                     1404                         else
1405                                 count = SWAP_    1405                                 count = SWAP_MAP_MAX;
1406                 } else                           1406                 } else
1407                         count--;                 1407                         count--;
1408         }                                        1408         }
1409                                                  1409 
1410         usage = count | has_cache;               1410         usage = count | has_cache;
1411         if (usage)                               1411         if (usage)
1412                 WRITE_ONCE(si->swap_map[offse    1412                 WRITE_ONCE(si->swap_map[offset], usage);
1413         else                                     1413         else
1414                 WRITE_ONCE(si->swap_map[offse    1414                 WRITE_ONCE(si->swap_map[offset], SWAP_HAS_CACHE);
1415                                                  1415 
1416         return usage;                            1416         return usage;
1417 }                                                1417 }
1418                                                  1418 
1419 /*                                               1419 /*
1420  * When we get a swap entry, if there aren't     1420  * When we get a swap entry, if there aren't some other ways to
1421  * prevent swapoff, such as the folio in swap    1421  * prevent swapoff, such as the folio in swap cache is locked, RCU
1422  * reader side is locked, etc., the swap entr    1422  * reader side is locked, etc., the swap entry may become invalid
1423  * because of swapoff.  Then, we need to encl    1423  * because of swapoff.  Then, we need to enclose all swap related
1424  * functions with get_swap_device() and put_s    1424  * functions with get_swap_device() and put_swap_device(), unless the
1425  * swap functions call get/put_swap_device()     1425  * swap functions call get/put_swap_device() by themselves.
1426  *                                               1426  *
1427  * RCU reader side lock (including any spinlo    1427  * RCU reader side lock (including any spinlock) is sufficient to
1428  * prevent swapoff, because synchronize_rcu()    1428  * prevent swapoff, because synchronize_rcu() is called in swapoff()
1429  * before freeing data structures.               1429  * before freeing data structures.
1430  *                                               1430  *
1431  * Check whether swap entry is valid in the s    1431  * Check whether swap entry is valid in the swap device.  If so,
1432  * return pointer to swap_info_struct, and ke    1432  * return pointer to swap_info_struct, and keep the swap entry valid
1433  * via preventing the swap device from being     1433  * via preventing the swap device from being swapoff, until
1434  * put_swap_device() is called.  Otherwise re    1434  * put_swap_device() is called.  Otherwise return NULL.
1435  *                                               1435  *
1436  * Notice that swapoff or swapoff+swapon can     1436  * Notice that swapoff or swapoff+swapon can still happen before the
1437  * percpu_ref_tryget_live() in get_swap_devic    1437  * percpu_ref_tryget_live() in get_swap_device() or after the
1438  * percpu_ref_put() in put_swap_device() if t    1438  * percpu_ref_put() in put_swap_device() if there isn't any other way
1439  * to prevent swapoff.  The caller must be pr    1439  * to prevent swapoff.  The caller must be prepared for that.  For
1440  * example, the following situation is possib    1440  * example, the following situation is possible.
1441  *                                               1441  *
1442  *   CPU1                               CPU2     1442  *   CPU1                               CPU2
1443  *   do_swap_page()                              1443  *   do_swap_page()
1444  *     ...                              swapo    1444  *     ...                              swapoff+swapon
1445  *     __read_swap_cache_async()                 1445  *     __read_swap_cache_async()
1446  *       swapcache_prepare()                     1446  *       swapcache_prepare()
1447  *         __swap_duplicate()                    1447  *         __swap_duplicate()
1448  *           // check swap_map                   1448  *           // check swap_map
1449  *     // verify PTE not changed                 1449  *     // verify PTE not changed
1450  *                                               1450  *
1451  * In __swap_duplicate(), the swap_map need t    1451  * In __swap_duplicate(), the swap_map need to be checked before
1452  * changing partly because the specified swap    1452  * changing partly because the specified swap entry may be for another
1453  * swap device which has been swapoff.  And i    1453  * swap device which has been swapoff.  And in do_swap_page(), after
1454  * the page is read from the swap device, the    1454  * the page is read from the swap device, the PTE is verified not
1455  * changed with the page table locked to chec    1455  * changed with the page table locked to check whether the swap device
1456  * has been swapoff or swapoff+swapon.           1456  * has been swapoff or swapoff+swapon.
1457  */                                              1457  */
1458 struct swap_info_struct *get_swap_device(swp_    1458 struct swap_info_struct *get_swap_device(swp_entry_t entry)
1459 {                                                1459 {
1460         struct swap_info_struct *si;             1460         struct swap_info_struct *si;
1461         unsigned long offset;                    1461         unsigned long offset;
1462                                                  1462 
1463         if (!entry.val)                          1463         if (!entry.val)
1464                 goto out;                        1464                 goto out;
1465         si = swp_swap_info(entry);               1465         si = swp_swap_info(entry);
1466         if (!si)                                 1466         if (!si)
1467                 goto bad_nofile;                 1467                 goto bad_nofile;
1468         if (!percpu_ref_tryget_live(&si->user    1468         if (!percpu_ref_tryget_live(&si->users))
1469                 goto out;                        1469                 goto out;
1470         /*                                       1470         /*
1471          * Guarantee the si->users are checke    1471          * Guarantee the si->users are checked before accessing other
1472          * fields of swap_info_struct.           1472          * fields of swap_info_struct.
1473          *                                       1473          *
1474          * Paired with the spin_unlock() afte    1474          * Paired with the spin_unlock() after setup_swap_info() in
1475          * enable_swap_info().                   1475          * enable_swap_info().
1476          */                                      1476          */
1477         smp_rmb();                               1477         smp_rmb();
1478         offset = swp_offset(entry);              1478         offset = swp_offset(entry);
1479         if (offset >= si->max)                   1479         if (offset >= si->max)
1480                 goto put_out;                    1480                 goto put_out;
1481                                                  1481 
1482         return si;                               1482         return si;
1483 bad_nofile:                                      1483 bad_nofile:
1484         pr_err("%s: %s%08lx\n", __func__, Bad    1484         pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
1485 out:                                             1485 out:
1486         return NULL;                             1486         return NULL;
1487 put_out:                                         1487 put_out:
1488         pr_err("%s: %s%08lx\n", __func__, Bad    1488         pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val);
1489         percpu_ref_put(&si->users);              1489         percpu_ref_put(&si->users);
1490         return NULL;                             1490         return NULL;
1491 }                                                1491 }
1492                                                  1492 
1493 static unsigned char __swap_entry_free(struct    1493 static unsigned char __swap_entry_free(struct swap_info_struct *si,
1494                                        swp_en    1494                                        swp_entry_t entry)
1495 {                                                1495 {
1496         struct swap_cluster_info *ci;            1496         struct swap_cluster_info *ci;
1497         unsigned long offset = swp_offset(ent    1497         unsigned long offset = swp_offset(entry);
1498         unsigned char usage;                     1498         unsigned char usage;
1499                                                  1499 
1500         ci = lock_cluster_or_swap_info(si, of    1500         ci = lock_cluster_or_swap_info(si, offset);
1501         usage = __swap_entry_free_locked(si,     1501         usage = __swap_entry_free_locked(si, offset, 1);
1502         unlock_cluster_or_swap_info(si, ci);     1502         unlock_cluster_or_swap_info(si, ci);
1503         if (!usage)                              1503         if (!usage)
1504                 free_swap_slot(entry);           1504                 free_swap_slot(entry);
1505                                                  1505 
1506         return usage;                            1506         return usage;
1507 }                                                1507 }
1508                                                  1508 
1509 static bool __swap_entries_free(struct swap_i    1509 static bool __swap_entries_free(struct swap_info_struct *si,
1510                 swp_entry_t entry, int nr)       1510                 swp_entry_t entry, int nr)
1511 {                                                1511 {
1512         unsigned long offset = swp_offset(ent    1512         unsigned long offset = swp_offset(entry);
1513         unsigned int type = swp_type(entry);     1513         unsigned int type = swp_type(entry);
1514         struct swap_cluster_info *ci;            1514         struct swap_cluster_info *ci;
1515         bool has_cache = false;                  1515         bool has_cache = false;
1516         unsigned char count;                     1516         unsigned char count;
1517         int i;                                   1517         int i;
1518                                                  1518 
1519         if (nr <= 1 || swap_count(data_race(s    1519         if (nr <= 1 || swap_count(data_race(si->swap_map[offset])) != 1)
1520                 goto fallback;                   1520                 goto fallback;
1521         /* cross into another cluster */         1521         /* cross into another cluster */
1522         if (nr > SWAPFILE_CLUSTER - offset %     1522         if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER)
1523                 goto fallback;                   1523                 goto fallback;
1524                                                  1524 
1525         ci = lock_cluster_or_swap_info(si, of    1525         ci = lock_cluster_or_swap_info(si, offset);
1526         if (!swap_is_last_map(si, offset, nr,    1526         if (!swap_is_last_map(si, offset, nr, &has_cache)) {
1527                 unlock_cluster_or_swap_info(s    1527                 unlock_cluster_or_swap_info(si, ci);
1528                 goto fallback;                   1528                 goto fallback;
1529         }                                        1529         }
1530         for (i = 0; i < nr; i++)                 1530         for (i = 0; i < nr; i++)
1531                 WRITE_ONCE(si->swap_map[offse    1531                 WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE);
1532         unlock_cluster_or_swap_info(si, ci);     1532         unlock_cluster_or_swap_info(si, ci);
1533                                                  1533 
1534         if (!has_cache) {                        1534         if (!has_cache) {
1535                 for (i = 0; i < nr; i++)         1535                 for (i = 0; i < nr; i++)
1536                         zswap_invalidate(swp_    1536                         zswap_invalidate(swp_entry(si->type, offset + i));
1537                 spin_lock(&si->lock);            1537                 spin_lock(&si->lock);
1538                 swap_entry_range_free(si, ent    1538                 swap_entry_range_free(si, entry, nr);
1539                 spin_unlock(&si->lock);          1539                 spin_unlock(&si->lock);
1540         }                                        1540         }
1541         return has_cache;                        1541         return has_cache;
1542                                                  1542 
1543 fallback:                                        1543 fallback:
1544         for (i = 0; i < nr; i++) {               1544         for (i = 0; i < nr; i++) {
1545                 if (data_race(si->swap_map[of    1545                 if (data_race(si->swap_map[offset + i])) {
1546                         count = __swap_entry_    1546                         count = __swap_entry_free(si, swp_entry(type, offset + i));
1547                         if (count == SWAP_HAS    1547                         if (count == SWAP_HAS_CACHE)
1548                                 has_cache = t    1548                                 has_cache = true;
1549                 } else {                         1549                 } else {
1550                         WARN_ON_ONCE(1);         1550                         WARN_ON_ONCE(1);
1551                 }                                1551                 }
1552         }                                        1552         }
1553         return has_cache;                        1553         return has_cache;
1554 }                                                1554 }
1555                                                  1555 
1556 /*                                               1556 /*
1557  * Drop the last HAS_CACHE flag of swap entri    1557  * Drop the last HAS_CACHE flag of swap entries, caller have to
1558  * ensure all entries belong to the same cgro    1558  * ensure all entries belong to the same cgroup.
1559  */                                              1559  */
1560 static void swap_entry_range_free(struct swap    1560 static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry,
1561                                   unsigned in    1561                                   unsigned int nr_pages)
1562 {                                                1562 {
1563         unsigned long offset = swp_offset(ent    1563         unsigned long offset = swp_offset(entry);
1564         unsigned char *map = si->swap_map + o    1564         unsigned char *map = si->swap_map + offset;
1565         unsigned char *map_end = map + nr_pag    1565         unsigned char *map_end = map + nr_pages;
1566         struct swap_cluster_info *ci;            1566         struct swap_cluster_info *ci;
1567                                                  1567 
1568         ci = lock_cluster(si, offset);           1568         ci = lock_cluster(si, offset);
1569         do {                                     1569         do {
1570                 VM_BUG_ON(*map != SWAP_HAS_CA    1570                 VM_BUG_ON(*map != SWAP_HAS_CACHE);
1571                 *map = 0;                        1571                 *map = 0;
1572         } while (++map < map_end);               1572         } while (++map < map_end);
1573         dec_cluster_info_page(si, ci, nr_page    1573         dec_cluster_info_page(si, ci, nr_pages);
1574         unlock_cluster(ci);                      1574         unlock_cluster(ci);
1575                                                  1575 
1576         mem_cgroup_uncharge_swap(entry, nr_pa    1576         mem_cgroup_uncharge_swap(entry, nr_pages);
1577         swap_range_free(si, offset, nr_pages)    1577         swap_range_free(si, offset, nr_pages);
1578 }                                                1578 }
1579                                                  1579 
1580 static void cluster_swap_free_nr(struct swap_    1580 static void cluster_swap_free_nr(struct swap_info_struct *si,
1581                 unsigned long offset, int nr_    1581                 unsigned long offset, int nr_pages,
1582                 unsigned char usage)             1582                 unsigned char usage)
1583 {                                                1583 {
1584         struct swap_cluster_info *ci;            1584         struct swap_cluster_info *ci;
1585         DECLARE_BITMAP(to_free, BITS_PER_LONG    1585         DECLARE_BITMAP(to_free, BITS_PER_LONG) = { 0 };
1586         int i, nr;                               1586         int i, nr;
1587                                                  1587 
1588         ci = lock_cluster_or_swap_info(si, of    1588         ci = lock_cluster_or_swap_info(si, offset);
1589         while (nr_pages) {                       1589         while (nr_pages) {
1590                 nr = min(BITS_PER_LONG, nr_pa    1590                 nr = min(BITS_PER_LONG, nr_pages);
1591                 for (i = 0; i < nr; i++) {       1591                 for (i = 0; i < nr; i++) {
1592                         if (!__swap_entry_fre    1592                         if (!__swap_entry_free_locked(si, offset + i, usage))
1593                                 bitmap_set(to    1593                                 bitmap_set(to_free, i, 1);
1594                 }                                1594                 }
1595                 if (!bitmap_empty(to_free, BI    1595                 if (!bitmap_empty(to_free, BITS_PER_LONG)) {
1596                         unlock_cluster_or_swa    1596                         unlock_cluster_or_swap_info(si, ci);
1597                         for_each_set_bit(i, t    1597                         for_each_set_bit(i, to_free, BITS_PER_LONG)
1598                                 free_swap_slo    1598                                 free_swap_slot(swp_entry(si->type, offset + i));
1599                         if (nr == nr_pages)      1599                         if (nr == nr_pages)
1600                                 return;          1600                                 return;
1601                         bitmap_clear(to_free,    1601                         bitmap_clear(to_free, 0, BITS_PER_LONG);
1602                         ci = lock_cluster_or_    1602                         ci = lock_cluster_or_swap_info(si, offset);
1603                 }                                1603                 }
1604                 offset += nr;                    1604                 offset += nr;
1605                 nr_pages -= nr;                  1605                 nr_pages -= nr;
1606         }                                        1606         }
1607         unlock_cluster_or_swap_info(si, ci);     1607         unlock_cluster_or_swap_info(si, ci);
1608 }                                                1608 }
1609                                                  1609 
1610 /*                                               1610 /*
1611  * Caller has made sure that the swap device     1611  * Caller has made sure that the swap device corresponding to entry
1612  * is still around or has not been recycled.     1612  * is still around or has not been recycled.
1613  */                                              1613  */
1614 void swap_free_nr(swp_entry_t entry, int nr_p    1614 void swap_free_nr(swp_entry_t entry, int nr_pages)
1615 {                                                1615 {
1616         int nr;                                  1616         int nr;
1617         struct swap_info_struct *sis;            1617         struct swap_info_struct *sis;
1618         unsigned long offset = swp_offset(ent    1618         unsigned long offset = swp_offset(entry);
1619                                                  1619 
1620         sis = _swap_info_get(entry);             1620         sis = _swap_info_get(entry);
1621         if (!sis)                                1621         if (!sis)
1622                 return;                          1622                 return;
1623                                                  1623 
1624         while (nr_pages) {                       1624         while (nr_pages) {
1625                 nr = min_t(int, nr_pages, SWA    1625                 nr = min_t(int, nr_pages, SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
1626                 cluster_swap_free_nr(sis, off    1626                 cluster_swap_free_nr(sis, offset, nr, 1);
1627                 offset += nr;                    1627                 offset += nr;
1628                 nr_pages -= nr;                  1628                 nr_pages -= nr;
1629         }                                        1629         }
1630 }                                                1630 }
1631                                                  1631 
1632 /*                                               1632 /*
1633  * Called after dropping swapcache to decreas    1633  * Called after dropping swapcache to decrease refcnt to swap entries.
1634  */                                              1634  */
1635 void put_swap_folio(struct folio *folio, swp_    1635 void put_swap_folio(struct folio *folio, swp_entry_t entry)
1636 {                                                1636 {
1637         unsigned long offset = swp_offset(ent    1637         unsigned long offset = swp_offset(entry);
1638         struct swap_cluster_info *ci;            1638         struct swap_cluster_info *ci;
1639         struct swap_info_struct *si;             1639         struct swap_info_struct *si;
1640         int size = 1 << swap_entry_order(foli    1640         int size = 1 << swap_entry_order(folio_order(folio));
1641                                                  1641 
1642         si = _swap_info_get(entry);              1642         si = _swap_info_get(entry);
1643         if (!si)                                 1643         if (!si)
1644                 return;                          1644                 return;
1645                                                  1645 
1646         ci = lock_cluster_or_swap_info(si, of    1646         ci = lock_cluster_or_swap_info(si, offset);
1647         if (size > 1 && swap_is_has_cache(si,    1647         if (size > 1 && swap_is_has_cache(si, offset, size)) {
1648                 unlock_cluster_or_swap_info(s    1648                 unlock_cluster_or_swap_info(si, ci);
1649                 spin_lock(&si->lock);            1649                 spin_lock(&si->lock);
1650                 swap_entry_range_free(si, ent    1650                 swap_entry_range_free(si, entry, size);
1651                 spin_unlock(&si->lock);          1651                 spin_unlock(&si->lock);
1652                 return;                          1652                 return;
1653         }                                        1653         }
1654         for (int i = 0; i < size; i++, entry.    1654         for (int i = 0; i < size; i++, entry.val++) {
1655                 if (!__swap_entry_free_locked    1655                 if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
1656                         unlock_cluster_or_swa    1656                         unlock_cluster_or_swap_info(si, ci);
1657                         free_swap_slot(entry)    1657                         free_swap_slot(entry);
1658                         if (i == size - 1)       1658                         if (i == size - 1)
1659                                 return;          1659                                 return;
1660                         lock_cluster_or_swap_    1660                         lock_cluster_or_swap_info(si, offset);
1661                 }                                1661                 }
1662         }                                        1662         }
1663         unlock_cluster_or_swap_info(si, ci);     1663         unlock_cluster_or_swap_info(si, ci);
1664 }                                                1664 }
1665                                                  1665 
1666 static int swp_entry_cmp(const void *ent1, co    1666 static int swp_entry_cmp(const void *ent1, const void *ent2)
1667 {                                                1667 {
1668         const swp_entry_t *e1 = ent1, *e2 = e    1668         const swp_entry_t *e1 = ent1, *e2 = ent2;
1669                                                  1669 
1670         return (int)swp_type(*e1) - (int)swp_    1670         return (int)swp_type(*e1) - (int)swp_type(*e2);
1671 }                                                1671 }
1672                                                  1672 
1673 void swapcache_free_entries(swp_entry_t *entr    1673 void swapcache_free_entries(swp_entry_t *entries, int n)
1674 {                                                1674 {
1675         struct swap_info_struct *p, *prev;       1675         struct swap_info_struct *p, *prev;
1676         int i;                                   1676         int i;
1677                                                  1677 
1678         if (n <= 0)                              1678         if (n <= 0)
1679                 return;                          1679                 return;
1680                                                  1680 
1681         prev = NULL;                             1681         prev = NULL;
1682         p = NULL;                                1682         p = NULL;
1683                                                  1683 
1684         /*                                       1684         /*
1685          * Sort swap entries by swap device,     1685          * Sort swap entries by swap device, so each lock is only taken once.
1686          * nr_swapfiles isn't absolutely corr    1686          * nr_swapfiles isn't absolutely correct, but the overhead of sort() is
1687          * so low that it isn't necessary to     1687          * so low that it isn't necessary to optimize further.
1688          */                                      1688          */
1689         if (nr_swapfiles > 1)                    1689         if (nr_swapfiles > 1)
1690                 sort(entries, n, sizeof(entri    1690                 sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL);
1691         for (i = 0; i < n; ++i) {                1691         for (i = 0; i < n; ++i) {
1692                 p = swap_info_get_cont(entrie    1692                 p = swap_info_get_cont(entries[i], prev);
1693                 if (p)                           1693                 if (p)
1694                         swap_entry_range_free    1694                         swap_entry_range_free(p, entries[i], 1);
1695                 prev = p;                        1695                 prev = p;
1696         }                                        1696         }
1697         if (p)                                   1697         if (p)
1698                 spin_unlock(&p->lock);           1698                 spin_unlock(&p->lock);
1699 }                                                1699 }
1700                                                  1700 
1701 int __swap_count(swp_entry_t entry)              1701 int __swap_count(swp_entry_t entry)
1702 {                                                1702 {
1703         struct swap_info_struct *si = swp_swa    1703         struct swap_info_struct *si = swp_swap_info(entry);
1704         pgoff_t offset = swp_offset(entry);      1704         pgoff_t offset = swp_offset(entry);
1705                                                  1705 
1706         return swap_count(si->swap_map[offset    1706         return swap_count(si->swap_map[offset]);
1707 }                                                1707 }
1708                                                  1708 
1709 /*                                               1709 /*
1710  * How many references to @entry are currentl    1710  * How many references to @entry are currently swapped out?
1711  * This does not give an exact answer when sw    1711  * This does not give an exact answer when swap count is continued,
1712  * but does include the high COUNT_CONTINUED     1712  * but does include the high COUNT_CONTINUED flag to allow for that.
1713  */                                              1713  */
1714 int swap_swapcount(struct swap_info_struct *s    1714 int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
1715 {                                                1715 {
1716         pgoff_t offset = swp_offset(entry);      1716         pgoff_t offset = swp_offset(entry);
1717         struct swap_cluster_info *ci;            1717         struct swap_cluster_info *ci;
1718         int count;                               1718         int count;
1719                                                  1719 
1720         ci = lock_cluster_or_swap_info(si, of    1720         ci = lock_cluster_or_swap_info(si, offset);
1721         count = swap_count(si->swap_map[offse    1721         count = swap_count(si->swap_map[offset]);
1722         unlock_cluster_or_swap_info(si, ci);     1722         unlock_cluster_or_swap_info(si, ci);
1723         return count;                            1723         return count;
1724 }                                                1724 }
1725                                                  1725 
1726 /*                                               1726 /*
1727  * How many references to @entry are currentl    1727  * How many references to @entry are currently swapped out?
1728  * This considers COUNT_CONTINUED so it retur    1728  * This considers COUNT_CONTINUED so it returns exact answer.
1729  */                                              1729  */
1730 int swp_swapcount(swp_entry_t entry)             1730 int swp_swapcount(swp_entry_t entry)
1731 {                                                1731 {
1732         int count, tmp_count, n;                 1732         int count, tmp_count, n;
1733         struct swap_info_struct *si;             1733         struct swap_info_struct *si;
1734         struct swap_cluster_info *ci;            1734         struct swap_cluster_info *ci;
1735         struct page *page;                       1735         struct page *page;
1736         pgoff_t offset;                          1736         pgoff_t offset;
1737         unsigned char *map;                      1737         unsigned char *map;
1738                                                  1738 
1739         si = _swap_info_get(entry);              1739         si = _swap_info_get(entry);
1740         if (!si)                                 1740         if (!si)
1741                 return 0;                        1741                 return 0;
1742                                                  1742 
1743         offset = swp_offset(entry);              1743         offset = swp_offset(entry);
1744                                                  1744 
1745         ci = lock_cluster_or_swap_info(si, of    1745         ci = lock_cluster_or_swap_info(si, offset);
1746                                                  1746 
1747         count = swap_count(si->swap_map[offse    1747         count = swap_count(si->swap_map[offset]);
1748         if (!(count & COUNT_CONTINUED))          1748         if (!(count & COUNT_CONTINUED))
1749                 goto out;                        1749                 goto out;
1750                                                  1750 
1751         count &= ~COUNT_CONTINUED;               1751         count &= ~COUNT_CONTINUED;
1752         n = SWAP_MAP_MAX + 1;                    1752         n = SWAP_MAP_MAX + 1;
1753                                                  1753 
1754         page = vmalloc_to_page(si->swap_map +    1754         page = vmalloc_to_page(si->swap_map + offset);
1755         offset &= ~PAGE_MASK;                    1755         offset &= ~PAGE_MASK;
1756         VM_BUG_ON(page_private(page) != SWP_C    1756         VM_BUG_ON(page_private(page) != SWP_CONTINUED);
1757                                                  1757 
1758         do {                                     1758         do {
1759                 page = list_next_entry(page,     1759                 page = list_next_entry(page, lru);
1760                 map = kmap_local_page(page);     1760                 map = kmap_local_page(page);
1761                 tmp_count = map[offset];         1761                 tmp_count = map[offset];
1762                 kunmap_local(map);               1762                 kunmap_local(map);
1763                                                  1763 
1764                 count += (tmp_count & ~COUNT_    1764                 count += (tmp_count & ~COUNT_CONTINUED) * n;
1765                 n *= (SWAP_CONT_MAX + 1);        1765                 n *= (SWAP_CONT_MAX + 1);
1766         } while (tmp_count & COUNT_CONTINUED)    1766         } while (tmp_count & COUNT_CONTINUED);
1767 out:                                             1767 out:
1768         unlock_cluster_or_swap_info(si, ci);     1768         unlock_cluster_or_swap_info(si, ci);
1769         return count;                            1769         return count;
1770 }                                                1770 }
1771                                                  1771 
1772 static bool swap_page_trans_huge_swapped(stru    1772 static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
1773                                          swp_    1773                                          swp_entry_t entry, int order)
1774 {                                                1774 {
1775         struct swap_cluster_info *ci;            1775         struct swap_cluster_info *ci;
1776         unsigned char *map = si->swap_map;       1776         unsigned char *map = si->swap_map;
1777         unsigned int nr_pages = 1 << order;      1777         unsigned int nr_pages = 1 << order;
1778         unsigned long roffset = swp_offset(en    1778         unsigned long roffset = swp_offset(entry);
1779         unsigned long offset = round_down(rof    1779         unsigned long offset = round_down(roffset, nr_pages);
1780         int i;                                   1780         int i;
1781         bool ret = false;                        1781         bool ret = false;
1782                                                  1782 
1783         ci = lock_cluster_or_swap_info(si, of    1783         ci = lock_cluster_or_swap_info(si, offset);
1784         if (!ci || nr_pages == 1) {              1784         if (!ci || nr_pages == 1) {
1785                 if (swap_count(map[roffset]))    1785                 if (swap_count(map[roffset]))
1786                         ret = true;              1786                         ret = true;
1787                 goto unlock_out;                 1787                 goto unlock_out;
1788         }                                        1788         }
1789         for (i = 0; i < nr_pages; i++) {         1789         for (i = 0; i < nr_pages; i++) {
1790                 if (swap_count(map[offset + i    1790                 if (swap_count(map[offset + i])) {
1791                         ret = true;              1791                         ret = true;
1792                         break;                   1792                         break;
1793                 }                                1793                 }
1794         }                                        1794         }
1795 unlock_out:                                      1795 unlock_out:
1796         unlock_cluster_or_swap_info(si, ci);     1796         unlock_cluster_or_swap_info(si, ci);
1797         return ret;                              1797         return ret;
1798 }                                                1798 }
1799                                                  1799 
1800 static bool folio_swapped(struct folio *folio    1800 static bool folio_swapped(struct folio *folio)
1801 {                                                1801 {
1802         swp_entry_t entry = folio->swap;         1802         swp_entry_t entry = folio->swap;
1803         struct swap_info_struct *si = _swap_i    1803         struct swap_info_struct *si = _swap_info_get(entry);
1804                                                  1804 
1805         if (!si)                                 1805         if (!si)
1806                 return false;                    1806                 return false;
1807                                                  1807 
1808         if (!IS_ENABLED(CONFIG_THP_SWAP) || l    1808         if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio)))
1809                 return swap_swapcount(si, ent    1809                 return swap_swapcount(si, entry) != 0;
1810                                                  1810 
1811         return swap_page_trans_huge_swapped(s    1811         return swap_page_trans_huge_swapped(si, entry, folio_order(folio));
1812 }                                                1812 }
1813                                                  1813 
1814 static bool folio_swapcache_freeable(struct f    1814 static bool folio_swapcache_freeable(struct folio *folio)
1815 {                                                1815 {
1816         VM_BUG_ON_FOLIO(!folio_test_locked(fo    1816         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
1817                                                  1817 
1818         if (!folio_test_swapcache(folio))        1818         if (!folio_test_swapcache(folio))
1819                 return false;                    1819                 return false;
1820         if (folio_test_writeback(folio))         1820         if (folio_test_writeback(folio))
1821                 return false;                    1821                 return false;
1822                                                  1822 
1823         /*                                       1823         /*
1824          * Once hibernation has begun to crea    1824          * Once hibernation has begun to create its image of memory,
1825          * there's a danger that one of the c    1825          * there's a danger that one of the calls to folio_free_swap()
1826          * - most probably a call from __try_    1826          * - most probably a call from __try_to_reclaim_swap() while
1827          * hibernation is allocating its own     1827          * hibernation is allocating its own swap pages for the image,
1828          * but conceivably even a call from m    1828          * but conceivably even a call from memory reclaim - will free
1829          * the swap from a folio which has al    1829          * the swap from a folio which has already been recorded in the
1830          * image as a clean swapcache folio,     1830          * image as a clean swapcache folio, and then reuse its swap for
1831          * another page of the image.  On wak    1831          * another page of the image.  On waking from hibernation, the
1832          * original folio might be freed unde    1832          * original folio might be freed under memory pressure, then
1833          * later read back in from swap, now     1833          * later read back in from swap, now with the wrong data.
1834          *                                       1834          *
1835          * Hibernation suspends storage while    1835          * Hibernation suspends storage while it is writing the image
1836          * to disk so check that here.           1836          * to disk so check that here.
1837          */                                      1837          */
1838         if (pm_suspended_storage())              1838         if (pm_suspended_storage())
1839                 return false;                    1839                 return false;
1840                                                  1840 
1841         return true;                             1841         return true;
1842 }                                                1842 }
1843                                                  1843 
1844 /**                                              1844 /**
1845  * folio_free_swap() - Free the swap space us    1845  * folio_free_swap() - Free the swap space used for this folio.
1846  * @folio: The folio to remove.                  1846  * @folio: The folio to remove.
1847  *                                               1847  *
1848  * If swap is getting full, or if there are n    1848  * If swap is getting full, or if there are no more mappings of this folio,
1849  * then call folio_free_swap to free its swap    1849  * then call folio_free_swap to free its swap space.
1850  *                                               1850  *
1851  * Return: true if we were able to release th    1851  * Return: true if we were able to release the swap space.
1852  */                                              1852  */
1853 bool folio_free_swap(struct folio *folio)        1853 bool folio_free_swap(struct folio *folio)
1854 {                                                1854 {
1855         if (!folio_swapcache_freeable(folio))    1855         if (!folio_swapcache_freeable(folio))
1856                 return false;                    1856                 return false;
1857         if (folio_swapped(folio))                1857         if (folio_swapped(folio))
1858                 return false;                    1858                 return false;
1859                                                  1859 
1860         delete_from_swap_cache(folio);           1860         delete_from_swap_cache(folio);
1861         folio_set_dirty(folio);                  1861         folio_set_dirty(folio);
1862         return true;                             1862         return true;
1863 }                                                1863 }
1864                                                  1864 
1865 /**                                              1865 /**
1866  * free_swap_and_cache_nr() - Release referen    1866  * free_swap_and_cache_nr() - Release reference on range of swap entries and
1867  *                            reclaim their c    1867  *                            reclaim their cache if no more references remain.
1868  * @entry: First entry of range.                 1868  * @entry: First entry of range.
1869  * @nr: Number of entries in range.              1869  * @nr: Number of entries in range.
1870  *                                               1870  *
1871  * For each swap entry in the contiguous rang    1871  * For each swap entry in the contiguous range, release a reference. If any swap
1872  * entries become free, try to reclaim their     1872  * entries become free, try to reclaim their underlying folios, if present. The
1873  * offset range is defined by [entry.offset,     1873  * offset range is defined by [entry.offset, entry.offset + nr).
1874  */                                              1874  */
1875 void free_swap_and_cache_nr(swp_entry_t entry    1875 void free_swap_and_cache_nr(swp_entry_t entry, int nr)
1876 {                                                1876 {
1877         const unsigned long start_offset = sw    1877         const unsigned long start_offset = swp_offset(entry);
1878         const unsigned long end_offset = star    1878         const unsigned long end_offset = start_offset + nr;
1879         struct swap_info_struct *si;             1879         struct swap_info_struct *si;
1880         bool any_only_cache = false;             1880         bool any_only_cache = false;
1881         unsigned long offset;                    1881         unsigned long offset;
1882                                                  1882 
1883         if (non_swap_entry(entry))               1883         if (non_swap_entry(entry))
1884                 return;                          1884                 return;
1885                                                  1885 
1886         si = get_swap_device(entry);             1886         si = get_swap_device(entry);
1887         if (!si)                                 1887         if (!si)
1888                 return;                          1888                 return;
1889                                                  1889 
1890         if (WARN_ON(end_offset > si->max))       1890         if (WARN_ON(end_offset > si->max))
1891                 goto out;                        1891                 goto out;
1892                                                  1892 
1893         /*                                       1893         /*
1894          * First free all entries in the rang    1894          * First free all entries in the range.
1895          */                                      1895          */
1896         any_only_cache = __swap_entries_free(    1896         any_only_cache = __swap_entries_free(si, entry, nr);
1897                                                  1897 
1898         /*                                       1898         /*
1899          * Short-circuit the below loop if no    1899          * Short-circuit the below loop if none of the entries had their
1900          * reference drop to zero.               1900          * reference drop to zero.
1901          */                                      1901          */
1902         if (!any_only_cache)                     1902         if (!any_only_cache)
1903                 goto out;                        1903                 goto out;
1904                                                  1904 
1905         /*                                       1905         /*
1906          * Now go back over the range trying     1906          * Now go back over the range trying to reclaim the swap cache. This is
1907          * more efficient for large folios be    1907          * more efficient for large folios because we will only try to reclaim
1908          * the swap once per folio in the com    1908          * the swap once per folio in the common case. If we do
1909          * __swap_entry_free() and __try_to_r    1909          * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the
1910          * latter will get a reference and lo    1910          * latter will get a reference and lock the folio for every individual
1911          * page but will only succeed once th    1911          * page but will only succeed once the swap slot for every subpage is
1912          * zero.                                 1912          * zero.
1913          */                                      1913          */
1914         for (offset = start_offset; offset <     1914         for (offset = start_offset; offset < end_offset; offset += nr) {
1915                 nr = 1;                          1915                 nr = 1;
1916                 if (READ_ONCE(si->swap_map[of    1916                 if (READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
1917                         /*                       1917                         /*
1918                          * Folios are always     1918                          * Folios are always naturally aligned in swap so
1919                          * advance forward to    1919                          * advance forward to the next boundary. Zero means no
1920                          * folio was found fo    1920                          * folio was found for the swap entry, so advance by 1
1921                          * in this case. Nega    1921                          * in this case. Negative value means folio was found
1922                          * but could not be r    1922                          * but could not be reclaimed. Here we can still advance
1923                          * to the next bounda    1923                          * to the next boundary.
1924                          */                      1924                          */
1925                         nr = __try_to_reclaim    1925                         nr = __try_to_reclaim_swap(si, offset,
1926                                                  1926                                                    TTRS_UNMAPPED | TTRS_FULL);
1927                         if (nr == 0)             1927                         if (nr == 0)
1928                                 nr = 1;          1928                                 nr = 1;
1929                         else if (nr < 0)         1929                         else if (nr < 0)
1930                                 nr = -nr;        1930                                 nr = -nr;
1931                         nr = ALIGN(offset + 1    1931                         nr = ALIGN(offset + 1, nr) - offset;
1932                 }                                1932                 }
1933         }                                        1933         }
1934                                                  1934 
1935 out:                                             1935 out:
1936         put_swap_device(si);                     1936         put_swap_device(si);
1937 }                                                1937 }
1938                                                  1938 
1939 #ifdef CONFIG_HIBERNATION                        1939 #ifdef CONFIG_HIBERNATION
1940                                                  1940 
1941 swp_entry_t get_swap_page_of_type(int type)      1941 swp_entry_t get_swap_page_of_type(int type)
1942 {                                                1942 {
1943         struct swap_info_struct *si = swap_ty    1943         struct swap_info_struct *si = swap_type_to_swap_info(type);
1944         swp_entry_t entry = {0};                 1944         swp_entry_t entry = {0};
1945                                                  1945 
1946         if (!si)                                 1946         if (!si)
1947                 goto fail;                       1947                 goto fail;
1948                                                  1948 
1949         /* This is called for allocating swap    1949         /* This is called for allocating swap entry, not cache */
1950         spin_lock(&si->lock);                    1950         spin_lock(&si->lock);
1951         if ((si->flags & SWP_WRITEOK) && scan    1951         if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0))
1952                 atomic_long_dec(&nr_swap_page    1952                 atomic_long_dec(&nr_swap_pages);
1953         spin_unlock(&si->lock);                  1953         spin_unlock(&si->lock);
1954 fail:                                            1954 fail:
1955         return entry;                            1955         return entry;
1956 }                                                1956 }
1957                                                  1957 
1958 /*                                               1958 /*
1959  * Find the swap type that corresponds to giv    1959  * Find the swap type that corresponds to given device (if any).
1960  *                                               1960  *
1961  * @offset - number of the PAGE_SIZE-sized bl    1961  * @offset - number of the PAGE_SIZE-sized block of the device, starting
1962  * from 0, in which the swap header is expect    1962  * from 0, in which the swap header is expected to be located.
1963  *                                               1963  *
1964  * This is needed for the suspend to disk (ak    1964  * This is needed for the suspend to disk (aka swsusp).
1965  */                                              1965  */
1966 int swap_type_of(dev_t device, sector_t offse    1966 int swap_type_of(dev_t device, sector_t offset)
1967 {                                                1967 {
1968         int type;                                1968         int type;
1969                                                  1969 
1970         if (!device)                             1970         if (!device)
1971                 return -1;                       1971                 return -1;
1972                                                  1972 
1973         spin_lock(&swap_lock);                   1973         spin_lock(&swap_lock);
1974         for (type = 0; type < nr_swapfiles; t    1974         for (type = 0; type < nr_swapfiles; type++) {
1975                 struct swap_info_struct *sis     1975                 struct swap_info_struct *sis = swap_info[type];
1976                                                  1976 
1977                 if (!(sis->flags & SWP_WRITEO    1977                 if (!(sis->flags & SWP_WRITEOK))
1978                         continue;                1978                         continue;
1979                                                  1979 
1980                 if (device == sis->bdev->bd_d    1980                 if (device == sis->bdev->bd_dev) {
1981                         struct swap_extent *s    1981                         struct swap_extent *se = first_se(sis);
1982                                                  1982 
1983                         if (se->start_block =    1983                         if (se->start_block == offset) {
1984                                 spin_unlock(&    1984                                 spin_unlock(&swap_lock);
1985                                 return type;     1985                                 return type;
1986                         }                        1986                         }
1987                 }                                1987                 }
1988         }                                        1988         }
1989         spin_unlock(&swap_lock);                 1989         spin_unlock(&swap_lock);
1990         return -ENODEV;                          1990         return -ENODEV;
1991 }                                                1991 }
1992                                                  1992 
1993 int find_first_swap(dev_t *device)               1993 int find_first_swap(dev_t *device)
1994 {                                                1994 {
1995         int type;                                1995         int type;
1996                                                  1996 
1997         spin_lock(&swap_lock);                   1997         spin_lock(&swap_lock);
1998         for (type = 0; type < nr_swapfiles; t    1998         for (type = 0; type < nr_swapfiles; type++) {
1999                 struct swap_info_struct *sis     1999                 struct swap_info_struct *sis = swap_info[type];
2000                                                  2000 
2001                 if (!(sis->flags & SWP_WRITEO    2001                 if (!(sis->flags & SWP_WRITEOK))
2002                         continue;                2002                         continue;
2003                 *device = sis->bdev->bd_dev;     2003                 *device = sis->bdev->bd_dev;
2004                 spin_unlock(&swap_lock);         2004                 spin_unlock(&swap_lock);
2005                 return type;                     2005                 return type;
2006         }                                        2006         }
2007         spin_unlock(&swap_lock);                 2007         spin_unlock(&swap_lock);
2008         return -ENODEV;                          2008         return -ENODEV;
2009 }                                                2009 }
2010                                                  2010 
2011 /*                                               2011 /*
2012  * Get the (PAGE_SIZE) block corresponding to    2012  * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
2013  * corresponding to given index in swap_info     2013  * corresponding to given index in swap_info (swap type).
2014  */                                              2014  */
2015 sector_t swapdev_block(int type, pgoff_t offs    2015 sector_t swapdev_block(int type, pgoff_t offset)
2016 {                                                2016 {
2017         struct swap_info_struct *si = swap_ty    2017         struct swap_info_struct *si = swap_type_to_swap_info(type);
2018         struct swap_extent *se;                  2018         struct swap_extent *se;
2019                                                  2019 
2020         if (!si || !(si->flags & SWP_WRITEOK)    2020         if (!si || !(si->flags & SWP_WRITEOK))
2021                 return 0;                        2021                 return 0;
2022         se = offset_to_swap_extent(si, offset    2022         se = offset_to_swap_extent(si, offset);
2023         return se->start_block + (offset - se    2023         return se->start_block + (offset - se->start_page);
2024 }                                                2024 }
2025                                                  2025 
2026 /*                                               2026 /*
2027  * Return either the total number of swap pag    2027  * Return either the total number of swap pages of given type, or the number
2028  * of free pages of that type (depending on @    2028  * of free pages of that type (depending on @free)
2029  *                                               2029  *
2030  * This is needed for software suspend           2030  * This is needed for software suspend
2031  */                                              2031  */
2032 unsigned int count_swap_pages(int type, int f    2032 unsigned int count_swap_pages(int type, int free)
2033 {                                                2033 {
2034         unsigned int n = 0;                      2034         unsigned int n = 0;
2035                                                  2035 
2036         spin_lock(&swap_lock);                   2036         spin_lock(&swap_lock);
2037         if ((unsigned int)type < nr_swapfiles    2037         if ((unsigned int)type < nr_swapfiles) {
2038                 struct swap_info_struct *sis     2038                 struct swap_info_struct *sis = swap_info[type];
2039                                                  2039 
2040                 spin_lock(&sis->lock);           2040                 spin_lock(&sis->lock);
2041                 if (sis->flags & SWP_WRITEOK)    2041                 if (sis->flags & SWP_WRITEOK) {
2042                         n = sis->pages;          2042                         n = sis->pages;
2043                         if (free)                2043                         if (free)
2044                                 n -= sis->inu    2044                                 n -= sis->inuse_pages;
2045                 }                                2045                 }
2046                 spin_unlock(&sis->lock);         2046                 spin_unlock(&sis->lock);
2047         }                                        2047         }
2048         spin_unlock(&swap_lock);                 2048         spin_unlock(&swap_lock);
2049         return n;                                2049         return n;
2050 }                                                2050 }
2051 #endif /* CONFIG_HIBERNATION */                  2051 #endif /* CONFIG_HIBERNATION */
2052                                                  2052 
2053 static inline int pte_same_as_swp(pte_t pte,     2053 static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
2054 {                                                2054 {
2055         return pte_same(pte_swp_clear_flags(p    2055         return pte_same(pte_swp_clear_flags(pte), swp_pte);
2056 }                                                2056 }
2057                                                  2057 
2058 /*                                               2058 /*
2059  * No need to decide whether this PTE shares     2059  * No need to decide whether this PTE shares the swap entry with others,
2060  * just let do_wp_page work it out if a write    2060  * just let do_wp_page work it out if a write is requested later - to
2061  * force COW, vm_page_prot omits write permis    2061  * force COW, vm_page_prot omits write permission from any private vma.
2062  */                                              2062  */
2063 static int unuse_pte(struct vm_area_struct *v    2063 static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
2064                 unsigned long addr, swp_entry    2064                 unsigned long addr, swp_entry_t entry, struct folio *folio)
2065 {                                                2065 {
2066         struct page *page;                       2066         struct page *page;
2067         struct folio *swapcache;                 2067         struct folio *swapcache;
2068         spinlock_t *ptl;                         2068         spinlock_t *ptl;
2069         pte_t *pte, new_pte, old_pte;            2069         pte_t *pte, new_pte, old_pte;
2070         bool hwpoisoned = false;                 2070         bool hwpoisoned = false;
2071         int ret = 1;                             2071         int ret = 1;
2072                                                  2072 
2073         swapcache = folio;                       2073         swapcache = folio;
2074         folio = ksm_might_need_to_copy(folio,    2074         folio = ksm_might_need_to_copy(folio, vma, addr);
2075         if (unlikely(!folio))                    2075         if (unlikely(!folio))
2076                 return -ENOMEM;                  2076                 return -ENOMEM;
2077         else if (unlikely(folio == ERR_PTR(-E    2077         else if (unlikely(folio == ERR_PTR(-EHWPOISON))) {
2078                 hwpoisoned = true;               2078                 hwpoisoned = true;
2079                 folio = swapcache;               2079                 folio = swapcache;
2080         }                                        2080         }
2081                                                  2081 
2082         page = folio_file_page(folio, swp_off    2082         page = folio_file_page(folio, swp_offset(entry));
2083         if (PageHWPoison(page))                  2083         if (PageHWPoison(page))
2084                 hwpoisoned = true;               2084                 hwpoisoned = true;
2085                                                  2085 
2086         pte = pte_offset_map_lock(vma->vm_mm,    2086         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
2087         if (unlikely(!pte || !pte_same_as_swp    2087         if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte),
2088                                                  2088                                                 swp_entry_to_pte(entry)))) {
2089                 ret = 0;                         2089                 ret = 0;
2090                 goto out;                        2090                 goto out;
2091         }                                        2091         }
2092                                                  2092 
2093         old_pte = ptep_get(pte);                 2093         old_pte = ptep_get(pte);
2094                                                  2094 
2095         if (unlikely(hwpoisoned || !folio_tes    2095         if (unlikely(hwpoisoned || !folio_test_uptodate(folio))) {
2096                 swp_entry_t swp_entry;           2096                 swp_entry_t swp_entry;
2097                                                  2097 
2098                 dec_mm_counter(vma->vm_mm, MM    2098                 dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
2099                 if (hwpoisoned) {                2099                 if (hwpoisoned) {
2100                         swp_entry = make_hwpo    2100                         swp_entry = make_hwpoison_entry(page);
2101                 } else {                         2101                 } else {
2102                         swp_entry = make_pois    2102                         swp_entry = make_poisoned_swp_entry();
2103                 }                                2103                 }
2104                 new_pte = swp_entry_to_pte(sw    2104                 new_pte = swp_entry_to_pte(swp_entry);
2105                 ret = 0;                         2105                 ret = 0;
2106                 goto setpte;                     2106                 goto setpte;
2107         }                                        2107         }
2108                                                  2108 
2109         /*                                       2109         /*
2110          * Some architectures may have to res    2110          * Some architectures may have to restore extra metadata to the page
2111          * when reading from swap. This metad    2111          * when reading from swap. This metadata may be indexed by swap entry
2112          * so this must be called before swap    2112          * so this must be called before swap_free().
2113          */                                      2113          */
2114         arch_swap_restore(folio_swap(entry, f    2114         arch_swap_restore(folio_swap(entry, folio), folio);
2115                                                  2115 
2116         dec_mm_counter(vma->vm_mm, MM_SWAPENT    2116         dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
2117         inc_mm_counter(vma->vm_mm, MM_ANONPAG    2117         inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
2118         folio_get(folio);                        2118         folio_get(folio);
2119         if (folio == swapcache) {                2119         if (folio == swapcache) {
2120                 rmap_t rmap_flags = RMAP_NONE    2120                 rmap_t rmap_flags = RMAP_NONE;
2121                                                  2121 
2122                 /*                               2122                 /*
2123                  * See do_swap_page(): writeb    2123                  * See do_swap_page(): writeback would be problematic.
2124                  * However, we do a folio_wai    2124                  * However, we do a folio_wait_writeback() just before this
2125                  * call and have the folio lo    2125                  * call and have the folio locked.
2126                  */                              2126                  */
2127                 VM_BUG_ON_FOLIO(folio_test_wr    2127                 VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);
2128                 if (pte_swp_exclusive(old_pte    2128                 if (pte_swp_exclusive(old_pte))
2129                         rmap_flags |= RMAP_EX    2129                         rmap_flags |= RMAP_EXCLUSIVE;
2130                 /*                               2130                 /*
2131                  * We currently only expect s    2131                  * We currently only expect small !anon folios, which are either
2132                  * fully exclusive or fully s    2132                  * fully exclusive or fully shared. If we ever get large folios
2133                  * here, we have to be carefu    2133                  * here, we have to be careful.
2134                  */                              2134                  */
2135                 if (!folio_test_anon(folio))     2135                 if (!folio_test_anon(folio)) {
2136                         VM_WARN_ON_ONCE(folio    2136                         VM_WARN_ON_ONCE(folio_test_large(folio));
2137                         VM_WARN_ON_FOLIO(!fol    2137                         VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
2138                         folio_add_new_anon_rm    2138                         folio_add_new_anon_rmap(folio, vma, addr, rmap_flags);
2139                 } else {                         2139                 } else {
2140                         folio_add_anon_rmap_p    2140                         folio_add_anon_rmap_pte(folio, page, vma, addr, rmap_flags);
2141                 }                                2141                 }
2142         } else { /* ksm created a completely     2142         } else { /* ksm created a completely new copy */
2143                 folio_add_new_anon_rmap(folio    2143                 folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
2144                 folio_add_lru_vma(folio, vma)    2144                 folio_add_lru_vma(folio, vma);
2145         }                                        2145         }
2146         new_pte = pte_mkold(mk_pte(page, vma-    2146         new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot));
2147         if (pte_swp_soft_dirty(old_pte))         2147         if (pte_swp_soft_dirty(old_pte))
2148                 new_pte = pte_mksoft_dirty(ne    2148                 new_pte = pte_mksoft_dirty(new_pte);
2149         if (pte_swp_uffd_wp(old_pte))            2149         if (pte_swp_uffd_wp(old_pte))
2150                 new_pte = pte_mkuffd_wp(new_p    2150                 new_pte = pte_mkuffd_wp(new_pte);
2151 setpte:                                          2151 setpte:
2152         set_pte_at(vma->vm_mm, addr, pte, new    2152         set_pte_at(vma->vm_mm, addr, pte, new_pte);
2153         swap_free(entry);                        2153         swap_free(entry);
2154 out:                                             2154 out:
2155         if (pte)                                 2155         if (pte)
2156                 pte_unmap_unlock(pte, ptl);      2156                 pte_unmap_unlock(pte, ptl);
2157         if (folio != swapcache) {                2157         if (folio != swapcache) {
2158                 folio_unlock(folio);             2158                 folio_unlock(folio);
2159                 folio_put(folio);                2159                 folio_put(folio);
2160         }                                        2160         }
2161         return ret;                              2161         return ret;
2162 }                                                2162 }
2163                                                  2163 
2164 static int unuse_pte_range(struct vm_area_str    2164 static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
2165                         unsigned long addr, u    2165                         unsigned long addr, unsigned long end,
2166                         unsigned int type)       2166                         unsigned int type)
2167 {                                                2167 {
2168         pte_t *pte = NULL;                       2168         pte_t *pte = NULL;
2169         struct swap_info_struct *si;             2169         struct swap_info_struct *si;
2170                                                  2170 
2171         si = swap_info[type];                    2171         si = swap_info[type];
2172         do {                                     2172         do {
2173                 struct folio *folio;             2173                 struct folio *folio;
2174                 unsigned long offset;            2174                 unsigned long offset;
2175                 unsigned char swp_count;         2175                 unsigned char swp_count;
2176                 swp_entry_t entry;               2176                 swp_entry_t entry;
2177                 int ret;                         2177                 int ret;
2178                 pte_t ptent;                     2178                 pte_t ptent;
2179                                                  2179 
2180                 if (!pte++) {                    2180                 if (!pte++) {
2181                         pte = pte_offset_map(    2181                         pte = pte_offset_map(pmd, addr);
2182                         if (!pte)                2182                         if (!pte)
2183                                 break;           2183                                 break;
2184                 }                                2184                 }
2185                                                  2185 
2186                 ptent = ptep_get_lockless(pte    2186                 ptent = ptep_get_lockless(pte);
2187                                                  2187 
2188                 if (!is_swap_pte(ptent))         2188                 if (!is_swap_pte(ptent))
2189                         continue;                2189                         continue;
2190                                                  2190 
2191                 entry = pte_to_swp_entry(pten    2191                 entry = pte_to_swp_entry(ptent);
2192                 if (swp_type(entry) != type)     2192                 if (swp_type(entry) != type)
2193                         continue;                2193                         continue;
2194                                                  2194 
2195                 offset = swp_offset(entry);      2195                 offset = swp_offset(entry);
2196                 pte_unmap(pte);                  2196                 pte_unmap(pte);
2197                 pte = NULL;                      2197                 pte = NULL;
2198                                                  2198 
2199                 folio = swap_cache_get_folio(    2199                 folio = swap_cache_get_folio(entry, vma, addr);
2200                 if (!folio) {                    2200                 if (!folio) {
2201                         struct vm_fault vmf =    2201                         struct vm_fault vmf = {
2202                                 .vma = vma,      2202                                 .vma = vma,
2203                                 .address = ad    2203                                 .address = addr,
2204                                 .real_address    2204                                 .real_address = addr,
2205                                 .pmd = pmd,      2205                                 .pmd = pmd,
2206                         };                       2206                         };
2207                                                  2207 
2208                         folio = swapin_readah    2208                         folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
2209                                                  2209                                                 &vmf);
2210                 }                                2210                 }
2211                 if (!folio) {                    2211                 if (!folio) {
2212                         swp_count = READ_ONCE    2212                         swp_count = READ_ONCE(si->swap_map[offset]);
2213                         if (swp_count == 0 ||    2213                         if (swp_count == 0 || swp_count == SWAP_MAP_BAD)
2214                                 continue;        2214                                 continue;
2215                         return -ENOMEM;          2215                         return -ENOMEM;
2216                 }                                2216                 }
2217                                                  2217 
2218                 folio_lock(folio);               2218                 folio_lock(folio);
2219                 folio_wait_writeback(folio);     2219                 folio_wait_writeback(folio);
2220                 ret = unuse_pte(vma, pmd, add    2220                 ret = unuse_pte(vma, pmd, addr, entry, folio);
2221                 if (ret < 0) {                   2221                 if (ret < 0) {
2222                         folio_unlock(folio);     2222                         folio_unlock(folio);
2223                         folio_put(folio);        2223                         folio_put(folio);
2224                         return ret;              2224                         return ret;
2225                 }                                2225                 }
2226                                                  2226 
2227                 folio_free_swap(folio);          2227                 folio_free_swap(folio);
2228                 folio_unlock(folio);             2228                 folio_unlock(folio);
2229                 folio_put(folio);                2229                 folio_put(folio);
2230         } while (addr += PAGE_SIZE, addr != e    2230         } while (addr += PAGE_SIZE, addr != end);
2231                                                  2231 
2232         if (pte)                                 2232         if (pte)
2233                 pte_unmap(pte);                  2233                 pte_unmap(pte);
2234         return 0;                                2234         return 0;
2235 }                                                2235 }
2236                                                  2236 
2237 static inline int unuse_pmd_range(struct vm_a    2237 static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
2238                                 unsigned long    2238                                 unsigned long addr, unsigned long end,
2239                                 unsigned int     2239                                 unsigned int type)
2240 {                                                2240 {
2241         pmd_t *pmd;                              2241         pmd_t *pmd;
2242         unsigned long next;                      2242         unsigned long next;
2243         int ret;                                 2243         int ret;
2244                                                  2244 
2245         pmd = pmd_offset(pud, addr);             2245         pmd = pmd_offset(pud, addr);
2246         do {                                     2246         do {
2247                 cond_resched();                  2247                 cond_resched();
2248                 next = pmd_addr_end(addr, end    2248                 next = pmd_addr_end(addr, end);
2249                 ret = unuse_pte_range(vma, pm    2249                 ret = unuse_pte_range(vma, pmd, addr, next, type);
2250                 if (ret)                         2250                 if (ret)
2251                         return ret;              2251                         return ret;
2252         } while (pmd++, addr = next, addr !=     2252         } while (pmd++, addr = next, addr != end);
2253         return 0;                                2253         return 0;
2254 }                                                2254 }
2255                                                  2255 
2256 static inline int unuse_pud_range(struct vm_a    2256 static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
2257                                 unsigned long    2257                                 unsigned long addr, unsigned long end,
2258                                 unsigned int     2258                                 unsigned int type)
2259 {                                                2259 {
2260         pud_t *pud;                              2260         pud_t *pud;
2261         unsigned long next;                      2261         unsigned long next;
2262         int ret;                                 2262         int ret;
2263                                                  2263 
2264         pud = pud_offset(p4d, addr);             2264         pud = pud_offset(p4d, addr);
2265         do {                                     2265         do {
2266                 next = pud_addr_end(addr, end    2266                 next = pud_addr_end(addr, end);
2267                 if (pud_none_or_clear_bad(pud    2267                 if (pud_none_or_clear_bad(pud))
2268                         continue;                2268                         continue;
2269                 ret = unuse_pmd_range(vma, pu    2269                 ret = unuse_pmd_range(vma, pud, addr, next, type);
2270                 if (ret)                         2270                 if (ret)
2271                         return ret;              2271                         return ret;
2272         } while (pud++, addr = next, addr !=     2272         } while (pud++, addr = next, addr != end);
2273         return 0;                                2273         return 0;
2274 }                                                2274 }
2275                                                  2275 
2276 static inline int unuse_p4d_range(struct vm_a    2276 static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
2277                                 unsigned long    2277                                 unsigned long addr, unsigned long end,
2278                                 unsigned int     2278                                 unsigned int type)
2279 {                                                2279 {
2280         p4d_t *p4d;                              2280         p4d_t *p4d;
2281         unsigned long next;                      2281         unsigned long next;
2282         int ret;                                 2282         int ret;
2283                                                  2283 
2284         p4d = p4d_offset(pgd, addr);             2284         p4d = p4d_offset(pgd, addr);
2285         do {                                     2285         do {
2286                 next = p4d_addr_end(addr, end    2286                 next = p4d_addr_end(addr, end);
2287                 if (p4d_none_or_clear_bad(p4d    2287                 if (p4d_none_or_clear_bad(p4d))
2288                         continue;                2288                         continue;
2289                 ret = unuse_pud_range(vma, p4    2289                 ret = unuse_pud_range(vma, p4d, addr, next, type);
2290                 if (ret)                         2290                 if (ret)
2291                         return ret;              2291                         return ret;
2292         } while (p4d++, addr = next, addr !=     2292         } while (p4d++, addr = next, addr != end);
2293         return 0;                                2293         return 0;
2294 }                                                2294 }
2295                                                  2295 
2296 static int unuse_vma(struct vm_area_struct *v    2296 static int unuse_vma(struct vm_area_struct *vma, unsigned int type)
2297 {                                                2297 {
2298         pgd_t *pgd;                              2298         pgd_t *pgd;
2299         unsigned long addr, end, next;           2299         unsigned long addr, end, next;
2300         int ret;                                 2300         int ret;
2301                                                  2301 
2302         addr = vma->vm_start;                    2302         addr = vma->vm_start;
2303         end = vma->vm_end;                       2303         end = vma->vm_end;
2304                                                  2304 
2305         pgd = pgd_offset(vma->vm_mm, addr);      2305         pgd = pgd_offset(vma->vm_mm, addr);
2306         do {                                     2306         do {
2307                 next = pgd_addr_end(addr, end    2307                 next = pgd_addr_end(addr, end);
2308                 if (pgd_none_or_clear_bad(pgd    2308                 if (pgd_none_or_clear_bad(pgd))
2309                         continue;                2309                         continue;
2310                 ret = unuse_p4d_range(vma, pg    2310                 ret = unuse_p4d_range(vma, pgd, addr, next, type);
2311                 if (ret)                         2311                 if (ret)
2312                         return ret;              2312                         return ret;
2313         } while (pgd++, addr = next, addr !=     2313         } while (pgd++, addr = next, addr != end);
2314         return 0;                                2314         return 0;
2315 }                                                2315 }
2316                                                  2316 
2317 static int unuse_mm(struct mm_struct *mm, uns    2317 static int unuse_mm(struct mm_struct *mm, unsigned int type)
2318 {                                                2318 {
2319         struct vm_area_struct *vma;              2319         struct vm_area_struct *vma;
2320         int ret = 0;                             2320         int ret = 0;
2321         VMA_ITERATOR(vmi, mm, 0);                2321         VMA_ITERATOR(vmi, mm, 0);
2322                                                  2322 
2323         mmap_read_lock(mm);                      2323         mmap_read_lock(mm);
2324         for_each_vma(vmi, vma) {                 2324         for_each_vma(vmi, vma) {
2325                 if (vma->anon_vma && !is_vm_h    2325                 if (vma->anon_vma && !is_vm_hugetlb_page(vma)) {
2326                         ret = unuse_vma(vma,     2326                         ret = unuse_vma(vma, type);
2327                         if (ret)                 2327                         if (ret)
2328                                 break;           2328                                 break;
2329                 }                                2329                 }
2330                                                  2330 
2331                 cond_resched();                  2331                 cond_resched();
2332         }                                        2332         }
2333         mmap_read_unlock(mm);                    2333         mmap_read_unlock(mm);
2334         return ret;                              2334         return ret;
2335 }                                                2335 }
2336                                                  2336 
2337 /*                                               2337 /*
2338  * Scan swap_map from current position to nex    2338  * Scan swap_map from current position to next entry still in use.
2339  * Return 0 if there are no inuse entries aft    2339  * Return 0 if there are no inuse entries after prev till end of
2340  * the map.                                      2340  * the map.
2341  */                                              2341  */
2342 static unsigned int find_next_to_unuse(struct    2342 static unsigned int find_next_to_unuse(struct swap_info_struct *si,
2343                                         unsig    2343                                         unsigned int prev)
2344 {                                                2344 {
2345         unsigned int i;                          2345         unsigned int i;
2346         unsigned char count;                     2346         unsigned char count;
2347                                                  2347 
2348         /*                                       2348         /*
2349          * No need for swap_lock here: we're     2349          * No need for swap_lock here: we're just looking
2350          * for whether an entry is in use, no    2350          * for whether an entry is in use, not modifying it; false
2351          * hits are okay, and sys_swapoff() h    2351          * hits are okay, and sys_swapoff() has already prevented new
2352          * allocations from this area (while     2352          * allocations from this area (while holding swap_lock).
2353          */                                      2353          */
2354         for (i = prev + 1; i < si->max; i++)     2354         for (i = prev + 1; i < si->max; i++) {
2355                 count = READ_ONCE(si->swap_ma    2355                 count = READ_ONCE(si->swap_map[i]);
2356                 if (count && swap_count(count    2356                 if (count && swap_count(count) != SWAP_MAP_BAD)
2357                         break;                   2357                         break;
2358                 if ((i % LATENCY_LIMIT) == 0)    2358                 if ((i % LATENCY_LIMIT) == 0)
2359                         cond_resched();          2359                         cond_resched();
2360         }                                        2360         }
2361                                                  2361 
2362         if (i == si->max)                        2362         if (i == si->max)
2363                 i = 0;                           2363                 i = 0;
2364                                                  2364 
2365         return i;                                2365         return i;
2366 }                                                2366 }
2367                                                  2367 
2368 static int try_to_unuse(unsigned int type)       2368 static int try_to_unuse(unsigned int type)
2369 {                                                2369 {
2370         struct mm_struct *prev_mm;               2370         struct mm_struct *prev_mm;
2371         struct mm_struct *mm;                    2371         struct mm_struct *mm;
2372         struct list_head *p;                     2372         struct list_head *p;
2373         int retval = 0;                          2373         int retval = 0;
2374         struct swap_info_struct *si = swap_in    2374         struct swap_info_struct *si = swap_info[type];
2375         struct folio *folio;                     2375         struct folio *folio;
2376         swp_entry_t entry;                       2376         swp_entry_t entry;
2377         unsigned int i;                          2377         unsigned int i;
2378                                                  2378 
2379         if (!READ_ONCE(si->inuse_pages))         2379         if (!READ_ONCE(si->inuse_pages))
2380                 goto success;                    2380                 goto success;
2381                                                  2381 
2382 retry:                                           2382 retry:
2383         retval = shmem_unuse(type);              2383         retval = shmem_unuse(type);
2384         if (retval)                              2384         if (retval)
2385                 return retval;                   2385                 return retval;
2386                                                  2386 
2387         prev_mm = &init_mm;                      2387         prev_mm = &init_mm;
2388         mmget(prev_mm);                          2388         mmget(prev_mm);
2389                                                  2389 
2390         spin_lock(&mmlist_lock);                 2390         spin_lock(&mmlist_lock);
2391         p = &init_mm.mmlist;                     2391         p = &init_mm.mmlist;
2392         while (READ_ONCE(si->inuse_pages) &&     2392         while (READ_ONCE(si->inuse_pages) &&
2393                !signal_pending(current) &&       2393                !signal_pending(current) &&
2394                (p = p->next) != &init_mm.mmli    2394                (p = p->next) != &init_mm.mmlist) {
2395                                                  2395 
2396                 mm = list_entry(p, struct mm_    2396                 mm = list_entry(p, struct mm_struct, mmlist);
2397                 if (!mmget_not_zero(mm))         2397                 if (!mmget_not_zero(mm))
2398                         continue;                2398                         continue;
2399                 spin_unlock(&mmlist_lock);       2399                 spin_unlock(&mmlist_lock);
2400                 mmput(prev_mm);                  2400                 mmput(prev_mm);
2401                 prev_mm = mm;                    2401                 prev_mm = mm;
2402                 retval = unuse_mm(mm, type);     2402                 retval = unuse_mm(mm, type);
2403                 if (retval) {                    2403                 if (retval) {
2404                         mmput(prev_mm);          2404                         mmput(prev_mm);
2405                         return retval;           2405                         return retval;
2406                 }                                2406                 }
2407                                                  2407 
2408                 /*                               2408                 /*
2409                  * Make sure that we aren't c    2409                  * Make sure that we aren't completely killing
2410                  * interactive performance.      2410                  * interactive performance.
2411                  */                              2411                  */
2412                 cond_resched();                  2412                 cond_resched();
2413                 spin_lock(&mmlist_lock);         2413                 spin_lock(&mmlist_lock);
2414         }                                        2414         }
2415         spin_unlock(&mmlist_lock);               2415         spin_unlock(&mmlist_lock);
2416                                                  2416 
2417         mmput(prev_mm);                          2417         mmput(prev_mm);
2418                                                  2418 
2419         i = 0;                                   2419         i = 0;
2420         while (READ_ONCE(si->inuse_pages) &&     2420         while (READ_ONCE(si->inuse_pages) &&
2421                !signal_pending(current) &&       2421                !signal_pending(current) &&
2422                (i = find_next_to_unuse(si, i)    2422                (i = find_next_to_unuse(si, i)) != 0) {
2423                                                  2423 
2424                 entry = swp_entry(type, i);      2424                 entry = swp_entry(type, i);
2425                 folio = filemap_get_folio(swa    2425                 folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry));
2426                 if (IS_ERR(folio))               2426                 if (IS_ERR(folio))
2427                         continue;                2427                         continue;
2428                                                  2428 
2429                 /*                               2429                 /*
2430                  * It is conceivable that a r    2430                  * It is conceivable that a racing task removed this folio from
2431                  * swap cache just before we     2431                  * swap cache just before we acquired the page lock. The folio
2432                  * might even be back in swap    2432                  * might even be back in swap cache on another swap area. But
2433                  * that is okay, folio_free_s    2433                  * that is okay, folio_free_swap() only removes stale folios.
2434                  */                              2434                  */
2435                 folio_lock(folio);               2435                 folio_lock(folio);
2436                 folio_wait_writeback(folio);     2436                 folio_wait_writeback(folio);
2437                 folio_free_swap(folio);          2437                 folio_free_swap(folio);
2438                 folio_unlock(folio);             2438                 folio_unlock(folio);
2439                 folio_put(folio);                2439                 folio_put(folio);
2440         }                                        2440         }
2441                                                  2441 
2442         /*                                       2442         /*
2443          * Lets check again to see if there a    2443          * Lets check again to see if there are still swap entries in the map.
2444          * If yes, we would need to do retry     2444          * If yes, we would need to do retry the unuse logic again.
2445          * Under global memory pressure, swap    2445          * Under global memory pressure, swap entries can be reinserted back
2446          * into process space after the mmlis    2446          * into process space after the mmlist loop above passes over them.
2447          *                                       2447          *
2448          * Limit the number of retries? No: w    2448          * Limit the number of retries? No: when mmget_not_zero()
2449          * above fails, that mm is likely to     2449          * above fails, that mm is likely to be freeing swap from
2450          * exit_mmap(), which proceeds at its    2450          * exit_mmap(), which proceeds at its own independent pace;
2451          * and even shmem_writepage() could h    2451          * and even shmem_writepage() could have been preempted after
2452          * folio_alloc_swap(), temporarily hi    2452          * folio_alloc_swap(), temporarily hiding that swap.  It's easy
2453          * and robust (though cpu-intensive)     2453          * and robust (though cpu-intensive) just to keep retrying.
2454          */                                      2454          */
2455         if (READ_ONCE(si->inuse_pages)) {        2455         if (READ_ONCE(si->inuse_pages)) {
2456                 if (!signal_pending(current))    2456                 if (!signal_pending(current))
2457                         goto retry;              2457                         goto retry;
2458                 return -EINTR;                   2458                 return -EINTR;
2459         }                                        2459         }
2460                                                  2460 
2461 success:                                         2461 success:
2462         /*                                       2462         /*
2463          * Make sure that further cleanups af    2463          * Make sure that further cleanups after try_to_unuse() returns happen
2464          * after swap_range_free() reduces si    2464          * after swap_range_free() reduces si->inuse_pages to 0.
2465          */                                      2465          */
2466         smp_mb();                                2466         smp_mb();
2467         return 0;                                2467         return 0;
2468 }                                                2468 }
2469                                                  2469 
2470 /*                                               2470 /*
2471  * After a successful try_to_unuse, if no swa    2471  * After a successful try_to_unuse, if no swap is now in use, we know
2472  * we can empty the mmlist.  swap_lock must b    2472  * we can empty the mmlist.  swap_lock must be held on entry and exit.
2473  * Note that mmlist_lock nests inside swap_lo    2473  * Note that mmlist_lock nests inside swap_lock, and an mm must be
2474  * added to the mmlist just after page_duplic    2474  * added to the mmlist just after page_duplicate - before would be racy.
2475  */                                              2475  */
2476 static void drain_mmlist(void)                   2476 static void drain_mmlist(void)
2477 {                                                2477 {
2478         struct list_head *p, *next;              2478         struct list_head *p, *next;
2479         unsigned int type;                       2479         unsigned int type;
2480                                                  2480 
2481         for (type = 0; type < nr_swapfiles; t    2481         for (type = 0; type < nr_swapfiles; type++)
2482                 if (swap_info[type]->inuse_pa    2482                 if (swap_info[type]->inuse_pages)
2483                         return;                  2483                         return;
2484         spin_lock(&mmlist_lock);                 2484         spin_lock(&mmlist_lock);
2485         list_for_each_safe(p, next, &init_mm.    2485         list_for_each_safe(p, next, &init_mm.mmlist)
2486                 list_del_init(p);                2486                 list_del_init(p);
2487         spin_unlock(&mmlist_lock);               2487         spin_unlock(&mmlist_lock);
2488 }                                                2488 }
2489                                                  2489 
2490 /*                                               2490 /*
2491  * Free all of a swapdev's extent information    2491  * Free all of a swapdev's extent information
2492  */                                              2492  */
2493 static void destroy_swap_extents(struct swap_    2493 static void destroy_swap_extents(struct swap_info_struct *sis)
2494 {                                                2494 {
2495         while (!RB_EMPTY_ROOT(&sis->swap_exte    2495         while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) {
2496                 struct rb_node *rb = sis->swa    2496                 struct rb_node *rb = sis->swap_extent_root.rb_node;
2497                 struct swap_extent *se = rb_e    2497                 struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node);
2498                                                  2498 
2499                 rb_erase(rb, &sis->swap_exten    2499                 rb_erase(rb, &sis->swap_extent_root);
2500                 kfree(se);                       2500                 kfree(se);
2501         }                                        2501         }
2502                                                  2502 
2503         if (sis->flags & SWP_ACTIVATED) {        2503         if (sis->flags & SWP_ACTIVATED) {
2504                 struct file *swap_file = sis-    2504                 struct file *swap_file = sis->swap_file;
2505                 struct address_space *mapping    2505                 struct address_space *mapping = swap_file->f_mapping;
2506                                                  2506 
2507                 sis->flags &= ~SWP_ACTIVATED;    2507                 sis->flags &= ~SWP_ACTIVATED;
2508                 if (mapping->a_ops->swap_deac    2508                 if (mapping->a_ops->swap_deactivate)
2509                         mapping->a_ops->swap_    2509                         mapping->a_ops->swap_deactivate(swap_file);
2510         }                                        2510         }
2511 }                                                2511 }
2512                                                  2512 
2513 /*                                               2513 /*
2514  * Add a block range (and the corresponding p    2514  * Add a block range (and the corresponding page range) into this swapdev's
2515  * extent tree.                                  2515  * extent tree.
2516  *                                               2516  *
2517  * This function rather assumes that it is ca    2517  * This function rather assumes that it is called in ascending page order.
2518  */                                              2518  */
2519 int                                              2519 int
2520 add_swap_extent(struct swap_info_struct *sis,    2520 add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
2521                 unsigned long nr_pages, secto    2521                 unsigned long nr_pages, sector_t start_block)
2522 {                                                2522 {
2523         struct rb_node **link = &sis->swap_ex    2523         struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL;
2524         struct swap_extent *se;                  2524         struct swap_extent *se;
2525         struct swap_extent *new_se;              2525         struct swap_extent *new_se;
2526                                                  2526 
2527         /*                                       2527         /*
2528          * place the new node at the right mo    2528          * place the new node at the right most since the
2529          * function is called in ascending pa    2529          * function is called in ascending page order.
2530          */                                      2530          */
2531         while (*link) {                          2531         while (*link) {
2532                 parent = *link;                  2532                 parent = *link;
2533                 link = &parent->rb_right;        2533                 link = &parent->rb_right;
2534         }                                        2534         }
2535                                                  2535 
2536         if (parent) {                            2536         if (parent) {
2537                 se = rb_entry(parent, struct     2537                 se = rb_entry(parent, struct swap_extent, rb_node);
2538                 BUG_ON(se->start_page + se->n    2538                 BUG_ON(se->start_page + se->nr_pages != start_page);
2539                 if (se->start_block + se->nr_    2539                 if (se->start_block + se->nr_pages == start_block) {
2540                         /* Merge it */           2540                         /* Merge it */
2541                         se->nr_pages += nr_pa    2541                         se->nr_pages += nr_pages;
2542                         return 0;                2542                         return 0;
2543                 }                                2543                 }
2544         }                                        2544         }
2545                                                  2545 
2546         /* No merge, insert a new extent. */     2546         /* No merge, insert a new extent. */
2547         new_se = kmalloc(sizeof(*se), GFP_KER    2547         new_se = kmalloc(sizeof(*se), GFP_KERNEL);
2548         if (new_se == NULL)                      2548         if (new_se == NULL)
2549                 return -ENOMEM;                  2549                 return -ENOMEM;
2550         new_se->start_page = start_page;         2550         new_se->start_page = start_page;
2551         new_se->nr_pages = nr_pages;             2551         new_se->nr_pages = nr_pages;
2552         new_se->start_block = start_block;       2552         new_se->start_block = start_block;
2553                                                  2553 
2554         rb_link_node(&new_se->rb_node, parent    2554         rb_link_node(&new_se->rb_node, parent, link);
2555         rb_insert_color(&new_se->rb_node, &si    2555         rb_insert_color(&new_se->rb_node, &sis->swap_extent_root);
2556         return 1;                                2556         return 1;
2557 }                                                2557 }
2558 EXPORT_SYMBOL_GPL(add_swap_extent);              2558 EXPORT_SYMBOL_GPL(add_swap_extent);
2559                                                  2559 
2560 /*                                               2560 /*
2561  * A `swap extent' is a simple thing which ma    2561  * A `swap extent' is a simple thing which maps a contiguous range of pages
2562  * onto a contiguous range of disk blocks.  A    2562  * onto a contiguous range of disk blocks.  A rbtree of swap extents is
2563  * built at swapon time and is then used at s    2563  * built at swapon time and is then used at swap_writepage/swap_read_folio
2564  * time for locating where on disk a page bel    2564  * time for locating where on disk a page belongs.
2565  *                                               2565  *
2566  * If the swapfile is an S_ISBLK block device    2566  * If the swapfile is an S_ISBLK block device, a single extent is installed.
2567  * This is done so that the main operating co    2567  * This is done so that the main operating code can treat S_ISBLK and S_ISREG
2568  * swap files identically.                       2568  * swap files identically.
2569  *                                               2569  *
2570  * Whether the swapdev is an S_ISREG file or     2570  * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
2571  * extent rbtree operates in PAGE_SIZE disk b    2571  * extent rbtree operates in PAGE_SIZE disk blocks.  Both S_ISREG and S_ISBLK
2572  * swapfiles are handled *identically* after     2572  * swapfiles are handled *identically* after swapon time.
2573  *                                               2573  *
2574  * For S_ISREG swapfiles, setup_swap_extents(    2574  * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
2575  * and will parse them into a rbtree, in PAGE    2575  * and will parse them into a rbtree, in PAGE_SIZE chunks.  If some stray
2576  * blocks are found which do not fall within     2576  * blocks are found which do not fall within the PAGE_SIZE alignment
2577  * requirements, they are simply tossed out -    2577  * requirements, they are simply tossed out - we will never use those blocks
2578  * for swapping.                                 2578  * for swapping.
2579  *                                               2579  *
2580  * For all swap devices we set S_SWAPFILE acr    2580  * For all swap devices we set S_SWAPFILE across the life of the swapon.  This
2581  * prevents users from writing to the swap de    2581  * prevents users from writing to the swap device, which will corrupt memory.
2582  *                                               2582  *
2583  * The amount of disk space which a single sw    2583  * The amount of disk space which a single swap extent represents varies.
2584  * Typically it is in the 1-4 megabyte range.    2584  * Typically it is in the 1-4 megabyte range.  So we can have hundreds of
2585  * extents in the rbtree. - akpm.                2585  * extents in the rbtree. - akpm.
2586  */                                              2586  */
2587 static int setup_swap_extents(struct swap_inf    2587 static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
2588 {                                                2588 {
2589         struct file *swap_file = sis->swap_fi    2589         struct file *swap_file = sis->swap_file;
2590         struct address_space *mapping = swap_    2590         struct address_space *mapping = swap_file->f_mapping;
2591         struct inode *inode = mapping->host;     2591         struct inode *inode = mapping->host;
2592         int ret;                                 2592         int ret;
2593                                                  2593 
2594         if (S_ISBLK(inode->i_mode)) {            2594         if (S_ISBLK(inode->i_mode)) {
2595                 ret = add_swap_extent(sis, 0,    2595                 ret = add_swap_extent(sis, 0, sis->max, 0);
2596                 *span = sis->pages;              2596                 *span = sis->pages;
2597                 return ret;                      2597                 return ret;
2598         }                                        2598         }
2599                                                  2599 
2600         if (mapping->a_ops->swap_activate) {     2600         if (mapping->a_ops->swap_activate) {
2601                 ret = mapping->a_ops->swap_ac    2601                 ret = mapping->a_ops->swap_activate(sis, swap_file, span);
2602                 if (ret < 0)                     2602                 if (ret < 0)
2603                         return ret;              2603                         return ret;
2604                 sis->flags |= SWP_ACTIVATED;     2604                 sis->flags |= SWP_ACTIVATED;
2605                 if ((sis->flags & SWP_FS_OPS)    2605                 if ((sis->flags & SWP_FS_OPS) &&
2606                     sio_pool_init() != 0) {      2606                     sio_pool_init() != 0) {
2607                         destroy_swap_extents(    2607                         destroy_swap_extents(sis);
2608                         return -ENOMEM;          2608                         return -ENOMEM;
2609                 }                                2609                 }
2610                 return ret;                      2610                 return ret;
2611         }                                        2611         }
2612                                                  2612 
2613         return generic_swapfile_activate(sis,    2613         return generic_swapfile_activate(sis, swap_file, span);
2614 }                                                2614 }
2615                                                  2615 
2616 static int swap_node(struct swap_info_struct     2616 static int swap_node(struct swap_info_struct *si)
2617 {                                                2617 {
2618         struct block_device *bdev;               2618         struct block_device *bdev;
2619                                                  2619 
2620         if (si->bdev)                            2620         if (si->bdev)
2621                 bdev = si->bdev;                 2621                 bdev = si->bdev;
2622         else                                     2622         else
2623                 bdev = si->swap_file->f_inode    2623                 bdev = si->swap_file->f_inode->i_sb->s_bdev;
2624                                                  2624 
2625         return bdev ? bdev->bd_disk->node_id     2625         return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
2626 }                                                2626 }
2627                                                  2627 
2628 static void setup_swap_info(struct swap_info_    2628 static void setup_swap_info(struct swap_info_struct *si, int prio,
2629                             unsigned char *sw    2629                             unsigned char *swap_map,
2630                             struct swap_clust    2630                             struct swap_cluster_info *cluster_info,
2631                             unsigned long *ze    2631                             unsigned long *zeromap)
2632 {                                                2632 {
2633         int i;                                   2633         int i;
2634                                                  2634 
2635         if (prio >= 0)                           2635         if (prio >= 0)
2636                 si->prio = prio;                 2636                 si->prio = prio;
2637         else                                     2637         else
2638                 si->prio = --least_priority;     2638                 si->prio = --least_priority;
2639         /*                                       2639         /*
2640          * the plist prio is negated because     2640          * the plist prio is negated because plist ordering is
2641          * low-to-high, while swap ordering i    2641          * low-to-high, while swap ordering is high-to-low
2642          */                                      2642          */
2643         si->list.prio = -si->prio;               2643         si->list.prio = -si->prio;
2644         for_each_node(i) {                       2644         for_each_node(i) {
2645                 if (si->prio >= 0)               2645                 if (si->prio >= 0)
2646                         si->avail_lists[i].pr    2646                         si->avail_lists[i].prio = -si->prio;
2647                 else {                           2647                 else {
2648                         if (swap_node(si) ==     2648                         if (swap_node(si) == i)
2649                                 si->avail_lis    2649                                 si->avail_lists[i].prio = 1;
2650                         else                     2650                         else
2651                                 si->avail_lis    2651                                 si->avail_lists[i].prio = -si->prio;
2652                 }                                2652                 }
2653         }                                        2653         }
2654         si->swap_map = swap_map;                 2654         si->swap_map = swap_map;
2655         si->cluster_info = cluster_info;         2655         si->cluster_info = cluster_info;
2656         si->zeromap = zeromap;                   2656         si->zeromap = zeromap;
2657 }                                                2657 }
2658                                                  2658 
2659 static void _enable_swap_info(struct swap_inf    2659 static void _enable_swap_info(struct swap_info_struct *si)
2660 {                                                2660 {
2661         si->flags |= SWP_WRITEOK;                2661         si->flags |= SWP_WRITEOK;
2662         atomic_long_add(si->pages, &nr_swap_p    2662         atomic_long_add(si->pages, &nr_swap_pages);
2663         total_swap_pages += si->pages;           2663         total_swap_pages += si->pages;
2664                                                  2664 
2665         assert_spin_locked(&swap_lock);          2665         assert_spin_locked(&swap_lock);
2666         /*                                       2666         /*
2667          * both lists are plists, and thus pr    2667          * both lists are plists, and thus priority ordered.
2668          * swap_active_head needs to be prior    2668          * swap_active_head needs to be priority ordered for swapoff(),
2669          * which on removal of any swap_info_    2669          * which on removal of any swap_info_struct with an auto-assigned
2670          * (i.e. negative) priority increment    2670          * (i.e. negative) priority increments the auto-assigned priority
2671          * of any lower-priority swap_info_st    2671          * of any lower-priority swap_info_structs.
2672          * swap_avail_head needs to be priori    2672          * swap_avail_head needs to be priority ordered for folio_alloc_swap(),
2673          * which allocates swap pages from th    2673          * which allocates swap pages from the highest available priority
2674          * swap_info_struct.                     2674          * swap_info_struct.
2675          */                                      2675          */
2676         plist_add(&si->list, &swap_active_hea    2676         plist_add(&si->list, &swap_active_head);
2677                                                  2677 
2678         /* add to available list iff swap dev    2678         /* add to available list iff swap device is not full */
2679         if (si->highest_bit)                     2679         if (si->highest_bit)
2680                 add_to_avail_list(si);           2680                 add_to_avail_list(si);
2681 }                                                2681 }
2682                                                  2682 
2683 static void enable_swap_info(struct swap_info    2683 static void enable_swap_info(struct swap_info_struct *si, int prio,
2684                                 unsigned char    2684                                 unsigned char *swap_map,
2685                                 struct swap_c    2685                                 struct swap_cluster_info *cluster_info,
2686                                 unsigned long    2686                                 unsigned long *zeromap)
2687 {                                                2687 {
2688         spin_lock(&swap_lock);                   2688         spin_lock(&swap_lock);
2689         spin_lock(&si->lock);                    2689         spin_lock(&si->lock);
2690         setup_swap_info(si, prio, swap_map, c    2690         setup_swap_info(si, prio, swap_map, cluster_info, zeromap);
2691         spin_unlock(&si->lock);                  2691         spin_unlock(&si->lock);
2692         spin_unlock(&swap_lock);                 2692         spin_unlock(&swap_lock);
2693         /*                                       2693         /*
2694          * Finished initializing swap device,    2694          * Finished initializing swap device, now it's safe to reference it.
2695          */                                      2695          */
2696         percpu_ref_resurrect(&si->users);        2696         percpu_ref_resurrect(&si->users);
2697         spin_lock(&swap_lock);                   2697         spin_lock(&swap_lock);
2698         spin_lock(&si->lock);                    2698         spin_lock(&si->lock);
2699         _enable_swap_info(si);                   2699         _enable_swap_info(si);
2700         spin_unlock(&si->lock);                  2700         spin_unlock(&si->lock);
2701         spin_unlock(&swap_lock);                 2701         spin_unlock(&swap_lock);
2702 }                                                2702 }
2703                                                  2703 
2704 static void reinsert_swap_info(struct swap_in    2704 static void reinsert_swap_info(struct swap_info_struct *si)
2705 {                                                2705 {
2706         spin_lock(&swap_lock);                   2706         spin_lock(&swap_lock);
2707         spin_lock(&si->lock);                    2707         spin_lock(&si->lock);
2708         setup_swap_info(si, si->prio, si->swa    2708         setup_swap_info(si, si->prio, si->swap_map, si->cluster_info, si->zeromap);
2709         _enable_swap_info(si);                   2709         _enable_swap_info(si);
2710         spin_unlock(&si->lock);                  2710         spin_unlock(&si->lock);
2711         spin_unlock(&swap_lock);                 2711         spin_unlock(&swap_lock);
2712 }                                                2712 }
2713                                                  2713 
2714 static bool __has_usable_swap(void)              2714 static bool __has_usable_swap(void)
2715 {                                                2715 {
2716         return !plist_head_empty(&swap_active    2716         return !plist_head_empty(&swap_active_head);
2717 }                                                2717 }
2718                                                  2718 
2719 bool has_usable_swap(void)                       2719 bool has_usable_swap(void)
2720 {                                                2720 {
2721         bool ret;                                2721         bool ret;
2722                                                  2722 
2723         spin_lock(&swap_lock);                   2723         spin_lock(&swap_lock);
2724         ret = __has_usable_swap();               2724         ret = __has_usable_swap();
2725         spin_unlock(&swap_lock);                 2725         spin_unlock(&swap_lock);
2726         return ret;                              2726         return ret;
2727 }                                                2727 }
2728                                                  2728 
2729 SYSCALL_DEFINE1(swapoff, const char __user *,    2729 SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
2730 {                                                2730 {
2731         struct swap_info_struct *p = NULL;       2731         struct swap_info_struct *p = NULL;
2732         unsigned char *swap_map;                 2732         unsigned char *swap_map;
2733         unsigned long *zeromap;                  2733         unsigned long *zeromap;
2734         struct swap_cluster_info *cluster_inf    2734         struct swap_cluster_info *cluster_info;
2735         struct file *swap_file, *victim;         2735         struct file *swap_file, *victim;
2736         struct address_space *mapping;           2736         struct address_space *mapping;
2737         struct inode *inode;                     2737         struct inode *inode;
2738         struct filename *pathname;               2738         struct filename *pathname;
2739         int err, found = 0;                      2739         int err, found = 0;
2740                                                  2740 
2741         if (!capable(CAP_SYS_ADMIN))             2741         if (!capable(CAP_SYS_ADMIN))
2742                 return -EPERM;                   2742                 return -EPERM;
2743                                                  2743 
2744         BUG_ON(!current->mm);                    2744         BUG_ON(!current->mm);
2745                                                  2745 
2746         pathname = getname(specialfile);         2746         pathname = getname(specialfile);
2747         if (IS_ERR(pathname))                    2747         if (IS_ERR(pathname))
2748                 return PTR_ERR(pathname);        2748                 return PTR_ERR(pathname);
2749                                                  2749 
2750         victim = file_open_name(pathname, O_R    2750         victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
2751         err = PTR_ERR(victim);                   2751         err = PTR_ERR(victim);
2752         if (IS_ERR(victim))                      2752         if (IS_ERR(victim))
2753                 goto out;                        2753                 goto out;
2754                                                  2754 
2755         mapping = victim->f_mapping;             2755         mapping = victim->f_mapping;
2756         spin_lock(&swap_lock);                   2756         spin_lock(&swap_lock);
2757         plist_for_each_entry(p, &swap_active_    2757         plist_for_each_entry(p, &swap_active_head, list) {
2758                 if (p->flags & SWP_WRITEOK) {    2758                 if (p->flags & SWP_WRITEOK) {
2759                         if (p->swap_file->f_m    2759                         if (p->swap_file->f_mapping == mapping) {
2760                                 found = 1;       2760                                 found = 1;
2761                                 break;           2761                                 break;
2762                         }                        2762                         }
2763                 }                                2763                 }
2764         }                                        2764         }
2765         if (!found) {                            2765         if (!found) {
2766                 err = -EINVAL;                   2766                 err = -EINVAL;
2767                 spin_unlock(&swap_lock);         2767                 spin_unlock(&swap_lock);
2768                 goto out_dput;                   2768                 goto out_dput;
2769         }                                        2769         }
2770         if (!security_vm_enough_memory_mm(cur    2770         if (!security_vm_enough_memory_mm(current->mm, p->pages))
2771                 vm_unacct_memory(p->pages);      2771                 vm_unacct_memory(p->pages);
2772         else {                                   2772         else {
2773                 err = -ENOMEM;                   2773                 err = -ENOMEM;
2774                 spin_unlock(&swap_lock);         2774                 spin_unlock(&swap_lock);
2775                 goto out_dput;                   2775                 goto out_dput;
2776         }                                        2776         }
2777         spin_lock(&p->lock);                     2777         spin_lock(&p->lock);
2778         del_from_avail_list(p);                  2778         del_from_avail_list(p);
2779         if (p->prio < 0) {                       2779         if (p->prio < 0) {
2780                 struct swap_info_struct *si =    2780                 struct swap_info_struct *si = p;
2781                 int nid;                         2781                 int nid;
2782                                                  2782 
2783                 plist_for_each_entry_continue    2783                 plist_for_each_entry_continue(si, &swap_active_head, list) {
2784                         si->prio++;              2784                         si->prio++;
2785                         si->list.prio--;         2785                         si->list.prio--;
2786                         for_each_node(nid) {     2786                         for_each_node(nid) {
2787                                 if (si->avail    2787                                 if (si->avail_lists[nid].prio != 1)
2788                                         si->a    2788                                         si->avail_lists[nid].prio--;
2789                         }                        2789                         }
2790                 }                                2790                 }
2791                 least_priority++;                2791                 least_priority++;
2792         }                                        2792         }
2793         plist_del(&p->list, &swap_active_head    2793         plist_del(&p->list, &swap_active_head);
2794         atomic_long_sub(p->pages, &nr_swap_pa    2794         atomic_long_sub(p->pages, &nr_swap_pages);
2795         total_swap_pages -= p->pages;            2795         total_swap_pages -= p->pages;
2796         p->flags &= ~SWP_WRITEOK;                2796         p->flags &= ~SWP_WRITEOK;
2797         spin_unlock(&p->lock);                   2797         spin_unlock(&p->lock);
2798         spin_unlock(&swap_lock);                 2798         spin_unlock(&swap_lock);
2799                                                  2799 
2800         disable_swap_slots_cache_lock();         2800         disable_swap_slots_cache_lock();
2801                                                  2801 
2802         set_current_oom_origin();                2802         set_current_oom_origin();
2803         err = try_to_unuse(p->type);             2803         err = try_to_unuse(p->type);
2804         clear_current_oom_origin();              2804         clear_current_oom_origin();
2805                                                  2805 
2806         if (err) {                               2806         if (err) {
2807                 /* re-insert swap space back     2807                 /* re-insert swap space back into swap_list */
2808                 reinsert_swap_info(p);           2808                 reinsert_swap_info(p);
2809                 reenable_swap_slots_cache_unl    2809                 reenable_swap_slots_cache_unlock();
2810                 goto out_dput;                   2810                 goto out_dput;
2811         }                                        2811         }
2812                                                  2812 
2813         reenable_swap_slots_cache_unlock();      2813         reenable_swap_slots_cache_unlock();
2814                                                  2814 
2815         /*                                       2815         /*
2816          * Wait for swap operations protected    2816          * Wait for swap operations protected by get/put_swap_device()
2817          * to complete.  Because of synchroni    2817          * to complete.  Because of synchronize_rcu() here, all swap
2818          * operations protected by RCU reader    2818          * operations protected by RCU reader side lock (including any
2819          * spinlock) will be waited too.  Thi    2819          * spinlock) will be waited too.  This makes it easy to
2820          * prevent folio_test_swapcache() and    2820          * prevent folio_test_swapcache() and the following swap cache
2821          * operations from racing with swapof    2821          * operations from racing with swapoff.
2822          */                                      2822          */
2823         percpu_ref_kill(&p->users);              2823         percpu_ref_kill(&p->users);
2824         synchronize_rcu();                       2824         synchronize_rcu();
2825         wait_for_completion(&p->comp);           2825         wait_for_completion(&p->comp);
2826                                                  2826 
2827         flush_work(&p->discard_work);            2827         flush_work(&p->discard_work);
2828         flush_work(&p->reclaim_work);            2828         flush_work(&p->reclaim_work);
2829                                                  2829 
2830         destroy_swap_extents(p);                 2830         destroy_swap_extents(p);
2831         if (p->flags & SWP_CONTINUED)            2831         if (p->flags & SWP_CONTINUED)
2832                 free_swap_count_continuations    2832                 free_swap_count_continuations(p);
2833                                                  2833 
2834         if (!p->bdev || !bdev_nonrot(p->bdev)    2834         if (!p->bdev || !bdev_nonrot(p->bdev))
2835                 atomic_dec(&nr_rotate_swap);     2835                 atomic_dec(&nr_rotate_swap);
2836                                                  2836 
2837         mutex_lock(&swapon_mutex);               2837         mutex_lock(&swapon_mutex);
2838         spin_lock(&swap_lock);                   2838         spin_lock(&swap_lock);
2839         spin_lock(&p->lock);                     2839         spin_lock(&p->lock);
2840         drain_mmlist();                          2840         drain_mmlist();
2841                                                  2841 
2842         /* wait for anyone still in scan_swap    2842         /* wait for anyone still in scan_swap_map_slots */
2843         p->highest_bit = 0;             /* cu    2843         p->highest_bit = 0;             /* cuts scans short */
2844         while (p->flags >= SWP_SCANNING) {       2844         while (p->flags >= SWP_SCANNING) {
2845                 spin_unlock(&p->lock);           2845                 spin_unlock(&p->lock);
2846                 spin_unlock(&swap_lock);         2846                 spin_unlock(&swap_lock);
2847                 schedule_timeout_uninterrupti    2847                 schedule_timeout_uninterruptible(1);
2848                 spin_lock(&swap_lock);           2848                 spin_lock(&swap_lock);
2849                 spin_lock(&p->lock);             2849                 spin_lock(&p->lock);
2850         }                                        2850         }
2851                                                  2851 
2852         swap_file = p->swap_file;                2852         swap_file = p->swap_file;
2853         p->swap_file = NULL;                     2853         p->swap_file = NULL;
2854         p->max = 0;                              2854         p->max = 0;
2855         swap_map = p->swap_map;                  2855         swap_map = p->swap_map;
2856         p->swap_map = NULL;                      2856         p->swap_map = NULL;
2857         zeromap = p->zeromap;                    2857         zeromap = p->zeromap;
2858         p->zeromap = NULL;                       2858         p->zeromap = NULL;
2859         cluster_info = p->cluster_info;          2859         cluster_info = p->cluster_info;
2860         p->cluster_info = NULL;                  2860         p->cluster_info = NULL;
2861         spin_unlock(&p->lock);                   2861         spin_unlock(&p->lock);
2862         spin_unlock(&swap_lock);                 2862         spin_unlock(&swap_lock);
2863         arch_swap_invalidate_area(p->type);      2863         arch_swap_invalidate_area(p->type);
2864         zswap_swapoff(p->type);                  2864         zswap_swapoff(p->type);
2865         mutex_unlock(&swapon_mutex);             2865         mutex_unlock(&swapon_mutex);
2866         free_percpu(p->percpu_cluster);          2866         free_percpu(p->percpu_cluster);
2867         p->percpu_cluster = NULL;                2867         p->percpu_cluster = NULL;
2868         free_percpu(p->cluster_next_cpu);        2868         free_percpu(p->cluster_next_cpu);
2869         p->cluster_next_cpu = NULL;              2869         p->cluster_next_cpu = NULL;
2870         vfree(swap_map);                         2870         vfree(swap_map);
2871         kvfree(zeromap);                         2871         kvfree(zeromap);
2872         kvfree(cluster_info);                    2872         kvfree(cluster_info);
2873         /* Destroy swap account information *    2873         /* Destroy swap account information */
2874         swap_cgroup_swapoff(p->type);            2874         swap_cgroup_swapoff(p->type);
2875         exit_swap_address_space(p->type);        2875         exit_swap_address_space(p->type);
2876                                                  2876 
2877         inode = mapping->host;                   2877         inode = mapping->host;
2878                                                  2878 
2879         inode_lock(inode);                       2879         inode_lock(inode);
2880         inode->i_flags &= ~S_SWAPFILE;           2880         inode->i_flags &= ~S_SWAPFILE;
2881         inode_unlock(inode);                     2881         inode_unlock(inode);
2882         filp_close(swap_file, NULL);             2882         filp_close(swap_file, NULL);
2883                                                  2883 
2884         /*                                       2884         /*
2885          * Clear the SWP_USED flag after all     2885          * Clear the SWP_USED flag after all resources are freed so that swapon
2886          * can reuse this swap_info in alloc_    2886          * can reuse this swap_info in alloc_swap_info() safely.  It is ok to
2887          * not hold p->lock after we cleared     2887          * not hold p->lock after we cleared its SWP_WRITEOK.
2888          */                                      2888          */
2889         spin_lock(&swap_lock);                   2889         spin_lock(&swap_lock);
2890         p->flags = 0;                            2890         p->flags = 0;
2891         spin_unlock(&swap_lock);                 2891         spin_unlock(&swap_lock);
2892                                                  2892 
2893         err = 0;                                 2893         err = 0;
2894         atomic_inc(&proc_poll_event);            2894         atomic_inc(&proc_poll_event);
2895         wake_up_interruptible(&proc_poll_wait    2895         wake_up_interruptible(&proc_poll_wait);
2896                                                  2896 
2897 out_dput:                                        2897 out_dput:
2898         filp_close(victim, NULL);                2898         filp_close(victim, NULL);
2899 out:                                             2899 out:
2900         putname(pathname);                       2900         putname(pathname);
2901         return err;                              2901         return err;
2902 }                                                2902 }
2903                                                  2903 
2904 #ifdef CONFIG_PROC_FS                            2904 #ifdef CONFIG_PROC_FS
2905 static __poll_t swaps_poll(struct file *file,    2905 static __poll_t swaps_poll(struct file *file, poll_table *wait)
2906 {                                                2906 {
2907         struct seq_file *seq = file->private_    2907         struct seq_file *seq = file->private_data;
2908                                                  2908 
2909         poll_wait(file, &proc_poll_wait, wait    2909         poll_wait(file, &proc_poll_wait, wait);
2910                                                  2910 
2911         if (seq->poll_event != atomic_read(&p    2911         if (seq->poll_event != atomic_read(&proc_poll_event)) {
2912                 seq->poll_event = atomic_read    2912                 seq->poll_event = atomic_read(&proc_poll_event);
2913                 return EPOLLIN | EPOLLRDNORM     2913                 return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI;
2914         }                                        2914         }
2915                                                  2915 
2916         return EPOLLIN | EPOLLRDNORM;            2916         return EPOLLIN | EPOLLRDNORM;
2917 }                                                2917 }
2918                                                  2918 
2919 /* iterator */                                   2919 /* iterator */
2920 static void *swap_start(struct seq_file *swap    2920 static void *swap_start(struct seq_file *swap, loff_t *pos)
2921 {                                                2921 {
2922         struct swap_info_struct *si;             2922         struct swap_info_struct *si;
2923         int type;                                2923         int type;
2924         loff_t l = *pos;                         2924         loff_t l = *pos;
2925                                                  2925 
2926         mutex_lock(&swapon_mutex);               2926         mutex_lock(&swapon_mutex);
2927                                                  2927 
2928         if (!l)                                  2928         if (!l)
2929                 return SEQ_START_TOKEN;          2929                 return SEQ_START_TOKEN;
2930                                                  2930 
2931         for (type = 0; (si = swap_type_to_swa    2931         for (type = 0; (si = swap_type_to_swap_info(type)); type++) {
2932                 if (!(si->flags & SWP_USED) |    2932                 if (!(si->flags & SWP_USED) || !si->swap_map)
2933                         continue;                2933                         continue;
2934                 if (!--l)                        2934                 if (!--l)
2935                         return si;               2935                         return si;
2936         }                                        2936         }
2937                                                  2937 
2938         return NULL;                             2938         return NULL;
2939 }                                                2939 }
2940                                                  2940 
2941 static void *swap_next(struct seq_file *swap,    2941 static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
2942 {                                                2942 {
2943         struct swap_info_struct *si = v;         2943         struct swap_info_struct *si = v;
2944         int type;                                2944         int type;
2945                                                  2945 
2946         if (v == SEQ_START_TOKEN)                2946         if (v == SEQ_START_TOKEN)
2947                 type = 0;                        2947                 type = 0;
2948         else                                     2948         else
2949                 type = si->type + 1;             2949                 type = si->type + 1;
2950                                                  2950 
2951         ++(*pos);                                2951         ++(*pos);
2952         for (; (si = swap_type_to_swap_info(t    2952         for (; (si = swap_type_to_swap_info(type)); type++) {
2953                 if (!(si->flags & SWP_USED) |    2953                 if (!(si->flags & SWP_USED) || !si->swap_map)
2954                         continue;                2954                         continue;
2955                 return si;                       2955                 return si;
2956         }                                        2956         }
2957                                                  2957 
2958         return NULL;                             2958         return NULL;
2959 }                                                2959 }
2960                                                  2960 
2961 static void swap_stop(struct seq_file *swap,     2961 static void swap_stop(struct seq_file *swap, void *v)
2962 {                                                2962 {
2963         mutex_unlock(&swapon_mutex);             2963         mutex_unlock(&swapon_mutex);
2964 }                                                2964 }
2965                                                  2965 
2966 static int swap_show(struct seq_file *swap, v    2966 static int swap_show(struct seq_file *swap, void *v)
2967 {                                                2967 {
2968         struct swap_info_struct *si = v;         2968         struct swap_info_struct *si = v;
2969         struct file *file;                       2969         struct file *file;
2970         int len;                                 2970         int len;
2971         unsigned long bytes, inuse;              2971         unsigned long bytes, inuse;
2972                                                  2972 
2973         if (si == SEQ_START_TOKEN) {             2973         if (si == SEQ_START_TOKEN) {
2974                 seq_puts(swap, "Filename\t\t\    2974                 seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
2975                 return 0;                        2975                 return 0;
2976         }                                        2976         }
2977                                                  2977 
2978         bytes = K(si->pages);                    2978         bytes = K(si->pages);
2979         inuse = K(READ_ONCE(si->inuse_pages))    2979         inuse = K(READ_ONCE(si->inuse_pages));
2980                                                  2980 
2981         file = si->swap_file;                    2981         file = si->swap_file;
2982         len = seq_file_path(swap, file, " \t\    2982         len = seq_file_path(swap, file, " \t\n\\");
2983         seq_printf(swap, "%*s%s\t%lu\t%s%lu\t    2983         seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
2984                         len < 40 ? 40 - len :    2984                         len < 40 ? 40 - len : 1, " ",
2985                         S_ISBLK(file_inode(fi    2985                         S_ISBLK(file_inode(file)->i_mode) ?
2986                                 "partition" :    2986                                 "partition" : "file\t",
2987                         bytes, bytes < 100000    2987                         bytes, bytes < 10000000 ? "\t" : "",
2988                         inuse, inuse < 100000    2988                         inuse, inuse < 10000000 ? "\t" : "",
2989                         si->prio);               2989                         si->prio);
2990         return 0;                                2990         return 0;
2991 }                                                2991 }
2992                                                  2992 
2993 static const struct seq_operations swaps_op =    2993 static const struct seq_operations swaps_op = {
2994         .start =        swap_start,              2994         .start =        swap_start,
2995         .next =         swap_next,               2995         .next =         swap_next,
2996         .stop =         swap_stop,               2996         .stop =         swap_stop,
2997         .show =         swap_show                2997         .show =         swap_show
2998 };                                               2998 };
2999                                                  2999 
3000 static int swaps_open(struct inode *inode, st    3000 static int swaps_open(struct inode *inode, struct file *file)
3001 {                                                3001 {
3002         struct seq_file *seq;                    3002         struct seq_file *seq;
3003         int ret;                                 3003         int ret;
3004                                                  3004 
3005         ret = seq_open(file, &swaps_op);         3005         ret = seq_open(file, &swaps_op);
3006         if (ret)                                 3006         if (ret)
3007                 return ret;                      3007                 return ret;
3008                                                  3008 
3009         seq = file->private_data;                3009         seq = file->private_data;
3010         seq->poll_event = atomic_read(&proc_p    3010         seq->poll_event = atomic_read(&proc_poll_event);
3011         return 0;                                3011         return 0;
3012 }                                                3012 }
3013                                                  3013 
3014 static const struct proc_ops swaps_proc_ops =    3014 static const struct proc_ops swaps_proc_ops = {
3015         .proc_flags     = PROC_ENTRY_PERMANEN    3015         .proc_flags     = PROC_ENTRY_PERMANENT,
3016         .proc_open      = swaps_open,            3016         .proc_open      = swaps_open,
3017         .proc_read      = seq_read,              3017         .proc_read      = seq_read,
3018         .proc_lseek     = seq_lseek,             3018         .proc_lseek     = seq_lseek,
3019         .proc_release   = seq_release,           3019         .proc_release   = seq_release,
3020         .proc_poll      = swaps_poll,            3020         .proc_poll      = swaps_poll,
3021 };                                               3021 };
3022                                                  3022 
3023 static int __init procswaps_init(void)           3023 static int __init procswaps_init(void)
3024 {                                                3024 {
3025         proc_create("swaps", 0, NULL, &swaps_    3025         proc_create("swaps", 0, NULL, &swaps_proc_ops);
3026         return 0;                                3026         return 0;
3027 }                                                3027 }
3028 __initcall(procswaps_init);                      3028 __initcall(procswaps_init);
3029 #endif /* CONFIG_PROC_FS */                      3029 #endif /* CONFIG_PROC_FS */
3030                                                  3030 
3031 #ifdef MAX_SWAPFILES_CHECK                       3031 #ifdef MAX_SWAPFILES_CHECK
3032 static int __init max_swapfiles_check(void)      3032 static int __init max_swapfiles_check(void)
3033 {                                                3033 {
3034         MAX_SWAPFILES_CHECK();                   3034         MAX_SWAPFILES_CHECK();
3035         return 0;                                3035         return 0;
3036 }                                                3036 }
3037 late_initcall(max_swapfiles_check);              3037 late_initcall(max_swapfiles_check);
3038 #endif                                           3038 #endif
3039                                                  3039 
3040 static struct swap_info_struct *alloc_swap_in    3040 static struct swap_info_struct *alloc_swap_info(void)
3041 {                                                3041 {
3042         struct swap_info_struct *p;              3042         struct swap_info_struct *p;
3043         struct swap_info_struct *defer = NULL    3043         struct swap_info_struct *defer = NULL;
3044         unsigned int type;                       3044         unsigned int type;
3045         int i;                                   3045         int i;
3046                                                  3046 
3047         p = kvzalloc(struct_size(p, avail_lis    3047         p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL);
3048         if (!p)                                  3048         if (!p)
3049                 return ERR_PTR(-ENOMEM);         3049                 return ERR_PTR(-ENOMEM);
3050                                                  3050 
3051         if (percpu_ref_init(&p->users, swap_u    3051         if (percpu_ref_init(&p->users, swap_users_ref_free,
3052                             PERCPU_REF_INIT_D    3052                             PERCPU_REF_INIT_DEAD, GFP_KERNEL)) {
3053                 kvfree(p);                       3053                 kvfree(p);
3054                 return ERR_PTR(-ENOMEM);         3054                 return ERR_PTR(-ENOMEM);
3055         }                                        3055         }
3056                                                  3056 
3057         spin_lock(&swap_lock);                   3057         spin_lock(&swap_lock);
3058         for (type = 0; type < nr_swapfiles; t    3058         for (type = 0; type < nr_swapfiles; type++) {
3059                 if (!(swap_info[type]->flags     3059                 if (!(swap_info[type]->flags & SWP_USED))
3060                         break;                   3060                         break;
3061         }                                        3061         }
3062         if (type >= MAX_SWAPFILES) {             3062         if (type >= MAX_SWAPFILES) {
3063                 spin_unlock(&swap_lock);         3063                 spin_unlock(&swap_lock);
3064                 percpu_ref_exit(&p->users);      3064                 percpu_ref_exit(&p->users);
3065                 kvfree(p);                       3065                 kvfree(p);
3066                 return ERR_PTR(-EPERM);          3066                 return ERR_PTR(-EPERM);
3067         }                                        3067         }
3068         if (type >= nr_swapfiles) {              3068         if (type >= nr_swapfiles) {
3069                 p->type = type;                  3069                 p->type = type;
3070                 /*                               3070                 /*
3071                  * Publish the swap_info_stru    3071                  * Publish the swap_info_struct after initializing it.
3072                  * Note that kvzalloc() above    3072                  * Note that kvzalloc() above zeroes all its fields.
3073                  */                              3073                  */
3074                 smp_store_release(&swap_info[    3074                 smp_store_release(&swap_info[type], p); /* rcu_assign_pointer() */
3075                 nr_swapfiles++;                  3075                 nr_swapfiles++;
3076         } else {                                 3076         } else {
3077                 defer = p;                       3077                 defer = p;
3078                 p = swap_info[type];             3078                 p = swap_info[type];
3079                 /*                               3079                 /*
3080                  * Do not memset this entry:     3080                  * Do not memset this entry: a racing procfs swap_next()
3081                  * would be relying on p->typ    3081                  * would be relying on p->type to remain valid.
3082                  */                              3082                  */
3083         }                                        3083         }
3084         p->swap_extent_root = RB_ROOT;           3084         p->swap_extent_root = RB_ROOT;
3085         plist_node_init(&p->list, 0);            3085         plist_node_init(&p->list, 0);
3086         for_each_node(i)                         3086         for_each_node(i)
3087                 plist_node_init(&p->avail_lis    3087                 plist_node_init(&p->avail_lists[i], 0);
3088         p->flags = SWP_USED;                     3088         p->flags = SWP_USED;
3089         spin_unlock(&swap_lock);                 3089         spin_unlock(&swap_lock);
3090         if (defer) {                             3090         if (defer) {
3091                 percpu_ref_exit(&defer->users    3091                 percpu_ref_exit(&defer->users);
3092                 kvfree(defer);                   3092                 kvfree(defer);
3093         }                                        3093         }
3094         spin_lock_init(&p->lock);                3094         spin_lock_init(&p->lock);
3095         spin_lock_init(&p->cont_lock);           3095         spin_lock_init(&p->cont_lock);
3096         init_completion(&p->comp);               3096         init_completion(&p->comp);
3097                                                  3097 
3098         return p;                                3098         return p;
3099 }                                                3099 }
3100                                                  3100 
3101 static int claim_swapfile(struct swap_info_st    3101 static int claim_swapfile(struct swap_info_struct *si, struct inode *inode)
3102 {                                                3102 {
3103         if (S_ISBLK(inode->i_mode)) {            3103         if (S_ISBLK(inode->i_mode)) {
3104                 si->bdev = I_BDEV(inode);        3104                 si->bdev = I_BDEV(inode);
3105                 /*                               3105                 /*
3106                  * Zoned block devices contai    3106                  * Zoned block devices contain zones that have a sequential
3107                  * write only restriction.  H    3107                  * write only restriction.  Hence zoned block devices are not
3108                  * suitable for swapping.  Di    3108                  * suitable for swapping.  Disallow them here.
3109                  */                              3109                  */
3110                 if (bdev_is_zoned(si->bdev))     3110                 if (bdev_is_zoned(si->bdev))
3111                         return -EINVAL;          3111                         return -EINVAL;
3112                 si->flags |= SWP_BLKDEV;         3112                 si->flags |= SWP_BLKDEV;
3113         } else if (S_ISREG(inode->i_mode)) {     3113         } else if (S_ISREG(inode->i_mode)) {
3114                 si->bdev = inode->i_sb->s_bde    3114                 si->bdev = inode->i_sb->s_bdev;
3115         }                                        3115         }
3116                                                  3116 
3117         return 0;                                3117         return 0;
3118 }                                                3118 }
3119                                                  3119 
3120                                                  3120 
3121 /*                                               3121 /*
3122  * Find out how many pages are allowed for a     3122  * Find out how many pages are allowed for a single swap device. There
3123  * are two limiting factors:                     3123  * are two limiting factors:
3124  * 1) the number of bits for the swap offset     3124  * 1) the number of bits for the swap offset in the swp_entry_t type, and
3125  * 2) the number of bits in the swap pte, as     3125  * 2) the number of bits in the swap pte, as defined by the different
3126  * architectures.                                3126  * architectures.
3127  *                                               3127  *
3128  * In order to find the largest possible bit     3128  * In order to find the largest possible bit mask, a swap entry with
3129  * swap type 0 and swap offset ~0UL is create    3129  * swap type 0 and swap offset ~0UL is created, encoded to a swap pte,
3130  * decoded to a swp_entry_t again, and finall    3130  * decoded to a swp_entry_t again, and finally the swap offset is
3131  * extracted.                                    3131  * extracted.
3132  *                                               3132  *
3133  * This will mask all the bits from the initi    3133  * This will mask all the bits from the initial ~0UL mask that can't
3134  * be encoded in either the swp_entry_t or th    3134  * be encoded in either the swp_entry_t or the architecture definition
3135  * of a swap pte.                                3135  * of a swap pte.
3136  */                                              3136  */
3137 unsigned long generic_max_swapfile_size(void)    3137 unsigned long generic_max_swapfile_size(void)
3138 {                                                3138 {
3139         return swp_offset(pte_to_swp_entry(      3139         return swp_offset(pte_to_swp_entry(
3140                         swp_entry_to_pte(swp_    3140                         swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
3141 }                                                3141 }
3142                                                  3142 
3143 /* Can be overridden by an architecture for a    3143 /* Can be overridden by an architecture for additional checks. */
3144 __weak unsigned long arch_max_swapfile_size(v    3144 __weak unsigned long arch_max_swapfile_size(void)
3145 {                                                3145 {
3146         return generic_max_swapfile_size();      3146         return generic_max_swapfile_size();
3147 }                                                3147 }
3148                                                  3148 
3149 static unsigned long read_swap_header(struct     3149 static unsigned long read_swap_header(struct swap_info_struct *si,
3150                                         union    3150                                         union swap_header *swap_header,
3151                                         struc    3151                                         struct inode *inode)
3152 {                                                3152 {
3153         int i;                                   3153         int i;
3154         unsigned long maxpages;                  3154         unsigned long maxpages;
3155         unsigned long swapfilepages;             3155         unsigned long swapfilepages;
3156         unsigned long last_page;                 3156         unsigned long last_page;
3157                                                  3157 
3158         if (memcmp("SWAPSPACE2", swap_header-    3158         if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
3159                 pr_err("Unable to find swap-s    3159                 pr_err("Unable to find swap-space signature\n");
3160                 return 0;                        3160                 return 0;
3161         }                                        3161         }
3162                                                  3162 
3163         /* swap partition endianness hack...     3163         /* swap partition endianness hack... */
3164         if (swab32(swap_header->info.version)    3164         if (swab32(swap_header->info.version) == 1) {
3165                 swab32s(&swap_header->info.ve    3165                 swab32s(&swap_header->info.version);
3166                 swab32s(&swap_header->info.la    3166                 swab32s(&swap_header->info.last_page);
3167                 swab32s(&swap_header->info.nr    3167                 swab32s(&swap_header->info.nr_badpages);
3168                 if (swap_header->info.nr_badp    3168                 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
3169                         return 0;                3169                         return 0;
3170                 for (i = 0; i < swap_header->    3170                 for (i = 0; i < swap_header->info.nr_badpages; i++)
3171                         swab32s(&swap_header-    3171                         swab32s(&swap_header->info.badpages[i]);
3172         }                                        3172         }
3173         /* Check the swap header's sub-versio    3173         /* Check the swap header's sub-version */
3174         if (swap_header->info.version != 1) {    3174         if (swap_header->info.version != 1) {
3175                 pr_warn("Unable to handle swa    3175                 pr_warn("Unable to handle swap header version %d\n",
3176                         swap_header->info.ver    3176                         swap_header->info.version);
3177                 return 0;                        3177                 return 0;
3178         }                                        3178         }
3179                                                  3179 
3180         si->lowest_bit  = 1;                     3180         si->lowest_bit  = 1;
3181         si->cluster_next = 1;                    3181         si->cluster_next = 1;
3182         si->cluster_nr = 0;                      3182         si->cluster_nr = 0;
3183                                                  3183 
3184         maxpages = swapfile_maximum_size;        3184         maxpages = swapfile_maximum_size;
3185         last_page = swap_header->info.last_pa    3185         last_page = swap_header->info.last_page;
3186         if (!last_page) {                        3186         if (!last_page) {
3187                 pr_warn("Empty swap-file\n");    3187                 pr_warn("Empty swap-file\n");
3188                 return 0;                        3188                 return 0;
3189         }                                        3189         }
3190         if (last_page > maxpages) {              3190         if (last_page > maxpages) {
3191                 pr_warn("Truncating oversized    3191                 pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
3192                         K(maxpages), K(last_p    3192                         K(maxpages), K(last_page));
3193         }                                        3193         }
3194         if (maxpages > last_page) {              3194         if (maxpages > last_page) {
3195                 maxpages = last_page + 1;        3195                 maxpages = last_page + 1;
3196                 /* p->max is an unsigned int:    3196                 /* p->max is an unsigned int: don't overflow it */
3197                 if ((unsigned int)maxpages ==    3197                 if ((unsigned int)maxpages == 0)
3198                         maxpages = UINT_MAX;     3198                         maxpages = UINT_MAX;
3199         }                                        3199         }
3200         si->highest_bit = maxpages - 1;          3200         si->highest_bit = maxpages - 1;
3201                                                  3201 
3202         if (!maxpages)                           3202         if (!maxpages)
3203                 return 0;                        3203                 return 0;
3204         swapfilepages = i_size_read(inode) >>    3204         swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
3205         if (swapfilepages && maxpages > swapf    3205         if (swapfilepages && maxpages > swapfilepages) {
3206                 pr_warn("Swap area shorter th    3206                 pr_warn("Swap area shorter than signature indicates\n");
3207                 return 0;                        3207                 return 0;
3208         }                                        3208         }
3209         if (swap_header->info.nr_badpages &&     3209         if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
3210                 return 0;                        3210                 return 0;
3211         if (swap_header->info.nr_badpages > M    3211         if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
3212                 return 0;                        3212                 return 0;
3213                                                  3213 
3214         return maxpages;                         3214         return maxpages;
3215 }                                                3215 }
3216                                                  3216 
3217 #define SWAP_CLUSTER_INFO_COLS                   3217 #define SWAP_CLUSTER_INFO_COLS                                          \
3218         DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(s    3218         DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
3219 #define SWAP_CLUSTER_SPACE_COLS                  3219 #define SWAP_CLUSTER_SPACE_COLS                                         \
3220         DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES    3220         DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
3221 #define SWAP_CLUSTER_COLS                        3221 #define SWAP_CLUSTER_COLS                                               \
3222         max_t(unsigned int, SWAP_CLUSTER_INFO    3222         max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
3223                                                  3223 
3224 static int setup_swap_map_and_extents(struct     3224 static int setup_swap_map_and_extents(struct swap_info_struct *si,
3225                                         union    3225                                         union swap_header *swap_header,
3226                                         unsig    3226                                         unsigned char *swap_map,
3227                                         unsig    3227                                         unsigned long maxpages,
3228                                         secto    3228                                         sector_t *span)
3229 {                                                3229 {
3230         unsigned int nr_good_pages;              3230         unsigned int nr_good_pages;
3231         unsigned long i;                         3231         unsigned long i;
3232         int nr_extents;                          3232         int nr_extents;
3233                                                  3233 
3234         nr_good_pages = maxpages - 1;   /* om    3234         nr_good_pages = maxpages - 1;   /* omit header page */
3235                                                  3235 
3236         for (i = 0; i < swap_header->info.nr_    3236         for (i = 0; i < swap_header->info.nr_badpages; i++) {
3237                 unsigned int page_nr = swap_h    3237                 unsigned int page_nr = swap_header->info.badpages[i];
3238                 if (page_nr == 0 || page_nr >    3238                 if (page_nr == 0 || page_nr > swap_header->info.last_page)
3239                         return -EINVAL;          3239                         return -EINVAL;
3240                 if (page_nr < maxpages) {        3240                 if (page_nr < maxpages) {
3241                         swap_map[page_nr] = S    3241                         swap_map[page_nr] = SWAP_MAP_BAD;
3242                         nr_good_pages--;         3242                         nr_good_pages--;
3243                 }                                3243                 }
3244         }                                        3244         }
3245                                                  3245 
3246         if (nr_good_pages) {                     3246         if (nr_good_pages) {
3247                 swap_map[0] = SWAP_MAP_BAD;      3247                 swap_map[0] = SWAP_MAP_BAD;
3248                 si->max = maxpages;              3248                 si->max = maxpages;
3249                 si->pages = nr_good_pages;       3249                 si->pages = nr_good_pages;
3250                 nr_extents = setup_swap_exten    3250                 nr_extents = setup_swap_extents(si, span);
3251                 if (nr_extents < 0)              3251                 if (nr_extents < 0)
3252                         return nr_extents;       3252                         return nr_extents;
3253                 nr_good_pages = si->pages;       3253                 nr_good_pages = si->pages;
3254         }                                        3254         }
3255         if (!nr_good_pages) {                    3255         if (!nr_good_pages) {
3256                 pr_warn("Empty swap-file\n");    3256                 pr_warn("Empty swap-file\n");
3257                 return -EINVAL;                  3257                 return -EINVAL;
3258         }                                        3258         }
3259                                                  3259 
3260         return nr_extents;                       3260         return nr_extents;
3261 }                                                3261 }
3262                                                  3262 
3263 static struct swap_cluster_info *setup_cluste    3263 static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
3264                                                  3264                                                 union swap_header *swap_header,
3265                                                  3265                                                 unsigned long maxpages)
3266 {                                                3266 {
3267         unsigned long nr_clusters = DIV_ROUND    3267         unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
3268         unsigned long col = si->cluster_next     3268         unsigned long col = si->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
3269         struct swap_cluster_info *cluster_inf    3269         struct swap_cluster_info *cluster_info;
3270         unsigned long i, j, k, idx;              3270         unsigned long i, j, k, idx;
3271         int cpu, err = -ENOMEM;                  3271         int cpu, err = -ENOMEM;
3272                                                  3272 
3273         cluster_info = kvcalloc(nr_clusters,     3273         cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL);
3274         if (!cluster_info)                       3274         if (!cluster_info)
3275                 goto err;                        3275                 goto err;
3276                                                  3276 
3277         for (i = 0; i < nr_clusters; i++)        3277         for (i = 0; i < nr_clusters; i++)
3278                 spin_lock_init(&cluster_info[    3278                 spin_lock_init(&cluster_info[i].lock);
3279                                                  3279 
3280         si->cluster_next_cpu = alloc_percpu(u    3280         si->cluster_next_cpu = alloc_percpu(unsigned int);
3281         if (!si->cluster_next_cpu)               3281         if (!si->cluster_next_cpu)
3282                 goto err_free;                   3282                 goto err_free;
3283                                                  3283 
3284         /* Random start position to help with    3284         /* Random start position to help with wear leveling */
3285         for_each_possible_cpu(cpu)               3285         for_each_possible_cpu(cpu)
3286                 per_cpu(*si->cluster_next_cpu    3286                 per_cpu(*si->cluster_next_cpu, cpu) =
3287                 get_random_u32_inclusive(1, s    3287                 get_random_u32_inclusive(1, si->highest_bit);
3288                                                  3288 
3289         si->percpu_cluster = alloc_percpu(str    3289         si->percpu_cluster = alloc_percpu(struct percpu_cluster);
3290         if (!si->percpu_cluster)                 3290         if (!si->percpu_cluster)
3291                 goto err_free;                   3291                 goto err_free;
3292                                                  3292 
3293         for_each_possible_cpu(cpu) {             3293         for_each_possible_cpu(cpu) {
3294                 struct percpu_cluster *cluste    3294                 struct percpu_cluster *cluster;
3295                                                  3295 
3296                 cluster = per_cpu_ptr(si->per    3296                 cluster = per_cpu_ptr(si->percpu_cluster, cpu);
3297                 for (i = 0; i < SWAP_NR_ORDER    3297                 for (i = 0; i < SWAP_NR_ORDERS; i++)
3298                         cluster->next[i] = SW    3298                         cluster->next[i] = SWAP_NEXT_INVALID;
3299         }                                        3299         }
3300                                                  3300 
3301         /*                                       3301         /*
3302          * Mark unusable pages as unavailable    3302          * Mark unusable pages as unavailable. The clusters aren't
3303          * marked free yet, so no list operat    3303          * marked free yet, so no list operations are involved yet.
3304          *                                       3304          *
3305          * See setup_swap_map_and_extents():     3305          * See setup_swap_map_and_extents(): header page, bad pages,
3306          * and the EOF part of the last clust    3306          * and the EOF part of the last cluster.
3307          */                                      3307          */
3308         inc_cluster_info_page(si, cluster_inf    3308         inc_cluster_info_page(si, cluster_info, 0);
3309         for (i = 0; i < swap_header->info.nr_    3309         for (i = 0; i < swap_header->info.nr_badpages; i++)
3310                 inc_cluster_info_page(si, clu    3310                 inc_cluster_info_page(si, cluster_info,
3311                                       swap_he    3311                                       swap_header->info.badpages[i]);
3312         for (i = maxpages; i < round_up(maxpa    3312         for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
3313                 inc_cluster_info_page(si, clu    3313                 inc_cluster_info_page(si, cluster_info, i);
3314                                                  3314 
3315         INIT_LIST_HEAD(&si->free_clusters);      3315         INIT_LIST_HEAD(&si->free_clusters);
3316         INIT_LIST_HEAD(&si->full_clusters);      3316         INIT_LIST_HEAD(&si->full_clusters);
3317         INIT_LIST_HEAD(&si->discard_clusters)    3317         INIT_LIST_HEAD(&si->discard_clusters);
3318                                                  3318 
3319         for (i = 0; i < SWAP_NR_ORDERS; i++)     3319         for (i = 0; i < SWAP_NR_ORDERS; i++) {
3320                 INIT_LIST_HEAD(&si->nonfull_c    3320                 INIT_LIST_HEAD(&si->nonfull_clusters[i]);
3321                 INIT_LIST_HEAD(&si->frag_clus    3321                 INIT_LIST_HEAD(&si->frag_clusters[i]);
3322                 si->frag_cluster_nr[i] = 0;      3322                 si->frag_cluster_nr[i] = 0;
3323         }                                        3323         }
3324                                                  3324 
3325         /*                                       3325         /*
3326          * Reduce false cache line sharing be    3326          * Reduce false cache line sharing between cluster_info and
3327          * sharing same address space.           3327          * sharing same address space.
3328          */                                      3328          */
3329         for (k = 0; k < SWAP_CLUSTER_COLS; k+    3329         for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
3330                 j = (k + col) % SWAP_CLUSTER_    3330                 j = (k + col) % SWAP_CLUSTER_COLS;
3331                 for (i = 0; i < DIV_ROUND_UP(    3331                 for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
3332                         struct swap_cluster_i    3332                         struct swap_cluster_info *ci;
3333                         idx = i * SWAP_CLUSTE    3333                         idx = i * SWAP_CLUSTER_COLS + j;
3334                         ci = cluster_info + i    3334                         ci = cluster_info + idx;
3335                         if (idx >= nr_cluster    3335                         if (idx >= nr_clusters)
3336                                 continue;        3336                                 continue;
3337                         if (ci->count) {         3337                         if (ci->count) {
3338                                 ci->flags = C    3338                                 ci->flags = CLUSTER_FLAG_NONFULL;
3339                                 list_add_tail    3339                                 list_add_tail(&ci->list, &si->nonfull_clusters[0]);
3340                                 continue;        3340                                 continue;
3341                         }                        3341                         }
3342                         ci->flags = CLUSTER_F    3342                         ci->flags = CLUSTER_FLAG_FREE;
3343                         list_add_tail(&ci->li    3343                         list_add_tail(&ci->list, &si->free_clusters);
3344                 }                                3344                 }
3345         }                                        3345         }
3346                                                  3346 
3347         return cluster_info;                     3347         return cluster_info;
3348                                                  3348 
3349 err_free:                                        3349 err_free:
3350         kvfree(cluster_info);                    3350         kvfree(cluster_info);
3351 err:                                             3351 err:
3352         return ERR_PTR(err);                     3352         return ERR_PTR(err);
3353 }                                                3353 }
3354                                                  3354 
3355 SYSCALL_DEFINE2(swapon, const char __user *,     3355 SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
3356 {                                                3356 {
3357         struct swap_info_struct *si;             3357         struct swap_info_struct *si;
3358         struct filename *name;                   3358         struct filename *name;
3359         struct file *swap_file = NULL;           3359         struct file *swap_file = NULL;
3360         struct address_space *mapping;           3360         struct address_space *mapping;
3361         struct dentry *dentry;                   3361         struct dentry *dentry;
3362         int prio;                                3362         int prio;
3363         int error;                               3363         int error;
3364         union swap_header *swap_header;          3364         union swap_header *swap_header;
3365         int nr_extents;                          3365         int nr_extents;
3366         sector_t span;                           3366         sector_t span;
3367         unsigned long maxpages;                  3367         unsigned long maxpages;
3368         unsigned char *swap_map = NULL;          3368         unsigned char *swap_map = NULL;
3369         unsigned long *zeromap = NULL;           3369         unsigned long *zeromap = NULL;
3370         struct swap_cluster_info *cluster_inf    3370         struct swap_cluster_info *cluster_info = NULL;
3371         struct folio *folio = NULL;              3371         struct folio *folio = NULL;
3372         struct inode *inode = NULL;              3372         struct inode *inode = NULL;
3373         bool inced_nr_rotate_swap = false;       3373         bool inced_nr_rotate_swap = false;
3374                                                  3374 
3375         if (swap_flags & ~SWAP_FLAGS_VALID)      3375         if (swap_flags & ~SWAP_FLAGS_VALID)
3376                 return -EINVAL;                  3376                 return -EINVAL;
3377                                                  3377 
3378         if (!capable(CAP_SYS_ADMIN))             3378         if (!capable(CAP_SYS_ADMIN))
3379                 return -EPERM;                   3379                 return -EPERM;
3380                                                  3380 
3381         if (!swap_avail_heads)                   3381         if (!swap_avail_heads)
3382                 return -ENOMEM;                  3382                 return -ENOMEM;
3383                                                  3383 
3384         si = alloc_swap_info();                  3384         si = alloc_swap_info();
3385         if (IS_ERR(si))                          3385         if (IS_ERR(si))
3386                 return PTR_ERR(si);              3386                 return PTR_ERR(si);
3387                                                  3387 
3388         INIT_WORK(&si->discard_work, swap_dis    3388         INIT_WORK(&si->discard_work, swap_discard_work);
3389         INIT_WORK(&si->reclaim_work, swap_rec    3389         INIT_WORK(&si->reclaim_work, swap_reclaim_work);
3390                                                  3390 
3391         name = getname(specialfile);             3391         name = getname(specialfile);
3392         if (IS_ERR(name)) {                      3392         if (IS_ERR(name)) {
3393                 error = PTR_ERR(name);           3393                 error = PTR_ERR(name);
3394                 name = NULL;                     3394                 name = NULL;
3395                 goto bad_swap;                   3395                 goto bad_swap;
3396         }                                        3396         }
3397         swap_file = file_open_name(name, O_RD    3397         swap_file = file_open_name(name, O_RDWR | O_LARGEFILE | O_EXCL, 0);
3398         if (IS_ERR(swap_file)) {                 3398         if (IS_ERR(swap_file)) {
3399                 error = PTR_ERR(swap_file);      3399                 error = PTR_ERR(swap_file);
3400                 swap_file = NULL;                3400                 swap_file = NULL;
3401                 goto bad_swap;                   3401                 goto bad_swap;
3402         }                                        3402         }
3403                                                  3403 
3404         si->swap_file = swap_file;               3404         si->swap_file = swap_file;
3405         mapping = swap_file->f_mapping;          3405         mapping = swap_file->f_mapping;
3406         dentry = swap_file->f_path.dentry;       3406         dentry = swap_file->f_path.dentry;
3407         inode = mapping->host;                   3407         inode = mapping->host;
3408                                                  3408 
3409         error = claim_swapfile(si, inode);       3409         error = claim_swapfile(si, inode);
3410         if (unlikely(error))                     3410         if (unlikely(error))
3411                 goto bad_swap;                   3411                 goto bad_swap;
3412                                                  3412 
3413         inode_lock(inode);                       3413         inode_lock(inode);
3414         if (d_unlinked(dentry) || cant_mount(    3414         if (d_unlinked(dentry) || cant_mount(dentry)) {
3415                 error = -ENOENT;                 3415                 error = -ENOENT;
3416                 goto bad_swap_unlock_inode;      3416                 goto bad_swap_unlock_inode;
3417         }                                        3417         }
3418         if (IS_SWAPFILE(inode)) {                3418         if (IS_SWAPFILE(inode)) {
3419                 error = -EBUSY;                  3419                 error = -EBUSY;
3420                 goto bad_swap_unlock_inode;      3420                 goto bad_swap_unlock_inode;
3421         }                                        3421         }
3422                                                  3422 
3423         /*                                       3423         /*
3424          * Read the swap header.                 3424          * Read the swap header.
3425          */                                      3425          */
3426         if (!mapping->a_ops->read_folio) {       3426         if (!mapping->a_ops->read_folio) {
3427                 error = -EINVAL;                 3427                 error = -EINVAL;
3428                 goto bad_swap_unlock_inode;      3428                 goto bad_swap_unlock_inode;
3429         }                                        3429         }
3430         folio = read_mapping_folio(mapping, 0    3430         folio = read_mapping_folio(mapping, 0, swap_file);
3431         if (IS_ERR(folio)) {                     3431         if (IS_ERR(folio)) {
3432                 error = PTR_ERR(folio);          3432                 error = PTR_ERR(folio);
3433                 goto bad_swap_unlock_inode;      3433                 goto bad_swap_unlock_inode;
3434         }                                        3434         }
3435         swap_header = kmap_local_folio(folio,    3435         swap_header = kmap_local_folio(folio, 0);
3436                                                  3436 
3437         maxpages = read_swap_header(si, swap_    3437         maxpages = read_swap_header(si, swap_header, inode);
3438         if (unlikely(!maxpages)) {               3438         if (unlikely(!maxpages)) {
3439                 error = -EINVAL;                 3439                 error = -EINVAL;
3440                 goto bad_swap_unlock_inode;      3440                 goto bad_swap_unlock_inode;
3441         }                                        3441         }
3442                                                  3442 
3443         /* OK, set up the swap map and apply     3443         /* OK, set up the swap map and apply the bad block list */
3444         swap_map = vzalloc(maxpages);            3444         swap_map = vzalloc(maxpages);
3445         if (!swap_map) {                         3445         if (!swap_map) {
3446                 error = -ENOMEM;                 3446                 error = -ENOMEM;
3447                 goto bad_swap_unlock_inode;      3447                 goto bad_swap_unlock_inode;
3448         }                                        3448         }
3449                                                  3449 
3450         error = swap_cgroup_swapon(si->type,     3450         error = swap_cgroup_swapon(si->type, maxpages);
3451         if (error)                               3451         if (error)
3452                 goto bad_swap_unlock_inode;      3452                 goto bad_swap_unlock_inode;
3453                                                  3453 
3454         nr_extents = setup_swap_map_and_exten    3454         nr_extents = setup_swap_map_and_extents(si, swap_header, swap_map,
3455                                                  3455                                                 maxpages, &span);
3456         if (unlikely(nr_extents < 0)) {          3456         if (unlikely(nr_extents < 0)) {
3457                 error = nr_extents;              3457                 error = nr_extents;
3458                 goto bad_swap_unlock_inode;      3458                 goto bad_swap_unlock_inode;
3459         }                                        3459         }
3460                                                  3460 
3461         /*                                       3461         /*
3462          * Use kvmalloc_array instead of bitm    3462          * Use kvmalloc_array instead of bitmap_zalloc as the allocation order might
3463          * be above MAX_PAGE_ORDER incase of     3463          * be above MAX_PAGE_ORDER incase of a large swap file.
3464          */                                      3464          */
3465         zeromap = kvmalloc_array(BITS_TO_LONG    3465         zeromap = kvmalloc_array(BITS_TO_LONGS(maxpages), sizeof(long),
3466                                     GFP_KERNE    3466                                     GFP_KERNEL | __GFP_ZERO);
3467         if (!zeromap) {                          3467         if (!zeromap) {
3468                 error = -ENOMEM;                 3468                 error = -ENOMEM;
3469                 goto bad_swap_unlock_inode;      3469                 goto bad_swap_unlock_inode;
3470         }                                        3470         }
3471                                                  3471 
3472         if (si->bdev && bdev_stable_writes(si    3472         if (si->bdev && bdev_stable_writes(si->bdev))
3473                 si->flags |= SWP_STABLE_WRITE    3473                 si->flags |= SWP_STABLE_WRITES;
3474                                                  3474 
3475         if (si->bdev && bdev_synchronous(si->    3475         if (si->bdev && bdev_synchronous(si->bdev))
3476                 si->flags |= SWP_SYNCHRONOUS_    3476                 si->flags |= SWP_SYNCHRONOUS_IO;
3477                                                  3477 
3478         if (si->bdev && bdev_nonrot(si->bdev)    3478         if (si->bdev && bdev_nonrot(si->bdev)) {
3479                 si->flags |= SWP_SOLIDSTATE;     3479                 si->flags |= SWP_SOLIDSTATE;
3480                                                  3480 
3481                 cluster_info = setup_clusters    3481                 cluster_info = setup_clusters(si, swap_header, maxpages);
3482                 if (IS_ERR(cluster_info)) {      3482                 if (IS_ERR(cluster_info)) {
3483                         error = PTR_ERR(clust    3483                         error = PTR_ERR(cluster_info);
3484                         cluster_info = NULL;     3484                         cluster_info = NULL;
3485                         goto bad_swap_unlock_    3485                         goto bad_swap_unlock_inode;
3486                 }                                3486                 }
3487         } else {                                 3487         } else {
3488                 atomic_inc(&nr_rotate_swap);     3488                 atomic_inc(&nr_rotate_swap);
3489                 inced_nr_rotate_swap = true;     3489                 inced_nr_rotate_swap = true;
3490         }                                        3490         }
3491                                                  3491 
3492         if ((swap_flags & SWAP_FLAG_DISCARD)     3492         if ((swap_flags & SWAP_FLAG_DISCARD) &&
3493             si->bdev && bdev_max_discard_sect    3493             si->bdev && bdev_max_discard_sectors(si->bdev)) {
3494                 /*                               3494                 /*
3495                  * When discard is enabled fo    3495                  * When discard is enabled for swap with no particular
3496                  * policy flagged, we set all    3496                  * policy flagged, we set all swap discard flags here in
3497                  * order to sustain backward     3497                  * order to sustain backward compatibility with older
3498                  * swapon(8) releases.           3498                  * swapon(8) releases.
3499                  */                              3499                  */
3500                 si->flags |= (SWP_DISCARDABLE    3500                 si->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
3501                              SWP_PAGE_DISCARD    3501                              SWP_PAGE_DISCARD);
3502                                                  3502 
3503                 /*                               3503                 /*
3504                  * By flagging sys_swapon, a     3504                  * By flagging sys_swapon, a sysadmin can tell us to
3505                  * either do single-time area    3505                  * either do single-time area discards only, or to just
3506                  * perform discards for relea    3506                  * perform discards for released swap page-clusters.
3507                  * Now it's time to adjust th    3507                  * Now it's time to adjust the p->flags accordingly.
3508                  */                              3508                  */
3509                 if (swap_flags & SWAP_FLAG_DI    3509                 if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
3510                         si->flags &= ~SWP_PAG    3510                         si->flags &= ~SWP_PAGE_DISCARD;
3511                 else if (swap_flags & SWAP_FL    3511                 else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
3512                         si->flags &= ~SWP_ARE    3512                         si->flags &= ~SWP_AREA_DISCARD;
3513                                                  3513 
3514                 /* issue a swapon-time discar    3514                 /* issue a swapon-time discard if it's still required */
3515                 if (si->flags & SWP_AREA_DISC    3515                 if (si->flags & SWP_AREA_DISCARD) {
3516                         int err = discard_swa    3516                         int err = discard_swap(si);
3517                         if (unlikely(err))       3517                         if (unlikely(err))
3518                                 pr_err("swapo    3518                                 pr_err("swapon: discard_swap(%p): %d\n",
3519                                         si, e    3519                                         si, err);
3520                 }                                3520                 }
3521         }                                        3521         }
3522                                                  3522 
3523         error = init_swap_address_space(si->t    3523         error = init_swap_address_space(si->type, maxpages);
3524         if (error)                               3524         if (error)
3525                 goto bad_swap_unlock_inode;      3525                 goto bad_swap_unlock_inode;
3526                                                  3526 
3527         error = zswap_swapon(si->type, maxpag    3527         error = zswap_swapon(si->type, maxpages);
3528         if (error)                               3528         if (error)
3529                 goto free_swap_address_space;    3529                 goto free_swap_address_space;
3530                                                  3530 
3531         /*                                       3531         /*
3532          * Flush any pending IO and dirty map    3532          * Flush any pending IO and dirty mappings before we start using this
3533          * swap device.                          3533          * swap device.
3534          */                                      3534          */
3535         inode->i_flags |= S_SWAPFILE;            3535         inode->i_flags |= S_SWAPFILE;
3536         error = inode_drain_writes(inode);       3536         error = inode_drain_writes(inode);
3537         if (error) {                             3537         if (error) {
3538                 inode->i_flags &= ~S_SWAPFILE    3538                 inode->i_flags &= ~S_SWAPFILE;
3539                 goto free_swap_zswap;            3539                 goto free_swap_zswap;
3540         }                                        3540         }
3541                                                  3541 
3542         mutex_lock(&swapon_mutex);               3542         mutex_lock(&swapon_mutex);
3543         prio = -1;                               3543         prio = -1;
3544         if (swap_flags & SWAP_FLAG_PREFER)       3544         if (swap_flags & SWAP_FLAG_PREFER)
3545                 prio =                           3545                 prio =
3546                   (swap_flags & SWAP_FLAG_PRI    3546                   (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
3547         enable_swap_info(si, prio, swap_map,     3547         enable_swap_info(si, prio, swap_map, cluster_info, zeromap);
3548                                                  3548 
3549         pr_info("Adding %uk swap on %s.  Prio    3549         pr_info("Adding %uk swap on %s.  Priority:%d extents:%d across:%lluk %s%s%s%s\n",
3550                 K(si->pages), name->name, si-    3550                 K(si->pages), name->name, si->prio, nr_extents,
3551                 K((unsigned long long)span),     3551                 K((unsigned long long)span),
3552                 (si->flags & SWP_SOLIDSTATE)     3552                 (si->flags & SWP_SOLIDSTATE) ? "SS" : "",
3553                 (si->flags & SWP_DISCARDABLE)    3553                 (si->flags & SWP_DISCARDABLE) ? "D" : "",
3554                 (si->flags & SWP_AREA_DISCARD    3554                 (si->flags & SWP_AREA_DISCARD) ? "s" : "",
3555                 (si->flags & SWP_PAGE_DISCARD    3555                 (si->flags & SWP_PAGE_DISCARD) ? "c" : "");
3556                                                  3556 
3557         mutex_unlock(&swapon_mutex);             3557         mutex_unlock(&swapon_mutex);
3558         atomic_inc(&proc_poll_event);            3558         atomic_inc(&proc_poll_event);
3559         wake_up_interruptible(&proc_poll_wait    3559         wake_up_interruptible(&proc_poll_wait);
3560                                                  3560 
3561         error = 0;                               3561         error = 0;
3562         goto out;                                3562         goto out;
3563 free_swap_zswap:                                 3563 free_swap_zswap:
3564         zswap_swapoff(si->type);                 3564         zswap_swapoff(si->type);
3565 free_swap_address_space:                         3565 free_swap_address_space:
3566         exit_swap_address_space(si->type);       3566         exit_swap_address_space(si->type);
3567 bad_swap_unlock_inode:                           3567 bad_swap_unlock_inode:
3568         inode_unlock(inode);                     3568         inode_unlock(inode);
3569 bad_swap:                                        3569 bad_swap:
3570         free_percpu(si->percpu_cluster);         3570         free_percpu(si->percpu_cluster);
3571         si->percpu_cluster = NULL;               3571         si->percpu_cluster = NULL;
3572         free_percpu(si->cluster_next_cpu);       3572         free_percpu(si->cluster_next_cpu);
3573         si->cluster_next_cpu = NULL;             3573         si->cluster_next_cpu = NULL;
3574         inode = NULL;                            3574         inode = NULL;
3575         destroy_swap_extents(si);                3575         destroy_swap_extents(si);
3576         swap_cgroup_swapoff(si->type);           3576         swap_cgroup_swapoff(si->type);
3577         spin_lock(&swap_lock);                   3577         spin_lock(&swap_lock);
3578         si->swap_file = NULL;                    3578         si->swap_file = NULL;
3579         si->flags = 0;                           3579         si->flags = 0;
3580         spin_unlock(&swap_lock);                 3580         spin_unlock(&swap_lock);
3581         vfree(swap_map);                         3581         vfree(swap_map);
3582         kvfree(zeromap);                         3582         kvfree(zeromap);
3583         kvfree(cluster_info);                    3583         kvfree(cluster_info);
3584         if (inced_nr_rotate_swap)                3584         if (inced_nr_rotate_swap)
3585                 atomic_dec(&nr_rotate_swap);     3585                 atomic_dec(&nr_rotate_swap);
3586         if (swap_file)                           3586         if (swap_file)
3587                 filp_close(swap_file, NULL);     3587                 filp_close(swap_file, NULL);
3588 out:                                             3588 out:
3589         if (!IS_ERR_OR_NULL(folio))              3589         if (!IS_ERR_OR_NULL(folio))
3590                 folio_release_kmap(folio, swa    3590                 folio_release_kmap(folio, swap_header);
3591         if (name)                                3591         if (name)
3592                 putname(name);                   3592                 putname(name);
3593         if (inode)                               3593         if (inode)
3594                 inode_unlock(inode);             3594                 inode_unlock(inode);
3595         if (!error)                              3595         if (!error)
3596                 enable_swap_slots_cache();       3596                 enable_swap_slots_cache();
3597         return error;                            3597         return error;
3598 }                                                3598 }
3599                                                  3599 
3600 void si_swapinfo(struct sysinfo *val)            3600 void si_swapinfo(struct sysinfo *val)
3601 {                                                3601 {
3602         unsigned int type;                       3602         unsigned int type;
3603         unsigned long nr_to_be_unused = 0;       3603         unsigned long nr_to_be_unused = 0;
3604                                                  3604 
3605         spin_lock(&swap_lock);                   3605         spin_lock(&swap_lock);
3606         for (type = 0; type < nr_swapfiles; t    3606         for (type = 0; type < nr_swapfiles; type++) {
3607                 struct swap_info_struct *si =    3607                 struct swap_info_struct *si = swap_info[type];
3608                                                  3608 
3609                 if ((si->flags & SWP_USED) &&    3609                 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
3610                         nr_to_be_unused += RE    3610                         nr_to_be_unused += READ_ONCE(si->inuse_pages);
3611         }                                        3611         }
3612         val->freeswap = atomic_long_read(&nr_    3612         val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
3613         val->totalswap = total_swap_pages + n    3613         val->totalswap = total_swap_pages + nr_to_be_unused;
3614         spin_unlock(&swap_lock);                 3614         spin_unlock(&swap_lock);
3615 }                                                3615 }
3616                                                  3616 
3617 /*                                               3617 /*
3618  * Verify that nr swap entries are valid and     3618  * Verify that nr swap entries are valid and increment their swap map counts.
3619  *                                               3619  *
3620  * Returns error code in following case.         3620  * Returns error code in following case.
3621  * - success -> 0                                3621  * - success -> 0
3622  * - swp_entry is invalid -> EINVAL              3622  * - swp_entry is invalid -> EINVAL
3623  * - swp_entry is migration entry -> EINVAL      3623  * - swp_entry is migration entry -> EINVAL
3624  * - swap-cache reference is requested but th    3624  * - swap-cache reference is requested but there is already one. -> EEXIST
3625  * - swap-cache reference is requested but th    3625  * - swap-cache reference is requested but the entry is not used. -> ENOENT
3626  * - swap-mapped reference requested but need    3626  * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
3627  */                                              3627  */
3628 static int __swap_duplicate(swp_entry_t entry    3628 static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
3629 {                                                3629 {
3630         struct swap_info_struct *si;             3630         struct swap_info_struct *si;
3631         struct swap_cluster_info *ci;            3631         struct swap_cluster_info *ci;
3632         unsigned long offset;                    3632         unsigned long offset;
3633         unsigned char count;                     3633         unsigned char count;
3634         unsigned char has_cache;                 3634         unsigned char has_cache;
3635         int err, i;                              3635         int err, i;
3636                                                  3636 
3637         si = swp_swap_info(entry);               3637         si = swp_swap_info(entry);
3638                                                  3638 
3639         offset = swp_offset(entry);              3639         offset = swp_offset(entry);
3640         VM_WARN_ON(nr > SWAPFILE_CLUSTER - of    3640         VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
3641         VM_WARN_ON(usage == 1 && nr > 1);        3641         VM_WARN_ON(usage == 1 && nr > 1);
3642         ci = lock_cluster_or_swap_info(si, of    3642         ci = lock_cluster_or_swap_info(si, offset);
3643                                                  3643 
3644         err = 0;                                 3644         err = 0;
3645         for (i = 0; i < nr; i++) {               3645         for (i = 0; i < nr; i++) {
3646                 count = si->swap_map[offset +    3646                 count = si->swap_map[offset + i];
3647                                                  3647 
3648                 /*                               3648                 /*
3649                  * swapin_readahead() doesn't    3649                  * swapin_readahead() doesn't check if a swap entry is valid, so the
3650                  * swap entry could be SWAP_M    3650                  * swap entry could be SWAP_MAP_BAD. Check here with lock held.
3651                  */                              3651                  */
3652                 if (unlikely(swap_count(count    3652                 if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
3653                         err = -ENOENT;           3653                         err = -ENOENT;
3654                         goto unlock_out;         3654                         goto unlock_out;
3655                 }                                3655                 }
3656                                                  3656 
3657                 has_cache = count & SWAP_HAS_    3657                 has_cache = count & SWAP_HAS_CACHE;
3658                 count &= ~SWAP_HAS_CACHE;        3658                 count &= ~SWAP_HAS_CACHE;
3659                                                  3659 
3660                 if (!count && !has_cache) {      3660                 if (!count && !has_cache) {
3661                         err = -ENOENT;           3661                         err = -ENOENT;
3662                 } else if (usage == SWAP_HAS_    3662                 } else if (usage == SWAP_HAS_CACHE) {
3663                         if (has_cache)           3663                         if (has_cache)
3664                                 err = -EEXIST    3664                                 err = -EEXIST;
3665                 } else if ((count & ~COUNT_CO    3665                 } else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) {
3666                         err = -EINVAL;           3666                         err = -EINVAL;
3667                 }                                3667                 }
3668                                                  3668 
3669                 if (err)                         3669                 if (err)
3670                         goto unlock_out;         3670                         goto unlock_out;
3671         }                                        3671         }
3672                                                  3672 
3673         for (i = 0; i < nr; i++) {               3673         for (i = 0; i < nr; i++) {
3674                 count = si->swap_map[offset +    3674                 count = si->swap_map[offset + i];
3675                 has_cache = count & SWAP_HAS_    3675                 has_cache = count & SWAP_HAS_CACHE;
3676                 count &= ~SWAP_HAS_CACHE;        3676                 count &= ~SWAP_HAS_CACHE;
3677                                                  3677 
3678                 if (usage == SWAP_HAS_CACHE)     3678                 if (usage == SWAP_HAS_CACHE)
3679                         has_cache = SWAP_HAS_    3679                         has_cache = SWAP_HAS_CACHE;
3680                 else if ((count & ~COUNT_CONT    3680                 else if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
3681                         count += usage;          3681                         count += usage;
3682                 else if (swap_count_continued    3682                 else if (swap_count_continued(si, offset + i, count))
3683                         count = COUNT_CONTINU    3683                         count = COUNT_CONTINUED;
3684                 else {                           3684                 else {
3685                         /*                       3685                         /*
3686                          * Don't need to roll    3686                          * Don't need to rollback changes, because if
3687                          * usage == 1, there     3687                          * usage == 1, there must be nr == 1.
3688                          */                      3688                          */
3689                         err = -ENOMEM;           3689                         err = -ENOMEM;
3690                         goto unlock_out;         3690                         goto unlock_out;
3691                 }                                3691                 }
3692                                                  3692 
3693                 WRITE_ONCE(si->swap_map[offse    3693                 WRITE_ONCE(si->swap_map[offset + i], count | has_cache);
3694         }                                        3694         }
3695                                                  3695 
3696 unlock_out:                                      3696 unlock_out:
3697         unlock_cluster_or_swap_info(si, ci);     3697         unlock_cluster_or_swap_info(si, ci);
3698         return err;                              3698         return err;
3699 }                                                3699 }
3700                                                  3700 
3701 /*                                               3701 /*
3702  * Help swapoff by noting that swap entry bel    3702  * Help swapoff by noting that swap entry belongs to shmem/tmpfs
3703  * (in which case its reference count is neve    3703  * (in which case its reference count is never incremented).
3704  */                                              3704  */
3705 void swap_shmem_alloc(swp_entry_t entry, int     3705 void swap_shmem_alloc(swp_entry_t entry, int nr)
3706 {                                                3706 {
3707         __swap_duplicate(entry, SWAP_MAP_SHME    3707         __swap_duplicate(entry, SWAP_MAP_SHMEM, nr);
3708 }                                                3708 }
3709                                                  3709 
3710 /*                                               3710 /*
3711  * Increase reference count of swap entry by     3711  * Increase reference count of swap entry by 1.
3712  * Returns 0 for success, or -ENOMEM if a swa    3712  * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
3713  * but could not be atomically allocated.  Re    3713  * but could not be atomically allocated.  Returns 0, just as if it succeeded,
3714  * if __swap_duplicate() fails for another re    3714  * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
3715  * might occur if a page table entry has got     3715  * might occur if a page table entry has got corrupted.
3716  */                                              3716  */
3717 int swap_duplicate(swp_entry_t entry)            3717 int swap_duplicate(swp_entry_t entry)
3718 {                                                3718 {
3719         int err = 0;                             3719         int err = 0;
3720                                                  3720 
3721         while (!err && __swap_duplicate(entry    3721         while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM)
3722                 err = add_swap_count_continua    3722                 err = add_swap_count_continuation(entry, GFP_ATOMIC);
3723         return err;                              3723         return err;
3724 }                                                3724 }
3725                                                  3725 
3726 /*                                               3726 /*
3727  * @entry: first swap entry from which we all    3727  * @entry: first swap entry from which we allocate nr swap cache.
3728  *                                               3728  *
3729  * Called when allocating swap cache for exis    3729  * Called when allocating swap cache for existing swap entries,
3730  * This can return error codes. Returns 0 at     3730  * This can return error codes. Returns 0 at success.
3731  * -EEXIST means there is a swap cache.          3731  * -EEXIST means there is a swap cache.
3732  * Note: return code is different from swap_d    3732  * Note: return code is different from swap_duplicate().
3733  */                                              3733  */
3734 int swapcache_prepare(swp_entry_t entry, int     3734 int swapcache_prepare(swp_entry_t entry, int nr)
3735 {                                                3735 {
3736         return __swap_duplicate(entry, SWAP_H    3736         return __swap_duplicate(entry, SWAP_HAS_CACHE, nr);
3737 }                                                3737 }
3738                                                  3738 
3739 void swapcache_clear(struct swap_info_struct     3739 void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr)
3740 {                                                3740 {
3741         unsigned long offset = swp_offset(ent    3741         unsigned long offset = swp_offset(entry);
3742                                                  3742 
3743         cluster_swap_free_nr(si, offset, nr,     3743         cluster_swap_free_nr(si, offset, nr, SWAP_HAS_CACHE);
3744 }                                                3744 }
3745                                                  3745 
3746 struct swap_info_struct *swp_swap_info(swp_en    3746 struct swap_info_struct *swp_swap_info(swp_entry_t entry)
3747 {                                                3747 {
3748         return swap_type_to_swap_info(swp_typ    3748         return swap_type_to_swap_info(swp_type(entry));
3749 }                                                3749 }
3750                                                  3750 
3751 /*                                               3751 /*
3752  * out-of-line methods to avoid include hell.    3752  * out-of-line methods to avoid include hell.
3753  */                                              3753  */
3754 struct address_space *swapcache_mapping(struc    3754 struct address_space *swapcache_mapping(struct folio *folio)
3755 {                                                3755 {
3756         return swp_swap_info(folio->swap)->sw    3756         return swp_swap_info(folio->swap)->swap_file->f_mapping;
3757 }                                                3757 }
3758 EXPORT_SYMBOL_GPL(swapcache_mapping);            3758 EXPORT_SYMBOL_GPL(swapcache_mapping);
3759                                                  3759 
3760 pgoff_t __folio_swap_cache_index(struct folio    3760 pgoff_t __folio_swap_cache_index(struct folio *folio)
3761 {                                                3761 {
3762         return swap_cache_index(folio->swap);    3762         return swap_cache_index(folio->swap);
3763 }                                                3763 }
3764 EXPORT_SYMBOL_GPL(__folio_swap_cache_index);     3764 EXPORT_SYMBOL_GPL(__folio_swap_cache_index);
3765                                                  3765 
3766 /*                                               3766 /*
3767  * add_swap_count_continuation - called when     3767  * add_swap_count_continuation - called when a swap count is duplicated
3768  * beyond SWAP_MAP_MAX, it allocates a new pa    3768  * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
3769  * page of the original vmalloc'ed swap_map,     3769  * page of the original vmalloc'ed swap_map, to hold the continuation count
3770  * (for that entry and for its neighbouring P    3770  * (for that entry and for its neighbouring PAGE_SIZE swap entries).  Called
3771  * again when count is duplicated beyond SWAP    3771  * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
3772  *                                               3772  *
3773  * These continuation pages are seldom refere    3773  * These continuation pages are seldom referenced: the common paths all work
3774  * on the original swap_map, only referring t    3774  * on the original swap_map, only referring to a continuation page when the
3775  * low "digit" of a count is incremented or d    3775  * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
3776  *                                               3776  *
3777  * add_swap_count_continuation(, GFP_ATOMIC)     3777  * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
3778  * page table locks; if it fails, add_swap_co    3778  * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
3779  * can be called after dropping locks.           3779  * can be called after dropping locks.
3780  */                                              3780  */
3781 int add_swap_count_continuation(swp_entry_t e    3781 int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
3782 {                                                3782 {
3783         struct swap_info_struct *si;             3783         struct swap_info_struct *si;
3784         struct swap_cluster_info *ci;            3784         struct swap_cluster_info *ci;
3785         struct page *head;                       3785         struct page *head;
3786         struct page *page;                       3786         struct page *page;
3787         struct page *list_page;                  3787         struct page *list_page;
3788         pgoff_t offset;                          3788         pgoff_t offset;
3789         unsigned char count;                     3789         unsigned char count;
3790         int ret = 0;                             3790         int ret = 0;
3791                                                  3791 
3792         /*                                       3792         /*
3793          * When debugging, it's easier to use    3793          * When debugging, it's easier to use __GFP_ZERO here; but it's better
3794          * for latency not to zero a page whi    3794          * for latency not to zero a page while GFP_ATOMIC and holding locks.
3795          */                                      3795          */
3796         page = alloc_page(gfp_mask | __GFP_HI    3796         page = alloc_page(gfp_mask | __GFP_HIGHMEM);
3797                                                  3797 
3798         si = get_swap_device(entry);             3798         si = get_swap_device(entry);
3799         if (!si) {                               3799         if (!si) {
3800                 /*                               3800                 /*
3801                  * An acceptable race has occ    3801                  * An acceptable race has occurred since the failing
3802                  * __swap_duplicate(): the sw    3802                  * __swap_duplicate(): the swap device may be swapoff
3803                  */                              3803                  */
3804                 goto outer;                      3804                 goto outer;
3805         }                                        3805         }
3806         spin_lock(&si->lock);                    3806         spin_lock(&si->lock);
3807                                                  3807 
3808         offset = swp_offset(entry);              3808         offset = swp_offset(entry);
3809                                                  3809 
3810         ci = lock_cluster(si, offset);           3810         ci = lock_cluster(si, offset);
3811                                                  3811 
3812         count = swap_count(si->swap_map[offse    3812         count = swap_count(si->swap_map[offset]);
3813                                                  3813 
3814         if ((count & ~COUNT_CONTINUED) != SWA    3814         if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
3815                 /*                               3815                 /*
3816                  * The higher the swap count,    3816                  * The higher the swap count, the more likely it is that tasks
3817                  * will race to add swap coun    3817                  * will race to add swap count continuation: we need to avoid
3818                  * over-provisioning.            3818                  * over-provisioning.
3819                  */                              3819                  */
3820                 goto out;                        3820                 goto out;
3821         }                                        3821         }
3822                                                  3822 
3823         if (!page) {                             3823         if (!page) {
3824                 ret = -ENOMEM;                   3824                 ret = -ENOMEM;
3825                 goto out;                        3825                 goto out;
3826         }                                        3826         }
3827                                                  3827 
3828         head = vmalloc_to_page(si->swap_map +    3828         head = vmalloc_to_page(si->swap_map + offset);
3829         offset &= ~PAGE_MASK;                    3829         offset &= ~PAGE_MASK;
3830                                                  3830 
3831         spin_lock(&si->cont_lock);               3831         spin_lock(&si->cont_lock);
3832         /*                                       3832         /*
3833          * Page allocation does not initializ    3833          * Page allocation does not initialize the page's lru field,
3834          * but it does always reset its priva    3834          * but it does always reset its private field.
3835          */                                      3835          */
3836         if (!page_private(head)) {               3836         if (!page_private(head)) {
3837                 BUG_ON(count & COUNT_CONTINUE    3837                 BUG_ON(count & COUNT_CONTINUED);
3838                 INIT_LIST_HEAD(&head->lru);      3838                 INIT_LIST_HEAD(&head->lru);
3839                 set_page_private(head, SWP_CO    3839                 set_page_private(head, SWP_CONTINUED);
3840                 si->flags |= SWP_CONTINUED;      3840                 si->flags |= SWP_CONTINUED;
3841         }                                        3841         }
3842                                                  3842 
3843         list_for_each_entry(list_page, &head-    3843         list_for_each_entry(list_page, &head->lru, lru) {
3844                 unsigned char *map;              3844                 unsigned char *map;
3845                                                  3845 
3846                 /*                               3846                 /*
3847                  * If the previous map said n    3847                  * If the previous map said no continuation, but we've found
3848                  * a continuation page, free     3848                  * a continuation page, free our allocation and use this one.
3849                  */                              3849                  */
3850                 if (!(count & COUNT_CONTINUED    3850                 if (!(count & COUNT_CONTINUED))
3851                         goto out_unlock_cont;    3851                         goto out_unlock_cont;
3852                                                  3852 
3853                 map = kmap_local_page(list_pa    3853                 map = kmap_local_page(list_page) + offset;
3854                 count = *map;                    3854                 count = *map;
3855                 kunmap_local(map);               3855                 kunmap_local(map);
3856                                                  3856 
3857                 /*                               3857                 /*
3858                  * If this continuation count    3858                  * If this continuation count now has some space in it,
3859                  * free our allocation and us    3859                  * free our allocation and use this one.
3860                  */                              3860                  */
3861                 if ((count & ~COUNT_CONTINUED    3861                 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
3862                         goto out_unlock_cont;    3862                         goto out_unlock_cont;
3863         }                                        3863         }
3864                                                  3864 
3865         list_add_tail(&page->lru, &head->lru)    3865         list_add_tail(&page->lru, &head->lru);
3866         page = NULL;                    /* no    3866         page = NULL;                    /* now it's attached, don't free it */
3867 out_unlock_cont:                                 3867 out_unlock_cont:
3868         spin_unlock(&si->cont_lock);             3868         spin_unlock(&si->cont_lock);
3869 out:                                             3869 out:
3870         unlock_cluster(ci);                      3870         unlock_cluster(ci);
3871         spin_unlock(&si->lock);                  3871         spin_unlock(&si->lock);
3872         put_swap_device(si);                     3872         put_swap_device(si);
3873 outer:                                           3873 outer:
3874         if (page)                                3874         if (page)
3875                 __free_page(page);               3875                 __free_page(page);
3876         return ret;                              3876         return ret;
3877 }                                                3877 }
3878                                                  3878 
3879 /*                                               3879 /*
3880  * swap_count_continued - when the original s    3880  * swap_count_continued - when the original swap_map count is incremented
3881  * from SWAP_MAP_MAX, check if there is alrea    3881  * from SWAP_MAP_MAX, check if there is already a continuation page to carry
3882  * into, carry if so, or else fail until a ne    3882  * into, carry if so, or else fail until a new continuation page is allocated;
3883  * when the original swap_map count is decrem    3883  * when the original swap_map count is decremented from 0 with continuation,
3884  * borrow from the continuation and report wh    3884  * borrow from the continuation and report whether it still holds more.
3885  * Called while __swap_duplicate() or swap_en    3885  * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster
3886  * lock.                                         3886  * lock.
3887  */                                              3887  */
3888 static bool swap_count_continued(struct swap_    3888 static bool swap_count_continued(struct swap_info_struct *si,
3889                                  pgoff_t offs    3889                                  pgoff_t offset, unsigned char count)
3890 {                                                3890 {
3891         struct page *head;                       3891         struct page *head;
3892         struct page *page;                       3892         struct page *page;
3893         unsigned char *map;                      3893         unsigned char *map;
3894         bool ret;                                3894         bool ret;
3895                                                  3895 
3896         head = vmalloc_to_page(si->swap_map +    3896         head = vmalloc_to_page(si->swap_map + offset);
3897         if (page_private(head) != SWP_CONTINU    3897         if (page_private(head) != SWP_CONTINUED) {
3898                 BUG_ON(count & COUNT_CONTINUE    3898                 BUG_ON(count & COUNT_CONTINUED);
3899                 return false;           /* ne    3899                 return false;           /* need to add count continuation */
3900         }                                        3900         }
3901                                                  3901 
3902         spin_lock(&si->cont_lock);               3902         spin_lock(&si->cont_lock);
3903         offset &= ~PAGE_MASK;                    3903         offset &= ~PAGE_MASK;
3904         page = list_next_entry(head, lru);       3904         page = list_next_entry(head, lru);
3905         map = kmap_local_page(page) + offset;    3905         map = kmap_local_page(page) + offset;
3906                                                  3906 
3907         if (count == SWAP_MAP_MAX)      /* in    3907         if (count == SWAP_MAP_MAX)      /* initial increment from swap_map */
3908                 goto init_map;          /* ju    3908                 goto init_map;          /* jump over SWAP_CONT_MAX checks */
3909                                                  3909 
3910         if (count == (SWAP_MAP_MAX | COUNT_CO    3910         if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
3911                 /*                               3911                 /*
3912                  * Think of how you add 1 to     3912                  * Think of how you add 1 to 999
3913                  */                              3913                  */
3914                 while (*map == (SWAP_CONT_MAX    3914                 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
3915                         kunmap_local(map);       3915                         kunmap_local(map);
3916                         page = list_next_entr    3916                         page = list_next_entry(page, lru);
3917                         BUG_ON(page == head);    3917                         BUG_ON(page == head);
3918                         map = kmap_local_page    3918                         map = kmap_local_page(page) + offset;
3919                 }                                3919                 }
3920                 if (*map == SWAP_CONT_MAX) {     3920                 if (*map == SWAP_CONT_MAX) {
3921                         kunmap_local(map);       3921                         kunmap_local(map);
3922                         page = list_next_entr    3922                         page = list_next_entry(page, lru);
3923                         if (page == head) {      3923                         if (page == head) {
3924                                 ret = false;     3924                                 ret = false;    /* add count continuation */
3925                                 goto out;        3925                                 goto out;
3926                         }                        3926                         }
3927                         map = kmap_local_page    3927                         map = kmap_local_page(page) + offset;
3928 init_map:               *map = 0;                3928 init_map:               *map = 0;               /* we didn't zero the page */
3929                 }                                3929                 }
3930                 *map += 1;                       3930                 *map += 1;
3931                 kunmap_local(map);               3931                 kunmap_local(map);
3932                 while ((page = list_prev_entr    3932                 while ((page = list_prev_entry(page, lru)) != head) {
3933                         map = kmap_local_page    3933                         map = kmap_local_page(page) + offset;
3934                         *map = COUNT_CONTINUE    3934                         *map = COUNT_CONTINUED;
3935                         kunmap_local(map);       3935                         kunmap_local(map);
3936                 }                                3936                 }
3937                 ret = true;                      3937                 ret = true;                     /* incremented */
3938                                                  3938 
3939         } else {                                 3939         } else {                                /* decrementing */
3940                 /*                               3940                 /*
3941                  * Think of how you subtract     3941                  * Think of how you subtract 1 from 1000
3942                  */                              3942                  */
3943                 BUG_ON(count != COUNT_CONTINU    3943                 BUG_ON(count != COUNT_CONTINUED);
3944                 while (*map == COUNT_CONTINUE    3944                 while (*map == COUNT_CONTINUED) {
3945                         kunmap_local(map);       3945                         kunmap_local(map);
3946                         page = list_next_entr    3946                         page = list_next_entry(page, lru);
3947                         BUG_ON(page == head);    3947                         BUG_ON(page == head);
3948                         map = kmap_local_page    3948                         map = kmap_local_page(page) + offset;
3949                 }                                3949                 }
3950                 BUG_ON(*map == 0);               3950                 BUG_ON(*map == 0);
3951                 *map -= 1;                       3951                 *map -= 1;
3952                 if (*map == 0)                   3952                 if (*map == 0)
3953                         count = 0;               3953                         count = 0;
3954                 kunmap_local(map);               3954                 kunmap_local(map);
3955                 while ((page = list_prev_entr    3955                 while ((page = list_prev_entry(page, lru)) != head) {
3956                         map = kmap_local_page    3956                         map = kmap_local_page(page) + offset;
3957                         *map = SWAP_CONT_MAX     3957                         *map = SWAP_CONT_MAX | count;
3958                         count = COUNT_CONTINU    3958                         count = COUNT_CONTINUED;
3959                         kunmap_local(map);       3959                         kunmap_local(map);
3960                 }                                3960                 }
3961                 ret = count == COUNT_CONTINUE    3961                 ret = count == COUNT_CONTINUED;
3962         }                                        3962         }
3963 out:                                             3963 out:
3964         spin_unlock(&si->cont_lock);             3964         spin_unlock(&si->cont_lock);
3965         return ret;                              3965         return ret;
3966 }                                                3966 }
3967                                                  3967 
3968 /*                                               3968 /*
3969  * free_swap_count_continuations - swapoff fr    3969  * free_swap_count_continuations - swapoff free all the continuation pages
3970  * appended to the swap_map, after swap_map i    3970  * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
3971  */                                              3971  */
3972 static void free_swap_count_continuations(str    3972 static void free_swap_count_continuations(struct swap_info_struct *si)
3973 {                                                3973 {
3974         pgoff_t offset;                          3974         pgoff_t offset;
3975                                                  3975 
3976         for (offset = 0; offset < si->max; of    3976         for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
3977                 struct page *head;               3977                 struct page *head;
3978                 head = vmalloc_to_page(si->sw    3978                 head = vmalloc_to_page(si->swap_map + offset);
3979                 if (page_private(head)) {        3979                 if (page_private(head)) {
3980                         struct page *page, *n    3980                         struct page *page, *next;
3981                                                  3981 
3982                         list_for_each_entry_s    3982                         list_for_each_entry_safe(page, next, &head->lru, lru) {
3983                                 list_del(&pag    3983                                 list_del(&page->lru);
3984                                 __free_page(p    3984                                 __free_page(page);
3985                         }                        3985                         }
3986                 }                                3986                 }
3987         }                                        3987         }
3988 }                                                3988 }
3989                                                  3989 
3990 #if defined(CONFIG_MEMCG) && defined(CONFIG_B    3990 #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
3991 void __folio_throttle_swaprate(struct folio *    3991 void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
3992 {                                                3992 {
3993         struct swap_info_struct *si, *next;      3993         struct swap_info_struct *si, *next;
3994         int nid = folio_nid(folio);              3994         int nid = folio_nid(folio);
3995                                                  3995 
3996         if (!(gfp & __GFP_IO))                   3996         if (!(gfp & __GFP_IO))
3997                 return;                          3997                 return;
3998                                                  3998 
3999         if (!__has_usable_swap())                3999         if (!__has_usable_swap())
4000                 return;                          4000                 return;
4001                                                  4001 
4002         if (!blk_cgroup_congested())             4002         if (!blk_cgroup_congested())
4003                 return;                          4003                 return;
4004                                                  4004 
4005         /*                                       4005         /*
4006          * We've already scheduled a throttle    4006          * We've already scheduled a throttle, avoid taking the global swap
4007          * lock.                                 4007          * lock.
4008          */                                      4008          */
4009         if (current->throttle_disk)              4009         if (current->throttle_disk)
4010                 return;                          4010                 return;
4011                                                  4011 
4012         spin_lock(&swap_avail_lock);             4012         spin_lock(&swap_avail_lock);
4013         plist_for_each_entry_safe(si, next, &    4013         plist_for_each_entry_safe(si, next, &swap_avail_heads[nid],
4014                                   avail_lists    4014                                   avail_lists[nid]) {
4015                 if (si->bdev) {                  4015                 if (si->bdev) {
4016                         blkcg_schedule_thrott    4016                         blkcg_schedule_throttle(si->bdev->bd_disk, true);
4017                         break;                   4017                         break;
4018                 }                                4018                 }
4019         }                                        4019         }
4020         spin_unlock(&swap_avail_lock);           4020         spin_unlock(&swap_avail_lock);
4021 }                                                4021 }
4022 #endif                                           4022 #endif
4023                                                  4023 
4024 static int __init swapfile_init(void)            4024 static int __init swapfile_init(void)
4025 {                                                4025 {
4026         int nid;                                 4026         int nid;
4027                                                  4027 
4028         swap_avail_heads = kmalloc_array(nr_n    4028         swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
4029                                          GFP_    4029                                          GFP_KERNEL);
4030         if (!swap_avail_heads) {                 4030         if (!swap_avail_heads) {
4031                 pr_emerg("Not enough memory f    4031                 pr_emerg("Not enough memory for swap heads, swap is disabled\n");
4032                 return -ENOMEM;                  4032                 return -ENOMEM;
4033         }                                        4033         }
4034                                                  4034 
4035         for_each_node(nid)                       4035         for_each_node(nid)
4036                 plist_head_init(&swap_avail_h    4036                 plist_head_init(&swap_avail_heads[nid]);
4037                                                  4037 
4038         swapfile_maximum_size = arch_max_swap    4038         swapfile_maximum_size = arch_max_swapfile_size();
4039                                                  4039 
4040 #ifdef CONFIG_MIGRATION                          4040 #ifdef CONFIG_MIGRATION
4041         if (swapfile_maximum_size >= (1UL <<     4041         if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS))
4042                 swap_migration_ad_supported =    4042                 swap_migration_ad_supported = true;
4043 #endif  /* CONFIG_MIGRATION */                   4043 #endif  /* CONFIG_MIGRATION */
4044                                                  4044 
4045         return 0;                                4045         return 0;
4046 }                                                4046 }
4047 subsys_initcall(swapfile_init);                  4047 subsys_initcall(swapfile_init);
4048                                                  4048
~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.
TOMOYO Linux Cross Reference Linux/mm/swapfile.c

Diff markup

Differences between /mm/swapfile.c (Architecture i386) and /mm/swapfile.c (Architecture sparc64)

TOMOYO Linux Cross Reference
Linux/mm/swapfile.c