1 // SPDX-License-Identifier: GPL-2.0-only 1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 2 /* 3 * linux/mm/swapfile.c 3 * linux/mm/swapfile.c 4 * 4 * 5 * Copyright (C) 1991, 1992, 1993, 1994 Linu 5 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 6 * Swap reorganised 29.12.95, Stephen Tweedie 6 * Swap reorganised 29.12.95, Stephen Tweedie 7 */ 7 */ 8 8 9 #include <linux/blkdev.h> 9 #include <linux/blkdev.h> 10 #include <linux/mm.h> 10 #include <linux/mm.h> 11 #include <linux/sched/mm.h> 11 #include <linux/sched/mm.h> 12 #include <linux/sched/task.h> 12 #include <linux/sched/task.h> 13 #include <linux/hugetlb.h> 13 #include <linux/hugetlb.h> 14 #include <linux/mman.h> 14 #include <linux/mman.h> 15 #include <linux/slab.h> 15 #include <linux/slab.h> 16 #include <linux/kernel_stat.h> 16 #include <linux/kernel_stat.h> 17 #include <linux/swap.h> 17 #include <linux/swap.h> 18 #include <linux/vmalloc.h> 18 #include <linux/vmalloc.h> 19 #include <linux/pagemap.h> 19 #include <linux/pagemap.h> 20 #include <linux/namei.h> 20 #include <linux/namei.h> 21 #include <linux/shmem_fs.h> 21 #include <linux/shmem_fs.h> 22 #include <linux/blk-cgroup.h> 22 #include <linux/blk-cgroup.h> 23 #include <linux/random.h> 23 #include <linux/random.h> 24 #include <linux/writeback.h> 24 #include <linux/writeback.h> 25 #include <linux/proc_fs.h> 25 #include <linux/proc_fs.h> 26 #include <linux/seq_file.h> 26 #include <linux/seq_file.h> 27 #include <linux/init.h> 27 #include <linux/init.h> 28 #include <linux/ksm.h> 28 #include <linux/ksm.h> 29 #include <linux/rmap.h> 29 #include <linux/rmap.h> 30 #include <linux/security.h> 30 #include <linux/security.h> 31 #include <linux/backing-dev.h> 31 #include <linux/backing-dev.h> 32 #include <linux/mutex.h> 32 #include <linux/mutex.h> 33 #include <linux/capability.h> 33 #include <linux/capability.h> 34 #include <linux/syscalls.h> 34 #include <linux/syscalls.h> 35 #include <linux/memcontrol.h> 35 #include <linux/memcontrol.h> 36 #include <linux/poll.h> 36 #include <linux/poll.h> 37 #include <linux/oom.h> 37 #include <linux/oom.h> >> 38 #include <linux/frontswap.h> 38 #include <linux/swapfile.h> 39 #include <linux/swapfile.h> 39 #include <linux/export.h> 40 #include <linux/export.h> 40 #include <linux/swap_slots.h> 41 #include <linux/swap_slots.h> 41 #include <linux/sort.h> 42 #include <linux/sort.h> 42 #include <linux/completion.h> 43 #include <linux/completion.h> 43 #include <linux/suspend.h> 44 #include <linux/suspend.h> 44 #include <linux/zswap.h> << 45 #include <linux/plist.h> << 46 45 47 #include <asm/tlbflush.h> 46 #include <asm/tlbflush.h> 48 #include <linux/swapops.h> 47 #include <linux/swapops.h> 49 #include <linux/swap_cgroup.h> 48 #include <linux/swap_cgroup.h> 50 #include "internal.h" << 51 #include "swap.h" 49 #include "swap.h" 52 50 53 static bool swap_count_continued(struct swap_i 51 static bool swap_count_continued(struct swap_info_struct *, pgoff_t, 54 unsigned char 52 unsigned char); 55 static void free_swap_count_continuations(stru 53 static void free_swap_count_continuations(struct swap_info_struct *); 56 static void swap_entry_range_free(struct swap_ << 57 unsigned int << 58 static void swap_range_alloc(struct swap_info_ << 59 unsigned int nr_e << 60 static bool folio_swapcache_freeable(struct fo << 61 static struct swap_cluster_info *lock_cluster_ << 62 struct swap_info_struct *si, u << 63 static void unlock_cluster_or_swap_info(struct << 64 struct << 65 54 66 static DEFINE_SPINLOCK(swap_lock); 55 static DEFINE_SPINLOCK(swap_lock); 67 static unsigned int nr_swapfiles; 56 static unsigned int nr_swapfiles; 68 atomic_long_t nr_swap_pages; 57 atomic_long_t nr_swap_pages; 69 /* 58 /* 70 * Some modules use swappable objects and may 59 * Some modules use swappable objects and may try to swap them out under 71 * memory pressure (via the shrinker). Before 60 * memory pressure (via the shrinker). Before doing so, they may wish to 72 * check to see if any swap space is available 61 * check to see if any swap space is available. 73 */ 62 */ 74 EXPORT_SYMBOL_GPL(nr_swap_pages); 63 EXPORT_SYMBOL_GPL(nr_swap_pages); 75 /* protected with swap_lock. reading in vm_swa 64 /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ 76 long total_swap_pages; 65 long total_swap_pages; 77 static int least_priority = -1; 66 static int least_priority = -1; 78 unsigned long swapfile_maximum_size; 67 unsigned long swapfile_maximum_size; 79 #ifdef CONFIG_MIGRATION 68 #ifdef CONFIG_MIGRATION 80 bool swap_migration_ad_supported; 69 bool swap_migration_ad_supported; 81 #endif /* CONFIG_MIGRATION */ 70 #endif /* CONFIG_MIGRATION */ 82 71 83 static const char Bad_file[] = "Bad swap file 72 static const char Bad_file[] = "Bad swap file entry "; 84 static const char Unused_file[] = "Unused swap 73 static const char Unused_file[] = "Unused swap file entry "; 85 static const char Bad_offset[] = "Bad swap off 74 static const char Bad_offset[] = "Bad swap offset entry "; 86 static const char Unused_offset[] = "Unused sw 75 static const char Unused_offset[] = "Unused swap offset entry "; 87 76 88 /* 77 /* 89 * all active swap_info_structs 78 * all active swap_info_structs 90 * protected with swap_lock, and ordered by pr 79 * protected with swap_lock, and ordered by priority. 91 */ 80 */ 92 static PLIST_HEAD(swap_active_head); 81 static PLIST_HEAD(swap_active_head); 93 82 94 /* 83 /* 95 * all available (active, not full) swap_info_ 84 * all available (active, not full) swap_info_structs 96 * protected with swap_avail_lock, ordered by 85 * protected with swap_avail_lock, ordered by priority. 97 * This is used by folio_alloc_swap() instead 86 * This is used by folio_alloc_swap() instead of swap_active_head 98 * because swap_active_head includes all swap_ 87 * because swap_active_head includes all swap_info_structs, 99 * but folio_alloc_swap() doesn't need to look 88 * but folio_alloc_swap() doesn't need to look at full ones. 100 * This uses its own lock instead of swap_lock 89 * This uses its own lock instead of swap_lock because when a 101 * swap_info_struct changes between not-full/f 90 * swap_info_struct changes between not-full/full, it needs to 102 * add/remove itself to/from this list, but th 91 * add/remove itself to/from this list, but the swap_info_struct->lock 103 * is held and the locking order requires swap 92 * is held and the locking order requires swap_lock to be taken 104 * before any swap_info_struct->lock. 93 * before any swap_info_struct->lock. 105 */ 94 */ 106 static struct plist_head *swap_avail_heads; 95 static struct plist_head *swap_avail_heads; 107 static DEFINE_SPINLOCK(swap_avail_lock); 96 static DEFINE_SPINLOCK(swap_avail_lock); 108 97 109 static struct swap_info_struct *swap_info[MAX_ !! 98 struct swap_info_struct *swap_info[MAX_SWAPFILES]; 110 99 111 static DEFINE_MUTEX(swapon_mutex); 100 static DEFINE_MUTEX(swapon_mutex); 112 101 113 static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait) 102 static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); 114 /* Activity counter to indicate that a swapon 103 /* Activity counter to indicate that a swapon or swapoff has occurred */ 115 static atomic_t proc_poll_event = ATOMIC_INIT( 104 static atomic_t proc_poll_event = ATOMIC_INIT(0); 116 105 117 atomic_t nr_rotate_swap = ATOMIC_INIT(0); 106 atomic_t nr_rotate_swap = ATOMIC_INIT(0); 118 107 119 static struct swap_info_struct *swap_type_to_s 108 static struct swap_info_struct *swap_type_to_swap_info(int type) 120 { 109 { 121 if (type >= MAX_SWAPFILES) 110 if (type >= MAX_SWAPFILES) 122 return NULL; 111 return NULL; 123 112 124 return READ_ONCE(swap_info[type]); /* 113 return READ_ONCE(swap_info[type]); /* rcu_dereference() */ 125 } 114 } 126 115 127 static inline unsigned char swap_count(unsigne 116 static inline unsigned char swap_count(unsigned char ent) 128 { 117 { 129 return ent & ~SWAP_HAS_CACHE; /* may 118 return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ 130 } 119 } 131 120 132 /* Reclaim the swap entry anyway if possible * 121 /* Reclaim the swap entry anyway if possible */ 133 #define TTRS_ANYWAY 0x1 122 #define TTRS_ANYWAY 0x1 134 /* 123 /* 135 * Reclaim the swap entry if there are no more 124 * Reclaim the swap entry if there are no more mappings of the 136 * corresponding page 125 * corresponding page 137 */ 126 */ 138 #define TTRS_UNMAPPED 0x2 127 #define TTRS_UNMAPPED 0x2 139 /* Reclaim the swap entry if swap is getting f !! 128 /* Reclaim the swap entry if swap is getting full*/ 140 #define TTRS_FULL 0x4 129 #define TTRS_FULL 0x4 141 /* Reclaim directly, bypass the slot cache and << 142 #define TTRS_DIRECT 0x8 << 143 << 144 static bool swap_is_has_cache(struct swap_info << 145 unsigned long of << 146 { << 147 unsigned char *map = si->swap_map + of << 148 unsigned char *map_end = map + nr_page << 149 << 150 do { << 151 VM_BUG_ON(!(*map & SWAP_HAS_CA << 152 if (*map != SWAP_HAS_CACHE) << 153 return false; << 154 } while (++map < map_end); << 155 << 156 return true; << 157 } << 158 << 159 static bool swap_is_last_map(struct swap_info_ << 160 unsigned long offset, int nr_p << 161 { << 162 unsigned char *map = si->swap_map + of << 163 unsigned char *map_end = map + nr_page << 164 unsigned char count = *map; << 165 << 166 if (swap_count(count) != 1) << 167 return false; << 168 << 169 while (++map < map_end) { << 170 if (*map != count) << 171 return false; << 172 } << 173 130 174 *has_cache = !!(count & SWAP_HAS_CACHE !! 131 /* returns 1 if swap entry is freed */ 175 return true; << 176 } << 177 << 178 /* << 179 * returns number of pages in the folio that b << 180 * the folio was reclaimed. If negative, the f << 181 * folio was associated with the swap entry. << 182 */ << 183 static int __try_to_reclaim_swap(struct swap_i 132 static int __try_to_reclaim_swap(struct swap_info_struct *si, 184 unsigned long 133 unsigned long offset, unsigned long flags) 185 { 134 { 186 swp_entry_t entry = swp_entry(si->type 135 swp_entry_t entry = swp_entry(si->type, offset); 187 struct address_space *address_space = << 188 struct swap_cluster_info *ci; << 189 struct folio *folio; 136 struct folio *folio; 190 int ret, nr_pages; !! 137 int ret = 0; 191 bool need_reclaim; << 192 138 193 folio = filemap_get_folio(address_spac !! 139 folio = filemap_get_folio(swap_address_space(entry), offset); 194 if (IS_ERR(folio)) 140 if (IS_ERR(folio)) 195 return 0; 141 return 0; 196 << 197 nr_pages = folio_nr_pages(folio); << 198 ret = -nr_pages; << 199 << 200 /* 142 /* 201 * When this function is called from s 143 * When this function is called from scan_swap_map_slots() and it's 202 * called by vmscan.c at reclaiming fo 144 * called by vmscan.c at reclaiming folios. So we hold a folio lock 203 * here. We have to use trylock for av 145 * here. We have to use trylock for avoiding deadlock. This is a special 204 * case and you should use folio_free_ 146 * case and you should use folio_free_swap() with explicit folio_lock() 205 * in usual operations. 147 * in usual operations. 206 */ 148 */ 207 if (!folio_trylock(folio)) !! 149 if (folio_trylock(folio)) { 208 goto out; !! 150 if ((flags & TTRS_ANYWAY) || 209 !! 151 ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || 210 /* offset could point to the middle of !! 152 ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))) 211 entry = folio->swap; !! 153 ret = folio_free_swap(folio); 212 offset = swp_offset(entry); !! 154 folio_unlock(folio); 213 << 214 need_reclaim = ((flags & TTRS_ANYWAY) << 215 ((flags & TTRS_UNMAPPE << 216 ((flags & TTRS_FULL) & << 217 if (!need_reclaim || !folio_swapcache_ << 218 goto out_unlock; << 219 << 220 /* << 221 * It's safe to delete the folio from << 222 * swap_map is HAS_CACHE only, which m << 223 * reference or pending writeback, and << 224 */ << 225 ci = lock_cluster_or_swap_info(si, off << 226 need_reclaim = swap_is_has_cache(si, o << 227 unlock_cluster_or_swap_info(si, ci); << 228 if (!need_reclaim) << 229 goto out_unlock; << 230 << 231 if (!(flags & TTRS_DIRECT)) { << 232 /* Free through slot cache */ << 233 delete_from_swap_cache(folio); << 234 folio_set_dirty(folio); << 235 ret = nr_pages; << 236 goto out_unlock; << 237 } 155 } 238 << 239 xa_lock_irq(&address_space->i_pages); << 240 __delete_from_swap_cache(folio, entry, << 241 xa_unlock_irq(&address_space->i_pages) << 242 folio_ref_sub(folio, nr_pages); << 243 folio_set_dirty(folio); << 244 << 245 spin_lock(&si->lock); << 246 /* Only sinple page folio can be backe << 247 if (nr_pages == 1) << 248 zswap_invalidate(entry); << 249 swap_entry_range_free(si, entry, nr_pa << 250 spin_unlock(&si->lock); << 251 ret = nr_pages; << 252 out_unlock: << 253 folio_unlock(folio); << 254 out: << 255 folio_put(folio); 156 folio_put(folio); 256 return ret; 157 return ret; 257 } 158 } 258 159 259 static inline struct swap_extent *first_se(str 160 static inline struct swap_extent *first_se(struct swap_info_struct *sis) 260 { 161 { 261 struct rb_node *rb = rb_first(&sis->sw 162 struct rb_node *rb = rb_first(&sis->swap_extent_root); 262 return rb_entry(rb, struct swap_extent 163 return rb_entry(rb, struct swap_extent, rb_node); 263 } 164 } 264 165 265 static inline struct swap_extent *next_se(stru 166 static inline struct swap_extent *next_se(struct swap_extent *se) 266 { 167 { 267 struct rb_node *rb = rb_next(&se->rb_n 168 struct rb_node *rb = rb_next(&se->rb_node); 268 return rb ? rb_entry(rb, struct swap_e 169 return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL; 269 } 170 } 270 171 271 /* 172 /* 272 * swapon tell device that all the old swap co 173 * swapon tell device that all the old swap contents can be discarded, 273 * to allow the swap device to optimize its we 174 * to allow the swap device to optimize its wear-levelling. 274 */ 175 */ 275 static int discard_swap(struct swap_info_struc 176 static int discard_swap(struct swap_info_struct *si) 276 { 177 { 277 struct swap_extent *se; 178 struct swap_extent *se; 278 sector_t start_block; 179 sector_t start_block; 279 sector_t nr_blocks; 180 sector_t nr_blocks; 280 int err = 0; 181 int err = 0; 281 182 282 /* Do not discard the swap header page 183 /* Do not discard the swap header page! */ 283 se = first_se(si); 184 se = first_se(si); 284 start_block = (se->start_block + 1) << 185 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); 285 nr_blocks = ((sector_t)se->nr_pages - 186 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); 286 if (nr_blocks) { 187 if (nr_blocks) { 287 err = blkdev_issue_discard(si- 188 err = blkdev_issue_discard(si->bdev, start_block, 288 nr_blocks, GFP 189 nr_blocks, GFP_KERNEL); 289 if (err) 190 if (err) 290 return err; 191 return err; 291 cond_resched(); 192 cond_resched(); 292 } 193 } 293 194 294 for (se = next_se(se); se; se = next_s 195 for (se = next_se(se); se; se = next_se(se)) { 295 start_block = se->start_block 196 start_block = se->start_block << (PAGE_SHIFT - 9); 296 nr_blocks = (sector_t)se->nr_p 197 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); 297 198 298 err = blkdev_issue_discard(si- 199 err = blkdev_issue_discard(si->bdev, start_block, 299 nr_blocks, GFP 200 nr_blocks, GFP_KERNEL); 300 if (err) 201 if (err) 301 break; 202 break; 302 203 303 cond_resched(); 204 cond_resched(); 304 } 205 } 305 return err; /* That will o 206 return err; /* That will often be -EOPNOTSUPP */ 306 } 207 } 307 208 308 static struct swap_extent * 209 static struct swap_extent * 309 offset_to_swap_extent(struct swap_info_struct 210 offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset) 310 { 211 { 311 struct swap_extent *se; 212 struct swap_extent *se; 312 struct rb_node *rb; 213 struct rb_node *rb; 313 214 314 rb = sis->swap_extent_root.rb_node; 215 rb = sis->swap_extent_root.rb_node; 315 while (rb) { 216 while (rb) { 316 se = rb_entry(rb, struct swap_ 217 se = rb_entry(rb, struct swap_extent, rb_node); 317 if (offset < se->start_page) 218 if (offset < se->start_page) 318 rb = rb->rb_left; 219 rb = rb->rb_left; 319 else if (offset >= se->start_p 220 else if (offset >= se->start_page + se->nr_pages) 320 rb = rb->rb_right; 221 rb = rb->rb_right; 321 else 222 else 322 return se; 223 return se; 323 } 224 } 324 /* It *must* be present */ 225 /* It *must* be present */ 325 BUG(); 226 BUG(); 326 } 227 } 327 228 328 sector_t swap_folio_sector(struct folio *folio !! 229 sector_t swap_page_sector(struct page *page) 329 { 230 { 330 struct swap_info_struct *sis = swp_swa !! 231 struct swap_info_struct *sis = page_swap_info(page); 331 struct swap_extent *se; 232 struct swap_extent *se; 332 sector_t sector; 233 sector_t sector; 333 pgoff_t offset; 234 pgoff_t offset; 334 235 335 offset = swp_offset(folio->swap); !! 236 offset = __page_file_index(page); 336 se = offset_to_swap_extent(sis, offset 237 se = offset_to_swap_extent(sis, offset); 337 sector = se->start_block + (offset - s 238 sector = se->start_block + (offset - se->start_page); 338 return sector << (PAGE_SHIFT - 9); 239 return sector << (PAGE_SHIFT - 9); 339 } 240 } 340 241 341 /* 242 /* 342 * swap allocation tell device that a cluster 243 * swap allocation tell device that a cluster of swap can now be discarded, 343 * to allow the swap device to optimize its we 244 * to allow the swap device to optimize its wear-levelling. 344 */ 245 */ 345 static void discard_swap_cluster(struct swap_i 246 static void discard_swap_cluster(struct swap_info_struct *si, 346 pgoff_t start 247 pgoff_t start_page, pgoff_t nr_pages) 347 { 248 { 348 struct swap_extent *se = offset_to_swa 249 struct swap_extent *se = offset_to_swap_extent(si, start_page); 349 250 350 while (nr_pages) { 251 while (nr_pages) { 351 pgoff_t offset = start_page - 252 pgoff_t offset = start_page - se->start_page; 352 sector_t start_block = se->sta 253 sector_t start_block = se->start_block + offset; 353 sector_t nr_blocks = se->nr_pa 254 sector_t nr_blocks = se->nr_pages - offset; 354 255 355 if (nr_blocks > nr_pages) 256 if (nr_blocks > nr_pages) 356 nr_blocks = nr_pages; 257 nr_blocks = nr_pages; 357 start_page += nr_blocks; 258 start_page += nr_blocks; 358 nr_pages -= nr_blocks; 259 nr_pages -= nr_blocks; 359 260 360 start_block <<= PAGE_SHIFT - 9 261 start_block <<= PAGE_SHIFT - 9; 361 nr_blocks <<= PAGE_SHIFT - 9; 262 nr_blocks <<= PAGE_SHIFT - 9; 362 if (blkdev_issue_discard(si->b 263 if (blkdev_issue_discard(si->bdev, start_block, 363 nr_blo 264 nr_blocks, GFP_NOIO)) 364 break; 265 break; 365 266 366 se = next_se(se); 267 se = next_se(se); 367 } 268 } 368 } 269 } 369 270 370 #ifdef CONFIG_THP_SWAP 271 #ifdef CONFIG_THP_SWAP 371 #define SWAPFILE_CLUSTER HPAGE_PMD_NR 272 #define SWAPFILE_CLUSTER HPAGE_PMD_NR 372 273 373 #define swap_entry_order(order) (order) !! 274 #define swap_entry_size(size) (size) 374 #else 275 #else 375 #define SWAPFILE_CLUSTER 256 276 #define SWAPFILE_CLUSTER 256 376 277 377 /* 278 /* 378 * Define swap_entry_order() as constant to le !! 279 * Define swap_entry_size() as constant to let compiler to optimize 379 * out some code if !CONFIG_THP_SWAP 280 * out some code if !CONFIG_THP_SWAP 380 */ 281 */ 381 #define swap_entry_order(order) 0 !! 282 #define swap_entry_size(size) 1 382 #endif 283 #endif 383 #define LATENCY_LIMIT 256 284 #define LATENCY_LIMIT 256 384 285 >> 286 static inline void cluster_set_flag(struct swap_cluster_info *info, >> 287 unsigned int flag) >> 288 { >> 289 info->flags = flag; >> 290 } >> 291 >> 292 static inline unsigned int cluster_count(struct swap_cluster_info *info) >> 293 { >> 294 return info->data; >> 295 } >> 296 >> 297 static inline void cluster_set_count(struct swap_cluster_info *info, >> 298 unsigned int c) >> 299 { >> 300 info->data = c; >> 301 } >> 302 >> 303 static inline void cluster_set_count_flag(struct swap_cluster_info *info, >> 304 unsigned int c, unsigned int f) >> 305 { >> 306 info->flags = f; >> 307 info->data = c; >> 308 } >> 309 >> 310 static inline unsigned int cluster_next(struct swap_cluster_info *info) >> 311 { >> 312 return info->data; >> 313 } >> 314 >> 315 static inline void cluster_set_next(struct swap_cluster_info *info, >> 316 unsigned int n) >> 317 { >> 318 info->data = n; >> 319 } >> 320 >> 321 static inline void cluster_set_next_flag(struct swap_cluster_info *info, >> 322 unsigned int n, unsigned int f) >> 323 { >> 324 info->flags = f; >> 325 info->data = n; >> 326 } >> 327 385 static inline bool cluster_is_free(struct swap 328 static inline bool cluster_is_free(struct swap_cluster_info *info) 386 { 329 { 387 return info->flags & CLUSTER_FLAG_FREE 330 return info->flags & CLUSTER_FLAG_FREE; 388 } 331 } 389 332 390 static inline unsigned int cluster_index(struc !! 333 static inline bool cluster_is_null(struct swap_cluster_info *info) 391 struc << 392 { 334 { 393 return ci - si->cluster_info; !! 335 return info->flags & CLUSTER_FLAG_NEXT_NULL; 394 } 336 } 395 337 396 static inline unsigned int cluster_offset(stru !! 338 static inline void cluster_set_null(struct swap_cluster_info *info) 397 stru << 398 { 339 { 399 return cluster_index(si, ci) * SWAPFIL !! 340 info->flags = CLUSTER_FLAG_NEXT_NULL; >> 341 info->data = 0; >> 342 } >> 343 >> 344 static inline bool cluster_is_huge(struct swap_cluster_info *info) >> 345 { >> 346 if (IS_ENABLED(CONFIG_THP_SWAP)) >> 347 return info->flags & CLUSTER_FLAG_HUGE; >> 348 return false; >> 349 } >> 350 >> 351 static inline void cluster_clear_huge(struct swap_cluster_info *info) >> 352 { >> 353 info->flags &= ~CLUSTER_FLAG_HUGE; 400 } 354 } 401 355 402 static inline struct swap_cluster_info *lock_c 356 static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, 403 357 unsigned long offset) 404 { 358 { 405 struct swap_cluster_info *ci; 359 struct swap_cluster_info *ci; 406 360 407 ci = si->cluster_info; 361 ci = si->cluster_info; 408 if (ci) { 362 if (ci) { 409 ci += offset / SWAPFILE_CLUSTE 363 ci += offset / SWAPFILE_CLUSTER; 410 spin_lock(&ci->lock); 364 spin_lock(&ci->lock); 411 } 365 } 412 return ci; 366 return ci; 413 } 367 } 414 368 415 static inline void unlock_cluster(struct swap_ 369 static inline void unlock_cluster(struct swap_cluster_info *ci) 416 { 370 { 417 if (ci) 371 if (ci) 418 spin_unlock(&ci->lock); 372 spin_unlock(&ci->lock); 419 } 373 } 420 374 421 /* 375 /* 422 * Determine the locking method in use for thi 376 * Determine the locking method in use for this device. Return 423 * swap_cluster_info if SSD-style cluster-base 377 * swap_cluster_info if SSD-style cluster-based locking is in place. 424 */ 378 */ 425 static inline struct swap_cluster_info *lock_c 379 static inline struct swap_cluster_info *lock_cluster_or_swap_info( 426 struct swap_info_struct *si, u 380 struct swap_info_struct *si, unsigned long offset) 427 { 381 { 428 struct swap_cluster_info *ci; 382 struct swap_cluster_info *ci; 429 383 430 /* Try to use fine-grained SSD-style l 384 /* Try to use fine-grained SSD-style locking if available: */ 431 ci = lock_cluster(si, offset); 385 ci = lock_cluster(si, offset); 432 /* Otherwise, fall back to traditional 386 /* Otherwise, fall back to traditional, coarse locking: */ 433 if (!ci) 387 if (!ci) 434 spin_lock(&si->lock); 388 spin_lock(&si->lock); 435 389 436 return ci; 390 return ci; 437 } 391 } 438 392 439 static inline void unlock_cluster_or_swap_info 393 static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si, 440 394 struct swap_cluster_info *ci) 441 { 395 { 442 if (ci) 396 if (ci) 443 unlock_cluster(ci); 397 unlock_cluster(ci); 444 else 398 else 445 spin_unlock(&si->lock); 399 spin_unlock(&si->lock); 446 } 400 } 447 401 >> 402 static inline bool cluster_list_empty(struct swap_cluster_list *list) >> 403 { >> 404 return cluster_is_null(&list->head); >> 405 } >> 406 >> 407 static inline unsigned int cluster_list_first(struct swap_cluster_list *list) >> 408 { >> 409 return cluster_next(&list->head); >> 410 } >> 411 >> 412 static void cluster_list_init(struct swap_cluster_list *list) >> 413 { >> 414 cluster_set_null(&list->head); >> 415 cluster_set_null(&list->tail); >> 416 } >> 417 >> 418 static void cluster_list_add_tail(struct swap_cluster_list *list, >> 419 struct swap_cluster_info *ci, >> 420 unsigned int idx) >> 421 { >> 422 if (cluster_list_empty(list)) { >> 423 cluster_set_next_flag(&list->head, idx, 0); >> 424 cluster_set_next_flag(&list->tail, idx, 0); >> 425 } else { >> 426 struct swap_cluster_info *ci_tail; >> 427 unsigned int tail = cluster_next(&list->tail); >> 428 >> 429 /* >> 430 * Nested cluster lock, but both cluster locks are >> 431 * only acquired when we held swap_info_struct->lock >> 432 */ >> 433 ci_tail = ci + tail; >> 434 spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING); >> 435 cluster_set_next(ci_tail, idx); >> 436 spin_unlock(&ci_tail->lock); >> 437 cluster_set_next_flag(&list->tail, idx, 0); >> 438 } >> 439 } >> 440 >> 441 static unsigned int cluster_list_del_first(struct swap_cluster_list *list, >> 442 struct swap_cluster_info *ci) >> 443 { >> 444 unsigned int idx; >> 445 >> 446 idx = cluster_next(&list->head); >> 447 if (cluster_next(&list->tail) == idx) { >> 448 cluster_set_null(&list->head); >> 449 cluster_set_null(&list->tail); >> 450 } else >> 451 cluster_set_next_flag(&list->head, >> 452 cluster_next(&ci[idx]), 0); >> 453 >> 454 return idx; >> 455 } >> 456 448 /* Add a cluster to discard list and schedule 457 /* Add a cluster to discard list and schedule it to do discard */ 449 static void swap_cluster_schedule_discard(stru 458 static void swap_cluster_schedule_discard(struct swap_info_struct *si, 450 struct swap_cluster_info *ci) !! 459 unsigned int idx) 451 { 460 { 452 unsigned int idx = cluster_index(si, c << 453 /* 461 /* 454 * If scan_swap_map_slots() can't find 462 * If scan_swap_map_slots() can't find a free cluster, it will check 455 * si->swap_map directly. To make sure 463 * si->swap_map directly. To make sure the discarding cluster isn't 456 * taken by scan_swap_map_slots(), mar 464 * taken by scan_swap_map_slots(), mark the swap entries bad (occupied). 457 * It will be cleared after discard 465 * It will be cleared after discard 458 */ 466 */ 459 memset(si->swap_map + idx * SWAPFILE_C 467 memset(si->swap_map + idx * SWAPFILE_CLUSTER, 460 SWAP_MAP_BAD, SWAPFILE 468 SWAP_MAP_BAD, SWAPFILE_CLUSTER); 461 469 462 VM_BUG_ON(ci->flags & CLUSTER_FLAG_FRE !! 470 cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx); 463 list_move_tail(&ci->list, &si->discard !! 471 464 ci->flags = 0; << 465 schedule_work(&si->discard_work); 472 schedule_work(&si->discard_work); 466 } 473 } 467 474 468 static void __free_cluster(struct swap_info_st !! 475 static void __free_cluster(struct swap_info_struct *si, unsigned long idx) 469 { 476 { 470 lockdep_assert_held(&si->lock); !! 477 struct swap_cluster_info *ci = si->cluster_info; 471 lockdep_assert_held(&ci->lock); << 472 478 473 if (ci->flags) !! 479 cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE); 474 list_move_tail(&ci->list, &si- !! 480 cluster_list_add_tail(&si->free_clusters, ci, idx); 475 else << 476 list_add_tail(&ci->list, &si-> << 477 ci->flags = CLUSTER_FLAG_FREE; << 478 ci->order = 0; << 479 } 481 } 480 482 481 /* 483 /* 482 * Doing discard actually. After a cluster dis 484 * Doing discard actually. After a cluster discard is finished, the cluster 483 * will be added to free cluster list. caller 485 * will be added to free cluster list. caller should hold si->lock. 484 */ 486 */ 485 static void swap_do_scheduled_discard(struct s 487 static void swap_do_scheduled_discard(struct swap_info_struct *si) 486 { 488 { 487 struct swap_cluster_info *ci; !! 489 struct swap_cluster_info *info, *ci; 488 unsigned int idx; 490 unsigned int idx; 489 491 490 while (!list_empty(&si->discard_cluste !! 492 info = si->cluster_info; 491 ci = list_first_entry(&si->dis !! 493 492 list_del(&ci->list); !! 494 while (!cluster_list_empty(&si->discard_clusters)) { 493 idx = cluster_index(si, ci); !! 495 idx = cluster_list_del_first(&si->discard_clusters, info); 494 spin_unlock(&si->lock); 496 spin_unlock(&si->lock); 495 497 496 discard_swap_cluster(si, idx * 498 discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, 497 SWAPFILE_CLUST 499 SWAPFILE_CLUSTER); 498 500 499 spin_lock(&si->lock); 501 spin_lock(&si->lock); 500 spin_lock(&ci->lock); !! 502 ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); 501 __free_cluster(si, ci); !! 503 __free_cluster(si, idx); 502 memset(si->swap_map + idx * SW 504 memset(si->swap_map + idx * SWAPFILE_CLUSTER, 503 0, SWAPFILE_CL 505 0, SWAPFILE_CLUSTER); 504 spin_unlock(&ci->lock); !! 506 unlock_cluster(ci); 505 } 507 } 506 } 508 } 507 509 508 static void swap_discard_work(struct work_stru 510 static void swap_discard_work(struct work_struct *work) 509 { 511 { 510 struct swap_info_struct *si; 512 struct swap_info_struct *si; 511 513 512 si = container_of(work, struct swap_in 514 si = container_of(work, struct swap_info_struct, discard_work); 513 515 514 spin_lock(&si->lock); 516 spin_lock(&si->lock); 515 swap_do_scheduled_discard(si); 517 swap_do_scheduled_discard(si); 516 spin_unlock(&si->lock); 518 spin_unlock(&si->lock); 517 } 519 } 518 520 519 static void swap_users_ref_free(struct percpu_ 521 static void swap_users_ref_free(struct percpu_ref *ref) 520 { 522 { 521 struct swap_info_struct *si; 523 struct swap_info_struct *si; 522 524 523 si = container_of(ref, struct swap_inf 525 si = container_of(ref, struct swap_info_struct, users); 524 complete(&si->comp); 526 complete(&si->comp); 525 } 527 } 526 528 527 static void free_cluster(struct swap_info_stru !! 529 static void alloc_cluster(struct swap_info_struct *si, unsigned long idx) 528 { 530 { 529 VM_BUG_ON(ci->count != 0); !! 531 struct swap_cluster_info *ci = si->cluster_info; 530 lockdep_assert_held(&si->lock); << 531 lockdep_assert_held(&ci->lock); << 532 532 533 if (ci->flags & CLUSTER_FLAG_FRAG) !! 533 VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx); 534 si->frag_cluster_nr[ci->order] !! 534 cluster_list_del_first(&si->free_clusters, ci); >> 535 cluster_set_count_flag(ci + idx, 0, 0); >> 536 } >> 537 >> 538 static void free_cluster(struct swap_info_struct *si, unsigned long idx) >> 539 { >> 540 struct swap_cluster_info *ci = si->cluster_info + idx; 535 541 >> 542 VM_BUG_ON(cluster_count(ci) != 0); 536 /* 543 /* 537 * If the swap is discardable, prepare 544 * If the swap is discardable, prepare discard the cluster 538 * instead of free it immediately. The 545 * instead of free it immediately. The cluster will be freed 539 * after discard. 546 * after discard. 540 */ 547 */ 541 if ((si->flags & (SWP_WRITEOK | SWP_PA 548 if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == 542 (SWP_WRITEOK | SWP_PAGE_DISCARD)) 549 (SWP_WRITEOK | SWP_PAGE_DISCARD)) { 543 swap_cluster_schedule_discard( !! 550 swap_cluster_schedule_discard(si, idx); 544 return; 551 return; 545 } 552 } 546 553 547 __free_cluster(si, ci); !! 554 __free_cluster(si, idx); 548 } 555 } 549 556 550 /* 557 /* 551 * The cluster corresponding to page_nr will b !! 558 * The cluster corresponding to page_nr will be used. The cluster will be 552 * added to free cluster list and its usage co !! 559 * removed from free cluster list and its usage counter will be increased. 553 * Only used for initialization. << 554 */ 560 */ 555 static void inc_cluster_info_page(struct swap_ !! 561 static void inc_cluster_info_page(struct swap_info_struct *p, 556 struct swap_cluster_info *cluster_info 562 struct swap_cluster_info *cluster_info, unsigned long page_nr) 557 { 563 { 558 unsigned long idx = page_nr / SWAPFILE 564 unsigned long idx = page_nr / SWAPFILE_CLUSTER; 559 struct swap_cluster_info *ci; << 560 565 561 if (!cluster_info) 566 if (!cluster_info) 562 return; 567 return; >> 568 if (cluster_is_free(&cluster_info[idx])) >> 569 alloc_cluster(p, idx); 563 570 564 ci = cluster_info + idx; !! 571 VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER); 565 ci->count++; !! 572 cluster_set_count(&cluster_info[idx], 566 !! 573 cluster_count(&cluster_info[idx]) + 1); 567 VM_BUG_ON(ci->count > SWAPFILE_CLUSTER << 568 VM_BUG_ON(ci->flags); << 569 } 574 } 570 575 571 /* 576 /* 572 * The cluster ci decreases @nr_pages usage. I !! 577 * The cluster corresponding to page_nr decreases one usage. If the usage 573 * which means no page in the cluster is in us !! 578 * counter becomes 0, which means no page in the cluster is in using, we can 574 * the cluster and add it to free cluster list !! 579 * optionally discard the cluster and add it to free cluster list. 575 */ 580 */ 576 static void dec_cluster_info_page(struct swap_ !! 581 static void dec_cluster_info_page(struct swap_info_struct *p, 577 struct swap_ !! 582 struct swap_cluster_info *cluster_info, unsigned long page_nr) 578 { 583 { 579 if (!si->cluster_info) !! 584 unsigned long idx = page_nr / SWAPFILE_CLUSTER; 580 return; << 581 << 582 VM_BUG_ON(ci->count < nr_pages); << 583 VM_BUG_ON(cluster_is_free(ci)); << 584 lockdep_assert_held(&si->lock); << 585 lockdep_assert_held(&ci->lock); << 586 ci->count -= nr_pages; << 587 585 588 if (!ci->count) { !! 586 if (!cluster_info) 589 free_cluster(si, ci); << 590 return; 587 return; 591 } << 592 << 593 if (!(ci->flags & CLUSTER_FLAG_NONFULL << 594 VM_BUG_ON(ci->flags & CLUSTER_ << 595 if (ci->flags & CLUSTER_FLAG_F << 596 si->frag_cluster_nr[ci << 597 list_move_tail(&ci->list, &si- << 598 ci->flags = CLUSTER_FLAG_NONFU << 599 } << 600 } << 601 588 602 static bool cluster_reclaim_range(struct swap_ !! 589 VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0); 603 struct swap_ !! 590 cluster_set_count(&cluster_info[idx], 604 unsigned lon !! 591 cluster_count(&cluster_info[idx]) - 1); 605 { << 606 unsigned char *map = si->swap_map; << 607 unsigned long offset; << 608 << 609 spin_unlock(&ci->lock); << 610 spin_unlock(&si->lock); << 611 592 612 for (offset = start; offset < end; off !! 593 if (cluster_count(&cluster_info[idx]) == 0) 613 switch (READ_ONCE(map[offset]) !! 594 free_cluster(p, idx); 614 case 0: << 615 continue; << 616 case SWAP_HAS_CACHE: << 617 if (__try_to_reclaim_s << 618 continue; << 619 goto out; << 620 default: << 621 goto out; << 622 } << 623 } << 624 out: << 625 spin_lock(&si->lock); << 626 spin_lock(&ci->lock); << 627 << 628 /* << 629 * Recheck the range no matter reclaim << 630 * could have been be freed while we a << 631 */ << 632 for (offset = start; offset < end; off << 633 if (READ_ONCE(map[offset])) << 634 return false; << 635 << 636 return true; << 637 } 595 } 638 596 639 static bool cluster_scan_range(struct swap_inf !! 597 /* 640 struct swap_clu !! 598 * It's possible scan_swap_map_slots() uses a free cluster in the middle of free 641 unsigned long s !! 599 * cluster list. Avoiding such abuse to avoid list corruption. >> 600 */ >> 601 static bool >> 602 scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, >> 603 unsigned long offset) 642 { 604 { 643 unsigned long offset, end = start + nr !! 605 struct percpu_cluster *percpu_cluster; 644 unsigned char *map = si->swap_map; !! 606 bool conflict; 645 bool need_reclaim = false; << 646 607 647 for (offset = start; offset < end; off !! 608 offset /= SWAPFILE_CLUSTER; 648 switch (READ_ONCE(map[offset]) !! 609 conflict = !cluster_list_empty(&si->free_clusters) && 649 case 0: !! 610 offset != cluster_list_first(&si->free_clusters) && 650 continue; !! 611 cluster_is_free(&si->cluster_info[offset]); 651 case SWAP_HAS_CACHE: << 652 if (!vm_swap_full()) << 653 return false; << 654 need_reclaim = true; << 655 continue; << 656 default: << 657 return false; << 658 } << 659 } << 660 612 661 if (need_reclaim) !! 613 if (!conflict) 662 return cluster_reclaim_range(s !! 614 return false; 663 615 >> 616 percpu_cluster = this_cpu_ptr(si->percpu_cluster); >> 617 cluster_set_null(&percpu_cluster->index); 664 return true; 618 return true; 665 } 619 } 666 620 667 static void cluster_alloc_range(struct swap_in << 668 unsigned int s << 669 unsigned int o << 670 { << 671 unsigned int nr_pages = 1 << order; << 672 << 673 if (cluster_is_free(ci)) { << 674 if (nr_pages < SWAPFILE_CLUSTE << 675 list_move_tail(&ci->li << 676 ci->flags = CLUSTER_FL << 677 } << 678 ci->order = order; << 679 } << 680 << 681 memset(si->swap_map + start, usage, nr << 682 swap_range_alloc(si, start, nr_pages); << 683 ci->count += nr_pages; << 684 << 685 if (ci->count == SWAPFILE_CLUSTER) { << 686 VM_BUG_ON(!(ci->flags & << 687 (CLUSTER_FLAG_FREE | << 688 if (ci->flags & CLUSTER_FLAG_F << 689 si->frag_cluster_nr[ci << 690 list_move_tail(&ci->list, &si- << 691 ci->flags = CLUSTER_FLAG_FULL; << 692 } << 693 } << 694 << 695 static unsigned int alloc_swap_scan_cluster(st << 696 un << 697 un << 698 { << 699 unsigned long start = offset & ~(SWAPF << 700 unsigned long end = min(start + SWAPFI << 701 unsigned int nr_pages = 1 << order; << 702 struct swap_cluster_info *ci; << 703 << 704 if (end < nr_pages) << 705 return SWAP_NEXT_INVALID; << 706 end -= nr_pages; << 707 << 708 ci = lock_cluster(si, offset); << 709 if (ci->count + nr_pages > SWAPFILE_CL << 710 offset = SWAP_NEXT_INVALID; << 711 goto done; << 712 } << 713 << 714 while (offset <= end) { << 715 if (cluster_scan_range(si, ci, << 716 cluster_alloc_range(si << 717 *foundp = offset; << 718 if (ci->count == SWAPF << 719 offset = SWAP_ << 720 goto done; << 721 } << 722 offset += nr_pages; << 723 break; << 724 } << 725 offset += nr_pages; << 726 } << 727 if (offset > end) << 728 offset = SWAP_NEXT_INVALID; << 729 done: << 730 unlock_cluster(ci); << 731 return offset; << 732 } << 733 << 734 /* Return true if reclaimed a whole cluster */ << 735 static void swap_reclaim_full_clusters(struct << 736 { << 737 long to_scan = 1; << 738 unsigned long offset, end; << 739 struct swap_cluster_info *ci; << 740 unsigned char *map = si->swap_map; << 741 int nr_reclaim; << 742 << 743 if (force) << 744 to_scan = si->inuse_pages / SW << 745 << 746 while (!list_empty(&si->full_clusters) << 747 ci = list_first_entry(&si->ful << 748 list_move_tail(&ci->list, &si- << 749 offset = cluster_offset(si, ci << 750 end = min(si->max, offset + SW << 751 to_scan--; << 752 << 753 spin_unlock(&si->lock); << 754 while (offset < end) { << 755 if (READ_ONCE(map[offs << 756 nr_reclaim = _ << 757 << 758 if (nr_reclaim << 759 offset << 760 contin << 761 } << 762 } << 763 offset++; << 764 } << 765 spin_lock(&si->lock); << 766 << 767 if (to_scan <= 0) << 768 break; << 769 } << 770 } << 771 << 772 static void swap_reclaim_work(struct work_stru << 773 { << 774 struct swap_info_struct *si; << 775 << 776 si = container_of(work, struct swap_in << 777 << 778 spin_lock(&si->lock); << 779 swap_reclaim_full_clusters(si, true); << 780 spin_unlock(&si->lock); << 781 } << 782 << 783 /* 621 /* 784 * Try to get swap entries with specified orde !! 622 * Try to get a swap entry from current cpu's swap entry pool (a cluster). This 785 * pool (a cluster). This might involve alloca !! 623 * might involve allocating a new cluster for current CPU too. 786 * too. << 787 */ 624 */ 788 static unsigned long cluster_alloc_swap_entry( !! 625 static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, 789 !! 626 unsigned long *offset, unsigned long *scan_base) 790 { 627 { 791 struct percpu_cluster *cluster; 628 struct percpu_cluster *cluster; 792 struct swap_cluster_info *ci; 629 struct swap_cluster_info *ci; 793 unsigned int offset, found = 0; !! 630 unsigned long tmp, max; 794 631 795 new_cluster: 632 new_cluster: 796 lockdep_assert_held(&si->lock); << 797 cluster = this_cpu_ptr(si->percpu_clus 633 cluster = this_cpu_ptr(si->percpu_cluster); 798 offset = cluster->next[order]; !! 634 if (cluster_is_null(&cluster->index)) { 799 if (offset) { !! 635 if (!cluster_list_empty(&si->free_clusters)) { 800 offset = alloc_swap_scan_clust !! 636 cluster->index = si->free_clusters.head; 801 if (found) !! 637 cluster->next = cluster_next(&cluster->index) * 802 goto done; !! 638 SWAPFILE_CLUSTER; 803 } !! 639 } else if (!cluster_list_empty(&si->discard_clusters)) { 804 !! 640 /* 805 if (!list_empty(&si->free_clusters)) { !! 641 * we don't have free cluster but have some clusters in 806 ci = list_first_entry(&si->fre !! 642 * discarding, do discard now and reclaim them, then 807 offset = alloc_swap_scan_clust !! 643 * reread cluster_next_cpu since we dropped si->lock 808 VM_BUG_ON(!found); !! 644 */ 809 goto done; !! 645 swap_do_scheduled_discard(si); >> 646 *scan_base = this_cpu_read(*si->cluster_next_cpu); >> 647 *offset = *scan_base; >> 648 goto new_cluster; >> 649 } else >> 650 return false; 810 } 651 } 811 652 812 /* Try reclaim from full clusters if f !! 653 /* 813 if (vm_swap_full()) !! 654 * Other CPUs can use our cluster if they can't find a free cluster, 814 swap_reclaim_full_clusters(si, !! 655 * check if there is still free entry in the cluster 815 !! 656 */ 816 if (order < PMD_ORDER) { !! 657 tmp = cluster->next; 817 unsigned int frags = 0; !! 658 max = min_t(unsigned long, si->max, 818 !! 659 (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER); 819 while (!list_empty(&si->nonful !! 660 if (tmp < max) { 820 ci = list_first_entry( !! 661 ci = lock_cluster(si, tmp); 821 !! 662 while (tmp < max) { 822 list_move_tail(&ci->li !! 663 if (!si->swap_map[tmp]) 823 ci->flags = CLUSTER_FL << 824 si->frag_cluster_nr[or << 825 offset = alloc_swap_sc << 826 << 827 frags++; << 828 if (found) << 829 break; 664 break; >> 665 tmp++; 830 } 666 } 831 !! 667 unlock_cluster(ci); 832 if (!found) { << 833 /* << 834 * Nonfull clusters ar << 835 * here, count them to << 836 */ << 837 while (frags < si->fra << 838 ci = list_firs << 839 << 840 /* << 841 * Rotate the << 842 * high order << 843 * this help k << 844 */ << 845 list_move_tail << 846 offset = alloc << 847 << 848 frags++; << 849 if (found) << 850 break; << 851 } << 852 } << 853 } 668 } 854 !! 669 if (tmp >= max) { 855 if (found) !! 670 cluster_set_null(&cluster->index); 856 goto done; << 857 << 858 if (!list_empty(&si->discard_clusters) << 859 /* << 860 * we don't have free cluster << 861 * discarding, do discard now << 862 * reread cluster_next_cpu sin << 863 */ << 864 swap_do_scheduled_discard(si); << 865 goto new_cluster; 671 goto new_cluster; 866 } 672 } 867 !! 673 cluster->next = tmp + 1; 868 if (order) !! 674 *offset = tmp; 869 goto done; !! 675 *scan_base = tmp; 870 !! 676 return true; 871 /* Order 0 stealing from higher order << 872 for (int o = 1; o < SWAP_NR_ORDERS; o+ << 873 /* << 874 * Clusters here have at least << 875 * allocation, but reclaim may << 876 */ << 877 while (!list_empty(&si->frag_c << 878 ci = list_first_entry( << 879 << 880 offset = alloc_swap_sc << 881 << 882 if (found) << 883 goto done; << 884 } << 885 << 886 while (!list_empty(&si->nonful << 887 ci = list_first_entry( << 888 << 889 offset = alloc_swap_sc << 890 << 891 if (found) << 892 goto done; << 893 } << 894 } << 895 << 896 done: << 897 cluster->next[order] = offset; << 898 return found; << 899 } 677 } 900 678 901 static void __del_from_avail_list(struct swap_ !! 679 static void __del_from_avail_list(struct swap_info_struct *p) 902 { 680 { 903 int nid; 681 int nid; 904 682 905 assert_spin_locked(&si->lock); !! 683 assert_spin_locked(&p->lock); 906 for_each_node(nid) 684 for_each_node(nid) 907 plist_del(&si->avail_lists[nid !! 685 plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]); 908 } 686 } 909 687 910 static void del_from_avail_list(struct swap_in !! 688 static void del_from_avail_list(struct swap_info_struct *p) 911 { 689 { 912 spin_lock(&swap_avail_lock); 690 spin_lock(&swap_avail_lock); 913 __del_from_avail_list(si); !! 691 __del_from_avail_list(p); 914 spin_unlock(&swap_avail_lock); 692 spin_unlock(&swap_avail_lock); 915 } 693 } 916 694 917 static void swap_range_alloc(struct swap_info_ 695 static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, 918 unsigned int nr_e 696 unsigned int nr_entries) 919 { 697 { 920 unsigned int end = offset + nr_entries 698 unsigned int end = offset + nr_entries - 1; 921 699 922 if (offset == si->lowest_bit) 700 if (offset == si->lowest_bit) 923 si->lowest_bit += nr_entries; 701 si->lowest_bit += nr_entries; 924 if (end == si->highest_bit) 702 if (end == si->highest_bit) 925 WRITE_ONCE(si->highest_bit, si 703 WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries); 926 WRITE_ONCE(si->inuse_pages, si->inuse_ 704 WRITE_ONCE(si->inuse_pages, si->inuse_pages + nr_entries); 927 if (si->inuse_pages == si->pages) { 705 if (si->inuse_pages == si->pages) { 928 si->lowest_bit = si->max; 706 si->lowest_bit = si->max; 929 si->highest_bit = 0; 707 si->highest_bit = 0; 930 del_from_avail_list(si); 708 del_from_avail_list(si); 931 << 932 if (vm_swap_full()) << 933 schedule_work(&si->rec << 934 } 709 } 935 } 710 } 936 711 937 static void add_to_avail_list(struct swap_info !! 712 static void add_to_avail_list(struct swap_info_struct *p) 938 { 713 { 939 int nid; 714 int nid; 940 715 941 spin_lock(&swap_avail_lock); 716 spin_lock(&swap_avail_lock); 942 for_each_node(nid) !! 717 for_each_node(nid) { 943 plist_add(&si->avail_lists[nid !! 718 WARN_ON(!plist_node_empty(&p->avail_lists[nid])); >> 719 plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]); >> 720 } 944 spin_unlock(&swap_avail_lock); 721 spin_unlock(&swap_avail_lock); 945 } 722 } 946 723 947 static void swap_range_free(struct swap_info_s 724 static void swap_range_free(struct swap_info_struct *si, unsigned long offset, 948 unsigned int nr_en 725 unsigned int nr_entries) 949 { 726 { 950 unsigned long begin = offset; 727 unsigned long begin = offset; 951 unsigned long end = offset + nr_entrie 728 unsigned long end = offset + nr_entries - 1; 952 void (*swap_slot_free_notify)(struct b 729 void (*swap_slot_free_notify)(struct block_device *, unsigned long); 953 unsigned int i; << 954 << 955 /* << 956 * Use atomic clear_bit operations onl << 957 * bitmap_clear to prevent adjacent bi << 958 */ << 959 for (i = 0; i < nr_entries; i++) << 960 clear_bit(offset + i, si->zero << 961 730 962 if (offset < si->lowest_bit) 731 if (offset < si->lowest_bit) 963 si->lowest_bit = offset; 732 si->lowest_bit = offset; 964 if (end > si->highest_bit) { 733 if (end > si->highest_bit) { 965 bool was_full = !si->highest_b 734 bool was_full = !si->highest_bit; 966 735 967 WRITE_ONCE(si->highest_bit, en 736 WRITE_ONCE(si->highest_bit, end); 968 if (was_full && (si->flags & S 737 if (was_full && (si->flags & SWP_WRITEOK)) 969 add_to_avail_list(si); 738 add_to_avail_list(si); 970 } 739 } >> 740 atomic_long_add(nr_entries, &nr_swap_pages); >> 741 WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries); 971 if (si->flags & SWP_BLKDEV) 742 if (si->flags & SWP_BLKDEV) 972 swap_slot_free_notify = 743 swap_slot_free_notify = 973 si->bdev->bd_disk->fop 744 si->bdev->bd_disk->fops->swap_slot_free_notify; 974 else 745 else 975 swap_slot_free_notify = NULL; 746 swap_slot_free_notify = NULL; 976 while (offset <= end) { 747 while (offset <= end) { 977 arch_swap_invalidate_page(si-> 748 arch_swap_invalidate_page(si->type, offset); >> 749 frontswap_invalidate_page(si->type, offset); 978 if (swap_slot_free_notify) 750 if (swap_slot_free_notify) 979 swap_slot_free_notify( 751 swap_slot_free_notify(si->bdev, offset); 980 offset++; 752 offset++; 981 } 753 } 982 clear_shadow_from_swap_cache(si->type, 754 clear_shadow_from_swap_cache(si->type, begin, end); 983 << 984 /* << 985 * Make sure that try_to_unuse() obser << 986 * only after the above cleanups are d << 987 */ << 988 smp_wmb(); << 989 atomic_long_add(nr_entries, &nr_swap_p << 990 WRITE_ONCE(si->inuse_pages, si->inuse_ << 991 } 755 } 992 756 993 static void set_cluster_next(struct swap_info_ 757 static void set_cluster_next(struct swap_info_struct *si, unsigned long next) 994 { 758 { 995 unsigned long prev; 759 unsigned long prev; 996 760 997 if (!(si->flags & SWP_SOLIDSTATE)) { 761 if (!(si->flags & SWP_SOLIDSTATE)) { 998 si->cluster_next = next; 762 si->cluster_next = next; 999 return; 763 return; 1000 } 764 } 1001 765 1002 prev = this_cpu_read(*si->cluster_nex 766 prev = this_cpu_read(*si->cluster_next_cpu); 1003 /* 767 /* 1004 * Cross the swap address space size 768 * Cross the swap address space size aligned trunk, choose 1005 * another trunk randomly to avoid lo 769 * another trunk randomly to avoid lock contention on swap 1006 * address space if possible. 770 * address space if possible. 1007 */ 771 */ 1008 if ((prev >> SWAP_ADDRESS_SPACE_SHIFT 772 if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) != 1009 (next >> SWAP_ADDRESS_SPACE_SHIFT 773 (next >> SWAP_ADDRESS_SPACE_SHIFT)) { 1010 /* No free swap slots availab 774 /* No free swap slots available */ 1011 if (si->highest_bit <= si->lo 775 if (si->highest_bit <= si->lowest_bit) 1012 return; 776 return; 1013 next = get_random_u32_inclusi 777 next = get_random_u32_inclusive(si->lowest_bit, si->highest_bit); 1014 next = ALIGN_DOWN(next, SWAP_ 778 next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES); 1015 next = max_t(unsigned int, ne 779 next = max_t(unsigned int, next, si->lowest_bit); 1016 } 780 } 1017 this_cpu_write(*si->cluster_next_cpu, 781 this_cpu_write(*si->cluster_next_cpu, next); 1018 } 782 } 1019 783 1020 static bool swap_offset_available_and_locked( 784 static bool swap_offset_available_and_locked(struct swap_info_struct *si, 1021 785 unsigned long offset) 1022 { 786 { 1023 if (data_race(!si->swap_map[offset])) 787 if (data_race(!si->swap_map[offset])) { 1024 spin_lock(&si->lock); 788 spin_lock(&si->lock); 1025 return true; 789 return true; 1026 } 790 } 1027 791 1028 if (vm_swap_full() && READ_ONCE(si->s 792 if (vm_swap_full() && READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { 1029 spin_lock(&si->lock); 793 spin_lock(&si->lock); 1030 return true; 794 return true; 1031 } 795 } 1032 796 1033 return false; 797 return false; 1034 } 798 } 1035 799 1036 static int cluster_alloc_swap(struct swap_inf << 1037 unsigned char us << 1038 swp_entry_t slot << 1039 { << 1040 int n_ret = 0; << 1041 << 1042 VM_BUG_ON(!si->cluster_info); << 1043 << 1044 while (n_ret < nr) { << 1045 unsigned long offset = cluste << 1046 << 1047 if (!offset) << 1048 break; << 1049 slots[n_ret++] = swp_entry(si << 1050 } << 1051 << 1052 return n_ret; << 1053 } << 1054 << 1055 static int scan_swap_map_slots(struct swap_in 800 static int scan_swap_map_slots(struct swap_info_struct *si, 1056 unsigned char 801 unsigned char usage, int nr, 1057 swp_entry_t sl !! 802 swp_entry_t slots[]) 1058 { 803 { >> 804 struct swap_cluster_info *ci; 1059 unsigned long offset; 805 unsigned long offset; 1060 unsigned long scan_base; 806 unsigned long scan_base; 1061 unsigned long last_in_cluster = 0; 807 unsigned long last_in_cluster = 0; 1062 int latency_ration = LATENCY_LIMIT; 808 int latency_ration = LATENCY_LIMIT; 1063 unsigned int nr_pages = 1 << order; << 1064 int n_ret = 0; 809 int n_ret = 0; 1065 bool scanned_many = false; 810 bool scanned_many = false; 1066 811 1067 /* 812 /* 1068 * We try to cluster swap pages by al 813 * We try to cluster swap pages by allocating them sequentially 1069 * in swap. Once we've allocated SWA 814 * in swap. Once we've allocated SWAPFILE_CLUSTER pages this 1070 * way, however, we resort to first-f 815 * way, however, we resort to first-free allocation, starting 1071 * a new cluster. This prevents us f 816 * a new cluster. This prevents us from scattering swap pages 1072 * all over the entire swap partition 817 * all over the entire swap partition, so that we reduce 1073 * overall disk seek times between sw 818 * overall disk seek times between swap pages. -- sct 1074 * But we do now try to find an empty 819 * But we do now try to find an empty cluster. -Andrea 1075 * And we let swap pages go all over 820 * And we let swap pages go all over an SSD partition. Hugh 1076 */ 821 */ 1077 822 1078 if (order > 0) { << 1079 /* << 1080 * Should not even be attempt << 1081 * page swap is disabled. Wa << 1082 */ << 1083 if (!IS_ENABLED(CONFIG_THP_SW << 1084 nr_pages > SWAPFILE_CLUST << 1085 VM_WARN_ON_ONCE(1); << 1086 return 0; << 1087 } << 1088 << 1089 /* << 1090 * Swapfile is not block devi << 1091 * to allocate large entries. << 1092 */ << 1093 if (!(si->flags & SWP_BLKDEV) << 1094 return 0; << 1095 } << 1096 << 1097 if (si->cluster_info) << 1098 return cluster_alloc_swap(si, << 1099 << 1100 si->flags += SWP_SCANNING; 823 si->flags += SWP_SCANNING; 1101 !! 824 /* 1102 /* For HDD, sequential access is more !! 825 * Use percpu scan base for SSD to reduce lock contention on 1103 scan_base = si->cluster_next; !! 826 * cluster and swap cache. For HDD, sequential access is more >> 827 * important. >> 828 */ >> 829 if (si->flags & SWP_SOLIDSTATE) >> 830 scan_base = this_cpu_read(*si->cluster_next_cpu); >> 831 else >> 832 scan_base = si->cluster_next; 1104 offset = scan_base; 833 offset = scan_base; 1105 834 1106 if (unlikely(!si->cluster_nr--)) { !! 835 /* SSD algorithm */ >> 836 if (si->cluster_info) { >> 837 if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) >> 838 goto scan; >> 839 } else if (unlikely(!si->cluster_nr--)) { 1107 if (si->pages - si->inuse_pag 840 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { 1108 si->cluster_nr = SWAP 841 si->cluster_nr = SWAPFILE_CLUSTER - 1; 1109 goto checks; 842 goto checks; 1110 } 843 } 1111 844 1112 spin_unlock(&si->lock); 845 spin_unlock(&si->lock); 1113 846 1114 /* 847 /* 1115 * If seek is expensive, star 848 * If seek is expensive, start searching for new cluster from 1116 * start of partition, to min 849 * start of partition, to minimize the span of allocated swap. >> 850 * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info >> 851 * case, just handled by scan_swap_map_try_ssd_cluster() above. 1117 */ 852 */ 1118 scan_base = offset = si->lowe 853 scan_base = offset = si->lowest_bit; 1119 last_in_cluster = offset + SW 854 last_in_cluster = offset + SWAPFILE_CLUSTER - 1; 1120 855 1121 /* Locate the first empty (un 856 /* Locate the first empty (unaligned) cluster */ 1122 for (; last_in_cluster <= REA !! 857 for (; last_in_cluster <= si->highest_bit; offset++) { 1123 if (si->swap_map[offs 858 if (si->swap_map[offset]) 1124 last_in_clust 859 last_in_cluster = offset + SWAPFILE_CLUSTER; 1125 else if (offset == la 860 else if (offset == last_in_cluster) { 1126 spin_lock(&si 861 spin_lock(&si->lock); 1127 offset -= SWA 862 offset -= SWAPFILE_CLUSTER - 1; 1128 si->cluster_n 863 si->cluster_next = offset; 1129 si->cluster_n 864 si->cluster_nr = SWAPFILE_CLUSTER - 1; 1130 goto checks; 865 goto checks; 1131 } 866 } 1132 if (unlikely(--latenc 867 if (unlikely(--latency_ration < 0)) { 1133 cond_resched( 868 cond_resched(); 1134 latency_ratio 869 latency_ration = LATENCY_LIMIT; 1135 } 870 } 1136 } 871 } 1137 872 1138 offset = scan_base; 873 offset = scan_base; 1139 spin_lock(&si->lock); 874 spin_lock(&si->lock); 1140 si->cluster_nr = SWAPFILE_CLU 875 si->cluster_nr = SWAPFILE_CLUSTER - 1; 1141 } 876 } 1142 877 1143 checks: 878 checks: >> 879 if (si->cluster_info) { >> 880 while (scan_swap_map_ssd_cluster_conflict(si, offset)) { >> 881 /* take a break if we already got some slots */ >> 882 if (n_ret) >> 883 goto done; >> 884 if (!scan_swap_map_try_ssd_cluster(si, &offset, >> 885 &scan_base)) >> 886 goto scan; >> 887 } >> 888 } 1144 if (!(si->flags & SWP_WRITEOK)) 889 if (!(si->flags & SWP_WRITEOK)) 1145 goto no_page; 890 goto no_page; 1146 if (!si->highest_bit) 891 if (!si->highest_bit) 1147 goto no_page; 892 goto no_page; 1148 if (offset > si->highest_bit) 893 if (offset > si->highest_bit) 1149 scan_base = offset = si->lowe 894 scan_base = offset = si->lowest_bit; 1150 895 >> 896 ci = lock_cluster(si, offset); 1151 /* reuse swap entry of cache-only swa 897 /* reuse swap entry of cache-only swap if not busy. */ 1152 if (vm_swap_full() && si->swap_map[of 898 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 1153 int swap_was_freed; 899 int swap_was_freed; >> 900 unlock_cluster(ci); 1154 spin_unlock(&si->lock); 901 spin_unlock(&si->lock); 1155 swap_was_freed = __try_to_rec !! 902 swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); 1156 spin_lock(&si->lock); 903 spin_lock(&si->lock); 1157 /* entry was freed successful 904 /* entry was freed successfully, try to use this again */ 1158 if (swap_was_freed > 0) !! 905 if (swap_was_freed) 1159 goto checks; 906 goto checks; 1160 goto scan; /* check next one 907 goto scan; /* check next one */ 1161 } 908 } 1162 909 1163 if (si->swap_map[offset]) { 910 if (si->swap_map[offset]) { >> 911 unlock_cluster(ci); 1164 if (!n_ret) 912 if (!n_ret) 1165 goto scan; 913 goto scan; 1166 else 914 else 1167 goto done; 915 goto done; 1168 } 916 } 1169 memset(si->swap_map + offset, usage, !! 917 WRITE_ONCE(si->swap_map[offset], usage); >> 918 inc_cluster_info_page(si, si->cluster_info, offset); >> 919 unlock_cluster(ci); 1170 920 1171 swap_range_alloc(si, offset, nr_pages !! 921 swap_range_alloc(si, offset, 1); 1172 slots[n_ret++] = swp_entry(si->type, 922 slots[n_ret++] = swp_entry(si->type, offset); 1173 923 1174 /* got enough slots or reach max slot 924 /* got enough slots or reach max slots? */ 1175 if ((n_ret == nr) || (offset >= si->h 925 if ((n_ret == nr) || (offset >= si->highest_bit)) 1176 goto done; 926 goto done; 1177 927 1178 /* search for next available slot */ 928 /* search for next available slot */ 1179 929 1180 /* time to take a break? */ 930 /* time to take a break? */ 1181 if (unlikely(--latency_ration < 0)) { 931 if (unlikely(--latency_ration < 0)) { 1182 if (n_ret) 932 if (n_ret) 1183 goto done; 933 goto done; 1184 spin_unlock(&si->lock); 934 spin_unlock(&si->lock); 1185 cond_resched(); 935 cond_resched(); 1186 spin_lock(&si->lock); 936 spin_lock(&si->lock); 1187 latency_ration = LATENCY_LIMI 937 latency_ration = LATENCY_LIMIT; 1188 } 938 } 1189 939 1190 if (si->cluster_nr && !si->swap_map[+ !! 940 /* try to get more slots in cluster */ >> 941 if (si->cluster_info) { >> 942 if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) >> 943 goto checks; >> 944 } else if (si->cluster_nr && !si->swap_map[++offset]) { 1191 /* non-ssd case, still more s 945 /* non-ssd case, still more slots in cluster? */ 1192 --si->cluster_nr; 946 --si->cluster_nr; 1193 goto checks; 947 goto checks; 1194 } 948 } 1195 949 1196 /* 950 /* 1197 * Even if there's no free clusters a 951 * Even if there's no free clusters available (fragmented), 1198 * try to scan a little more quickly 952 * try to scan a little more quickly with lock held unless we 1199 * have scanned too many slots alread 953 * have scanned too many slots already. 1200 */ 954 */ 1201 if (!scanned_many) { 955 if (!scanned_many) { 1202 unsigned long scan_limit; 956 unsigned long scan_limit; 1203 957 1204 if (offset < scan_base) 958 if (offset < scan_base) 1205 scan_limit = scan_bas 959 scan_limit = scan_base; 1206 else 960 else 1207 scan_limit = si->high 961 scan_limit = si->highest_bit; 1208 for (; offset <= scan_limit & 962 for (; offset <= scan_limit && --latency_ration > 0; 1209 offset++) { 963 offset++) { 1210 if (!si->swap_map[off 964 if (!si->swap_map[offset]) 1211 goto checks; 965 goto checks; 1212 } 966 } 1213 } 967 } 1214 968 1215 done: 969 done: 1216 if (order == 0) !! 970 set_cluster_next(si, offset + 1); 1217 set_cluster_next(si, offset + << 1218 si->flags -= SWP_SCANNING; 971 si->flags -= SWP_SCANNING; 1219 return n_ret; 972 return n_ret; 1220 973 1221 scan: 974 scan: 1222 VM_WARN_ON(order > 0); << 1223 spin_unlock(&si->lock); 975 spin_unlock(&si->lock); 1224 while (++offset <= READ_ONCE(si->high 976 while (++offset <= READ_ONCE(si->highest_bit)) { 1225 if (unlikely(--latency_ration 977 if (unlikely(--latency_ration < 0)) { 1226 cond_resched(); 978 cond_resched(); 1227 latency_ration = LATE 979 latency_ration = LATENCY_LIMIT; 1228 scanned_many = true; 980 scanned_many = true; 1229 } 981 } 1230 if (swap_offset_available_and 982 if (swap_offset_available_and_locked(si, offset)) 1231 goto checks; 983 goto checks; 1232 } 984 } 1233 offset = si->lowest_bit; 985 offset = si->lowest_bit; 1234 while (offset < scan_base) { 986 while (offset < scan_base) { 1235 if (unlikely(--latency_ration 987 if (unlikely(--latency_ration < 0)) { 1236 cond_resched(); 988 cond_resched(); 1237 latency_ration = LATE 989 latency_ration = LATENCY_LIMIT; 1238 scanned_many = true; 990 scanned_many = true; 1239 } 991 } 1240 if (swap_offset_available_and 992 if (swap_offset_available_and_locked(si, offset)) 1241 goto checks; 993 goto checks; 1242 offset++; 994 offset++; 1243 } 995 } 1244 spin_lock(&si->lock); 996 spin_lock(&si->lock); 1245 997 1246 no_page: 998 no_page: 1247 si->flags -= SWP_SCANNING; 999 si->flags -= SWP_SCANNING; 1248 return n_ret; 1000 return n_ret; 1249 } 1001 } 1250 1002 1251 int get_swap_pages(int n_goal, swp_entry_t sw !! 1003 static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) >> 1004 { >> 1005 unsigned long idx; >> 1006 struct swap_cluster_info *ci; >> 1007 unsigned long offset; >> 1008 >> 1009 /* >> 1010 * Should not even be attempting cluster allocations when huge >> 1011 * page swap is disabled. Warn and fail the allocation. >> 1012 */ >> 1013 if (!IS_ENABLED(CONFIG_THP_SWAP)) { >> 1014 VM_WARN_ON_ONCE(1); >> 1015 return 0; >> 1016 } >> 1017 >> 1018 if (cluster_list_empty(&si->free_clusters)) >> 1019 return 0; >> 1020 >> 1021 idx = cluster_list_first(&si->free_clusters); >> 1022 offset = idx * SWAPFILE_CLUSTER; >> 1023 ci = lock_cluster(si, offset); >> 1024 alloc_cluster(si, idx); >> 1025 cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE); >> 1026 >> 1027 memset(si->swap_map + offset, SWAP_HAS_CACHE, SWAPFILE_CLUSTER); >> 1028 unlock_cluster(ci); >> 1029 swap_range_alloc(si, offset, SWAPFILE_CLUSTER); >> 1030 *slot = swp_entry(si->type, offset); >> 1031 >> 1032 return 1; >> 1033 } >> 1034 >> 1035 static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) 1252 { 1036 { 1253 int order = swap_entry_order(entry_or !! 1037 unsigned long offset = idx * SWAPFILE_CLUSTER; 1254 unsigned long size = 1 << order; !! 1038 struct swap_cluster_info *ci; >> 1039 >> 1040 ci = lock_cluster(si, offset); >> 1041 memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); >> 1042 cluster_set_count_flag(ci, 0, 0); >> 1043 free_cluster(si, idx); >> 1044 unlock_cluster(ci); >> 1045 swap_range_free(si, offset, SWAPFILE_CLUSTER); >> 1046 } >> 1047 >> 1048 int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) >> 1049 { >> 1050 unsigned long size = swap_entry_size(entry_size); 1255 struct swap_info_struct *si, *next; 1051 struct swap_info_struct *si, *next; 1256 long avail_pgs; 1052 long avail_pgs; 1257 int n_ret = 0; 1053 int n_ret = 0; 1258 int node; 1054 int node; 1259 1055 >> 1056 /* Only single cluster request supported */ >> 1057 WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER); >> 1058 1260 spin_lock(&swap_avail_lock); 1059 spin_lock(&swap_avail_lock); 1261 1060 1262 avail_pgs = atomic_long_read(&nr_swap 1061 avail_pgs = atomic_long_read(&nr_swap_pages) / size; 1263 if (avail_pgs <= 0) { 1062 if (avail_pgs <= 0) { 1264 spin_unlock(&swap_avail_lock) 1063 spin_unlock(&swap_avail_lock); 1265 goto noswap; 1064 goto noswap; 1266 } 1065 } 1267 1066 1268 n_goal = min3((long)n_goal, (long)SWA 1067 n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs); 1269 1068 1270 atomic_long_sub(n_goal * size, &nr_sw 1069 atomic_long_sub(n_goal * size, &nr_swap_pages); 1271 1070 1272 start_over: 1071 start_over: 1273 node = numa_node_id(); 1072 node = numa_node_id(); 1274 plist_for_each_entry_safe(si, next, & 1073 plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) { 1275 /* requeue si to after same-p 1074 /* requeue si to after same-priority siblings */ 1276 plist_requeue(&si->avail_list 1075 plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); 1277 spin_unlock(&swap_avail_lock) 1076 spin_unlock(&swap_avail_lock); 1278 spin_lock(&si->lock); 1077 spin_lock(&si->lock); 1279 if (!si->highest_bit || !(si- 1078 if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { 1280 spin_lock(&swap_avail 1079 spin_lock(&swap_avail_lock); 1281 if (plist_node_empty( 1080 if (plist_node_empty(&si->avail_lists[node])) { 1282 spin_unlock(& 1081 spin_unlock(&si->lock); 1283 goto nextsi; 1082 goto nextsi; 1284 } 1083 } 1285 WARN(!si->highest_bit 1084 WARN(!si->highest_bit, 1286 "swap_info %d in 1085 "swap_info %d in list but !highest_bit\n", 1287 si->type); 1086 si->type); 1288 WARN(!(si->flags & SW 1087 WARN(!(si->flags & SWP_WRITEOK), 1289 "swap_info %d in 1088 "swap_info %d in list but !SWP_WRITEOK\n", 1290 si->type); 1089 si->type); 1291 __del_from_avail_list 1090 __del_from_avail_list(si); 1292 spin_unlock(&si->lock 1091 spin_unlock(&si->lock); 1293 goto nextsi; 1092 goto nextsi; 1294 } 1093 } 1295 n_ret = scan_swap_map_slots(s !! 1094 if (size == SWAPFILE_CLUSTER) { 1296 n !! 1095 if (si->flags & SWP_BLKDEV) >> 1096 n_ret = swap_alloc_cluster(si, swp_entries); >> 1097 } else >> 1098 n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, >> 1099 n_goal, swp_entries); 1297 spin_unlock(&si->lock); 1100 spin_unlock(&si->lock); 1298 if (n_ret || size > 1) !! 1101 if (n_ret || size == SWAPFILE_CLUSTER) 1299 goto check_out; 1102 goto check_out; 1300 cond_resched(); 1103 cond_resched(); 1301 1104 1302 spin_lock(&swap_avail_lock); 1105 spin_lock(&swap_avail_lock); 1303 nextsi: 1106 nextsi: 1304 /* 1107 /* 1305 * if we got here, it's likel 1108 * if we got here, it's likely that si was almost full before, 1306 * and since scan_swap_map_sl 1109 * and since scan_swap_map_slots() can drop the si->lock, 1307 * multiple callers probably 1110 * multiple callers probably all tried to get a page from the 1308 * same si and it filled up b 1111 * same si and it filled up before we could get one; or, the si 1309 * filled up between us dropp 1112 * filled up between us dropping swap_avail_lock and taking 1310 * si->lock. Since we dropped 1113 * si->lock. Since we dropped the swap_avail_lock, the 1311 * swap_avail_head list may h 1114 * swap_avail_head list may have been modified; so if next is 1312 * still in the swap_avail_he 1115 * still in the swap_avail_head list then try it, otherwise 1313 * start over if we have not 1116 * start over if we have not gotten any slots. 1314 */ 1117 */ 1315 if (plist_node_empty(&next->a 1118 if (plist_node_empty(&next->avail_lists[node])) 1316 goto start_over; 1119 goto start_over; 1317 } 1120 } 1318 1121 1319 spin_unlock(&swap_avail_lock); 1122 spin_unlock(&swap_avail_lock); 1320 1123 1321 check_out: 1124 check_out: 1322 if (n_ret < n_goal) 1125 if (n_ret < n_goal) 1323 atomic_long_add((long)(n_goal 1126 atomic_long_add((long)(n_goal - n_ret) * size, 1324 &nr_swap_page 1127 &nr_swap_pages); 1325 noswap: 1128 noswap: 1326 return n_ret; 1129 return n_ret; 1327 } 1130 } 1328 1131 1329 static struct swap_info_struct *_swap_info_ge 1132 static struct swap_info_struct *_swap_info_get(swp_entry_t entry) 1330 { 1133 { 1331 struct swap_info_struct *si; !! 1134 struct swap_info_struct *p; 1332 unsigned long offset; 1135 unsigned long offset; 1333 1136 1334 if (!entry.val) 1137 if (!entry.val) 1335 goto out; 1138 goto out; 1336 si = swp_swap_info(entry); !! 1139 p = swp_swap_info(entry); 1337 if (!si) !! 1140 if (!p) 1338 goto bad_nofile; 1141 goto bad_nofile; 1339 if (data_race(!(si->flags & SWP_USED) !! 1142 if (data_race(!(p->flags & SWP_USED))) 1340 goto bad_device; 1143 goto bad_device; 1341 offset = swp_offset(entry); 1144 offset = swp_offset(entry); 1342 if (offset >= si->max) !! 1145 if (offset >= p->max) 1343 goto bad_offset; 1146 goto bad_offset; 1344 if (data_race(!si->swap_map[swp_offse !! 1147 if (data_race(!p->swap_map[swp_offset(entry)])) 1345 goto bad_free; 1148 goto bad_free; 1346 return si; !! 1149 return p; 1347 1150 1348 bad_free: 1151 bad_free: 1349 pr_err("%s: %s%08lx\n", __func__, Unu 1152 pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val); 1350 goto out; 1153 goto out; 1351 bad_offset: 1154 bad_offset: 1352 pr_err("%s: %s%08lx\n", __func__, Bad 1155 pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val); 1353 goto out; 1156 goto out; 1354 bad_device: 1157 bad_device: 1355 pr_err("%s: %s%08lx\n", __func__, Unu 1158 pr_err("%s: %s%08lx\n", __func__, Unused_file, entry.val); 1356 goto out; 1159 goto out; 1357 bad_nofile: 1160 bad_nofile: 1358 pr_err("%s: %s%08lx\n", __func__, Bad 1161 pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); 1359 out: 1162 out: 1360 return NULL; 1163 return NULL; 1361 } 1164 } 1362 1165 1363 static struct swap_info_struct *swap_info_get 1166 static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, 1364 struc 1167 struct swap_info_struct *q) 1365 { 1168 { 1366 struct swap_info_struct *p; 1169 struct swap_info_struct *p; 1367 1170 1368 p = _swap_info_get(entry); 1171 p = _swap_info_get(entry); 1369 1172 1370 if (p != q) { 1173 if (p != q) { 1371 if (q != NULL) 1174 if (q != NULL) 1372 spin_unlock(&q->lock) 1175 spin_unlock(&q->lock); 1373 if (p != NULL) 1176 if (p != NULL) 1374 spin_lock(&p->lock); 1177 spin_lock(&p->lock); 1375 } 1178 } 1376 return p; 1179 return p; 1377 } 1180 } 1378 1181 1379 static unsigned char __swap_entry_free_locked !! 1182 static unsigned char __swap_entry_free_locked(struct swap_info_struct *p, 1380 1183 unsigned long offset, 1381 1184 unsigned char usage) 1382 { 1185 { 1383 unsigned char count; 1186 unsigned char count; 1384 unsigned char has_cache; 1187 unsigned char has_cache; 1385 1188 1386 count = si->swap_map[offset]; !! 1189 count = p->swap_map[offset]; 1387 1190 1388 has_cache = count & SWAP_HAS_CACHE; 1191 has_cache = count & SWAP_HAS_CACHE; 1389 count &= ~SWAP_HAS_CACHE; 1192 count &= ~SWAP_HAS_CACHE; 1390 1193 1391 if (usage == SWAP_HAS_CACHE) { 1194 if (usage == SWAP_HAS_CACHE) { 1392 VM_BUG_ON(!has_cache); 1195 VM_BUG_ON(!has_cache); 1393 has_cache = 0; 1196 has_cache = 0; 1394 } else if (count == SWAP_MAP_SHMEM) { 1197 } else if (count == SWAP_MAP_SHMEM) { 1395 /* 1198 /* 1396 * Or we could insist on shme 1199 * Or we could insist on shmem.c using a special 1397 * swap_shmem_free() and free 1200 * swap_shmem_free() and free_shmem_swap_and_cache()... 1398 */ 1201 */ 1399 count = 0; 1202 count = 0; 1400 } else if ((count & ~COUNT_CONTINUED) 1203 } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { 1401 if (count == COUNT_CONTINUED) 1204 if (count == COUNT_CONTINUED) { 1402 if (swap_count_contin !! 1205 if (swap_count_continued(p, offset, count)) 1403 count = SWAP_ 1206 count = SWAP_MAP_MAX | COUNT_CONTINUED; 1404 else 1207 else 1405 count = SWAP_ 1208 count = SWAP_MAP_MAX; 1406 } else 1209 } else 1407 count--; 1210 count--; 1408 } 1211 } 1409 1212 1410 usage = count | has_cache; 1213 usage = count | has_cache; 1411 if (usage) 1214 if (usage) 1412 WRITE_ONCE(si->swap_map[offse !! 1215 WRITE_ONCE(p->swap_map[offset], usage); 1413 else 1216 else 1414 WRITE_ONCE(si->swap_map[offse !! 1217 WRITE_ONCE(p->swap_map[offset], SWAP_HAS_CACHE); 1415 1218 1416 return usage; 1219 return usage; 1417 } 1220 } 1418 1221 1419 /* 1222 /* 1420 * When we get a swap entry, if there aren't 1223 * When we get a swap entry, if there aren't some other ways to 1421 * prevent swapoff, such as the folio in swap !! 1224 * prevent swapoff, such as the folio in swap cache is locked, page 1422 * reader side is locked, etc., the swap entr !! 1225 * table lock is held, etc., the swap entry may become invalid because 1423 * because of swapoff. Then, we need to encl !! 1226 * of swapoff. Then, we need to enclose all swap related functions 1424 * functions with get_swap_device() and put_s !! 1227 * with get_swap_device() and put_swap_device(), unless the swap 1425 * swap functions call get/put_swap_device() !! 1228 * functions call get/put_swap_device() by themselves. 1426 * << 1427 * RCU reader side lock (including any spinlo << 1428 * prevent swapoff, because synchronize_rcu() << 1429 * before freeing data structures. << 1430 * 1229 * 1431 * Check whether swap entry is valid in the s 1230 * Check whether swap entry is valid in the swap device. If so, 1432 * return pointer to swap_info_struct, and ke 1231 * return pointer to swap_info_struct, and keep the swap entry valid 1433 * via preventing the swap device from being 1232 * via preventing the swap device from being swapoff, until 1434 * put_swap_device() is called. Otherwise re 1233 * put_swap_device() is called. Otherwise return NULL. 1435 * 1234 * 1436 * Notice that swapoff or swapoff+swapon can 1235 * Notice that swapoff or swapoff+swapon can still happen before the 1437 * percpu_ref_tryget_live() in get_swap_devic 1236 * percpu_ref_tryget_live() in get_swap_device() or after the 1438 * percpu_ref_put() in put_swap_device() if t 1237 * percpu_ref_put() in put_swap_device() if there isn't any other way 1439 * to prevent swapoff. The caller must be pr 1238 * to prevent swapoff. The caller must be prepared for that. For 1440 * example, the following situation is possib 1239 * example, the following situation is possible. 1441 * 1240 * 1442 * CPU1 CPU2 1241 * CPU1 CPU2 1443 * do_swap_page() 1242 * do_swap_page() 1444 * ... swapo 1243 * ... swapoff+swapon 1445 * __read_swap_cache_async() 1244 * __read_swap_cache_async() 1446 * swapcache_prepare() 1245 * swapcache_prepare() 1447 * __swap_duplicate() 1246 * __swap_duplicate() 1448 * // check swap_map 1247 * // check swap_map 1449 * // verify PTE not changed 1248 * // verify PTE not changed 1450 * 1249 * 1451 * In __swap_duplicate(), the swap_map need t 1250 * In __swap_duplicate(), the swap_map need to be checked before 1452 * changing partly because the specified swap 1251 * changing partly because the specified swap entry may be for another 1453 * swap device which has been swapoff. And i 1252 * swap device which has been swapoff. And in do_swap_page(), after 1454 * the page is read from the swap device, the 1253 * the page is read from the swap device, the PTE is verified not 1455 * changed with the page table locked to chec 1254 * changed with the page table locked to check whether the swap device 1456 * has been swapoff or swapoff+swapon. 1255 * has been swapoff or swapoff+swapon. 1457 */ 1256 */ 1458 struct swap_info_struct *get_swap_device(swp_ 1257 struct swap_info_struct *get_swap_device(swp_entry_t entry) 1459 { 1258 { 1460 struct swap_info_struct *si; 1259 struct swap_info_struct *si; 1461 unsigned long offset; 1260 unsigned long offset; 1462 1261 1463 if (!entry.val) 1262 if (!entry.val) 1464 goto out; 1263 goto out; 1465 si = swp_swap_info(entry); 1264 si = swp_swap_info(entry); 1466 if (!si) 1265 if (!si) 1467 goto bad_nofile; 1266 goto bad_nofile; 1468 if (!percpu_ref_tryget_live(&si->user 1267 if (!percpu_ref_tryget_live(&si->users)) 1469 goto out; 1268 goto out; 1470 /* 1269 /* 1471 * Guarantee the si->users are checke 1270 * Guarantee the si->users are checked before accessing other 1472 * fields of swap_info_struct. 1271 * fields of swap_info_struct. 1473 * 1272 * 1474 * Paired with the spin_unlock() afte 1273 * Paired with the spin_unlock() after setup_swap_info() in 1475 * enable_swap_info(). 1274 * enable_swap_info(). 1476 */ 1275 */ 1477 smp_rmb(); 1276 smp_rmb(); 1478 offset = swp_offset(entry); 1277 offset = swp_offset(entry); 1479 if (offset >= si->max) 1278 if (offset >= si->max) 1480 goto put_out; 1279 goto put_out; 1481 1280 1482 return si; 1281 return si; 1483 bad_nofile: 1282 bad_nofile: 1484 pr_err("%s: %s%08lx\n", __func__, Bad 1283 pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); 1485 out: 1284 out: 1486 return NULL; 1285 return NULL; 1487 put_out: 1286 put_out: 1488 pr_err("%s: %s%08lx\n", __func__, Bad 1287 pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val); 1489 percpu_ref_put(&si->users); 1288 percpu_ref_put(&si->users); 1490 return NULL; 1289 return NULL; 1491 } 1290 } 1492 1291 1493 static unsigned char __swap_entry_free(struct !! 1292 static unsigned char __swap_entry_free(struct swap_info_struct *p, 1494 swp_en 1293 swp_entry_t entry) 1495 { 1294 { 1496 struct swap_cluster_info *ci; 1295 struct swap_cluster_info *ci; 1497 unsigned long offset = swp_offset(ent 1296 unsigned long offset = swp_offset(entry); 1498 unsigned char usage; 1297 unsigned char usage; 1499 1298 1500 ci = lock_cluster_or_swap_info(si, of !! 1299 ci = lock_cluster_or_swap_info(p, offset); 1501 usage = __swap_entry_free_locked(si, !! 1300 usage = __swap_entry_free_locked(p, offset, 1); 1502 unlock_cluster_or_swap_info(si, ci); !! 1301 unlock_cluster_or_swap_info(p, ci); 1503 if (!usage) 1302 if (!usage) 1504 free_swap_slot(entry); 1303 free_swap_slot(entry); 1505 1304 1506 return usage; 1305 return usage; 1507 } 1306 } 1508 1307 1509 static bool __swap_entries_free(struct swap_i !! 1308 static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) 1510 swp_entry_t entry, int nr) << 1511 { 1309 { 1512 unsigned long offset = swp_offset(ent << 1513 unsigned int type = swp_type(entry); << 1514 struct swap_cluster_info *ci; 1310 struct swap_cluster_info *ci; 1515 bool has_cache = false; << 1516 unsigned char count; << 1517 int i; << 1518 << 1519 if (nr <= 1 || swap_count(data_race(s << 1520 goto fallback; << 1521 /* cross into another cluster */ << 1522 if (nr > SWAPFILE_CLUSTER - offset % << 1523 goto fallback; << 1524 << 1525 ci = lock_cluster_or_swap_info(si, of << 1526 if (!swap_is_last_map(si, offset, nr, << 1527 unlock_cluster_or_swap_info(s << 1528 goto fallback; << 1529 } << 1530 for (i = 0; i < nr; i++) << 1531 WRITE_ONCE(si->swap_map[offse << 1532 unlock_cluster_or_swap_info(si, ci); << 1533 << 1534 if (!has_cache) { << 1535 for (i = 0; i < nr; i++) << 1536 zswap_invalidate(swp_ << 1537 spin_lock(&si->lock); << 1538 swap_entry_range_free(si, ent << 1539 spin_unlock(&si->lock); << 1540 } << 1541 return has_cache; << 1542 << 1543 fallback: << 1544 for (i = 0; i < nr; i++) { << 1545 if (data_race(si->swap_map[of << 1546 count = __swap_entry_ << 1547 if (count == SWAP_HAS << 1548 has_cache = t << 1549 } else { << 1550 WARN_ON_ONCE(1); << 1551 } << 1552 } << 1553 return has_cache; << 1554 } << 1555 << 1556 /* << 1557 * Drop the last HAS_CACHE flag of swap entri << 1558 * ensure all entries belong to the same cgro << 1559 */ << 1560 static void swap_entry_range_free(struct swap << 1561 unsigned in << 1562 { << 1563 unsigned long offset = swp_offset(ent 1311 unsigned long offset = swp_offset(entry); 1564 unsigned char *map = si->swap_map + o !! 1312 unsigned char count; 1565 unsigned char *map_end = map + nr_pag << 1566 struct swap_cluster_info *ci; << 1567 1313 1568 ci = lock_cluster(si, offset); !! 1314 ci = lock_cluster(p, offset); 1569 do { !! 1315 count = p->swap_map[offset]; 1570 VM_BUG_ON(*map != SWAP_HAS_CA !! 1316 VM_BUG_ON(count != SWAP_HAS_CACHE); 1571 *map = 0; !! 1317 p->swap_map[offset] = 0; 1572 } while (++map < map_end); !! 1318 dec_cluster_info_page(p, p->cluster_info, offset); 1573 dec_cluster_info_page(si, ci, nr_page << 1574 unlock_cluster(ci); 1319 unlock_cluster(ci); 1575 1320 1576 mem_cgroup_uncharge_swap(entry, nr_pa !! 1321 mem_cgroup_uncharge_swap(entry, 1); 1577 swap_range_free(si, offset, nr_pages) !! 1322 swap_range_free(p, offset, 1); 1578 } << 1579 << 1580 static void cluster_swap_free_nr(struct swap_ << 1581 unsigned long offset, int nr_ << 1582 unsigned char usage) << 1583 { << 1584 struct swap_cluster_info *ci; << 1585 DECLARE_BITMAP(to_free, BITS_PER_LONG << 1586 int i, nr; << 1587 << 1588 ci = lock_cluster_or_swap_info(si, of << 1589 while (nr_pages) { << 1590 nr = min(BITS_PER_LONG, nr_pa << 1591 for (i = 0; i < nr; i++) { << 1592 if (!__swap_entry_fre << 1593 bitmap_set(to << 1594 } << 1595 if (!bitmap_empty(to_free, BI << 1596 unlock_cluster_or_swa << 1597 for_each_set_bit(i, t << 1598 free_swap_slo << 1599 if (nr == nr_pages) << 1600 return; << 1601 bitmap_clear(to_free, << 1602 ci = lock_cluster_or_ << 1603 } << 1604 offset += nr; << 1605 nr_pages -= nr; << 1606 } << 1607 unlock_cluster_or_swap_info(si, ci); << 1608 } 1323 } 1609 1324 1610 /* 1325 /* 1611 * Caller has made sure that the swap device 1326 * Caller has made sure that the swap device corresponding to entry 1612 * is still around or has not been recycled. 1327 * is still around or has not been recycled. 1613 */ 1328 */ 1614 void swap_free_nr(swp_entry_t entry, int nr_p !! 1329 void swap_free(swp_entry_t entry) 1615 { 1330 { 1616 int nr; !! 1331 struct swap_info_struct *p; 1617 struct swap_info_struct *sis; << 1618 unsigned long offset = swp_offset(ent << 1619 << 1620 sis = _swap_info_get(entry); << 1621 if (!sis) << 1622 return; << 1623 1332 1624 while (nr_pages) { !! 1333 p = _swap_info_get(entry); 1625 nr = min_t(int, nr_pages, SWA !! 1334 if (p) 1626 cluster_swap_free_nr(sis, off !! 1335 __swap_entry_free(p, entry); 1627 offset += nr; << 1628 nr_pages -= nr; << 1629 } << 1630 } 1336 } 1631 1337 1632 /* 1338 /* 1633 * Called after dropping swapcache to decreas 1339 * Called after dropping swapcache to decrease refcnt to swap entries. 1634 */ 1340 */ 1635 void put_swap_folio(struct folio *folio, swp_ 1341 void put_swap_folio(struct folio *folio, swp_entry_t entry) 1636 { 1342 { 1637 unsigned long offset = swp_offset(ent 1343 unsigned long offset = swp_offset(entry); >> 1344 unsigned long idx = offset / SWAPFILE_CLUSTER; 1638 struct swap_cluster_info *ci; 1345 struct swap_cluster_info *ci; 1639 struct swap_info_struct *si; 1346 struct swap_info_struct *si; 1640 int size = 1 << swap_entry_order(foli !! 1347 unsigned char *map; >> 1348 unsigned int i, free_entries = 0; >> 1349 unsigned char val; >> 1350 int size = swap_entry_size(folio_nr_pages(folio)); 1641 1351 1642 si = _swap_info_get(entry); 1352 si = _swap_info_get(entry); 1643 if (!si) 1353 if (!si) 1644 return; 1354 return; 1645 1355 1646 ci = lock_cluster_or_swap_info(si, of 1356 ci = lock_cluster_or_swap_info(si, offset); 1647 if (size > 1 && swap_is_has_cache(si, !! 1357 if (size == SWAPFILE_CLUSTER) { 1648 unlock_cluster_or_swap_info(s !! 1358 VM_BUG_ON(!cluster_is_huge(ci)); 1649 spin_lock(&si->lock); !! 1359 map = si->swap_map + offset; 1650 swap_entry_range_free(si, ent !! 1360 for (i = 0; i < SWAPFILE_CLUSTER; i++) { 1651 spin_unlock(&si->lock); !! 1361 val = map[i]; 1652 return; !! 1362 VM_BUG_ON(!(val & SWAP_HAS_CACHE)); >> 1363 if (val == SWAP_HAS_CACHE) >> 1364 free_entries++; >> 1365 } >> 1366 cluster_clear_huge(ci); >> 1367 if (free_entries == SWAPFILE_CLUSTER) { >> 1368 unlock_cluster_or_swap_info(si, ci); >> 1369 spin_lock(&si->lock); >> 1370 mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); >> 1371 swap_free_cluster(si, idx); >> 1372 spin_unlock(&si->lock); >> 1373 return; >> 1374 } 1653 } 1375 } 1654 for (int i = 0; i < size; i++, entry. !! 1376 for (i = 0; i < size; i++, entry.val++) { 1655 if (!__swap_entry_free_locked 1377 if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) { 1656 unlock_cluster_or_swa 1378 unlock_cluster_or_swap_info(si, ci); 1657 free_swap_slot(entry) 1379 free_swap_slot(entry); 1658 if (i == size - 1) 1380 if (i == size - 1) 1659 return; 1381 return; 1660 lock_cluster_or_swap_ 1382 lock_cluster_or_swap_info(si, offset); 1661 } 1383 } 1662 } 1384 } 1663 unlock_cluster_or_swap_info(si, ci); 1385 unlock_cluster_or_swap_info(si, ci); 1664 } 1386 } 1665 1387 >> 1388 #ifdef CONFIG_THP_SWAP >> 1389 int split_swap_cluster(swp_entry_t entry) >> 1390 { >> 1391 struct swap_info_struct *si; >> 1392 struct swap_cluster_info *ci; >> 1393 unsigned long offset = swp_offset(entry); >> 1394 >> 1395 si = _swap_info_get(entry); >> 1396 if (!si) >> 1397 return -EBUSY; >> 1398 ci = lock_cluster(si, offset); >> 1399 cluster_clear_huge(ci); >> 1400 unlock_cluster(ci); >> 1401 return 0; >> 1402 } >> 1403 #endif >> 1404 1666 static int swp_entry_cmp(const void *ent1, co 1405 static int swp_entry_cmp(const void *ent1, const void *ent2) 1667 { 1406 { 1668 const swp_entry_t *e1 = ent1, *e2 = e 1407 const swp_entry_t *e1 = ent1, *e2 = ent2; 1669 1408 1670 return (int)swp_type(*e1) - (int)swp_ 1409 return (int)swp_type(*e1) - (int)swp_type(*e2); 1671 } 1410 } 1672 1411 1673 void swapcache_free_entries(swp_entry_t *entr 1412 void swapcache_free_entries(swp_entry_t *entries, int n) 1674 { 1413 { 1675 struct swap_info_struct *p, *prev; 1414 struct swap_info_struct *p, *prev; 1676 int i; 1415 int i; 1677 1416 1678 if (n <= 0) 1417 if (n <= 0) 1679 return; 1418 return; 1680 1419 1681 prev = NULL; 1420 prev = NULL; 1682 p = NULL; 1421 p = NULL; 1683 1422 1684 /* 1423 /* 1685 * Sort swap entries by swap device, 1424 * Sort swap entries by swap device, so each lock is only taken once. 1686 * nr_swapfiles isn't absolutely corr 1425 * nr_swapfiles isn't absolutely correct, but the overhead of sort() is 1687 * so low that it isn't necessary to 1426 * so low that it isn't necessary to optimize further. 1688 */ 1427 */ 1689 if (nr_swapfiles > 1) 1428 if (nr_swapfiles > 1) 1690 sort(entries, n, sizeof(entri 1429 sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL); 1691 for (i = 0; i < n; ++i) { 1430 for (i = 0; i < n; ++i) { 1692 p = swap_info_get_cont(entrie 1431 p = swap_info_get_cont(entries[i], prev); 1693 if (p) 1432 if (p) 1694 swap_entry_range_free !! 1433 swap_entry_free(p, entries[i]); 1695 prev = p; 1434 prev = p; 1696 } 1435 } 1697 if (p) 1436 if (p) 1698 spin_unlock(&p->lock); 1437 spin_unlock(&p->lock); 1699 } 1438 } 1700 1439 1701 int __swap_count(swp_entry_t entry) 1440 int __swap_count(swp_entry_t entry) 1702 { 1441 { 1703 struct swap_info_struct *si = swp_swa 1442 struct swap_info_struct *si = swp_swap_info(entry); 1704 pgoff_t offset = swp_offset(entry); 1443 pgoff_t offset = swp_offset(entry); 1705 1444 1706 return swap_count(si->swap_map[offset 1445 return swap_count(si->swap_map[offset]); 1707 } 1446 } 1708 1447 1709 /* 1448 /* 1710 * How many references to @entry are currentl 1449 * How many references to @entry are currently swapped out? 1711 * This does not give an exact answer when sw 1450 * This does not give an exact answer when swap count is continued, 1712 * but does include the high COUNT_CONTINUED 1451 * but does include the high COUNT_CONTINUED flag to allow for that. 1713 */ 1452 */ 1714 int swap_swapcount(struct swap_info_struct *s 1453 int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) 1715 { 1454 { 1716 pgoff_t offset = swp_offset(entry); 1455 pgoff_t offset = swp_offset(entry); 1717 struct swap_cluster_info *ci; 1456 struct swap_cluster_info *ci; 1718 int count; 1457 int count; 1719 1458 1720 ci = lock_cluster_or_swap_info(si, of 1459 ci = lock_cluster_or_swap_info(si, offset); 1721 count = swap_count(si->swap_map[offse 1460 count = swap_count(si->swap_map[offset]); 1722 unlock_cluster_or_swap_info(si, ci); 1461 unlock_cluster_or_swap_info(si, ci); 1723 return count; 1462 return count; 1724 } 1463 } 1725 1464 1726 /* 1465 /* 1727 * How many references to @entry are currentl 1466 * How many references to @entry are currently swapped out? 1728 * This considers COUNT_CONTINUED so it retur 1467 * This considers COUNT_CONTINUED so it returns exact answer. 1729 */ 1468 */ 1730 int swp_swapcount(swp_entry_t entry) 1469 int swp_swapcount(swp_entry_t entry) 1731 { 1470 { 1732 int count, tmp_count, n; 1471 int count, tmp_count, n; 1733 struct swap_info_struct *si; !! 1472 struct swap_info_struct *p; 1734 struct swap_cluster_info *ci; 1473 struct swap_cluster_info *ci; 1735 struct page *page; 1474 struct page *page; 1736 pgoff_t offset; 1475 pgoff_t offset; 1737 unsigned char *map; 1476 unsigned char *map; 1738 1477 1739 si = _swap_info_get(entry); !! 1478 p = _swap_info_get(entry); 1740 if (!si) !! 1479 if (!p) 1741 return 0; 1480 return 0; 1742 1481 1743 offset = swp_offset(entry); 1482 offset = swp_offset(entry); 1744 1483 1745 ci = lock_cluster_or_swap_info(si, of !! 1484 ci = lock_cluster_or_swap_info(p, offset); 1746 1485 1747 count = swap_count(si->swap_map[offse !! 1486 count = swap_count(p->swap_map[offset]); 1748 if (!(count & COUNT_CONTINUED)) 1487 if (!(count & COUNT_CONTINUED)) 1749 goto out; 1488 goto out; 1750 1489 1751 count &= ~COUNT_CONTINUED; 1490 count &= ~COUNT_CONTINUED; 1752 n = SWAP_MAP_MAX + 1; 1491 n = SWAP_MAP_MAX + 1; 1753 1492 1754 page = vmalloc_to_page(si->swap_map + !! 1493 page = vmalloc_to_page(p->swap_map + offset); 1755 offset &= ~PAGE_MASK; 1494 offset &= ~PAGE_MASK; 1756 VM_BUG_ON(page_private(page) != SWP_C 1495 VM_BUG_ON(page_private(page) != SWP_CONTINUED); 1757 1496 1758 do { 1497 do { 1759 page = list_next_entry(page, 1498 page = list_next_entry(page, lru); 1760 map = kmap_local_page(page); !! 1499 map = kmap_atomic(page); 1761 tmp_count = map[offset]; 1500 tmp_count = map[offset]; 1762 kunmap_local(map); !! 1501 kunmap_atomic(map); 1763 1502 1764 count += (tmp_count & ~COUNT_ 1503 count += (tmp_count & ~COUNT_CONTINUED) * n; 1765 n *= (SWAP_CONT_MAX + 1); 1504 n *= (SWAP_CONT_MAX + 1); 1766 } while (tmp_count & COUNT_CONTINUED) 1505 } while (tmp_count & COUNT_CONTINUED); 1767 out: 1506 out: 1768 unlock_cluster_or_swap_info(si, ci); !! 1507 unlock_cluster_or_swap_info(p, ci); 1769 return count; 1508 return count; 1770 } 1509 } 1771 1510 1772 static bool swap_page_trans_huge_swapped(stru 1511 static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, 1773 swp_ !! 1512 swp_entry_t entry) 1774 { 1513 { 1775 struct swap_cluster_info *ci; 1514 struct swap_cluster_info *ci; 1776 unsigned char *map = si->swap_map; 1515 unsigned char *map = si->swap_map; 1777 unsigned int nr_pages = 1 << order; << 1778 unsigned long roffset = swp_offset(en 1516 unsigned long roffset = swp_offset(entry); 1779 unsigned long offset = round_down(rof !! 1517 unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER); 1780 int i; 1518 int i; 1781 bool ret = false; 1519 bool ret = false; 1782 1520 1783 ci = lock_cluster_or_swap_info(si, of 1521 ci = lock_cluster_or_swap_info(si, offset); 1784 if (!ci || nr_pages == 1) { !! 1522 if (!ci || !cluster_is_huge(ci)) { 1785 if (swap_count(map[roffset])) 1523 if (swap_count(map[roffset])) 1786 ret = true; 1524 ret = true; 1787 goto unlock_out; 1525 goto unlock_out; 1788 } 1526 } 1789 for (i = 0; i < nr_pages; i++) { !! 1527 for (i = 0; i < SWAPFILE_CLUSTER; i++) { 1790 if (swap_count(map[offset + i 1528 if (swap_count(map[offset + i])) { 1791 ret = true; 1529 ret = true; 1792 break; 1530 break; 1793 } 1531 } 1794 } 1532 } 1795 unlock_out: 1533 unlock_out: 1796 unlock_cluster_or_swap_info(si, ci); 1534 unlock_cluster_or_swap_info(si, ci); 1797 return ret; 1535 return ret; 1798 } 1536 } 1799 1537 1800 static bool folio_swapped(struct folio *folio 1538 static bool folio_swapped(struct folio *folio) 1801 { 1539 { 1802 swp_entry_t entry = folio->swap; !! 1540 swp_entry_t entry = folio_swap_entry(folio); 1803 struct swap_info_struct *si = _swap_i 1541 struct swap_info_struct *si = _swap_info_get(entry); 1804 1542 1805 if (!si) 1543 if (!si) 1806 return false; 1544 return false; 1807 1545 1808 if (!IS_ENABLED(CONFIG_THP_SWAP) || l 1546 if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio))) 1809 return swap_swapcount(si, ent 1547 return swap_swapcount(si, entry) != 0; 1810 1548 1811 return swap_page_trans_huge_swapped(s !! 1549 return swap_page_trans_huge_swapped(si, entry); 1812 } 1550 } 1813 1551 1814 static bool folio_swapcache_freeable(struct f !! 1552 /** >> 1553 * folio_free_swap() - Free the swap space used for this folio. >> 1554 * @folio: The folio to remove. >> 1555 * >> 1556 * If swap is getting full, or if there are no more mappings of this folio, >> 1557 * then call folio_free_swap to free its swap space. >> 1558 * >> 1559 * Return: true if we were able to release the swap space. >> 1560 */ >> 1561 bool folio_free_swap(struct folio *folio) 1815 { 1562 { 1816 VM_BUG_ON_FOLIO(!folio_test_locked(fo 1563 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 1817 1564 1818 if (!folio_test_swapcache(folio)) 1565 if (!folio_test_swapcache(folio)) 1819 return false; 1566 return false; 1820 if (folio_test_writeback(folio)) 1567 if (folio_test_writeback(folio)) 1821 return false; 1568 return false; >> 1569 if (folio_swapped(folio)) >> 1570 return false; 1822 1571 1823 /* 1572 /* 1824 * Once hibernation has begun to crea 1573 * Once hibernation has begun to create its image of memory, 1825 * there's a danger that one of the c 1574 * there's a danger that one of the calls to folio_free_swap() 1826 * - most probably a call from __try_ 1575 * - most probably a call from __try_to_reclaim_swap() while 1827 * hibernation is allocating its own 1576 * hibernation is allocating its own swap pages for the image, 1828 * but conceivably even a call from m 1577 * but conceivably even a call from memory reclaim - will free 1829 * the swap from a folio which has al 1578 * the swap from a folio which has already been recorded in the 1830 * image as a clean swapcache folio, 1579 * image as a clean swapcache folio, and then reuse its swap for 1831 * another page of the image. On wak 1580 * another page of the image. On waking from hibernation, the 1832 * original folio might be freed unde 1581 * original folio might be freed under memory pressure, then 1833 * later read back in from swap, now 1582 * later read back in from swap, now with the wrong data. 1834 * 1583 * 1835 * Hibernation suspends storage while 1584 * Hibernation suspends storage while it is writing the image 1836 * to disk so check that here. 1585 * to disk so check that here. 1837 */ 1586 */ 1838 if (pm_suspended_storage()) 1587 if (pm_suspended_storage()) 1839 return false; 1588 return false; 1840 1589 1841 return true; << 1842 } << 1843 << 1844 /** << 1845 * folio_free_swap() - Free the swap space us << 1846 * @folio: The folio to remove. << 1847 * << 1848 * If swap is getting full, or if there are n << 1849 * then call folio_free_swap to free its swap << 1850 * << 1851 * Return: true if we were able to release th << 1852 */ << 1853 bool folio_free_swap(struct folio *folio) << 1854 { << 1855 if (!folio_swapcache_freeable(folio)) << 1856 return false; << 1857 if (folio_swapped(folio)) << 1858 return false; << 1859 << 1860 delete_from_swap_cache(folio); 1590 delete_from_swap_cache(folio); 1861 folio_set_dirty(folio); 1591 folio_set_dirty(folio); 1862 return true; 1592 return true; 1863 } 1593 } 1864 1594 1865 /** !! 1595 /* 1866 * free_swap_and_cache_nr() - Release referen !! 1596 * Free the swap entry like above, but also try to 1867 * reclaim their c !! 1597 * free the page cache entry if it is the last user. 1868 * @entry: First entry of range. << 1869 * @nr: Number of entries in range. << 1870 * << 1871 * For each swap entry in the contiguous rang << 1872 * entries become free, try to reclaim their << 1873 * offset range is defined by [entry.offset, << 1874 */ 1598 */ 1875 void free_swap_and_cache_nr(swp_entry_t entry !! 1599 int free_swap_and_cache(swp_entry_t entry) 1876 { 1600 { 1877 const unsigned long start_offset = sw !! 1601 struct swap_info_struct *p; 1878 const unsigned long end_offset = star !! 1602 unsigned char count; 1879 struct swap_info_struct *si; << 1880 bool any_only_cache = false; << 1881 unsigned long offset; << 1882 1603 1883 if (non_swap_entry(entry)) 1604 if (non_swap_entry(entry)) 1884 return; !! 1605 return 1; 1885 << 1886 si = get_swap_device(entry); << 1887 if (!si) << 1888 return; << 1889 << 1890 if (WARN_ON(end_offset > si->max)) << 1891 goto out; << 1892 << 1893 /* << 1894 * First free all entries in the rang << 1895 */ << 1896 any_only_cache = __swap_entries_free( << 1897 << 1898 /* << 1899 * Short-circuit the below loop if no << 1900 * reference drop to zero. << 1901 */ << 1902 if (!any_only_cache) << 1903 goto out; << 1904 1606 1905 /* !! 1607 p = _swap_info_get(entry); 1906 * Now go back over the range trying !! 1608 if (p) { 1907 * more efficient for large folios be !! 1609 count = __swap_entry_free(p, entry); 1908 * the swap once per folio in the com !! 1610 if (count == SWAP_HAS_CACHE && 1909 * __swap_entry_free() and __try_to_r !! 1611 !swap_page_trans_huge_swapped(p, entry)) 1910 * latter will get a reference and lo !! 1612 __try_to_reclaim_swap(p, swp_offset(entry), 1911 * page but will only succeed once th !! 1613 TTRS_UNMAPPED | TTRS_FULL); 1912 * zero. << 1913 */ << 1914 for (offset = start_offset; offset < << 1915 nr = 1; << 1916 if (READ_ONCE(si->swap_map[of << 1917 /* << 1918 * Folios are always << 1919 * advance forward to << 1920 * folio was found fo << 1921 * in this case. Nega << 1922 * but could not be r << 1923 * to the next bounda << 1924 */ << 1925 nr = __try_to_reclaim << 1926 << 1927 if (nr == 0) << 1928 nr = 1; << 1929 else if (nr < 0) << 1930 nr = -nr; << 1931 nr = ALIGN(offset + 1 << 1932 } << 1933 } 1614 } 1934 !! 1615 return p != NULL; 1935 out: << 1936 put_swap_device(si); << 1937 } 1616 } 1938 1617 1939 #ifdef CONFIG_HIBERNATION 1618 #ifdef CONFIG_HIBERNATION 1940 1619 1941 swp_entry_t get_swap_page_of_type(int type) 1620 swp_entry_t get_swap_page_of_type(int type) 1942 { 1621 { 1943 struct swap_info_struct *si = swap_ty 1622 struct swap_info_struct *si = swap_type_to_swap_info(type); 1944 swp_entry_t entry = {0}; 1623 swp_entry_t entry = {0}; 1945 1624 1946 if (!si) 1625 if (!si) 1947 goto fail; 1626 goto fail; 1948 1627 1949 /* This is called for allocating swap 1628 /* This is called for allocating swap entry, not cache */ 1950 spin_lock(&si->lock); 1629 spin_lock(&si->lock); 1951 if ((si->flags & SWP_WRITEOK) && scan !! 1630 if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry)) 1952 atomic_long_dec(&nr_swap_page 1631 atomic_long_dec(&nr_swap_pages); 1953 spin_unlock(&si->lock); 1632 spin_unlock(&si->lock); 1954 fail: 1633 fail: 1955 return entry; 1634 return entry; 1956 } 1635 } 1957 1636 1958 /* 1637 /* 1959 * Find the swap type that corresponds to giv 1638 * Find the swap type that corresponds to given device (if any). 1960 * 1639 * 1961 * @offset - number of the PAGE_SIZE-sized bl 1640 * @offset - number of the PAGE_SIZE-sized block of the device, starting 1962 * from 0, in which the swap header is expect 1641 * from 0, in which the swap header is expected to be located. 1963 * 1642 * 1964 * This is needed for the suspend to disk (ak 1643 * This is needed for the suspend to disk (aka swsusp). 1965 */ 1644 */ 1966 int swap_type_of(dev_t device, sector_t offse 1645 int swap_type_of(dev_t device, sector_t offset) 1967 { 1646 { 1968 int type; 1647 int type; 1969 1648 1970 if (!device) 1649 if (!device) 1971 return -1; 1650 return -1; 1972 1651 1973 spin_lock(&swap_lock); 1652 spin_lock(&swap_lock); 1974 for (type = 0; type < nr_swapfiles; t 1653 for (type = 0; type < nr_swapfiles; type++) { 1975 struct swap_info_struct *sis 1654 struct swap_info_struct *sis = swap_info[type]; 1976 1655 1977 if (!(sis->flags & SWP_WRITEO 1656 if (!(sis->flags & SWP_WRITEOK)) 1978 continue; 1657 continue; 1979 1658 1980 if (device == sis->bdev->bd_d 1659 if (device == sis->bdev->bd_dev) { 1981 struct swap_extent *s 1660 struct swap_extent *se = first_se(sis); 1982 1661 1983 if (se->start_block = 1662 if (se->start_block == offset) { 1984 spin_unlock(& 1663 spin_unlock(&swap_lock); 1985 return type; 1664 return type; 1986 } 1665 } 1987 } 1666 } 1988 } 1667 } 1989 spin_unlock(&swap_lock); 1668 spin_unlock(&swap_lock); 1990 return -ENODEV; 1669 return -ENODEV; 1991 } 1670 } 1992 1671 1993 int find_first_swap(dev_t *device) 1672 int find_first_swap(dev_t *device) 1994 { 1673 { 1995 int type; 1674 int type; 1996 1675 1997 spin_lock(&swap_lock); 1676 spin_lock(&swap_lock); 1998 for (type = 0; type < nr_swapfiles; t 1677 for (type = 0; type < nr_swapfiles; type++) { 1999 struct swap_info_struct *sis 1678 struct swap_info_struct *sis = swap_info[type]; 2000 1679 2001 if (!(sis->flags & SWP_WRITEO 1680 if (!(sis->flags & SWP_WRITEOK)) 2002 continue; 1681 continue; 2003 *device = sis->bdev->bd_dev; 1682 *device = sis->bdev->bd_dev; 2004 spin_unlock(&swap_lock); 1683 spin_unlock(&swap_lock); 2005 return type; 1684 return type; 2006 } 1685 } 2007 spin_unlock(&swap_lock); 1686 spin_unlock(&swap_lock); 2008 return -ENODEV; 1687 return -ENODEV; 2009 } 1688 } 2010 1689 2011 /* 1690 /* 2012 * Get the (PAGE_SIZE) block corresponding to 1691 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev 2013 * corresponding to given index in swap_info 1692 * corresponding to given index in swap_info (swap type). 2014 */ 1693 */ 2015 sector_t swapdev_block(int type, pgoff_t offs 1694 sector_t swapdev_block(int type, pgoff_t offset) 2016 { 1695 { 2017 struct swap_info_struct *si = swap_ty 1696 struct swap_info_struct *si = swap_type_to_swap_info(type); 2018 struct swap_extent *se; 1697 struct swap_extent *se; 2019 1698 2020 if (!si || !(si->flags & SWP_WRITEOK) 1699 if (!si || !(si->flags & SWP_WRITEOK)) 2021 return 0; 1700 return 0; 2022 se = offset_to_swap_extent(si, offset 1701 se = offset_to_swap_extent(si, offset); 2023 return se->start_block + (offset - se 1702 return se->start_block + (offset - se->start_page); 2024 } 1703 } 2025 1704 2026 /* 1705 /* 2027 * Return either the total number of swap pag 1706 * Return either the total number of swap pages of given type, or the number 2028 * of free pages of that type (depending on @ 1707 * of free pages of that type (depending on @free) 2029 * 1708 * 2030 * This is needed for software suspend 1709 * This is needed for software suspend 2031 */ 1710 */ 2032 unsigned int count_swap_pages(int type, int f 1711 unsigned int count_swap_pages(int type, int free) 2033 { 1712 { 2034 unsigned int n = 0; 1713 unsigned int n = 0; 2035 1714 2036 spin_lock(&swap_lock); 1715 spin_lock(&swap_lock); 2037 if ((unsigned int)type < nr_swapfiles 1716 if ((unsigned int)type < nr_swapfiles) { 2038 struct swap_info_struct *sis 1717 struct swap_info_struct *sis = swap_info[type]; 2039 1718 2040 spin_lock(&sis->lock); 1719 spin_lock(&sis->lock); 2041 if (sis->flags & SWP_WRITEOK) 1720 if (sis->flags & SWP_WRITEOK) { 2042 n = sis->pages; 1721 n = sis->pages; 2043 if (free) 1722 if (free) 2044 n -= sis->inu 1723 n -= sis->inuse_pages; 2045 } 1724 } 2046 spin_unlock(&sis->lock); 1725 spin_unlock(&sis->lock); 2047 } 1726 } 2048 spin_unlock(&swap_lock); 1727 spin_unlock(&swap_lock); 2049 return n; 1728 return n; 2050 } 1729 } 2051 #endif /* CONFIG_HIBERNATION */ 1730 #endif /* CONFIG_HIBERNATION */ 2052 1731 2053 static inline int pte_same_as_swp(pte_t pte, 1732 static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte) 2054 { 1733 { 2055 return pte_same(pte_swp_clear_flags(p 1734 return pte_same(pte_swp_clear_flags(pte), swp_pte); 2056 } 1735 } 2057 1736 2058 /* 1737 /* 2059 * No need to decide whether this PTE shares 1738 * No need to decide whether this PTE shares the swap entry with others, 2060 * just let do_wp_page work it out if a write 1739 * just let do_wp_page work it out if a write is requested later - to 2061 * force COW, vm_page_prot omits write permis 1740 * force COW, vm_page_prot omits write permission from any private vma. 2062 */ 1741 */ 2063 static int unuse_pte(struct vm_area_struct *v 1742 static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, 2064 unsigned long addr, swp_entry 1743 unsigned long addr, swp_entry_t entry, struct folio *folio) 2065 { 1744 { 2066 struct page *page; !! 1745 struct page *page = folio_file_page(folio, swp_offset(entry)); 2067 struct folio *swapcache; !! 1746 struct page *swapcache; 2068 spinlock_t *ptl; 1747 spinlock_t *ptl; 2069 pte_t *pte, new_pte, old_pte; 1748 pte_t *pte, new_pte, old_pte; 2070 bool hwpoisoned = false; !! 1749 bool hwpoisoned = PageHWPoison(page); 2071 int ret = 1; 1750 int ret = 1; 2072 1751 2073 swapcache = folio; !! 1752 swapcache = page; 2074 folio = ksm_might_need_to_copy(folio, !! 1753 page = ksm_might_need_to_copy(page, vma, addr); 2075 if (unlikely(!folio)) !! 1754 if (unlikely(!page)) 2076 return -ENOMEM; 1755 return -ENOMEM; 2077 else if (unlikely(folio == ERR_PTR(-E !! 1756 else if (unlikely(PTR_ERR(page) == -EHWPOISON)) 2078 hwpoisoned = true; << 2079 folio = swapcache; << 2080 } << 2081 << 2082 page = folio_file_page(folio, swp_off << 2083 if (PageHWPoison(page)) << 2084 hwpoisoned = true; 1757 hwpoisoned = true; 2085 1758 2086 pte = pte_offset_map_lock(vma->vm_mm, 1759 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 2087 if (unlikely(!pte || !pte_same_as_swp 1760 if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte), 2088 1761 swp_entry_to_pte(entry)))) { 2089 ret = 0; 1762 ret = 0; 2090 goto out; 1763 goto out; 2091 } 1764 } 2092 1765 2093 old_pte = ptep_get(pte); 1766 old_pte = ptep_get(pte); 2094 1767 2095 if (unlikely(hwpoisoned || !folio_tes !! 1768 if (unlikely(hwpoisoned || !PageUptodate(page))) { 2096 swp_entry_t swp_entry; 1769 swp_entry_t swp_entry; 2097 1770 2098 dec_mm_counter(vma->vm_mm, MM 1771 dec_mm_counter(vma->vm_mm, MM_SWAPENTS); 2099 if (hwpoisoned) { 1772 if (hwpoisoned) { 2100 swp_entry = make_hwpo !! 1773 swp_entry = make_hwpoison_entry(swapcache); >> 1774 page = swapcache; 2101 } else { 1775 } else { 2102 swp_entry = make_pois !! 1776 swp_entry = make_swapin_error_entry(); 2103 } 1777 } 2104 new_pte = swp_entry_to_pte(sw 1778 new_pte = swp_entry_to_pte(swp_entry); 2105 ret = 0; 1779 ret = 0; 2106 goto setpte; 1780 goto setpte; 2107 } 1781 } 2108 1782 2109 /* !! 1783 /* See do_swap_page() */ 2110 * Some architectures may have to res !! 1784 BUG_ON(!PageAnon(page) && PageMappedToDisk(page)); 2111 * when reading from swap. This metad !! 1785 BUG_ON(PageAnon(page) && PageAnonExclusive(page)); 2112 * so this must be called before swap << 2113 */ << 2114 arch_swap_restore(folio_swap(entry, f << 2115 1786 2116 dec_mm_counter(vma->vm_mm, MM_SWAPENT 1787 dec_mm_counter(vma->vm_mm, MM_SWAPENTS); 2117 inc_mm_counter(vma->vm_mm, MM_ANONPAG 1788 inc_mm_counter(vma->vm_mm, MM_ANONPAGES); 2118 folio_get(folio); !! 1789 get_page(page); 2119 if (folio == swapcache) { !! 1790 if (page == swapcache) { 2120 rmap_t rmap_flags = RMAP_NONE 1791 rmap_t rmap_flags = RMAP_NONE; 2121 1792 2122 /* 1793 /* 2123 * See do_swap_page(): writeb !! 1794 * See do_swap_page(): PageWriteback() would be problematic. 2124 * However, we do a folio_wai !! 1795 * However, we do a wait_on_page_writeback() just before this 2125 * call and have the folio lo !! 1796 * call and have the page locked. 2126 */ 1797 */ 2127 VM_BUG_ON_FOLIO(folio_test_wr !! 1798 VM_BUG_ON_PAGE(PageWriteback(page), page); 2128 if (pte_swp_exclusive(old_pte 1799 if (pte_swp_exclusive(old_pte)) 2129 rmap_flags |= RMAP_EX 1800 rmap_flags |= RMAP_EXCLUSIVE; 2130 /* !! 1801 2131 * We currently only expect s !! 1802 page_add_anon_rmap(page, vma, addr, rmap_flags); 2132 * fully exclusive or fully s << 2133 * here, we have to be carefu << 2134 */ << 2135 if (!folio_test_anon(folio)) << 2136 VM_WARN_ON_ONCE(folio << 2137 VM_WARN_ON_FOLIO(!fol << 2138 folio_add_new_anon_rm << 2139 } else { << 2140 folio_add_anon_rmap_p << 2141 } << 2142 } else { /* ksm created a completely 1803 } else { /* ksm created a completely new copy */ 2143 folio_add_new_anon_rmap(folio !! 1804 page_add_new_anon_rmap(page, vma, addr); 2144 folio_add_lru_vma(folio, vma) !! 1805 lru_cache_add_inactive_or_unevictable(page, vma); 2145 } 1806 } 2146 new_pte = pte_mkold(mk_pte(page, vma- 1807 new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot)); 2147 if (pte_swp_soft_dirty(old_pte)) 1808 if (pte_swp_soft_dirty(old_pte)) 2148 new_pte = pte_mksoft_dirty(ne 1809 new_pte = pte_mksoft_dirty(new_pte); 2149 if (pte_swp_uffd_wp(old_pte)) 1810 if (pte_swp_uffd_wp(old_pte)) 2150 new_pte = pte_mkuffd_wp(new_p 1811 new_pte = pte_mkuffd_wp(new_pte); 2151 setpte: 1812 setpte: 2152 set_pte_at(vma->vm_mm, addr, pte, new 1813 set_pte_at(vma->vm_mm, addr, pte, new_pte); 2153 swap_free(entry); 1814 swap_free(entry); 2154 out: 1815 out: 2155 if (pte) 1816 if (pte) 2156 pte_unmap_unlock(pte, ptl); 1817 pte_unmap_unlock(pte, ptl); 2157 if (folio != swapcache) { !! 1818 if (page != swapcache) { 2158 folio_unlock(folio); !! 1819 unlock_page(page); 2159 folio_put(folio); !! 1820 put_page(page); 2160 } 1821 } 2161 return ret; 1822 return ret; 2162 } 1823 } 2163 1824 2164 static int unuse_pte_range(struct vm_area_str 1825 static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 2165 unsigned long addr, u 1826 unsigned long addr, unsigned long end, 2166 unsigned int type) 1827 unsigned int type) 2167 { 1828 { 2168 pte_t *pte = NULL; 1829 pte_t *pte = NULL; 2169 struct swap_info_struct *si; 1830 struct swap_info_struct *si; 2170 1831 2171 si = swap_info[type]; 1832 si = swap_info[type]; 2172 do { 1833 do { 2173 struct folio *folio; 1834 struct folio *folio; 2174 unsigned long offset; 1835 unsigned long offset; 2175 unsigned char swp_count; 1836 unsigned char swp_count; 2176 swp_entry_t entry; 1837 swp_entry_t entry; 2177 int ret; 1838 int ret; 2178 pte_t ptent; 1839 pte_t ptent; 2179 1840 2180 if (!pte++) { 1841 if (!pte++) { 2181 pte = pte_offset_map( 1842 pte = pte_offset_map(pmd, addr); 2182 if (!pte) 1843 if (!pte) 2183 break; 1844 break; 2184 } 1845 } 2185 1846 2186 ptent = ptep_get_lockless(pte 1847 ptent = ptep_get_lockless(pte); 2187 1848 2188 if (!is_swap_pte(ptent)) 1849 if (!is_swap_pte(ptent)) 2189 continue; 1850 continue; 2190 1851 2191 entry = pte_to_swp_entry(pten 1852 entry = pte_to_swp_entry(ptent); 2192 if (swp_type(entry) != type) 1853 if (swp_type(entry) != type) 2193 continue; 1854 continue; 2194 1855 2195 offset = swp_offset(entry); 1856 offset = swp_offset(entry); 2196 pte_unmap(pte); 1857 pte_unmap(pte); 2197 pte = NULL; 1858 pte = NULL; 2198 1859 2199 folio = swap_cache_get_folio( 1860 folio = swap_cache_get_folio(entry, vma, addr); 2200 if (!folio) { 1861 if (!folio) { >> 1862 struct page *page; 2201 struct vm_fault vmf = 1863 struct vm_fault vmf = { 2202 .vma = vma, 1864 .vma = vma, 2203 .address = ad 1865 .address = addr, 2204 .real_address 1866 .real_address = addr, 2205 .pmd = pmd, 1867 .pmd = pmd, 2206 }; 1868 }; 2207 1869 2208 folio = swapin_readah !! 1870 page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, 2209 1871 &vmf); >> 1872 if (page) >> 1873 folio = page_folio(page); 2210 } 1874 } 2211 if (!folio) { 1875 if (!folio) { 2212 swp_count = READ_ONCE 1876 swp_count = READ_ONCE(si->swap_map[offset]); 2213 if (swp_count == 0 || 1877 if (swp_count == 0 || swp_count == SWAP_MAP_BAD) 2214 continue; 1878 continue; 2215 return -ENOMEM; 1879 return -ENOMEM; 2216 } 1880 } 2217 1881 2218 folio_lock(folio); 1882 folio_lock(folio); 2219 folio_wait_writeback(folio); 1883 folio_wait_writeback(folio); 2220 ret = unuse_pte(vma, pmd, add 1884 ret = unuse_pte(vma, pmd, addr, entry, folio); 2221 if (ret < 0) { 1885 if (ret < 0) { 2222 folio_unlock(folio); 1886 folio_unlock(folio); 2223 folio_put(folio); 1887 folio_put(folio); 2224 return ret; 1888 return ret; 2225 } 1889 } 2226 1890 2227 folio_free_swap(folio); 1891 folio_free_swap(folio); 2228 folio_unlock(folio); 1892 folio_unlock(folio); 2229 folio_put(folio); 1893 folio_put(folio); 2230 } while (addr += PAGE_SIZE, addr != e 1894 } while (addr += PAGE_SIZE, addr != end); 2231 1895 2232 if (pte) 1896 if (pte) 2233 pte_unmap(pte); 1897 pte_unmap(pte); 2234 return 0; 1898 return 0; 2235 } 1899 } 2236 1900 2237 static inline int unuse_pmd_range(struct vm_a 1901 static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, 2238 unsigned long 1902 unsigned long addr, unsigned long end, 2239 unsigned int 1903 unsigned int type) 2240 { 1904 { 2241 pmd_t *pmd; 1905 pmd_t *pmd; 2242 unsigned long next; 1906 unsigned long next; 2243 int ret; 1907 int ret; 2244 1908 2245 pmd = pmd_offset(pud, addr); 1909 pmd = pmd_offset(pud, addr); 2246 do { 1910 do { 2247 cond_resched(); 1911 cond_resched(); 2248 next = pmd_addr_end(addr, end 1912 next = pmd_addr_end(addr, end); 2249 ret = unuse_pte_range(vma, pm 1913 ret = unuse_pte_range(vma, pmd, addr, next, type); 2250 if (ret) 1914 if (ret) 2251 return ret; 1915 return ret; 2252 } while (pmd++, addr = next, addr != 1916 } while (pmd++, addr = next, addr != end); 2253 return 0; 1917 return 0; 2254 } 1918 } 2255 1919 2256 static inline int unuse_pud_range(struct vm_a 1920 static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, 2257 unsigned long 1921 unsigned long addr, unsigned long end, 2258 unsigned int 1922 unsigned int type) 2259 { 1923 { 2260 pud_t *pud; 1924 pud_t *pud; 2261 unsigned long next; 1925 unsigned long next; 2262 int ret; 1926 int ret; 2263 1927 2264 pud = pud_offset(p4d, addr); 1928 pud = pud_offset(p4d, addr); 2265 do { 1929 do { 2266 next = pud_addr_end(addr, end 1930 next = pud_addr_end(addr, end); 2267 if (pud_none_or_clear_bad(pud 1931 if (pud_none_or_clear_bad(pud)) 2268 continue; 1932 continue; 2269 ret = unuse_pmd_range(vma, pu 1933 ret = unuse_pmd_range(vma, pud, addr, next, type); 2270 if (ret) 1934 if (ret) 2271 return ret; 1935 return ret; 2272 } while (pud++, addr = next, addr != 1936 } while (pud++, addr = next, addr != end); 2273 return 0; 1937 return 0; 2274 } 1938 } 2275 1939 2276 static inline int unuse_p4d_range(struct vm_a 1940 static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, 2277 unsigned long 1941 unsigned long addr, unsigned long end, 2278 unsigned int 1942 unsigned int type) 2279 { 1943 { 2280 p4d_t *p4d; 1944 p4d_t *p4d; 2281 unsigned long next; 1945 unsigned long next; 2282 int ret; 1946 int ret; 2283 1947 2284 p4d = p4d_offset(pgd, addr); 1948 p4d = p4d_offset(pgd, addr); 2285 do { 1949 do { 2286 next = p4d_addr_end(addr, end 1950 next = p4d_addr_end(addr, end); 2287 if (p4d_none_or_clear_bad(p4d 1951 if (p4d_none_or_clear_bad(p4d)) 2288 continue; 1952 continue; 2289 ret = unuse_pud_range(vma, p4 1953 ret = unuse_pud_range(vma, p4d, addr, next, type); 2290 if (ret) 1954 if (ret) 2291 return ret; 1955 return ret; 2292 } while (p4d++, addr = next, addr != 1956 } while (p4d++, addr = next, addr != end); 2293 return 0; 1957 return 0; 2294 } 1958 } 2295 1959 2296 static int unuse_vma(struct vm_area_struct *v 1960 static int unuse_vma(struct vm_area_struct *vma, unsigned int type) 2297 { 1961 { 2298 pgd_t *pgd; 1962 pgd_t *pgd; 2299 unsigned long addr, end, next; 1963 unsigned long addr, end, next; 2300 int ret; 1964 int ret; 2301 1965 2302 addr = vma->vm_start; 1966 addr = vma->vm_start; 2303 end = vma->vm_end; 1967 end = vma->vm_end; 2304 1968 2305 pgd = pgd_offset(vma->vm_mm, addr); 1969 pgd = pgd_offset(vma->vm_mm, addr); 2306 do { 1970 do { 2307 next = pgd_addr_end(addr, end 1971 next = pgd_addr_end(addr, end); 2308 if (pgd_none_or_clear_bad(pgd 1972 if (pgd_none_or_clear_bad(pgd)) 2309 continue; 1973 continue; 2310 ret = unuse_p4d_range(vma, pg 1974 ret = unuse_p4d_range(vma, pgd, addr, next, type); 2311 if (ret) 1975 if (ret) 2312 return ret; 1976 return ret; 2313 } while (pgd++, addr = next, addr != 1977 } while (pgd++, addr = next, addr != end); 2314 return 0; 1978 return 0; 2315 } 1979 } 2316 1980 2317 static int unuse_mm(struct mm_struct *mm, uns 1981 static int unuse_mm(struct mm_struct *mm, unsigned int type) 2318 { 1982 { 2319 struct vm_area_struct *vma; 1983 struct vm_area_struct *vma; 2320 int ret = 0; 1984 int ret = 0; 2321 VMA_ITERATOR(vmi, mm, 0); 1985 VMA_ITERATOR(vmi, mm, 0); 2322 1986 2323 mmap_read_lock(mm); 1987 mmap_read_lock(mm); 2324 for_each_vma(vmi, vma) { 1988 for_each_vma(vmi, vma) { 2325 if (vma->anon_vma && !is_vm_h !! 1989 if (vma->anon_vma) { 2326 ret = unuse_vma(vma, 1990 ret = unuse_vma(vma, type); 2327 if (ret) 1991 if (ret) 2328 break; 1992 break; 2329 } 1993 } 2330 1994 2331 cond_resched(); 1995 cond_resched(); 2332 } 1996 } 2333 mmap_read_unlock(mm); 1997 mmap_read_unlock(mm); 2334 return ret; 1998 return ret; 2335 } 1999 } 2336 2000 2337 /* 2001 /* 2338 * Scan swap_map from current position to nex 2002 * Scan swap_map from current position to next entry still in use. 2339 * Return 0 if there are no inuse entries aft 2003 * Return 0 if there are no inuse entries after prev till end of 2340 * the map. 2004 * the map. 2341 */ 2005 */ 2342 static unsigned int find_next_to_unuse(struct 2006 static unsigned int find_next_to_unuse(struct swap_info_struct *si, 2343 unsig 2007 unsigned int prev) 2344 { 2008 { 2345 unsigned int i; 2009 unsigned int i; 2346 unsigned char count; 2010 unsigned char count; 2347 2011 2348 /* 2012 /* 2349 * No need for swap_lock here: we're 2013 * No need for swap_lock here: we're just looking 2350 * for whether an entry is in use, no 2014 * for whether an entry is in use, not modifying it; false 2351 * hits are okay, and sys_swapoff() h 2015 * hits are okay, and sys_swapoff() has already prevented new 2352 * allocations from this area (while 2016 * allocations from this area (while holding swap_lock). 2353 */ 2017 */ 2354 for (i = prev + 1; i < si->max; i++) 2018 for (i = prev + 1; i < si->max; i++) { 2355 count = READ_ONCE(si->swap_ma 2019 count = READ_ONCE(si->swap_map[i]); 2356 if (count && swap_count(count 2020 if (count && swap_count(count) != SWAP_MAP_BAD) 2357 break; 2021 break; 2358 if ((i % LATENCY_LIMIT) == 0) 2022 if ((i % LATENCY_LIMIT) == 0) 2359 cond_resched(); 2023 cond_resched(); 2360 } 2024 } 2361 2025 2362 if (i == si->max) 2026 if (i == si->max) 2363 i = 0; 2027 i = 0; 2364 2028 2365 return i; 2029 return i; 2366 } 2030 } 2367 2031 2368 static int try_to_unuse(unsigned int type) 2032 static int try_to_unuse(unsigned int type) 2369 { 2033 { 2370 struct mm_struct *prev_mm; 2034 struct mm_struct *prev_mm; 2371 struct mm_struct *mm; 2035 struct mm_struct *mm; 2372 struct list_head *p; 2036 struct list_head *p; 2373 int retval = 0; 2037 int retval = 0; 2374 struct swap_info_struct *si = swap_in 2038 struct swap_info_struct *si = swap_info[type]; 2375 struct folio *folio; 2039 struct folio *folio; 2376 swp_entry_t entry; 2040 swp_entry_t entry; 2377 unsigned int i; 2041 unsigned int i; 2378 2042 2379 if (!READ_ONCE(si->inuse_pages)) 2043 if (!READ_ONCE(si->inuse_pages)) 2380 goto success; !! 2044 return 0; 2381 2045 2382 retry: 2046 retry: 2383 retval = shmem_unuse(type); 2047 retval = shmem_unuse(type); 2384 if (retval) 2048 if (retval) 2385 return retval; 2049 return retval; 2386 2050 2387 prev_mm = &init_mm; 2051 prev_mm = &init_mm; 2388 mmget(prev_mm); 2052 mmget(prev_mm); 2389 2053 2390 spin_lock(&mmlist_lock); 2054 spin_lock(&mmlist_lock); 2391 p = &init_mm.mmlist; 2055 p = &init_mm.mmlist; 2392 while (READ_ONCE(si->inuse_pages) && 2056 while (READ_ONCE(si->inuse_pages) && 2393 !signal_pending(current) && 2057 !signal_pending(current) && 2394 (p = p->next) != &init_mm.mmli 2058 (p = p->next) != &init_mm.mmlist) { 2395 2059 2396 mm = list_entry(p, struct mm_ 2060 mm = list_entry(p, struct mm_struct, mmlist); 2397 if (!mmget_not_zero(mm)) 2061 if (!mmget_not_zero(mm)) 2398 continue; 2062 continue; 2399 spin_unlock(&mmlist_lock); 2063 spin_unlock(&mmlist_lock); 2400 mmput(prev_mm); 2064 mmput(prev_mm); 2401 prev_mm = mm; 2065 prev_mm = mm; 2402 retval = unuse_mm(mm, type); 2066 retval = unuse_mm(mm, type); 2403 if (retval) { 2067 if (retval) { 2404 mmput(prev_mm); 2068 mmput(prev_mm); 2405 return retval; 2069 return retval; 2406 } 2070 } 2407 2071 2408 /* 2072 /* 2409 * Make sure that we aren't c 2073 * Make sure that we aren't completely killing 2410 * interactive performance. 2074 * interactive performance. 2411 */ 2075 */ 2412 cond_resched(); 2076 cond_resched(); 2413 spin_lock(&mmlist_lock); 2077 spin_lock(&mmlist_lock); 2414 } 2078 } 2415 spin_unlock(&mmlist_lock); 2079 spin_unlock(&mmlist_lock); 2416 2080 2417 mmput(prev_mm); 2081 mmput(prev_mm); 2418 2082 2419 i = 0; 2083 i = 0; 2420 while (READ_ONCE(si->inuse_pages) && 2084 while (READ_ONCE(si->inuse_pages) && 2421 !signal_pending(current) && 2085 !signal_pending(current) && 2422 (i = find_next_to_unuse(si, i) 2086 (i = find_next_to_unuse(si, i)) != 0) { 2423 2087 2424 entry = swp_entry(type, i); 2088 entry = swp_entry(type, i); 2425 folio = filemap_get_folio(swa !! 2089 folio = filemap_get_folio(swap_address_space(entry), i); 2426 if (IS_ERR(folio)) 2090 if (IS_ERR(folio)) 2427 continue; 2091 continue; 2428 2092 2429 /* 2093 /* 2430 * It is conceivable that a r 2094 * It is conceivable that a racing task removed this folio from 2431 * swap cache just before we 2095 * swap cache just before we acquired the page lock. The folio 2432 * might even be back in swap 2096 * might even be back in swap cache on another swap area. But 2433 * that is okay, folio_free_s 2097 * that is okay, folio_free_swap() only removes stale folios. 2434 */ 2098 */ 2435 folio_lock(folio); 2099 folio_lock(folio); 2436 folio_wait_writeback(folio); 2100 folio_wait_writeback(folio); 2437 folio_free_swap(folio); 2101 folio_free_swap(folio); 2438 folio_unlock(folio); 2102 folio_unlock(folio); 2439 folio_put(folio); 2103 folio_put(folio); 2440 } 2104 } 2441 2105 2442 /* 2106 /* 2443 * Lets check again to see if there a 2107 * Lets check again to see if there are still swap entries in the map. 2444 * If yes, we would need to do retry 2108 * If yes, we would need to do retry the unuse logic again. 2445 * Under global memory pressure, swap 2109 * Under global memory pressure, swap entries can be reinserted back 2446 * into process space after the mmlis 2110 * into process space after the mmlist loop above passes over them. 2447 * 2111 * 2448 * Limit the number of retries? No: w 2112 * Limit the number of retries? No: when mmget_not_zero() 2449 * above fails, that mm is likely to 2113 * above fails, that mm is likely to be freeing swap from 2450 * exit_mmap(), which proceeds at its 2114 * exit_mmap(), which proceeds at its own independent pace; 2451 * and even shmem_writepage() could h 2115 * and even shmem_writepage() could have been preempted after 2452 * folio_alloc_swap(), temporarily hi 2116 * folio_alloc_swap(), temporarily hiding that swap. It's easy 2453 * and robust (though cpu-intensive) 2117 * and robust (though cpu-intensive) just to keep retrying. 2454 */ 2118 */ 2455 if (READ_ONCE(si->inuse_pages)) { 2119 if (READ_ONCE(si->inuse_pages)) { 2456 if (!signal_pending(current)) 2120 if (!signal_pending(current)) 2457 goto retry; 2121 goto retry; 2458 return -EINTR; 2122 return -EINTR; 2459 } 2123 } 2460 2124 2461 success: << 2462 /* << 2463 * Make sure that further cleanups af << 2464 * after swap_range_free() reduces si << 2465 */ << 2466 smp_mb(); << 2467 return 0; 2125 return 0; 2468 } 2126 } 2469 2127 2470 /* 2128 /* 2471 * After a successful try_to_unuse, if no swa 2129 * After a successful try_to_unuse, if no swap is now in use, we know 2472 * we can empty the mmlist. swap_lock must b 2130 * we can empty the mmlist. swap_lock must be held on entry and exit. 2473 * Note that mmlist_lock nests inside swap_lo 2131 * Note that mmlist_lock nests inside swap_lock, and an mm must be 2474 * added to the mmlist just after page_duplic 2132 * added to the mmlist just after page_duplicate - before would be racy. 2475 */ 2133 */ 2476 static void drain_mmlist(void) 2134 static void drain_mmlist(void) 2477 { 2135 { 2478 struct list_head *p, *next; 2136 struct list_head *p, *next; 2479 unsigned int type; 2137 unsigned int type; 2480 2138 2481 for (type = 0; type < nr_swapfiles; t 2139 for (type = 0; type < nr_swapfiles; type++) 2482 if (swap_info[type]->inuse_pa 2140 if (swap_info[type]->inuse_pages) 2483 return; 2141 return; 2484 spin_lock(&mmlist_lock); 2142 spin_lock(&mmlist_lock); 2485 list_for_each_safe(p, next, &init_mm. 2143 list_for_each_safe(p, next, &init_mm.mmlist) 2486 list_del_init(p); 2144 list_del_init(p); 2487 spin_unlock(&mmlist_lock); 2145 spin_unlock(&mmlist_lock); 2488 } 2146 } 2489 2147 2490 /* 2148 /* 2491 * Free all of a swapdev's extent information 2149 * Free all of a swapdev's extent information 2492 */ 2150 */ 2493 static void destroy_swap_extents(struct swap_ 2151 static void destroy_swap_extents(struct swap_info_struct *sis) 2494 { 2152 { 2495 while (!RB_EMPTY_ROOT(&sis->swap_exte 2153 while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) { 2496 struct rb_node *rb = sis->swa 2154 struct rb_node *rb = sis->swap_extent_root.rb_node; 2497 struct swap_extent *se = rb_e 2155 struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node); 2498 2156 2499 rb_erase(rb, &sis->swap_exten 2157 rb_erase(rb, &sis->swap_extent_root); 2500 kfree(se); 2158 kfree(se); 2501 } 2159 } 2502 2160 2503 if (sis->flags & SWP_ACTIVATED) { 2161 if (sis->flags & SWP_ACTIVATED) { 2504 struct file *swap_file = sis- 2162 struct file *swap_file = sis->swap_file; 2505 struct address_space *mapping 2163 struct address_space *mapping = swap_file->f_mapping; 2506 2164 2507 sis->flags &= ~SWP_ACTIVATED; 2165 sis->flags &= ~SWP_ACTIVATED; 2508 if (mapping->a_ops->swap_deac 2166 if (mapping->a_ops->swap_deactivate) 2509 mapping->a_ops->swap_ 2167 mapping->a_ops->swap_deactivate(swap_file); 2510 } 2168 } 2511 } 2169 } 2512 2170 2513 /* 2171 /* 2514 * Add a block range (and the corresponding p 2172 * Add a block range (and the corresponding page range) into this swapdev's 2515 * extent tree. 2173 * extent tree. 2516 * 2174 * 2517 * This function rather assumes that it is ca 2175 * This function rather assumes that it is called in ascending page order. 2518 */ 2176 */ 2519 int 2177 int 2520 add_swap_extent(struct swap_info_struct *sis, 2178 add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, 2521 unsigned long nr_pages, secto 2179 unsigned long nr_pages, sector_t start_block) 2522 { 2180 { 2523 struct rb_node **link = &sis->swap_ex 2181 struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL; 2524 struct swap_extent *se; 2182 struct swap_extent *se; 2525 struct swap_extent *new_se; 2183 struct swap_extent *new_se; 2526 2184 2527 /* 2185 /* 2528 * place the new node at the right mo 2186 * place the new node at the right most since the 2529 * function is called in ascending pa 2187 * function is called in ascending page order. 2530 */ 2188 */ 2531 while (*link) { 2189 while (*link) { 2532 parent = *link; 2190 parent = *link; 2533 link = &parent->rb_right; 2191 link = &parent->rb_right; 2534 } 2192 } 2535 2193 2536 if (parent) { 2194 if (parent) { 2537 se = rb_entry(parent, struct 2195 se = rb_entry(parent, struct swap_extent, rb_node); 2538 BUG_ON(se->start_page + se->n 2196 BUG_ON(se->start_page + se->nr_pages != start_page); 2539 if (se->start_block + se->nr_ 2197 if (se->start_block + se->nr_pages == start_block) { 2540 /* Merge it */ 2198 /* Merge it */ 2541 se->nr_pages += nr_pa 2199 se->nr_pages += nr_pages; 2542 return 0; 2200 return 0; 2543 } 2201 } 2544 } 2202 } 2545 2203 2546 /* No merge, insert a new extent. */ 2204 /* No merge, insert a new extent. */ 2547 new_se = kmalloc(sizeof(*se), GFP_KER 2205 new_se = kmalloc(sizeof(*se), GFP_KERNEL); 2548 if (new_se == NULL) 2206 if (new_se == NULL) 2549 return -ENOMEM; 2207 return -ENOMEM; 2550 new_se->start_page = start_page; 2208 new_se->start_page = start_page; 2551 new_se->nr_pages = nr_pages; 2209 new_se->nr_pages = nr_pages; 2552 new_se->start_block = start_block; 2210 new_se->start_block = start_block; 2553 2211 2554 rb_link_node(&new_se->rb_node, parent 2212 rb_link_node(&new_se->rb_node, parent, link); 2555 rb_insert_color(&new_se->rb_node, &si 2213 rb_insert_color(&new_se->rb_node, &sis->swap_extent_root); 2556 return 1; 2214 return 1; 2557 } 2215 } 2558 EXPORT_SYMBOL_GPL(add_swap_extent); 2216 EXPORT_SYMBOL_GPL(add_swap_extent); 2559 2217 2560 /* 2218 /* 2561 * A `swap extent' is a simple thing which ma 2219 * A `swap extent' is a simple thing which maps a contiguous range of pages 2562 * onto a contiguous range of disk blocks. A 2220 * onto a contiguous range of disk blocks. A rbtree of swap extents is 2563 * built at swapon time and is then used at s !! 2221 * built at swapon time and is then used at swap_writepage/swap_readpage 2564 * time for locating where on disk a page bel 2222 * time for locating where on disk a page belongs. 2565 * 2223 * 2566 * If the swapfile is an S_ISBLK block device 2224 * If the swapfile is an S_ISBLK block device, a single extent is installed. 2567 * This is done so that the main operating co 2225 * This is done so that the main operating code can treat S_ISBLK and S_ISREG 2568 * swap files identically. 2226 * swap files identically. 2569 * 2227 * 2570 * Whether the swapdev is an S_ISREG file or 2228 * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap 2571 * extent rbtree operates in PAGE_SIZE disk b 2229 * extent rbtree operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK 2572 * swapfiles are handled *identically* after 2230 * swapfiles are handled *identically* after swapon time. 2573 * 2231 * 2574 * For S_ISREG swapfiles, setup_swap_extents( 2232 * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks 2575 * and will parse them into a rbtree, in PAGE 2233 * and will parse them into a rbtree, in PAGE_SIZE chunks. If some stray 2576 * blocks are found which do not fall within 2234 * blocks are found which do not fall within the PAGE_SIZE alignment 2577 * requirements, they are simply tossed out - 2235 * requirements, they are simply tossed out - we will never use those blocks 2578 * for swapping. 2236 * for swapping. 2579 * 2237 * 2580 * For all swap devices we set S_SWAPFILE acr 2238 * For all swap devices we set S_SWAPFILE across the life of the swapon. This 2581 * prevents users from writing to the swap de 2239 * prevents users from writing to the swap device, which will corrupt memory. 2582 * 2240 * 2583 * The amount of disk space which a single sw 2241 * The amount of disk space which a single swap extent represents varies. 2584 * Typically it is in the 1-4 megabyte range. 2242 * Typically it is in the 1-4 megabyte range. So we can have hundreds of 2585 * extents in the rbtree. - akpm. 2243 * extents in the rbtree. - akpm. 2586 */ 2244 */ 2587 static int setup_swap_extents(struct swap_inf 2245 static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) 2588 { 2246 { 2589 struct file *swap_file = sis->swap_fi 2247 struct file *swap_file = sis->swap_file; 2590 struct address_space *mapping = swap_ 2248 struct address_space *mapping = swap_file->f_mapping; 2591 struct inode *inode = mapping->host; 2249 struct inode *inode = mapping->host; 2592 int ret; 2250 int ret; 2593 2251 2594 if (S_ISBLK(inode->i_mode)) { 2252 if (S_ISBLK(inode->i_mode)) { 2595 ret = add_swap_extent(sis, 0, 2253 ret = add_swap_extent(sis, 0, sis->max, 0); 2596 *span = sis->pages; 2254 *span = sis->pages; 2597 return ret; 2255 return ret; 2598 } 2256 } 2599 2257 2600 if (mapping->a_ops->swap_activate) { 2258 if (mapping->a_ops->swap_activate) { 2601 ret = mapping->a_ops->swap_ac 2259 ret = mapping->a_ops->swap_activate(sis, swap_file, span); 2602 if (ret < 0) 2260 if (ret < 0) 2603 return ret; 2261 return ret; 2604 sis->flags |= SWP_ACTIVATED; 2262 sis->flags |= SWP_ACTIVATED; 2605 if ((sis->flags & SWP_FS_OPS) 2263 if ((sis->flags & SWP_FS_OPS) && 2606 sio_pool_init() != 0) { 2264 sio_pool_init() != 0) { 2607 destroy_swap_extents( 2265 destroy_swap_extents(sis); 2608 return -ENOMEM; 2266 return -ENOMEM; 2609 } 2267 } 2610 return ret; 2268 return ret; 2611 } 2269 } 2612 2270 2613 return generic_swapfile_activate(sis, 2271 return generic_swapfile_activate(sis, swap_file, span); 2614 } 2272 } 2615 2273 2616 static int swap_node(struct swap_info_struct !! 2274 static int swap_node(struct swap_info_struct *p) 2617 { 2275 { 2618 struct block_device *bdev; 2276 struct block_device *bdev; 2619 2277 2620 if (si->bdev) !! 2278 if (p->bdev) 2621 bdev = si->bdev; !! 2279 bdev = p->bdev; 2622 else 2280 else 2623 bdev = si->swap_file->f_inode !! 2281 bdev = p->swap_file->f_inode->i_sb->s_bdev; 2624 2282 2625 return bdev ? bdev->bd_disk->node_id 2283 return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; 2626 } 2284 } 2627 2285 2628 static void setup_swap_info(struct swap_info_ !! 2286 static void setup_swap_info(struct swap_info_struct *p, int prio, 2629 unsigned char *sw 2287 unsigned char *swap_map, 2630 struct swap_clust !! 2288 struct swap_cluster_info *cluster_info) 2631 unsigned long *ze << 2632 { 2289 { 2633 int i; 2290 int i; 2634 2291 2635 if (prio >= 0) 2292 if (prio >= 0) 2636 si->prio = prio; !! 2293 p->prio = prio; 2637 else 2294 else 2638 si->prio = --least_priority; !! 2295 p->prio = --least_priority; 2639 /* 2296 /* 2640 * the plist prio is negated because 2297 * the plist prio is negated because plist ordering is 2641 * low-to-high, while swap ordering i 2298 * low-to-high, while swap ordering is high-to-low 2642 */ 2299 */ 2643 si->list.prio = -si->prio; !! 2300 p->list.prio = -p->prio; 2644 for_each_node(i) { 2301 for_each_node(i) { 2645 if (si->prio >= 0) !! 2302 if (p->prio >= 0) 2646 si->avail_lists[i].pr !! 2303 p->avail_lists[i].prio = -p->prio; 2647 else { 2304 else { 2648 if (swap_node(si) == !! 2305 if (swap_node(p) == i) 2649 si->avail_lis !! 2306 p->avail_lists[i].prio = 1; 2650 else 2307 else 2651 si->avail_lis !! 2308 p->avail_lists[i].prio = -p->prio; 2652 } 2309 } 2653 } 2310 } 2654 si->swap_map = swap_map; !! 2311 p->swap_map = swap_map; 2655 si->cluster_info = cluster_info; !! 2312 p->cluster_info = cluster_info; 2656 si->zeromap = zeromap; << 2657 } 2313 } 2658 2314 2659 static void _enable_swap_info(struct swap_inf !! 2315 static void _enable_swap_info(struct swap_info_struct *p) 2660 { 2316 { 2661 si->flags |= SWP_WRITEOK; !! 2317 p->flags |= SWP_WRITEOK; 2662 atomic_long_add(si->pages, &nr_swap_p !! 2318 atomic_long_add(p->pages, &nr_swap_pages); 2663 total_swap_pages += si->pages; !! 2319 total_swap_pages += p->pages; 2664 2320 2665 assert_spin_locked(&swap_lock); 2321 assert_spin_locked(&swap_lock); 2666 /* 2322 /* 2667 * both lists are plists, and thus pr 2323 * both lists are plists, and thus priority ordered. 2668 * swap_active_head needs to be prior 2324 * swap_active_head needs to be priority ordered for swapoff(), 2669 * which on removal of any swap_info_ 2325 * which on removal of any swap_info_struct with an auto-assigned 2670 * (i.e. negative) priority increment 2326 * (i.e. negative) priority increments the auto-assigned priority 2671 * of any lower-priority swap_info_st 2327 * of any lower-priority swap_info_structs. 2672 * swap_avail_head needs to be priori 2328 * swap_avail_head needs to be priority ordered for folio_alloc_swap(), 2673 * which allocates swap pages from th 2329 * which allocates swap pages from the highest available priority 2674 * swap_info_struct. 2330 * swap_info_struct. 2675 */ 2331 */ 2676 plist_add(&si->list, &swap_active_hea !! 2332 plist_add(&p->list, &swap_active_head); 2677 !! 2333 add_to_avail_list(p); 2678 /* add to available list iff swap dev << 2679 if (si->highest_bit) << 2680 add_to_avail_list(si); << 2681 } 2334 } 2682 2335 2683 static void enable_swap_info(struct swap_info !! 2336 static void enable_swap_info(struct swap_info_struct *p, int prio, 2684 unsigned char 2337 unsigned char *swap_map, 2685 struct swap_c 2338 struct swap_cluster_info *cluster_info, 2686 unsigned long !! 2339 unsigned long *frontswap_map) 2687 { 2340 { >> 2341 if (IS_ENABLED(CONFIG_FRONTSWAP)) >> 2342 frontswap_init(p->type, frontswap_map); 2688 spin_lock(&swap_lock); 2343 spin_lock(&swap_lock); 2689 spin_lock(&si->lock); !! 2344 spin_lock(&p->lock); 2690 setup_swap_info(si, prio, swap_map, c !! 2345 setup_swap_info(p, prio, swap_map, cluster_info); 2691 spin_unlock(&si->lock); !! 2346 spin_unlock(&p->lock); 2692 spin_unlock(&swap_lock); 2347 spin_unlock(&swap_lock); 2693 /* 2348 /* 2694 * Finished initializing swap device, 2349 * Finished initializing swap device, now it's safe to reference it. 2695 */ 2350 */ 2696 percpu_ref_resurrect(&si->users); !! 2351 percpu_ref_resurrect(&p->users); 2697 spin_lock(&swap_lock); 2352 spin_lock(&swap_lock); 2698 spin_lock(&si->lock); !! 2353 spin_lock(&p->lock); 2699 _enable_swap_info(si); !! 2354 _enable_swap_info(p); 2700 spin_unlock(&si->lock); !! 2355 spin_unlock(&p->lock); 2701 spin_unlock(&swap_lock); 2356 spin_unlock(&swap_lock); 2702 } 2357 } 2703 2358 2704 static void reinsert_swap_info(struct swap_in !! 2359 static void reinsert_swap_info(struct swap_info_struct *p) 2705 { 2360 { 2706 spin_lock(&swap_lock); 2361 spin_lock(&swap_lock); 2707 spin_lock(&si->lock); !! 2362 spin_lock(&p->lock); 2708 setup_swap_info(si, si->prio, si->swa !! 2363 setup_swap_info(p, p->prio, p->swap_map, p->cluster_info); 2709 _enable_swap_info(si); !! 2364 _enable_swap_info(p); 2710 spin_unlock(&si->lock); !! 2365 spin_unlock(&p->lock); 2711 spin_unlock(&swap_lock); 2366 spin_unlock(&swap_lock); 2712 } 2367 } 2713 2368 2714 static bool __has_usable_swap(void) << 2715 { << 2716 return !plist_head_empty(&swap_active << 2717 } << 2718 << 2719 bool has_usable_swap(void) 2369 bool has_usable_swap(void) 2720 { 2370 { 2721 bool ret; !! 2371 bool ret = true; 2722 2372 2723 spin_lock(&swap_lock); 2373 spin_lock(&swap_lock); 2724 ret = __has_usable_swap(); !! 2374 if (plist_head_empty(&swap_active_head)) >> 2375 ret = false; 2725 spin_unlock(&swap_lock); 2376 spin_unlock(&swap_lock); 2726 return ret; 2377 return ret; 2727 } 2378 } 2728 2379 2729 SYSCALL_DEFINE1(swapoff, const char __user *, 2380 SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) 2730 { 2381 { 2731 struct swap_info_struct *p = NULL; 2382 struct swap_info_struct *p = NULL; 2732 unsigned char *swap_map; 2383 unsigned char *swap_map; 2733 unsigned long *zeromap; << 2734 struct swap_cluster_info *cluster_inf 2384 struct swap_cluster_info *cluster_info; >> 2385 unsigned long *frontswap_map; 2735 struct file *swap_file, *victim; 2386 struct file *swap_file, *victim; 2736 struct address_space *mapping; 2387 struct address_space *mapping; 2737 struct inode *inode; 2388 struct inode *inode; 2738 struct filename *pathname; 2389 struct filename *pathname; 2739 int err, found = 0; 2390 int err, found = 0; >> 2391 unsigned int old_block_size; 2740 2392 2741 if (!capable(CAP_SYS_ADMIN)) 2393 if (!capable(CAP_SYS_ADMIN)) 2742 return -EPERM; 2394 return -EPERM; 2743 2395 2744 BUG_ON(!current->mm); 2396 BUG_ON(!current->mm); 2745 2397 2746 pathname = getname(specialfile); 2398 pathname = getname(specialfile); 2747 if (IS_ERR(pathname)) 2399 if (IS_ERR(pathname)) 2748 return PTR_ERR(pathname); 2400 return PTR_ERR(pathname); 2749 2401 2750 victim = file_open_name(pathname, O_R 2402 victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0); 2751 err = PTR_ERR(victim); 2403 err = PTR_ERR(victim); 2752 if (IS_ERR(victim)) 2404 if (IS_ERR(victim)) 2753 goto out; 2405 goto out; 2754 2406 2755 mapping = victim->f_mapping; 2407 mapping = victim->f_mapping; 2756 spin_lock(&swap_lock); 2408 spin_lock(&swap_lock); 2757 plist_for_each_entry(p, &swap_active_ 2409 plist_for_each_entry(p, &swap_active_head, list) { 2758 if (p->flags & SWP_WRITEOK) { 2410 if (p->flags & SWP_WRITEOK) { 2759 if (p->swap_file->f_m 2411 if (p->swap_file->f_mapping == mapping) { 2760 found = 1; 2412 found = 1; 2761 break; 2413 break; 2762 } 2414 } 2763 } 2415 } 2764 } 2416 } 2765 if (!found) { 2417 if (!found) { 2766 err = -EINVAL; 2418 err = -EINVAL; 2767 spin_unlock(&swap_lock); 2419 spin_unlock(&swap_lock); 2768 goto out_dput; 2420 goto out_dput; 2769 } 2421 } 2770 if (!security_vm_enough_memory_mm(cur 2422 if (!security_vm_enough_memory_mm(current->mm, p->pages)) 2771 vm_unacct_memory(p->pages); 2423 vm_unacct_memory(p->pages); 2772 else { 2424 else { 2773 err = -ENOMEM; 2425 err = -ENOMEM; 2774 spin_unlock(&swap_lock); 2426 spin_unlock(&swap_lock); 2775 goto out_dput; 2427 goto out_dput; 2776 } 2428 } 2777 spin_lock(&p->lock); 2429 spin_lock(&p->lock); 2778 del_from_avail_list(p); 2430 del_from_avail_list(p); 2779 if (p->prio < 0) { 2431 if (p->prio < 0) { 2780 struct swap_info_struct *si = 2432 struct swap_info_struct *si = p; 2781 int nid; 2433 int nid; 2782 2434 2783 plist_for_each_entry_continue 2435 plist_for_each_entry_continue(si, &swap_active_head, list) { 2784 si->prio++; 2436 si->prio++; 2785 si->list.prio--; 2437 si->list.prio--; 2786 for_each_node(nid) { 2438 for_each_node(nid) { 2787 if (si->avail 2439 if (si->avail_lists[nid].prio != 1) 2788 si->a 2440 si->avail_lists[nid].prio--; 2789 } 2441 } 2790 } 2442 } 2791 least_priority++; 2443 least_priority++; 2792 } 2444 } 2793 plist_del(&p->list, &swap_active_head 2445 plist_del(&p->list, &swap_active_head); 2794 atomic_long_sub(p->pages, &nr_swap_pa 2446 atomic_long_sub(p->pages, &nr_swap_pages); 2795 total_swap_pages -= p->pages; 2447 total_swap_pages -= p->pages; 2796 p->flags &= ~SWP_WRITEOK; 2448 p->flags &= ~SWP_WRITEOK; 2797 spin_unlock(&p->lock); 2449 spin_unlock(&p->lock); 2798 spin_unlock(&swap_lock); 2450 spin_unlock(&swap_lock); 2799 2451 2800 disable_swap_slots_cache_lock(); 2452 disable_swap_slots_cache_lock(); 2801 2453 2802 set_current_oom_origin(); 2454 set_current_oom_origin(); 2803 err = try_to_unuse(p->type); 2455 err = try_to_unuse(p->type); 2804 clear_current_oom_origin(); 2456 clear_current_oom_origin(); 2805 2457 2806 if (err) { 2458 if (err) { 2807 /* re-insert swap space back 2459 /* re-insert swap space back into swap_list */ 2808 reinsert_swap_info(p); 2460 reinsert_swap_info(p); 2809 reenable_swap_slots_cache_unl 2461 reenable_swap_slots_cache_unlock(); 2810 goto out_dput; 2462 goto out_dput; 2811 } 2463 } 2812 2464 2813 reenable_swap_slots_cache_unlock(); 2465 reenable_swap_slots_cache_unlock(); 2814 2466 2815 /* 2467 /* 2816 * Wait for swap operations protected 2468 * Wait for swap operations protected by get/put_swap_device() 2817 * to complete. Because of synchroni !! 2469 * to complete. 2818 * operations protected by RCU reader !! 2470 * 2819 * spinlock) will be waited too. Thi !! 2471 * We need synchronize_rcu() here to protect the accessing to 2820 * prevent folio_test_swapcache() and !! 2472 * the swap cache data structure. 2821 * operations from racing with swapof << 2822 */ 2473 */ 2823 percpu_ref_kill(&p->users); 2474 percpu_ref_kill(&p->users); 2824 synchronize_rcu(); 2475 synchronize_rcu(); 2825 wait_for_completion(&p->comp); 2476 wait_for_completion(&p->comp); 2826 2477 2827 flush_work(&p->discard_work); 2478 flush_work(&p->discard_work); 2828 flush_work(&p->reclaim_work); << 2829 2479 2830 destroy_swap_extents(p); 2480 destroy_swap_extents(p); 2831 if (p->flags & SWP_CONTINUED) 2481 if (p->flags & SWP_CONTINUED) 2832 free_swap_count_continuations 2482 free_swap_count_continuations(p); 2833 2483 2834 if (!p->bdev || !bdev_nonrot(p->bdev) 2484 if (!p->bdev || !bdev_nonrot(p->bdev)) 2835 atomic_dec(&nr_rotate_swap); 2485 atomic_dec(&nr_rotate_swap); 2836 2486 2837 mutex_lock(&swapon_mutex); 2487 mutex_lock(&swapon_mutex); 2838 spin_lock(&swap_lock); 2488 spin_lock(&swap_lock); 2839 spin_lock(&p->lock); 2489 spin_lock(&p->lock); 2840 drain_mmlist(); 2490 drain_mmlist(); 2841 2491 2842 /* wait for anyone still in scan_swap 2492 /* wait for anyone still in scan_swap_map_slots */ 2843 p->highest_bit = 0; /* cu 2493 p->highest_bit = 0; /* cuts scans short */ 2844 while (p->flags >= SWP_SCANNING) { 2494 while (p->flags >= SWP_SCANNING) { 2845 spin_unlock(&p->lock); 2495 spin_unlock(&p->lock); 2846 spin_unlock(&swap_lock); 2496 spin_unlock(&swap_lock); 2847 schedule_timeout_uninterrupti 2497 schedule_timeout_uninterruptible(1); 2848 spin_lock(&swap_lock); 2498 spin_lock(&swap_lock); 2849 spin_lock(&p->lock); 2499 spin_lock(&p->lock); 2850 } 2500 } 2851 2501 2852 swap_file = p->swap_file; 2502 swap_file = p->swap_file; >> 2503 old_block_size = p->old_block_size; 2853 p->swap_file = NULL; 2504 p->swap_file = NULL; 2854 p->max = 0; 2505 p->max = 0; 2855 swap_map = p->swap_map; 2506 swap_map = p->swap_map; 2856 p->swap_map = NULL; 2507 p->swap_map = NULL; 2857 zeromap = p->zeromap; << 2858 p->zeromap = NULL; << 2859 cluster_info = p->cluster_info; 2508 cluster_info = p->cluster_info; 2860 p->cluster_info = NULL; 2509 p->cluster_info = NULL; >> 2510 frontswap_map = frontswap_map_get(p); 2861 spin_unlock(&p->lock); 2511 spin_unlock(&p->lock); 2862 spin_unlock(&swap_lock); 2512 spin_unlock(&swap_lock); 2863 arch_swap_invalidate_area(p->type); 2513 arch_swap_invalidate_area(p->type); 2864 zswap_swapoff(p->type); !! 2514 frontswap_invalidate_area(p->type); >> 2515 frontswap_map_set(p, NULL); 2865 mutex_unlock(&swapon_mutex); 2516 mutex_unlock(&swapon_mutex); 2866 free_percpu(p->percpu_cluster); 2517 free_percpu(p->percpu_cluster); 2867 p->percpu_cluster = NULL; 2518 p->percpu_cluster = NULL; 2868 free_percpu(p->cluster_next_cpu); 2519 free_percpu(p->cluster_next_cpu); 2869 p->cluster_next_cpu = NULL; 2520 p->cluster_next_cpu = NULL; 2870 vfree(swap_map); 2521 vfree(swap_map); 2871 kvfree(zeromap); << 2872 kvfree(cluster_info); 2522 kvfree(cluster_info); >> 2523 kvfree(frontswap_map); 2873 /* Destroy swap account information * 2524 /* Destroy swap account information */ 2874 swap_cgroup_swapoff(p->type); 2525 swap_cgroup_swapoff(p->type); 2875 exit_swap_address_space(p->type); 2526 exit_swap_address_space(p->type); 2876 2527 2877 inode = mapping->host; 2528 inode = mapping->host; >> 2529 if (S_ISBLK(inode->i_mode)) { >> 2530 struct block_device *bdev = I_BDEV(inode); >> 2531 >> 2532 set_blocksize(bdev, old_block_size); >> 2533 blkdev_put(bdev, p); >> 2534 } 2878 2535 2879 inode_lock(inode); 2536 inode_lock(inode); 2880 inode->i_flags &= ~S_SWAPFILE; 2537 inode->i_flags &= ~S_SWAPFILE; 2881 inode_unlock(inode); 2538 inode_unlock(inode); 2882 filp_close(swap_file, NULL); 2539 filp_close(swap_file, NULL); 2883 2540 2884 /* 2541 /* 2885 * Clear the SWP_USED flag after all 2542 * Clear the SWP_USED flag after all resources are freed so that swapon 2886 * can reuse this swap_info in alloc_ 2543 * can reuse this swap_info in alloc_swap_info() safely. It is ok to 2887 * not hold p->lock after we cleared 2544 * not hold p->lock after we cleared its SWP_WRITEOK. 2888 */ 2545 */ 2889 spin_lock(&swap_lock); 2546 spin_lock(&swap_lock); 2890 p->flags = 0; 2547 p->flags = 0; 2891 spin_unlock(&swap_lock); 2548 spin_unlock(&swap_lock); 2892 2549 2893 err = 0; 2550 err = 0; 2894 atomic_inc(&proc_poll_event); 2551 atomic_inc(&proc_poll_event); 2895 wake_up_interruptible(&proc_poll_wait 2552 wake_up_interruptible(&proc_poll_wait); 2896 2553 2897 out_dput: 2554 out_dput: 2898 filp_close(victim, NULL); 2555 filp_close(victim, NULL); 2899 out: 2556 out: 2900 putname(pathname); 2557 putname(pathname); 2901 return err; 2558 return err; 2902 } 2559 } 2903 2560 2904 #ifdef CONFIG_PROC_FS 2561 #ifdef CONFIG_PROC_FS 2905 static __poll_t swaps_poll(struct file *file, 2562 static __poll_t swaps_poll(struct file *file, poll_table *wait) 2906 { 2563 { 2907 struct seq_file *seq = file->private_ 2564 struct seq_file *seq = file->private_data; 2908 2565 2909 poll_wait(file, &proc_poll_wait, wait 2566 poll_wait(file, &proc_poll_wait, wait); 2910 2567 2911 if (seq->poll_event != atomic_read(&p 2568 if (seq->poll_event != atomic_read(&proc_poll_event)) { 2912 seq->poll_event = atomic_read 2569 seq->poll_event = atomic_read(&proc_poll_event); 2913 return EPOLLIN | EPOLLRDNORM 2570 return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI; 2914 } 2571 } 2915 2572 2916 return EPOLLIN | EPOLLRDNORM; 2573 return EPOLLIN | EPOLLRDNORM; 2917 } 2574 } 2918 2575 2919 /* iterator */ 2576 /* iterator */ 2920 static void *swap_start(struct seq_file *swap 2577 static void *swap_start(struct seq_file *swap, loff_t *pos) 2921 { 2578 { 2922 struct swap_info_struct *si; 2579 struct swap_info_struct *si; 2923 int type; 2580 int type; 2924 loff_t l = *pos; 2581 loff_t l = *pos; 2925 2582 2926 mutex_lock(&swapon_mutex); 2583 mutex_lock(&swapon_mutex); 2927 2584 2928 if (!l) 2585 if (!l) 2929 return SEQ_START_TOKEN; 2586 return SEQ_START_TOKEN; 2930 2587 2931 for (type = 0; (si = swap_type_to_swa 2588 for (type = 0; (si = swap_type_to_swap_info(type)); type++) { 2932 if (!(si->flags & SWP_USED) | 2589 if (!(si->flags & SWP_USED) || !si->swap_map) 2933 continue; 2590 continue; 2934 if (!--l) 2591 if (!--l) 2935 return si; 2592 return si; 2936 } 2593 } 2937 2594 2938 return NULL; 2595 return NULL; 2939 } 2596 } 2940 2597 2941 static void *swap_next(struct seq_file *swap, 2598 static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) 2942 { 2599 { 2943 struct swap_info_struct *si = v; 2600 struct swap_info_struct *si = v; 2944 int type; 2601 int type; 2945 2602 2946 if (v == SEQ_START_TOKEN) 2603 if (v == SEQ_START_TOKEN) 2947 type = 0; 2604 type = 0; 2948 else 2605 else 2949 type = si->type + 1; 2606 type = si->type + 1; 2950 2607 2951 ++(*pos); 2608 ++(*pos); 2952 for (; (si = swap_type_to_swap_info(t 2609 for (; (si = swap_type_to_swap_info(type)); type++) { 2953 if (!(si->flags & SWP_USED) | 2610 if (!(si->flags & SWP_USED) || !si->swap_map) 2954 continue; 2611 continue; 2955 return si; 2612 return si; 2956 } 2613 } 2957 2614 2958 return NULL; 2615 return NULL; 2959 } 2616 } 2960 2617 2961 static void swap_stop(struct seq_file *swap, 2618 static void swap_stop(struct seq_file *swap, void *v) 2962 { 2619 { 2963 mutex_unlock(&swapon_mutex); 2620 mutex_unlock(&swapon_mutex); 2964 } 2621 } 2965 2622 2966 static int swap_show(struct seq_file *swap, v 2623 static int swap_show(struct seq_file *swap, void *v) 2967 { 2624 { 2968 struct swap_info_struct *si = v; 2625 struct swap_info_struct *si = v; 2969 struct file *file; 2626 struct file *file; 2970 int len; 2627 int len; 2971 unsigned long bytes, inuse; 2628 unsigned long bytes, inuse; 2972 2629 2973 if (si == SEQ_START_TOKEN) { 2630 if (si == SEQ_START_TOKEN) { 2974 seq_puts(swap, "Filename\t\t\ 2631 seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n"); 2975 return 0; 2632 return 0; 2976 } 2633 } 2977 2634 2978 bytes = K(si->pages); !! 2635 bytes = si->pages << (PAGE_SHIFT - 10); 2979 inuse = K(READ_ONCE(si->inuse_pages)) !! 2636 inuse = READ_ONCE(si->inuse_pages) << (PAGE_SHIFT - 10); 2980 2637 2981 file = si->swap_file; 2638 file = si->swap_file; 2982 len = seq_file_path(swap, file, " \t\ 2639 len = seq_file_path(swap, file, " \t\n\\"); 2983 seq_printf(swap, "%*s%s\t%lu\t%s%lu\t 2640 seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n", 2984 len < 40 ? 40 - len : 2641 len < 40 ? 40 - len : 1, " ", 2985 S_ISBLK(file_inode(fi 2642 S_ISBLK(file_inode(file)->i_mode) ? 2986 "partition" : 2643 "partition" : "file\t", 2987 bytes, bytes < 100000 2644 bytes, bytes < 10000000 ? "\t" : "", 2988 inuse, inuse < 100000 2645 inuse, inuse < 10000000 ? "\t" : "", 2989 si->prio); 2646 si->prio); 2990 return 0; 2647 return 0; 2991 } 2648 } 2992 2649 2993 static const struct seq_operations swaps_op = 2650 static const struct seq_operations swaps_op = { 2994 .start = swap_start, 2651 .start = swap_start, 2995 .next = swap_next, 2652 .next = swap_next, 2996 .stop = swap_stop, 2653 .stop = swap_stop, 2997 .show = swap_show 2654 .show = swap_show 2998 }; 2655 }; 2999 2656 3000 static int swaps_open(struct inode *inode, st 2657 static int swaps_open(struct inode *inode, struct file *file) 3001 { 2658 { 3002 struct seq_file *seq; 2659 struct seq_file *seq; 3003 int ret; 2660 int ret; 3004 2661 3005 ret = seq_open(file, &swaps_op); 2662 ret = seq_open(file, &swaps_op); 3006 if (ret) 2663 if (ret) 3007 return ret; 2664 return ret; 3008 2665 3009 seq = file->private_data; 2666 seq = file->private_data; 3010 seq->poll_event = atomic_read(&proc_p 2667 seq->poll_event = atomic_read(&proc_poll_event); 3011 return 0; 2668 return 0; 3012 } 2669 } 3013 2670 3014 static const struct proc_ops swaps_proc_ops = 2671 static const struct proc_ops swaps_proc_ops = { 3015 .proc_flags = PROC_ENTRY_PERMANEN 2672 .proc_flags = PROC_ENTRY_PERMANENT, 3016 .proc_open = swaps_open, 2673 .proc_open = swaps_open, 3017 .proc_read = seq_read, 2674 .proc_read = seq_read, 3018 .proc_lseek = seq_lseek, 2675 .proc_lseek = seq_lseek, 3019 .proc_release = seq_release, 2676 .proc_release = seq_release, 3020 .proc_poll = swaps_poll, 2677 .proc_poll = swaps_poll, 3021 }; 2678 }; 3022 2679 3023 static int __init procswaps_init(void) 2680 static int __init procswaps_init(void) 3024 { 2681 { 3025 proc_create("swaps", 0, NULL, &swaps_ 2682 proc_create("swaps", 0, NULL, &swaps_proc_ops); 3026 return 0; 2683 return 0; 3027 } 2684 } 3028 __initcall(procswaps_init); 2685 __initcall(procswaps_init); 3029 #endif /* CONFIG_PROC_FS */ 2686 #endif /* CONFIG_PROC_FS */ 3030 2687 3031 #ifdef MAX_SWAPFILES_CHECK 2688 #ifdef MAX_SWAPFILES_CHECK 3032 static int __init max_swapfiles_check(void) 2689 static int __init max_swapfiles_check(void) 3033 { 2690 { 3034 MAX_SWAPFILES_CHECK(); 2691 MAX_SWAPFILES_CHECK(); 3035 return 0; 2692 return 0; 3036 } 2693 } 3037 late_initcall(max_swapfiles_check); 2694 late_initcall(max_swapfiles_check); 3038 #endif 2695 #endif 3039 2696 3040 static struct swap_info_struct *alloc_swap_in 2697 static struct swap_info_struct *alloc_swap_info(void) 3041 { 2698 { 3042 struct swap_info_struct *p; 2699 struct swap_info_struct *p; 3043 struct swap_info_struct *defer = NULL 2700 struct swap_info_struct *defer = NULL; 3044 unsigned int type; 2701 unsigned int type; 3045 int i; 2702 int i; 3046 2703 3047 p = kvzalloc(struct_size(p, avail_lis 2704 p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL); 3048 if (!p) 2705 if (!p) 3049 return ERR_PTR(-ENOMEM); 2706 return ERR_PTR(-ENOMEM); 3050 2707 3051 if (percpu_ref_init(&p->users, swap_u 2708 if (percpu_ref_init(&p->users, swap_users_ref_free, 3052 PERCPU_REF_INIT_D 2709 PERCPU_REF_INIT_DEAD, GFP_KERNEL)) { 3053 kvfree(p); 2710 kvfree(p); 3054 return ERR_PTR(-ENOMEM); 2711 return ERR_PTR(-ENOMEM); 3055 } 2712 } 3056 2713 3057 spin_lock(&swap_lock); 2714 spin_lock(&swap_lock); 3058 for (type = 0; type < nr_swapfiles; t 2715 for (type = 0; type < nr_swapfiles; type++) { 3059 if (!(swap_info[type]->flags 2716 if (!(swap_info[type]->flags & SWP_USED)) 3060 break; 2717 break; 3061 } 2718 } 3062 if (type >= MAX_SWAPFILES) { 2719 if (type >= MAX_SWAPFILES) { 3063 spin_unlock(&swap_lock); 2720 spin_unlock(&swap_lock); 3064 percpu_ref_exit(&p->users); 2721 percpu_ref_exit(&p->users); 3065 kvfree(p); 2722 kvfree(p); 3066 return ERR_PTR(-EPERM); 2723 return ERR_PTR(-EPERM); 3067 } 2724 } 3068 if (type >= nr_swapfiles) { 2725 if (type >= nr_swapfiles) { 3069 p->type = type; 2726 p->type = type; 3070 /* 2727 /* 3071 * Publish the swap_info_stru 2728 * Publish the swap_info_struct after initializing it. 3072 * Note that kvzalloc() above 2729 * Note that kvzalloc() above zeroes all its fields. 3073 */ 2730 */ 3074 smp_store_release(&swap_info[ 2731 smp_store_release(&swap_info[type], p); /* rcu_assign_pointer() */ 3075 nr_swapfiles++; 2732 nr_swapfiles++; 3076 } else { 2733 } else { 3077 defer = p; 2734 defer = p; 3078 p = swap_info[type]; 2735 p = swap_info[type]; 3079 /* 2736 /* 3080 * Do not memset this entry: 2737 * Do not memset this entry: a racing procfs swap_next() 3081 * would be relying on p->typ 2738 * would be relying on p->type to remain valid. 3082 */ 2739 */ 3083 } 2740 } 3084 p->swap_extent_root = RB_ROOT; 2741 p->swap_extent_root = RB_ROOT; 3085 plist_node_init(&p->list, 0); 2742 plist_node_init(&p->list, 0); 3086 for_each_node(i) 2743 for_each_node(i) 3087 plist_node_init(&p->avail_lis 2744 plist_node_init(&p->avail_lists[i], 0); 3088 p->flags = SWP_USED; 2745 p->flags = SWP_USED; 3089 spin_unlock(&swap_lock); 2746 spin_unlock(&swap_lock); 3090 if (defer) { 2747 if (defer) { 3091 percpu_ref_exit(&defer->users 2748 percpu_ref_exit(&defer->users); 3092 kvfree(defer); 2749 kvfree(defer); 3093 } 2750 } 3094 spin_lock_init(&p->lock); 2751 spin_lock_init(&p->lock); 3095 spin_lock_init(&p->cont_lock); 2752 spin_lock_init(&p->cont_lock); 3096 init_completion(&p->comp); 2753 init_completion(&p->comp); 3097 2754 3098 return p; 2755 return p; 3099 } 2756 } 3100 2757 3101 static int claim_swapfile(struct swap_info_st !! 2758 static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) 3102 { 2759 { >> 2760 int error; >> 2761 3103 if (S_ISBLK(inode->i_mode)) { 2762 if (S_ISBLK(inode->i_mode)) { 3104 si->bdev = I_BDEV(inode); !! 2763 p->bdev = blkdev_get_by_dev(inode->i_rdev, >> 2764 BLK_OPEN_READ | BLK_OPEN_WRITE, p, NULL); >> 2765 if (IS_ERR(p->bdev)) { >> 2766 error = PTR_ERR(p->bdev); >> 2767 p->bdev = NULL; >> 2768 return error; >> 2769 } >> 2770 p->old_block_size = block_size(p->bdev); >> 2771 error = set_blocksize(p->bdev, PAGE_SIZE); >> 2772 if (error < 0) >> 2773 return error; 3105 /* 2774 /* 3106 * Zoned block devices contai 2775 * Zoned block devices contain zones that have a sequential 3107 * write only restriction. H 2776 * write only restriction. Hence zoned block devices are not 3108 * suitable for swapping. Di 2777 * suitable for swapping. Disallow them here. 3109 */ 2778 */ 3110 if (bdev_is_zoned(si->bdev)) !! 2779 if (bdev_is_zoned(p->bdev)) 3111 return -EINVAL; 2780 return -EINVAL; 3112 si->flags |= SWP_BLKDEV; !! 2781 p->flags |= SWP_BLKDEV; 3113 } else if (S_ISREG(inode->i_mode)) { 2782 } else if (S_ISREG(inode->i_mode)) { 3114 si->bdev = inode->i_sb->s_bde !! 2783 p->bdev = inode->i_sb->s_bdev; 3115 } 2784 } 3116 2785 3117 return 0; 2786 return 0; 3118 } 2787 } 3119 2788 3120 2789 3121 /* 2790 /* 3122 * Find out how many pages are allowed for a 2791 * Find out how many pages are allowed for a single swap device. There 3123 * are two limiting factors: 2792 * are two limiting factors: 3124 * 1) the number of bits for the swap offset 2793 * 1) the number of bits for the swap offset in the swp_entry_t type, and 3125 * 2) the number of bits in the swap pte, as 2794 * 2) the number of bits in the swap pte, as defined by the different 3126 * architectures. 2795 * architectures. 3127 * 2796 * 3128 * In order to find the largest possible bit 2797 * In order to find the largest possible bit mask, a swap entry with 3129 * swap type 0 and swap offset ~0UL is create 2798 * swap type 0 and swap offset ~0UL is created, encoded to a swap pte, 3130 * decoded to a swp_entry_t again, and finall 2799 * decoded to a swp_entry_t again, and finally the swap offset is 3131 * extracted. 2800 * extracted. 3132 * 2801 * 3133 * This will mask all the bits from the initi 2802 * This will mask all the bits from the initial ~0UL mask that can't 3134 * be encoded in either the swp_entry_t or th 2803 * be encoded in either the swp_entry_t or the architecture definition 3135 * of a swap pte. 2804 * of a swap pte. 3136 */ 2805 */ 3137 unsigned long generic_max_swapfile_size(void) 2806 unsigned long generic_max_swapfile_size(void) 3138 { 2807 { 3139 return swp_offset(pte_to_swp_entry( 2808 return swp_offset(pte_to_swp_entry( 3140 swp_entry_to_pte(swp_ 2809 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; 3141 } 2810 } 3142 2811 3143 /* Can be overridden by an architecture for a 2812 /* Can be overridden by an architecture for additional checks. */ 3144 __weak unsigned long arch_max_swapfile_size(v 2813 __weak unsigned long arch_max_swapfile_size(void) 3145 { 2814 { 3146 return generic_max_swapfile_size(); 2815 return generic_max_swapfile_size(); 3147 } 2816 } 3148 2817 3149 static unsigned long read_swap_header(struct !! 2818 static unsigned long read_swap_header(struct swap_info_struct *p, 3150 union 2819 union swap_header *swap_header, 3151 struc 2820 struct inode *inode) 3152 { 2821 { 3153 int i; 2822 int i; 3154 unsigned long maxpages; 2823 unsigned long maxpages; 3155 unsigned long swapfilepages; 2824 unsigned long swapfilepages; 3156 unsigned long last_page; 2825 unsigned long last_page; 3157 2826 3158 if (memcmp("SWAPSPACE2", swap_header- 2827 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { 3159 pr_err("Unable to find swap-s 2828 pr_err("Unable to find swap-space signature\n"); 3160 return 0; 2829 return 0; 3161 } 2830 } 3162 2831 3163 /* swap partition endianness hack... 2832 /* swap partition endianness hack... */ 3164 if (swab32(swap_header->info.version) 2833 if (swab32(swap_header->info.version) == 1) { 3165 swab32s(&swap_header->info.ve 2834 swab32s(&swap_header->info.version); 3166 swab32s(&swap_header->info.la 2835 swab32s(&swap_header->info.last_page); 3167 swab32s(&swap_header->info.nr 2836 swab32s(&swap_header->info.nr_badpages); 3168 if (swap_header->info.nr_badp 2837 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 3169 return 0; 2838 return 0; 3170 for (i = 0; i < swap_header-> 2839 for (i = 0; i < swap_header->info.nr_badpages; i++) 3171 swab32s(&swap_header- 2840 swab32s(&swap_header->info.badpages[i]); 3172 } 2841 } 3173 /* Check the swap header's sub-versio 2842 /* Check the swap header's sub-version */ 3174 if (swap_header->info.version != 1) { 2843 if (swap_header->info.version != 1) { 3175 pr_warn("Unable to handle swa 2844 pr_warn("Unable to handle swap header version %d\n", 3176 swap_header->info.ver 2845 swap_header->info.version); 3177 return 0; 2846 return 0; 3178 } 2847 } 3179 2848 3180 si->lowest_bit = 1; !! 2849 p->lowest_bit = 1; 3181 si->cluster_next = 1; !! 2850 p->cluster_next = 1; 3182 si->cluster_nr = 0; !! 2851 p->cluster_nr = 0; 3183 2852 3184 maxpages = swapfile_maximum_size; 2853 maxpages = swapfile_maximum_size; 3185 last_page = swap_header->info.last_pa 2854 last_page = swap_header->info.last_page; 3186 if (!last_page) { 2855 if (!last_page) { 3187 pr_warn("Empty swap-file\n"); 2856 pr_warn("Empty swap-file\n"); 3188 return 0; 2857 return 0; 3189 } 2858 } 3190 if (last_page > maxpages) { 2859 if (last_page > maxpages) { 3191 pr_warn("Truncating oversized 2860 pr_warn("Truncating oversized swap area, only using %luk out of %luk\n", 3192 K(maxpages), K(last_p !! 2861 maxpages << (PAGE_SHIFT - 10), >> 2862 last_page << (PAGE_SHIFT - 10)); 3193 } 2863 } 3194 if (maxpages > last_page) { 2864 if (maxpages > last_page) { 3195 maxpages = last_page + 1; 2865 maxpages = last_page + 1; 3196 /* p->max is an unsigned int: 2866 /* p->max is an unsigned int: don't overflow it */ 3197 if ((unsigned int)maxpages == 2867 if ((unsigned int)maxpages == 0) 3198 maxpages = UINT_MAX; 2868 maxpages = UINT_MAX; 3199 } 2869 } 3200 si->highest_bit = maxpages - 1; !! 2870 p->highest_bit = maxpages - 1; 3201 2871 3202 if (!maxpages) 2872 if (!maxpages) 3203 return 0; 2873 return 0; 3204 swapfilepages = i_size_read(inode) >> 2874 swapfilepages = i_size_read(inode) >> PAGE_SHIFT; 3205 if (swapfilepages && maxpages > swapf 2875 if (swapfilepages && maxpages > swapfilepages) { 3206 pr_warn("Swap area shorter th 2876 pr_warn("Swap area shorter than signature indicates\n"); 3207 return 0; 2877 return 0; 3208 } 2878 } 3209 if (swap_header->info.nr_badpages && 2879 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) 3210 return 0; 2880 return 0; 3211 if (swap_header->info.nr_badpages > M 2881 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 3212 return 0; 2882 return 0; 3213 2883 3214 return maxpages; 2884 return maxpages; 3215 } 2885 } 3216 2886 3217 #define SWAP_CLUSTER_INFO_COLS 2887 #define SWAP_CLUSTER_INFO_COLS \ 3218 DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(s 2888 DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info)) 3219 #define SWAP_CLUSTER_SPACE_COLS 2889 #define SWAP_CLUSTER_SPACE_COLS \ 3220 DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES 2890 DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER) 3221 #define SWAP_CLUSTER_COLS 2891 #define SWAP_CLUSTER_COLS \ 3222 max_t(unsigned int, SWAP_CLUSTER_INFO 2892 max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS) 3223 2893 3224 static int setup_swap_map_and_extents(struct !! 2894 static int setup_swap_map_and_extents(struct swap_info_struct *p, 3225 union 2895 union swap_header *swap_header, 3226 unsig 2896 unsigned char *swap_map, >> 2897 struct swap_cluster_info *cluster_info, 3227 unsig 2898 unsigned long maxpages, 3228 secto 2899 sector_t *span) 3229 { 2900 { >> 2901 unsigned int j, k; 3230 unsigned int nr_good_pages; 2902 unsigned int nr_good_pages; 3231 unsigned long i; << 3232 int nr_extents; 2903 int nr_extents; >> 2904 unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); >> 2905 unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS; >> 2906 unsigned long i, idx; 3233 2907 3234 nr_good_pages = maxpages - 1; /* om 2908 nr_good_pages = maxpages - 1; /* omit header page */ 3235 2909 >> 2910 cluster_list_init(&p->free_clusters); >> 2911 cluster_list_init(&p->discard_clusters); >> 2912 3236 for (i = 0; i < swap_header->info.nr_ 2913 for (i = 0; i < swap_header->info.nr_badpages; i++) { 3237 unsigned int page_nr = swap_h 2914 unsigned int page_nr = swap_header->info.badpages[i]; 3238 if (page_nr == 0 || page_nr > 2915 if (page_nr == 0 || page_nr > swap_header->info.last_page) 3239 return -EINVAL; 2916 return -EINVAL; 3240 if (page_nr < maxpages) { 2917 if (page_nr < maxpages) { 3241 swap_map[page_nr] = S 2918 swap_map[page_nr] = SWAP_MAP_BAD; 3242 nr_good_pages--; 2919 nr_good_pages--; >> 2920 /* >> 2921 * Haven't marked the cluster free yet, no list >> 2922 * operation involved >> 2923 */ >> 2924 inc_cluster_info_page(p, cluster_info, page_nr); 3243 } 2925 } 3244 } 2926 } 3245 2927 >> 2928 /* Haven't marked the cluster free yet, no list operation involved */ >> 2929 for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) >> 2930 inc_cluster_info_page(p, cluster_info, i); >> 2931 3246 if (nr_good_pages) { 2932 if (nr_good_pages) { 3247 swap_map[0] = SWAP_MAP_BAD; 2933 swap_map[0] = SWAP_MAP_BAD; 3248 si->max = maxpages; !! 2934 /* 3249 si->pages = nr_good_pages; !! 2935 * Not mark the cluster free yet, no list 3250 nr_extents = setup_swap_exten !! 2936 * operation involved >> 2937 */ >> 2938 inc_cluster_info_page(p, cluster_info, 0); >> 2939 p->max = maxpages; >> 2940 p->pages = nr_good_pages; >> 2941 nr_extents = setup_swap_extents(p, span); 3251 if (nr_extents < 0) 2942 if (nr_extents < 0) 3252 return nr_extents; 2943 return nr_extents; 3253 nr_good_pages = si->pages; !! 2944 nr_good_pages = p->pages; 3254 } 2945 } 3255 if (!nr_good_pages) { 2946 if (!nr_good_pages) { 3256 pr_warn("Empty swap-file\n"); 2947 pr_warn("Empty swap-file\n"); 3257 return -EINVAL; 2948 return -EINVAL; 3258 } 2949 } 3259 2950 3260 return nr_extents; << 3261 } << 3262 << 3263 static struct swap_cluster_info *setup_cluste << 3264 << 3265 << 3266 { << 3267 unsigned long nr_clusters = DIV_ROUND << 3268 unsigned long col = si->cluster_next << 3269 struct swap_cluster_info *cluster_inf << 3270 unsigned long i, j, k, idx; << 3271 int cpu, err = -ENOMEM; << 3272 << 3273 cluster_info = kvcalloc(nr_clusters, << 3274 if (!cluster_info) 2951 if (!cluster_info) 3275 goto err; !! 2952 return nr_extents; 3276 << 3277 for (i = 0; i < nr_clusters; i++) << 3278 spin_lock_init(&cluster_info[ << 3279 << 3280 si->cluster_next_cpu = alloc_percpu(u << 3281 if (!si->cluster_next_cpu) << 3282 goto err_free; << 3283 << 3284 /* Random start position to help with << 3285 for_each_possible_cpu(cpu) << 3286 per_cpu(*si->cluster_next_cpu << 3287 get_random_u32_inclusive(1, s << 3288 << 3289 si->percpu_cluster = alloc_percpu(str << 3290 if (!si->percpu_cluster) << 3291 goto err_free; << 3292 << 3293 for_each_possible_cpu(cpu) { << 3294 struct percpu_cluster *cluste << 3295 << 3296 cluster = per_cpu_ptr(si->per << 3297 for (i = 0; i < SWAP_NR_ORDER << 3298 cluster->next[i] = SW << 3299 } << 3300 << 3301 /* << 3302 * Mark unusable pages as unavailable << 3303 * marked free yet, so no list operat << 3304 * << 3305 * See setup_swap_map_and_extents(): << 3306 * and the EOF part of the last clust << 3307 */ << 3308 inc_cluster_info_page(si, cluster_inf << 3309 for (i = 0; i < swap_header->info.nr_ << 3310 inc_cluster_info_page(si, clu << 3311 swap_he << 3312 for (i = maxpages; i < round_up(maxpa << 3313 inc_cluster_info_page(si, clu << 3314 2953 3315 INIT_LIST_HEAD(&si->free_clusters); << 3316 INIT_LIST_HEAD(&si->full_clusters); << 3317 INIT_LIST_HEAD(&si->discard_clusters) << 3318 << 3319 for (i = 0; i < SWAP_NR_ORDERS; i++) << 3320 INIT_LIST_HEAD(&si->nonfull_c << 3321 INIT_LIST_HEAD(&si->frag_clus << 3322 si->frag_cluster_nr[i] = 0; << 3323 } << 3324 2954 3325 /* 2955 /* 3326 * Reduce false cache line sharing be 2956 * Reduce false cache line sharing between cluster_info and 3327 * sharing same address space. 2957 * sharing same address space. 3328 */ 2958 */ 3329 for (k = 0; k < SWAP_CLUSTER_COLS; k+ 2959 for (k = 0; k < SWAP_CLUSTER_COLS; k++) { 3330 j = (k + col) % SWAP_CLUSTER_ 2960 j = (k + col) % SWAP_CLUSTER_COLS; 3331 for (i = 0; i < DIV_ROUND_UP( 2961 for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { 3332 struct swap_cluster_i << 3333 idx = i * SWAP_CLUSTE 2962 idx = i * SWAP_CLUSTER_COLS + j; 3334 ci = cluster_info + i << 3335 if (idx >= nr_cluster 2963 if (idx >= nr_clusters) 3336 continue; 2964 continue; 3337 if (ci->count) { !! 2965 if (cluster_count(&cluster_info[idx])) 3338 ci->flags = C << 3339 list_add_tail << 3340 continue; 2966 continue; 3341 } !! 2967 cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); 3342 ci->flags = CLUSTER_F !! 2968 cluster_list_add_tail(&p->free_clusters, cluster_info, 3343 list_add_tail(&ci->li !! 2969 idx); 3344 } 2970 } 3345 } 2971 } 3346 !! 2972 return nr_extents; 3347 return cluster_info; << 3348 << 3349 err_free: << 3350 kvfree(cluster_info); << 3351 err: << 3352 return ERR_PTR(err); << 3353 } 2973 } 3354 2974 3355 SYSCALL_DEFINE2(swapon, const char __user *, 2975 SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) 3356 { 2976 { 3357 struct swap_info_struct *si; !! 2977 struct swap_info_struct *p; 3358 struct filename *name; 2978 struct filename *name; 3359 struct file *swap_file = NULL; 2979 struct file *swap_file = NULL; 3360 struct address_space *mapping; 2980 struct address_space *mapping; 3361 struct dentry *dentry; 2981 struct dentry *dentry; 3362 int prio; 2982 int prio; 3363 int error; 2983 int error; 3364 union swap_header *swap_header; 2984 union swap_header *swap_header; 3365 int nr_extents; 2985 int nr_extents; 3366 sector_t span; 2986 sector_t span; 3367 unsigned long maxpages; 2987 unsigned long maxpages; 3368 unsigned char *swap_map = NULL; 2988 unsigned char *swap_map = NULL; 3369 unsigned long *zeromap = NULL; << 3370 struct swap_cluster_info *cluster_inf 2989 struct swap_cluster_info *cluster_info = NULL; 3371 struct folio *folio = NULL; !! 2990 unsigned long *frontswap_map = NULL; >> 2991 struct page *page = NULL; 3372 struct inode *inode = NULL; 2992 struct inode *inode = NULL; 3373 bool inced_nr_rotate_swap = false; 2993 bool inced_nr_rotate_swap = false; 3374 2994 3375 if (swap_flags & ~SWAP_FLAGS_VALID) 2995 if (swap_flags & ~SWAP_FLAGS_VALID) 3376 return -EINVAL; 2996 return -EINVAL; 3377 2997 3378 if (!capable(CAP_SYS_ADMIN)) 2998 if (!capable(CAP_SYS_ADMIN)) 3379 return -EPERM; 2999 return -EPERM; 3380 3000 3381 if (!swap_avail_heads) 3001 if (!swap_avail_heads) 3382 return -ENOMEM; 3002 return -ENOMEM; 3383 3003 3384 si = alloc_swap_info(); !! 3004 p = alloc_swap_info(); 3385 if (IS_ERR(si)) !! 3005 if (IS_ERR(p)) 3386 return PTR_ERR(si); !! 3006 return PTR_ERR(p); 3387 3007 3388 INIT_WORK(&si->discard_work, swap_dis !! 3008 INIT_WORK(&p->discard_work, swap_discard_work); 3389 INIT_WORK(&si->reclaim_work, swap_rec << 3390 3009 3391 name = getname(specialfile); 3010 name = getname(specialfile); 3392 if (IS_ERR(name)) { 3011 if (IS_ERR(name)) { 3393 error = PTR_ERR(name); 3012 error = PTR_ERR(name); 3394 name = NULL; 3013 name = NULL; 3395 goto bad_swap; 3014 goto bad_swap; 3396 } 3015 } 3397 swap_file = file_open_name(name, O_RD !! 3016 swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0); 3398 if (IS_ERR(swap_file)) { 3017 if (IS_ERR(swap_file)) { 3399 error = PTR_ERR(swap_file); 3018 error = PTR_ERR(swap_file); 3400 swap_file = NULL; 3019 swap_file = NULL; 3401 goto bad_swap; 3020 goto bad_swap; 3402 } 3021 } 3403 3022 3404 si->swap_file = swap_file; !! 3023 p->swap_file = swap_file; 3405 mapping = swap_file->f_mapping; 3024 mapping = swap_file->f_mapping; 3406 dentry = swap_file->f_path.dentry; 3025 dentry = swap_file->f_path.dentry; 3407 inode = mapping->host; 3026 inode = mapping->host; 3408 3027 3409 error = claim_swapfile(si, inode); !! 3028 error = claim_swapfile(p, inode); 3410 if (unlikely(error)) 3029 if (unlikely(error)) 3411 goto bad_swap; 3030 goto bad_swap; 3412 3031 3413 inode_lock(inode); 3032 inode_lock(inode); 3414 if (d_unlinked(dentry) || cant_mount( 3033 if (d_unlinked(dentry) || cant_mount(dentry)) { 3415 error = -ENOENT; 3034 error = -ENOENT; 3416 goto bad_swap_unlock_inode; 3035 goto bad_swap_unlock_inode; 3417 } 3036 } 3418 if (IS_SWAPFILE(inode)) { 3037 if (IS_SWAPFILE(inode)) { 3419 error = -EBUSY; 3038 error = -EBUSY; 3420 goto bad_swap_unlock_inode; 3039 goto bad_swap_unlock_inode; 3421 } 3040 } 3422 3041 3423 /* 3042 /* 3424 * Read the swap header. 3043 * Read the swap header. 3425 */ 3044 */ 3426 if (!mapping->a_ops->read_folio) { 3045 if (!mapping->a_ops->read_folio) { 3427 error = -EINVAL; 3046 error = -EINVAL; 3428 goto bad_swap_unlock_inode; 3047 goto bad_swap_unlock_inode; 3429 } 3048 } 3430 folio = read_mapping_folio(mapping, 0 !! 3049 page = read_mapping_page(mapping, 0, swap_file); 3431 if (IS_ERR(folio)) { !! 3050 if (IS_ERR(page)) { 3432 error = PTR_ERR(folio); !! 3051 error = PTR_ERR(page); 3433 goto bad_swap_unlock_inode; 3052 goto bad_swap_unlock_inode; 3434 } 3053 } 3435 swap_header = kmap_local_folio(folio, !! 3054 swap_header = kmap(page); 3436 3055 3437 maxpages = read_swap_header(si, swap_ !! 3056 maxpages = read_swap_header(p, swap_header, inode); 3438 if (unlikely(!maxpages)) { 3057 if (unlikely(!maxpages)) { 3439 error = -EINVAL; 3058 error = -EINVAL; 3440 goto bad_swap_unlock_inode; 3059 goto bad_swap_unlock_inode; 3441 } 3060 } 3442 3061 3443 /* OK, set up the swap map and apply 3062 /* OK, set up the swap map and apply the bad block list */ 3444 swap_map = vzalloc(maxpages); 3063 swap_map = vzalloc(maxpages); 3445 if (!swap_map) { 3064 if (!swap_map) { 3446 error = -ENOMEM; 3065 error = -ENOMEM; 3447 goto bad_swap_unlock_inode; 3066 goto bad_swap_unlock_inode; 3448 } 3067 } 3449 3068 3450 error = swap_cgroup_swapon(si->type, !! 3069 if (p->bdev && bdev_stable_writes(p->bdev)) 3451 if (error) !! 3070 p->flags |= SWP_STABLE_WRITES; 3452 goto bad_swap_unlock_inode; << 3453 << 3454 nr_extents = setup_swap_map_and_exten << 3455 << 3456 if (unlikely(nr_extents < 0)) { << 3457 error = nr_extents; << 3458 goto bad_swap_unlock_inode; << 3459 } << 3460 << 3461 /* << 3462 * Use kvmalloc_array instead of bitm << 3463 * be above MAX_PAGE_ORDER incase of << 3464 */ << 3465 zeromap = kvmalloc_array(BITS_TO_LONG << 3466 GFP_KERNE << 3467 if (!zeromap) { << 3468 error = -ENOMEM; << 3469 goto bad_swap_unlock_inode; << 3470 } << 3471 3071 3472 if (si->bdev && bdev_stable_writes(si !! 3072 if (p->bdev && bdev_synchronous(p->bdev)) 3473 si->flags |= SWP_STABLE_WRITE !! 3073 p->flags |= SWP_SYNCHRONOUS_IO; 3474 3074 3475 if (si->bdev && bdev_synchronous(si-> !! 3075 if (p->bdev && bdev_nonrot(p->bdev)) { 3476 si->flags |= SWP_SYNCHRONOUS_ !! 3076 int cpu; >> 3077 unsigned long ci, nr_cluster; >> 3078 >> 3079 p->flags |= SWP_SOLIDSTATE; >> 3080 p->cluster_next_cpu = alloc_percpu(unsigned int); >> 3081 if (!p->cluster_next_cpu) { >> 3082 error = -ENOMEM; >> 3083 goto bad_swap_unlock_inode; >> 3084 } >> 3085 /* >> 3086 * select a random position to start with to help wear leveling >> 3087 * SSD >> 3088 */ >> 3089 for_each_possible_cpu(cpu) { >> 3090 per_cpu(*p->cluster_next_cpu, cpu) = >> 3091 get_random_u32_inclusive(1, p->highest_bit); >> 3092 } >> 3093 nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); >> 3094 >> 3095 cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info), >> 3096 GFP_KERNEL); >> 3097 if (!cluster_info) { >> 3098 error = -ENOMEM; >> 3099 goto bad_swap_unlock_inode; >> 3100 } 3477 3101 3478 if (si->bdev && bdev_nonrot(si->bdev) !! 3102 for (ci = 0; ci < nr_cluster; ci++) 3479 si->flags |= SWP_SOLIDSTATE; !! 3103 spin_lock_init(&((cluster_info + ci)->lock)); 3480 3104 3481 cluster_info = setup_clusters !! 3105 p->percpu_cluster = alloc_percpu(struct percpu_cluster); 3482 if (IS_ERR(cluster_info)) { !! 3106 if (!p->percpu_cluster) { 3483 error = PTR_ERR(clust !! 3107 error = -ENOMEM; 3484 cluster_info = NULL; << 3485 goto bad_swap_unlock_ 3108 goto bad_swap_unlock_inode; 3486 } 3109 } >> 3110 for_each_possible_cpu(cpu) { >> 3111 struct percpu_cluster *cluster; >> 3112 cluster = per_cpu_ptr(p->percpu_cluster, cpu); >> 3113 cluster_set_null(&cluster->index); >> 3114 } 3487 } else { 3115 } else { 3488 atomic_inc(&nr_rotate_swap); 3116 atomic_inc(&nr_rotate_swap); 3489 inced_nr_rotate_swap = true; 3117 inced_nr_rotate_swap = true; 3490 } 3118 } 3491 3119 >> 3120 error = swap_cgroup_swapon(p->type, maxpages); >> 3121 if (error) >> 3122 goto bad_swap_unlock_inode; >> 3123 >> 3124 nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map, >> 3125 cluster_info, maxpages, &span); >> 3126 if (unlikely(nr_extents < 0)) { >> 3127 error = nr_extents; >> 3128 goto bad_swap_unlock_inode; >> 3129 } >> 3130 /* frontswap enabled? set up bit-per-page map for frontswap */ >> 3131 if (IS_ENABLED(CONFIG_FRONTSWAP)) >> 3132 frontswap_map = kvcalloc(BITS_TO_LONGS(maxpages), >> 3133 sizeof(long), >> 3134 GFP_KERNEL); >> 3135 3492 if ((swap_flags & SWAP_FLAG_DISCARD) 3136 if ((swap_flags & SWAP_FLAG_DISCARD) && 3493 si->bdev && bdev_max_discard_sect !! 3137 p->bdev && bdev_max_discard_sectors(p->bdev)) { 3494 /* 3138 /* 3495 * When discard is enabled fo 3139 * When discard is enabled for swap with no particular 3496 * policy flagged, we set all 3140 * policy flagged, we set all swap discard flags here in 3497 * order to sustain backward 3141 * order to sustain backward compatibility with older 3498 * swapon(8) releases. 3142 * swapon(8) releases. 3499 */ 3143 */ 3500 si->flags |= (SWP_DISCARDABLE !! 3144 p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | 3501 SWP_PAGE_DISCARD 3145 SWP_PAGE_DISCARD); 3502 3146 3503 /* 3147 /* 3504 * By flagging sys_swapon, a 3148 * By flagging sys_swapon, a sysadmin can tell us to 3505 * either do single-time area 3149 * either do single-time area discards only, or to just 3506 * perform discards for relea 3150 * perform discards for released swap page-clusters. 3507 * Now it's time to adjust th 3151 * Now it's time to adjust the p->flags accordingly. 3508 */ 3152 */ 3509 if (swap_flags & SWAP_FLAG_DI 3153 if (swap_flags & SWAP_FLAG_DISCARD_ONCE) 3510 si->flags &= ~SWP_PAG !! 3154 p->flags &= ~SWP_PAGE_DISCARD; 3511 else if (swap_flags & SWAP_FL 3155 else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) 3512 si->flags &= ~SWP_ARE !! 3156 p->flags &= ~SWP_AREA_DISCARD; 3513 3157 3514 /* issue a swapon-time discar 3158 /* issue a swapon-time discard if it's still required */ 3515 if (si->flags & SWP_AREA_DISC !! 3159 if (p->flags & SWP_AREA_DISCARD) { 3516 int err = discard_swa !! 3160 int err = discard_swap(p); 3517 if (unlikely(err)) 3161 if (unlikely(err)) 3518 pr_err("swapo 3162 pr_err("swapon: discard_swap(%p): %d\n", 3519 si, e !! 3163 p, err); 3520 } 3164 } 3521 } 3165 } 3522 3166 3523 error = init_swap_address_space(si->t !! 3167 error = init_swap_address_space(p->type, maxpages); 3524 if (error) 3168 if (error) 3525 goto bad_swap_unlock_inode; 3169 goto bad_swap_unlock_inode; 3526 3170 3527 error = zswap_swapon(si->type, maxpag << 3528 if (error) << 3529 goto free_swap_address_space; << 3530 << 3531 /* 3171 /* 3532 * Flush any pending IO and dirty map 3172 * Flush any pending IO and dirty mappings before we start using this 3533 * swap device. 3173 * swap device. 3534 */ 3174 */ 3535 inode->i_flags |= S_SWAPFILE; 3175 inode->i_flags |= S_SWAPFILE; 3536 error = inode_drain_writes(inode); 3176 error = inode_drain_writes(inode); 3537 if (error) { 3177 if (error) { 3538 inode->i_flags &= ~S_SWAPFILE 3178 inode->i_flags &= ~S_SWAPFILE; 3539 goto free_swap_zswap; !! 3179 goto free_swap_address_space; 3540 } 3180 } 3541 3181 3542 mutex_lock(&swapon_mutex); 3182 mutex_lock(&swapon_mutex); 3543 prio = -1; 3183 prio = -1; 3544 if (swap_flags & SWAP_FLAG_PREFER) 3184 if (swap_flags & SWAP_FLAG_PREFER) 3545 prio = 3185 prio = 3546 (swap_flags & SWAP_FLAG_PRI 3186 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; 3547 enable_swap_info(si, prio, swap_map, !! 3187 enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map); 3548 3188 3549 pr_info("Adding %uk swap on %s. Prio !! 3189 pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", 3550 K(si->pages), name->name, si- !! 3190 p->pages<<(PAGE_SHIFT-10), name->name, p->prio, 3551 K((unsigned long long)span), !! 3191 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), 3552 (si->flags & SWP_SOLIDSTATE) !! 3192 (p->flags & SWP_SOLIDSTATE) ? "SS" : "", 3553 (si->flags & SWP_DISCARDABLE) !! 3193 (p->flags & SWP_DISCARDABLE) ? "D" : "", 3554 (si->flags & SWP_AREA_DISCARD !! 3194 (p->flags & SWP_AREA_DISCARD) ? "s" : "", 3555 (si->flags & SWP_PAGE_DISCARD !! 3195 (p->flags & SWP_PAGE_DISCARD) ? "c" : "", >> 3196 (frontswap_map) ? "FS" : ""); 3556 3197 3557 mutex_unlock(&swapon_mutex); 3198 mutex_unlock(&swapon_mutex); 3558 atomic_inc(&proc_poll_event); 3199 atomic_inc(&proc_poll_event); 3559 wake_up_interruptible(&proc_poll_wait 3200 wake_up_interruptible(&proc_poll_wait); 3560 3201 3561 error = 0; 3202 error = 0; 3562 goto out; 3203 goto out; 3563 free_swap_zswap: << 3564 zswap_swapoff(si->type); << 3565 free_swap_address_space: 3204 free_swap_address_space: 3566 exit_swap_address_space(si->type); !! 3205 exit_swap_address_space(p->type); 3567 bad_swap_unlock_inode: 3206 bad_swap_unlock_inode: 3568 inode_unlock(inode); 3207 inode_unlock(inode); 3569 bad_swap: 3208 bad_swap: 3570 free_percpu(si->percpu_cluster); !! 3209 free_percpu(p->percpu_cluster); 3571 si->percpu_cluster = NULL; !! 3210 p->percpu_cluster = NULL; 3572 free_percpu(si->cluster_next_cpu); !! 3211 free_percpu(p->cluster_next_cpu); 3573 si->cluster_next_cpu = NULL; !! 3212 p->cluster_next_cpu = NULL; >> 3213 if (inode && S_ISBLK(inode->i_mode) && p->bdev) { >> 3214 set_blocksize(p->bdev, p->old_block_size); >> 3215 blkdev_put(p->bdev, p); >> 3216 } 3574 inode = NULL; 3217 inode = NULL; 3575 destroy_swap_extents(si); !! 3218 destroy_swap_extents(p); 3576 swap_cgroup_swapoff(si->type); !! 3219 swap_cgroup_swapoff(p->type); 3577 spin_lock(&swap_lock); 3220 spin_lock(&swap_lock); 3578 si->swap_file = NULL; !! 3221 p->swap_file = NULL; 3579 si->flags = 0; !! 3222 p->flags = 0; 3580 spin_unlock(&swap_lock); 3223 spin_unlock(&swap_lock); 3581 vfree(swap_map); 3224 vfree(swap_map); 3582 kvfree(zeromap); << 3583 kvfree(cluster_info); 3225 kvfree(cluster_info); >> 3226 kvfree(frontswap_map); 3584 if (inced_nr_rotate_swap) 3227 if (inced_nr_rotate_swap) 3585 atomic_dec(&nr_rotate_swap); 3228 atomic_dec(&nr_rotate_swap); 3586 if (swap_file) 3229 if (swap_file) 3587 filp_close(swap_file, NULL); 3230 filp_close(swap_file, NULL); 3588 out: 3231 out: 3589 if (!IS_ERR_OR_NULL(folio)) !! 3232 if (page && !IS_ERR(page)) { 3590 folio_release_kmap(folio, swa !! 3233 kunmap(page); >> 3234 put_page(page); >> 3235 } 3591 if (name) 3236 if (name) 3592 putname(name); 3237 putname(name); 3593 if (inode) 3238 if (inode) 3594 inode_unlock(inode); 3239 inode_unlock(inode); 3595 if (!error) 3240 if (!error) 3596 enable_swap_slots_cache(); 3241 enable_swap_slots_cache(); 3597 return error; 3242 return error; 3598 } 3243 } 3599 3244 3600 void si_swapinfo(struct sysinfo *val) 3245 void si_swapinfo(struct sysinfo *val) 3601 { 3246 { 3602 unsigned int type; 3247 unsigned int type; 3603 unsigned long nr_to_be_unused = 0; 3248 unsigned long nr_to_be_unused = 0; 3604 3249 3605 spin_lock(&swap_lock); 3250 spin_lock(&swap_lock); 3606 for (type = 0; type < nr_swapfiles; t 3251 for (type = 0; type < nr_swapfiles; type++) { 3607 struct swap_info_struct *si = 3252 struct swap_info_struct *si = swap_info[type]; 3608 3253 3609 if ((si->flags & SWP_USED) && 3254 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) 3610 nr_to_be_unused += RE 3255 nr_to_be_unused += READ_ONCE(si->inuse_pages); 3611 } 3256 } 3612 val->freeswap = atomic_long_read(&nr_ 3257 val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; 3613 val->totalswap = total_swap_pages + n 3258 val->totalswap = total_swap_pages + nr_to_be_unused; 3614 spin_unlock(&swap_lock); 3259 spin_unlock(&swap_lock); 3615 } 3260 } 3616 3261 3617 /* 3262 /* 3618 * Verify that nr swap entries are valid and !! 3263 * Verify that a swap entry is valid and increment its swap map count. 3619 * 3264 * 3620 * Returns error code in following case. 3265 * Returns error code in following case. 3621 * - success -> 0 3266 * - success -> 0 3622 * - swp_entry is invalid -> EINVAL 3267 * - swp_entry is invalid -> EINVAL 3623 * - swp_entry is migration entry -> EINVAL 3268 * - swp_entry is migration entry -> EINVAL 3624 * - swap-cache reference is requested but th 3269 * - swap-cache reference is requested but there is already one. -> EEXIST 3625 * - swap-cache reference is requested but th 3270 * - swap-cache reference is requested but the entry is not used. -> ENOENT 3626 * - swap-mapped reference requested but need 3271 * - swap-mapped reference requested but needs continued swap count. -> ENOMEM 3627 */ 3272 */ 3628 static int __swap_duplicate(swp_entry_t entry !! 3273 static int __swap_duplicate(swp_entry_t entry, unsigned char usage) 3629 { 3274 { 3630 struct swap_info_struct *si; !! 3275 struct swap_info_struct *p; 3631 struct swap_cluster_info *ci; 3276 struct swap_cluster_info *ci; 3632 unsigned long offset; 3277 unsigned long offset; 3633 unsigned char count; 3278 unsigned char count; 3634 unsigned char has_cache; 3279 unsigned char has_cache; 3635 int err, i; !! 3280 int err; 3636 3281 3637 si = swp_swap_info(entry); !! 3282 p = swp_swap_info(entry); 3638 3283 3639 offset = swp_offset(entry); 3284 offset = swp_offset(entry); 3640 VM_WARN_ON(nr > SWAPFILE_CLUSTER - of !! 3285 ci = lock_cluster_or_swap_info(p, offset); 3641 VM_WARN_ON(usage == 1 && nr > 1); << 3642 ci = lock_cluster_or_swap_info(si, of << 3643 3286 3644 err = 0; !! 3287 count = p->swap_map[offset]; 3645 for (i = 0; i < nr; i++) { << 3646 count = si->swap_map[offset + << 3647 3288 3648 /* !! 3289 /* 3649 * swapin_readahead() doesn't !! 3290 * swapin_readahead() doesn't check if a swap entry is valid, so the 3650 * swap entry could be SWAP_M !! 3291 * swap entry could be SWAP_MAP_BAD. Check here with lock held. 3651 */ !! 3292 */ 3652 if (unlikely(swap_count(count !! 3293 if (unlikely(swap_count(count) == SWAP_MAP_BAD)) { 3653 err = -ENOENT; !! 3294 err = -ENOENT; 3654 goto unlock_out; !! 3295 goto unlock_out; 3655 } !! 3296 } 3656 3297 3657 has_cache = count & SWAP_HAS_ !! 3298 has_cache = count & SWAP_HAS_CACHE; 3658 count &= ~SWAP_HAS_CACHE; !! 3299 count &= ~SWAP_HAS_CACHE; >> 3300 err = 0; 3659 3301 3660 if (!count && !has_cache) { !! 3302 if (usage == SWAP_HAS_CACHE) { 3661 err = -ENOENT; << 3662 } else if (usage == SWAP_HAS_ << 3663 if (has_cache) << 3664 err = -EEXIST << 3665 } else if ((count & ~COUNT_CO << 3666 err = -EINVAL; << 3667 } << 3668 3303 3669 if (err) !! 3304 /* set SWAP_HAS_CACHE if there is no cache and entry is used */ 3670 goto unlock_out; !! 3305 if (!has_cache && count) 3671 } !! 3306 has_cache = SWAP_HAS_CACHE; >> 3307 else if (has_cache) /* someone else added cache */ >> 3308 err = -EEXIST; >> 3309 else /* no users remaining */ >> 3310 err = -ENOENT; 3672 3311 3673 for (i = 0; i < nr; i++) { !! 3312 } else if (count || has_cache) { 3674 count = si->swap_map[offset + << 3675 has_cache = count & SWAP_HAS_ << 3676 count &= ~SWAP_HAS_CACHE; << 3677 3313 3678 if (usage == SWAP_HAS_CACHE) !! 3314 if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) 3679 has_cache = SWAP_HAS_ << 3680 else if ((count & ~COUNT_CONT << 3681 count += usage; 3315 count += usage; 3682 else if (swap_count_continued !! 3316 else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) >> 3317 err = -EINVAL; >> 3318 else if (swap_count_continued(p, offset, count)) 3683 count = COUNT_CONTINU 3319 count = COUNT_CONTINUED; 3684 else { !! 3320 else 3685 /* << 3686 * Don't need to roll << 3687 * usage == 1, there << 3688 */ << 3689 err = -ENOMEM; 3321 err = -ENOMEM; 3690 goto unlock_out; !! 3322 } else 3691 } !! 3323 err = -ENOENT; /* unused swap entry */ 3692 3324 3693 WRITE_ONCE(si->swap_map[offse !! 3325 WRITE_ONCE(p->swap_map[offset], count | has_cache); 3694 } << 3695 3326 3696 unlock_out: 3327 unlock_out: 3697 unlock_cluster_or_swap_info(si, ci); !! 3328 unlock_cluster_or_swap_info(p, ci); 3698 return err; 3329 return err; 3699 } 3330 } 3700 3331 3701 /* 3332 /* 3702 * Help swapoff by noting that swap entry bel 3333 * Help swapoff by noting that swap entry belongs to shmem/tmpfs 3703 * (in which case its reference count is neve 3334 * (in which case its reference count is never incremented). 3704 */ 3335 */ 3705 void swap_shmem_alloc(swp_entry_t entry, int !! 3336 void swap_shmem_alloc(swp_entry_t entry) 3706 { 3337 { 3707 __swap_duplicate(entry, SWAP_MAP_SHME !! 3338 __swap_duplicate(entry, SWAP_MAP_SHMEM); 3708 } 3339 } 3709 3340 3710 /* 3341 /* 3711 * Increase reference count of swap entry by 3342 * Increase reference count of swap entry by 1. 3712 * Returns 0 for success, or -ENOMEM if a swa 3343 * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required 3713 * but could not be atomically allocated. Re 3344 * but could not be atomically allocated. Returns 0, just as if it succeeded, 3714 * if __swap_duplicate() fails for another re 3345 * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which 3715 * might occur if a page table entry has got 3346 * might occur if a page table entry has got corrupted. 3716 */ 3347 */ 3717 int swap_duplicate(swp_entry_t entry) 3348 int swap_duplicate(swp_entry_t entry) 3718 { 3349 { 3719 int err = 0; 3350 int err = 0; 3720 3351 3721 while (!err && __swap_duplicate(entry !! 3352 while (!err && __swap_duplicate(entry, 1) == -ENOMEM) 3722 err = add_swap_count_continua 3353 err = add_swap_count_continuation(entry, GFP_ATOMIC); 3723 return err; 3354 return err; 3724 } 3355 } 3725 3356 3726 /* 3357 /* 3727 * @entry: first swap entry from which we all !! 3358 * @entry: swap entry for which we allocate swap cache. 3728 * 3359 * 3729 * Called when allocating swap cache for exis !! 3360 * Called when allocating swap cache for existing swap entry, 3730 * This can return error codes. Returns 0 at 3361 * This can return error codes. Returns 0 at success. 3731 * -EEXIST means there is a swap cache. 3362 * -EEXIST means there is a swap cache. 3732 * Note: return code is different from swap_d 3363 * Note: return code is different from swap_duplicate(). 3733 */ 3364 */ 3734 int swapcache_prepare(swp_entry_t entry, int !! 3365 int swapcache_prepare(swp_entry_t entry) 3735 { 3366 { 3736 return __swap_duplicate(entry, SWAP_H !! 3367 return __swap_duplicate(entry, SWAP_HAS_CACHE); 3737 } 3368 } 3738 3369 3739 void swapcache_clear(struct swap_info_struct !! 3370 struct swap_info_struct *swp_swap_info(swp_entry_t entry) 3740 { 3371 { 3741 unsigned long offset = swp_offset(ent !! 3372 return swap_type_to_swap_info(swp_type(entry)); 3742 << 3743 cluster_swap_free_nr(si, offset, nr, << 3744 } 3373 } 3745 3374 3746 struct swap_info_struct *swp_swap_info(swp_en !! 3375 struct swap_info_struct *page_swap_info(struct page *page) 3747 { 3376 { 3748 return swap_type_to_swap_info(swp_typ !! 3377 swp_entry_t entry = { .val = page_private(page) }; >> 3378 return swp_swap_info(entry); 3749 } 3379 } 3750 3380 3751 /* 3381 /* 3752 * out-of-line methods to avoid include hell. 3382 * out-of-line methods to avoid include hell. 3753 */ 3383 */ 3754 struct address_space *swapcache_mapping(struc 3384 struct address_space *swapcache_mapping(struct folio *folio) 3755 { 3385 { 3756 return swp_swap_info(folio->swap)->sw !! 3386 return page_swap_info(&folio->page)->swap_file->f_mapping; 3757 } 3387 } 3758 EXPORT_SYMBOL_GPL(swapcache_mapping); 3388 EXPORT_SYMBOL_GPL(swapcache_mapping); 3759 3389 3760 pgoff_t __folio_swap_cache_index(struct folio !! 3390 pgoff_t __page_file_index(struct page *page) 3761 { 3391 { 3762 return swap_cache_index(folio->swap); !! 3392 swp_entry_t swap = { .val = page_private(page) }; >> 3393 return swp_offset(swap); 3763 } 3394 } 3764 EXPORT_SYMBOL_GPL(__folio_swap_cache_index); !! 3395 EXPORT_SYMBOL_GPL(__page_file_index); 3765 3396 3766 /* 3397 /* 3767 * add_swap_count_continuation - called when 3398 * add_swap_count_continuation - called when a swap count is duplicated 3768 * beyond SWAP_MAP_MAX, it allocates a new pa 3399 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's 3769 * page of the original vmalloc'ed swap_map, 3400 * page of the original vmalloc'ed swap_map, to hold the continuation count 3770 * (for that entry and for its neighbouring P 3401 * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called 3771 * again when count is duplicated beyond SWAP 3402 * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc. 3772 * 3403 * 3773 * These continuation pages are seldom refere 3404 * These continuation pages are seldom referenced: the common paths all work 3774 * on the original swap_map, only referring t 3405 * on the original swap_map, only referring to a continuation page when the 3775 * low "digit" of a count is incremented or d 3406 * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. 3776 * 3407 * 3777 * add_swap_count_continuation(, GFP_ATOMIC) 3408 * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding 3778 * page table locks; if it fails, add_swap_co 3409 * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL) 3779 * can be called after dropping locks. 3410 * can be called after dropping locks. 3780 */ 3411 */ 3781 int add_swap_count_continuation(swp_entry_t e 3412 int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) 3782 { 3413 { 3783 struct swap_info_struct *si; 3414 struct swap_info_struct *si; 3784 struct swap_cluster_info *ci; 3415 struct swap_cluster_info *ci; 3785 struct page *head; 3416 struct page *head; 3786 struct page *page; 3417 struct page *page; 3787 struct page *list_page; 3418 struct page *list_page; 3788 pgoff_t offset; 3419 pgoff_t offset; 3789 unsigned char count; 3420 unsigned char count; 3790 int ret = 0; 3421 int ret = 0; 3791 3422 3792 /* 3423 /* 3793 * When debugging, it's easier to use 3424 * When debugging, it's easier to use __GFP_ZERO here; but it's better 3794 * for latency not to zero a page whi 3425 * for latency not to zero a page while GFP_ATOMIC and holding locks. 3795 */ 3426 */ 3796 page = alloc_page(gfp_mask | __GFP_HI 3427 page = alloc_page(gfp_mask | __GFP_HIGHMEM); 3797 3428 3798 si = get_swap_device(entry); 3429 si = get_swap_device(entry); 3799 if (!si) { 3430 if (!si) { 3800 /* 3431 /* 3801 * An acceptable race has occ 3432 * An acceptable race has occurred since the failing 3802 * __swap_duplicate(): the sw 3433 * __swap_duplicate(): the swap device may be swapoff 3803 */ 3434 */ 3804 goto outer; 3435 goto outer; 3805 } 3436 } 3806 spin_lock(&si->lock); 3437 spin_lock(&si->lock); 3807 3438 3808 offset = swp_offset(entry); 3439 offset = swp_offset(entry); 3809 3440 3810 ci = lock_cluster(si, offset); 3441 ci = lock_cluster(si, offset); 3811 3442 3812 count = swap_count(si->swap_map[offse 3443 count = swap_count(si->swap_map[offset]); 3813 3444 3814 if ((count & ~COUNT_CONTINUED) != SWA 3445 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { 3815 /* 3446 /* 3816 * The higher the swap count, 3447 * The higher the swap count, the more likely it is that tasks 3817 * will race to add swap coun 3448 * will race to add swap count continuation: we need to avoid 3818 * over-provisioning. 3449 * over-provisioning. 3819 */ 3450 */ 3820 goto out; 3451 goto out; 3821 } 3452 } 3822 3453 3823 if (!page) { 3454 if (!page) { 3824 ret = -ENOMEM; 3455 ret = -ENOMEM; 3825 goto out; 3456 goto out; 3826 } 3457 } 3827 3458 3828 head = vmalloc_to_page(si->swap_map + 3459 head = vmalloc_to_page(si->swap_map + offset); 3829 offset &= ~PAGE_MASK; 3460 offset &= ~PAGE_MASK; 3830 3461 3831 spin_lock(&si->cont_lock); 3462 spin_lock(&si->cont_lock); 3832 /* 3463 /* 3833 * Page allocation does not initializ 3464 * Page allocation does not initialize the page's lru field, 3834 * but it does always reset its priva 3465 * but it does always reset its private field. 3835 */ 3466 */ 3836 if (!page_private(head)) { 3467 if (!page_private(head)) { 3837 BUG_ON(count & COUNT_CONTINUE 3468 BUG_ON(count & COUNT_CONTINUED); 3838 INIT_LIST_HEAD(&head->lru); 3469 INIT_LIST_HEAD(&head->lru); 3839 set_page_private(head, SWP_CO 3470 set_page_private(head, SWP_CONTINUED); 3840 si->flags |= SWP_CONTINUED; 3471 si->flags |= SWP_CONTINUED; 3841 } 3472 } 3842 3473 3843 list_for_each_entry(list_page, &head- 3474 list_for_each_entry(list_page, &head->lru, lru) { 3844 unsigned char *map; 3475 unsigned char *map; 3845 3476 3846 /* 3477 /* 3847 * If the previous map said n 3478 * If the previous map said no continuation, but we've found 3848 * a continuation page, free 3479 * a continuation page, free our allocation and use this one. 3849 */ 3480 */ 3850 if (!(count & COUNT_CONTINUED 3481 if (!(count & COUNT_CONTINUED)) 3851 goto out_unlock_cont; 3482 goto out_unlock_cont; 3852 3483 3853 map = kmap_local_page(list_pa !! 3484 map = kmap_atomic(list_page) + offset; 3854 count = *map; 3485 count = *map; 3855 kunmap_local(map); !! 3486 kunmap_atomic(map); 3856 3487 3857 /* 3488 /* 3858 * If this continuation count 3489 * If this continuation count now has some space in it, 3859 * free our allocation and us 3490 * free our allocation and use this one. 3860 */ 3491 */ 3861 if ((count & ~COUNT_CONTINUED 3492 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) 3862 goto out_unlock_cont; 3493 goto out_unlock_cont; 3863 } 3494 } 3864 3495 3865 list_add_tail(&page->lru, &head->lru) 3496 list_add_tail(&page->lru, &head->lru); 3866 page = NULL; /* no 3497 page = NULL; /* now it's attached, don't free it */ 3867 out_unlock_cont: 3498 out_unlock_cont: 3868 spin_unlock(&si->cont_lock); 3499 spin_unlock(&si->cont_lock); 3869 out: 3500 out: 3870 unlock_cluster(ci); 3501 unlock_cluster(ci); 3871 spin_unlock(&si->lock); 3502 spin_unlock(&si->lock); 3872 put_swap_device(si); 3503 put_swap_device(si); 3873 outer: 3504 outer: 3874 if (page) 3505 if (page) 3875 __free_page(page); 3506 __free_page(page); 3876 return ret; 3507 return ret; 3877 } 3508 } 3878 3509 3879 /* 3510 /* 3880 * swap_count_continued - when the original s 3511 * swap_count_continued - when the original swap_map count is incremented 3881 * from SWAP_MAP_MAX, check if there is alrea 3512 * from SWAP_MAP_MAX, check if there is already a continuation page to carry 3882 * into, carry if so, or else fail until a ne 3513 * into, carry if so, or else fail until a new continuation page is allocated; 3883 * when the original swap_map count is decrem 3514 * when the original swap_map count is decremented from 0 with continuation, 3884 * borrow from the continuation and report wh 3515 * borrow from the continuation and report whether it still holds more. 3885 * Called while __swap_duplicate() or swap_en 3516 * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster 3886 * lock. 3517 * lock. 3887 */ 3518 */ 3888 static bool swap_count_continued(struct swap_ 3519 static bool swap_count_continued(struct swap_info_struct *si, 3889 pgoff_t offs 3520 pgoff_t offset, unsigned char count) 3890 { 3521 { 3891 struct page *head; 3522 struct page *head; 3892 struct page *page; 3523 struct page *page; 3893 unsigned char *map; 3524 unsigned char *map; 3894 bool ret; 3525 bool ret; 3895 3526 3896 head = vmalloc_to_page(si->swap_map + 3527 head = vmalloc_to_page(si->swap_map + offset); 3897 if (page_private(head) != SWP_CONTINU 3528 if (page_private(head) != SWP_CONTINUED) { 3898 BUG_ON(count & COUNT_CONTINUE 3529 BUG_ON(count & COUNT_CONTINUED); 3899 return false; /* ne 3530 return false; /* need to add count continuation */ 3900 } 3531 } 3901 3532 3902 spin_lock(&si->cont_lock); 3533 spin_lock(&si->cont_lock); 3903 offset &= ~PAGE_MASK; 3534 offset &= ~PAGE_MASK; 3904 page = list_next_entry(head, lru); 3535 page = list_next_entry(head, lru); 3905 map = kmap_local_page(page) + offset; !! 3536 map = kmap_atomic(page) + offset; 3906 3537 3907 if (count == SWAP_MAP_MAX) /* in 3538 if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ 3908 goto init_map; /* ju 3539 goto init_map; /* jump over SWAP_CONT_MAX checks */ 3909 3540 3910 if (count == (SWAP_MAP_MAX | COUNT_CO 3541 if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */ 3911 /* 3542 /* 3912 * Think of how you add 1 to 3543 * Think of how you add 1 to 999 3913 */ 3544 */ 3914 while (*map == (SWAP_CONT_MAX 3545 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { 3915 kunmap_local(map); !! 3546 kunmap_atomic(map); 3916 page = list_next_entr 3547 page = list_next_entry(page, lru); 3917 BUG_ON(page == head); 3548 BUG_ON(page == head); 3918 map = kmap_local_page !! 3549 map = kmap_atomic(page) + offset; 3919 } 3550 } 3920 if (*map == SWAP_CONT_MAX) { 3551 if (*map == SWAP_CONT_MAX) { 3921 kunmap_local(map); !! 3552 kunmap_atomic(map); 3922 page = list_next_entr 3553 page = list_next_entry(page, lru); 3923 if (page == head) { 3554 if (page == head) { 3924 ret = false; 3555 ret = false; /* add count continuation */ 3925 goto out; 3556 goto out; 3926 } 3557 } 3927 map = kmap_local_page !! 3558 map = kmap_atomic(page) + offset; 3928 init_map: *map = 0; 3559 init_map: *map = 0; /* we didn't zero the page */ 3929 } 3560 } 3930 *map += 1; 3561 *map += 1; 3931 kunmap_local(map); !! 3562 kunmap_atomic(map); 3932 while ((page = list_prev_entr 3563 while ((page = list_prev_entry(page, lru)) != head) { 3933 map = kmap_local_page !! 3564 map = kmap_atomic(page) + offset; 3934 *map = COUNT_CONTINUE 3565 *map = COUNT_CONTINUED; 3935 kunmap_local(map); !! 3566 kunmap_atomic(map); 3936 } 3567 } 3937 ret = true; 3568 ret = true; /* incremented */ 3938 3569 3939 } else { 3570 } else { /* decrementing */ 3940 /* 3571 /* 3941 * Think of how you subtract 3572 * Think of how you subtract 1 from 1000 3942 */ 3573 */ 3943 BUG_ON(count != COUNT_CONTINU 3574 BUG_ON(count != COUNT_CONTINUED); 3944 while (*map == COUNT_CONTINUE 3575 while (*map == COUNT_CONTINUED) { 3945 kunmap_local(map); !! 3576 kunmap_atomic(map); 3946 page = list_next_entr 3577 page = list_next_entry(page, lru); 3947 BUG_ON(page == head); 3578 BUG_ON(page == head); 3948 map = kmap_local_page !! 3579 map = kmap_atomic(page) + offset; 3949 } 3580 } 3950 BUG_ON(*map == 0); 3581 BUG_ON(*map == 0); 3951 *map -= 1; 3582 *map -= 1; 3952 if (*map == 0) 3583 if (*map == 0) 3953 count = 0; 3584 count = 0; 3954 kunmap_local(map); !! 3585 kunmap_atomic(map); 3955 while ((page = list_prev_entr 3586 while ((page = list_prev_entry(page, lru)) != head) { 3956 map = kmap_local_page !! 3587 map = kmap_atomic(page) + offset; 3957 *map = SWAP_CONT_MAX 3588 *map = SWAP_CONT_MAX | count; 3958 count = COUNT_CONTINU 3589 count = COUNT_CONTINUED; 3959 kunmap_local(map); !! 3590 kunmap_atomic(map); 3960 } 3591 } 3961 ret = count == COUNT_CONTINUE 3592 ret = count == COUNT_CONTINUED; 3962 } 3593 } 3963 out: 3594 out: 3964 spin_unlock(&si->cont_lock); 3595 spin_unlock(&si->cont_lock); 3965 return ret; 3596 return ret; 3966 } 3597 } 3967 3598 3968 /* 3599 /* 3969 * free_swap_count_continuations - swapoff fr 3600 * free_swap_count_continuations - swapoff free all the continuation pages 3970 * appended to the swap_map, after swap_map i 3601 * appended to the swap_map, after swap_map is quiesced, before vfree'ing it. 3971 */ 3602 */ 3972 static void free_swap_count_continuations(str 3603 static void free_swap_count_continuations(struct swap_info_struct *si) 3973 { 3604 { 3974 pgoff_t offset; 3605 pgoff_t offset; 3975 3606 3976 for (offset = 0; offset < si->max; of 3607 for (offset = 0; offset < si->max; offset += PAGE_SIZE) { 3977 struct page *head; 3608 struct page *head; 3978 head = vmalloc_to_page(si->sw 3609 head = vmalloc_to_page(si->swap_map + offset); 3979 if (page_private(head)) { 3610 if (page_private(head)) { 3980 struct page *page, *n 3611 struct page *page, *next; 3981 3612 3982 list_for_each_entry_s 3613 list_for_each_entry_safe(page, next, &head->lru, lru) { 3983 list_del(&pag 3614 list_del(&page->lru); 3984 __free_page(p 3615 __free_page(page); 3985 } 3616 } 3986 } 3617 } 3987 } 3618 } 3988 } 3619 } 3989 3620 3990 #if defined(CONFIG_MEMCG) && defined(CONFIG_B 3621 #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) 3991 void __folio_throttle_swaprate(struct folio * 3622 void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) 3992 { 3623 { 3993 struct swap_info_struct *si, *next; 3624 struct swap_info_struct *si, *next; 3994 int nid = folio_nid(folio); 3625 int nid = folio_nid(folio); 3995 3626 3996 if (!(gfp & __GFP_IO)) 3627 if (!(gfp & __GFP_IO)) 3997 return; << 3998 << 3999 if (!__has_usable_swap()) << 4000 return; 3628 return; 4001 3629 4002 if (!blk_cgroup_congested()) 3630 if (!blk_cgroup_congested()) 4003 return; 3631 return; 4004 3632 4005 /* 3633 /* 4006 * We've already scheduled a throttle 3634 * We've already scheduled a throttle, avoid taking the global swap 4007 * lock. 3635 * lock. 4008 */ 3636 */ 4009 if (current->throttle_disk) 3637 if (current->throttle_disk) 4010 return; 3638 return; 4011 3639 4012 spin_lock(&swap_avail_lock); 3640 spin_lock(&swap_avail_lock); 4013 plist_for_each_entry_safe(si, next, & 3641 plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], 4014 avail_lists 3642 avail_lists[nid]) { 4015 if (si->bdev) { 3643 if (si->bdev) { 4016 blkcg_schedule_thrott 3644 blkcg_schedule_throttle(si->bdev->bd_disk, true); 4017 break; 3645 break; 4018 } 3646 } 4019 } 3647 } 4020 spin_unlock(&swap_avail_lock); 3648 spin_unlock(&swap_avail_lock); 4021 } 3649 } 4022 #endif 3650 #endif 4023 3651 4024 static int __init swapfile_init(void) 3652 static int __init swapfile_init(void) 4025 { 3653 { 4026 int nid; 3654 int nid; 4027 3655 4028 swap_avail_heads = kmalloc_array(nr_n 3656 swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head), 4029 GFP_ 3657 GFP_KERNEL); 4030 if (!swap_avail_heads) { 3658 if (!swap_avail_heads) { 4031 pr_emerg("Not enough memory f 3659 pr_emerg("Not enough memory for swap heads, swap is disabled\n"); 4032 return -ENOMEM; 3660 return -ENOMEM; 4033 } 3661 } 4034 3662 4035 for_each_node(nid) 3663 for_each_node(nid) 4036 plist_head_init(&swap_avail_h 3664 plist_head_init(&swap_avail_heads[nid]); 4037 3665 4038 swapfile_maximum_size = arch_max_swap 3666 swapfile_maximum_size = arch_max_swapfile_size(); 4039 3667 4040 #ifdef CONFIG_MIGRATION 3668 #ifdef CONFIG_MIGRATION 4041 if (swapfile_maximum_size >= (1UL << 3669 if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS)) 4042 swap_migration_ad_supported = 3670 swap_migration_ad_supported = true; 4043 #endif /* CONFIG_MIGRATION */ 3671 #endif /* CONFIG_MIGRATION */ 4044 3672 4045 return 0; 3673 return 0; 4046 } 3674 } 4047 subsys_initcall(swapfile_init); 3675 subsys_initcall(swapfile_init); 4048 3676
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.