1 // SPDX-License-Identifier: GPL-2.0-only << 2 /* 1 /* 3 * linux/mm/swapfile.c 2 * linux/mm/swapfile.c 4 * 3 * 5 * Copyright (C) 1991, 1992, 1993, 1994 Linu 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 6 * Swap reorganised 29.12.95, Stephen Tweedie 5 * Swap reorganised 29.12.95, Stephen Tweedie 7 */ 6 */ 8 7 9 #include <linux/blkdev.h> << 10 #include <linux/mm.h> 8 #include <linux/mm.h> 11 #include <linux/sched/mm.h> << 12 #include <linux/sched/task.h> << 13 #include <linux/hugetlb.h> 9 #include <linux/hugetlb.h> 14 #include <linux/mman.h> 10 #include <linux/mman.h> 15 #include <linux/slab.h> 11 #include <linux/slab.h> 16 #include <linux/kernel_stat.h> 12 #include <linux/kernel_stat.h> 17 #include <linux/swap.h> 13 #include <linux/swap.h> 18 #include <linux/vmalloc.h> 14 #include <linux/vmalloc.h> 19 #include <linux/pagemap.h> 15 #include <linux/pagemap.h> 20 #include <linux/namei.h> 16 #include <linux/namei.h> 21 #include <linux/shmem_fs.h> 17 #include <linux/shmem_fs.h> 22 #include <linux/blk-cgroup.h> !! 18 #include <linux/blkdev.h> 23 #include <linux/random.h> 19 #include <linux/random.h> 24 #include <linux/writeback.h> 20 #include <linux/writeback.h> 25 #include <linux/proc_fs.h> 21 #include <linux/proc_fs.h> 26 #include <linux/seq_file.h> 22 #include <linux/seq_file.h> 27 #include <linux/init.h> 23 #include <linux/init.h> 28 #include <linux/ksm.h> 24 #include <linux/ksm.h> 29 #include <linux/rmap.h> 25 #include <linux/rmap.h> 30 #include <linux/security.h> 26 #include <linux/security.h> 31 #include <linux/backing-dev.h> 27 #include <linux/backing-dev.h> 32 #include <linux/mutex.h> 28 #include <linux/mutex.h> 33 #include <linux/capability.h> 29 #include <linux/capability.h> 34 #include <linux/syscalls.h> 30 #include <linux/syscalls.h> 35 #include <linux/memcontrol.h> 31 #include <linux/memcontrol.h> 36 #include <linux/poll.h> 32 #include <linux/poll.h> 37 #include <linux/oom.h> 33 #include <linux/oom.h> >> 34 #include <linux/frontswap.h> 38 #include <linux/swapfile.h> 35 #include <linux/swapfile.h> 39 #include <linux/export.h> 36 #include <linux/export.h> 40 #include <linux/swap_slots.h> << 41 #include <linux/sort.h> << 42 #include <linux/completion.h> << 43 #include <linux/suspend.h> << 44 #include <linux/zswap.h> << 45 #include <linux/plist.h> << 46 37 >> 38 #include <asm/pgtable.h> 47 #include <asm/tlbflush.h> 39 #include <asm/tlbflush.h> 48 #include <linux/swapops.h> 40 #include <linux/swapops.h> 49 #include <linux/swap_cgroup.h> 41 #include <linux/swap_cgroup.h> 50 #include "internal.h" << 51 #include "swap.h" << 52 42 53 static bool swap_count_continued(struct swap_i 43 static bool swap_count_continued(struct swap_info_struct *, pgoff_t, 54 unsigned char 44 unsigned char); 55 static void free_swap_count_continuations(stru 45 static void free_swap_count_continuations(struct swap_info_struct *); 56 static void swap_entry_range_free(struct swap_ !! 46 static sector_t map_swap_entry(swp_entry_t, struct block_device**); 57 unsigned int << 58 static void swap_range_alloc(struct swap_info_ << 59 unsigned int nr_e << 60 static bool folio_swapcache_freeable(struct fo << 61 static struct swap_cluster_info *lock_cluster_ << 62 struct swap_info_struct *si, u << 63 static void unlock_cluster_or_swap_info(struct << 64 struct << 65 47 66 static DEFINE_SPINLOCK(swap_lock); !! 48 DEFINE_SPINLOCK(swap_lock); 67 static unsigned int nr_swapfiles; 49 static unsigned int nr_swapfiles; 68 atomic_long_t nr_swap_pages; 50 atomic_long_t nr_swap_pages; 69 /* << 70 * Some modules use swappable objects and may << 71 * memory pressure (via the shrinker). Before << 72 * check to see if any swap space is available << 73 */ << 74 EXPORT_SYMBOL_GPL(nr_swap_pages); << 75 /* protected with swap_lock. reading in vm_swa 51 /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ 76 long total_swap_pages; 52 long total_swap_pages; 77 static int least_priority = -1; !! 53 static int least_priority; 78 unsigned long swapfile_maximum_size; << 79 #ifdef CONFIG_MIGRATION << 80 bool swap_migration_ad_supported; << 81 #endif /* CONFIG_MIGRATION */ << 82 54 83 static const char Bad_file[] = "Bad swap file 55 static const char Bad_file[] = "Bad swap file entry "; 84 static const char Unused_file[] = "Unused swap 56 static const char Unused_file[] = "Unused swap file entry "; 85 static const char Bad_offset[] = "Bad swap off 57 static const char Bad_offset[] = "Bad swap offset entry "; 86 static const char Unused_offset[] = "Unused sw 58 static const char Unused_offset[] = "Unused swap offset entry "; 87 59 88 /* 60 /* 89 * all active swap_info_structs 61 * all active swap_info_structs 90 * protected with swap_lock, and ordered by pr 62 * protected with swap_lock, and ordered by priority. 91 */ 63 */ 92 static PLIST_HEAD(swap_active_head); !! 64 PLIST_HEAD(swap_active_head); 93 65 94 /* 66 /* 95 * all available (active, not full) swap_info_ 67 * all available (active, not full) swap_info_structs 96 * protected with swap_avail_lock, ordered by 68 * protected with swap_avail_lock, ordered by priority. 97 * This is used by folio_alloc_swap() instead !! 69 * This is used by get_swap_page() instead of swap_active_head 98 * because swap_active_head includes all swap_ 70 * because swap_active_head includes all swap_info_structs, 99 * but folio_alloc_swap() doesn't need to look !! 71 * but get_swap_page() doesn't need to look at full ones. 100 * This uses its own lock instead of swap_lock 72 * This uses its own lock instead of swap_lock because when a 101 * swap_info_struct changes between not-full/f 73 * swap_info_struct changes between not-full/full, it needs to 102 * add/remove itself to/from this list, but th 74 * add/remove itself to/from this list, but the swap_info_struct->lock 103 * is held and the locking order requires swap 75 * is held and the locking order requires swap_lock to be taken 104 * before any swap_info_struct->lock. 76 * before any swap_info_struct->lock. 105 */ 77 */ 106 static struct plist_head *swap_avail_heads; !! 78 static PLIST_HEAD(swap_avail_head); 107 static DEFINE_SPINLOCK(swap_avail_lock); 79 static DEFINE_SPINLOCK(swap_avail_lock); 108 80 109 static struct swap_info_struct *swap_info[MAX_ !! 81 struct swap_info_struct *swap_info[MAX_SWAPFILES]; 110 82 111 static DEFINE_MUTEX(swapon_mutex); 83 static DEFINE_MUTEX(swapon_mutex); 112 84 113 static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait) 85 static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); 114 /* Activity counter to indicate that a swapon 86 /* Activity counter to indicate that a swapon or swapoff has occurred */ 115 static atomic_t proc_poll_event = ATOMIC_INIT( 87 static atomic_t proc_poll_event = ATOMIC_INIT(0); 116 88 117 atomic_t nr_rotate_swap = ATOMIC_INIT(0); << 118 << 119 static struct swap_info_struct *swap_type_to_s << 120 { << 121 if (type >= MAX_SWAPFILES) << 122 return NULL; << 123 << 124 return READ_ONCE(swap_info[type]); /* << 125 } << 126 << 127 static inline unsigned char swap_count(unsigne 89 static inline unsigned char swap_count(unsigned char ent) 128 { 90 { 129 return ent & ~SWAP_HAS_CACHE; /* may !! 91 return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ 130 } 92 } 131 93 132 /* Reclaim the swap entry anyway if possible * !! 94 /* returns 1 if swap entry is freed */ 133 #define TTRS_ANYWAY 0x1 !! 95 static int 134 /* !! 96 __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) 135 * Reclaim the swap entry if there are no more << 136 * corresponding page << 137 */ << 138 #define TTRS_UNMAPPED 0x2 << 139 /* Reclaim the swap entry if swap is getting f << 140 #define TTRS_FULL 0x4 << 141 /* Reclaim directly, bypass the slot cache and << 142 #define TTRS_DIRECT 0x8 << 143 << 144 static bool swap_is_has_cache(struct swap_info << 145 unsigned long of << 146 { << 147 unsigned char *map = si->swap_map + of << 148 unsigned char *map_end = map + nr_page << 149 << 150 do { << 151 VM_BUG_ON(!(*map & SWAP_HAS_CA << 152 if (*map != SWAP_HAS_CACHE) << 153 return false; << 154 } while (++map < map_end); << 155 << 156 return true; << 157 } << 158 << 159 static bool swap_is_last_map(struct swap_info_ << 160 unsigned long offset, int nr_p << 161 { << 162 unsigned char *map = si->swap_map + of << 163 unsigned char *map_end = map + nr_page << 164 unsigned char count = *map; << 165 << 166 if (swap_count(count) != 1) << 167 return false; << 168 << 169 while (++map < map_end) { << 170 if (*map != count) << 171 return false; << 172 } << 173 << 174 *has_cache = !!(count & SWAP_HAS_CACHE << 175 return true; << 176 } << 177 << 178 /* << 179 * returns number of pages in the folio that b << 180 * the folio was reclaimed. If negative, the f << 181 * folio was associated with the swap entry. << 182 */ << 183 static int __try_to_reclaim_swap(struct swap_i << 184 unsigned long << 185 { 97 { 186 swp_entry_t entry = swp_entry(si->type 98 swp_entry_t entry = swp_entry(si->type, offset); 187 struct address_space *address_space = !! 99 struct page *page; 188 struct swap_cluster_info *ci; !! 100 int ret = 0; 189 struct folio *folio; << 190 int ret, nr_pages; << 191 bool need_reclaim; << 192 101 193 folio = filemap_get_folio(address_spac !! 102 page = find_get_page(swap_address_space(entry), entry.val); 194 if (IS_ERR(folio)) !! 103 if (!page) 195 return 0; 104 return 0; 196 << 197 nr_pages = folio_nr_pages(folio); << 198 ret = -nr_pages; << 199 << 200 /* 105 /* 201 * When this function is called from s !! 106 * This function is called from scan_swap_map() and it's called 202 * called by vmscan.c at reclaiming fo !! 107 * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here. 203 * here. We have to use trylock for av !! 108 * We have to use trylock for avoiding deadlock. This is a special 204 * case and you should use folio_free_ !! 109 * case and you should use try_to_free_swap() with explicit lock_page() 205 * in usual operations. 110 * in usual operations. 206 */ 111 */ 207 if (!folio_trylock(folio)) !! 112 if (trylock_page(page)) { 208 goto out; !! 113 ret = try_to_free_swap(page); 209 !! 114 unlock_page(page); 210 /* offset could point to the middle of !! 115 } 211 entry = folio->swap; !! 116 page_cache_release(page); 212 offset = swp_offset(entry); << 213 << 214 need_reclaim = ((flags & TTRS_ANYWAY) << 215 ((flags & TTRS_UNMAPPE << 216 ((flags & TTRS_FULL) & << 217 if (!need_reclaim || !folio_swapcache_ << 218 goto out_unlock; << 219 << 220 /* << 221 * It's safe to delete the folio from << 222 * swap_map is HAS_CACHE only, which m << 223 * reference or pending writeback, and << 224 */ << 225 ci = lock_cluster_or_swap_info(si, off << 226 need_reclaim = swap_is_has_cache(si, o << 227 unlock_cluster_or_swap_info(si, ci); << 228 if (!need_reclaim) << 229 goto out_unlock; << 230 << 231 if (!(flags & TTRS_DIRECT)) { << 232 /* Free through slot cache */ << 233 delete_from_swap_cache(folio); << 234 folio_set_dirty(folio); << 235 ret = nr_pages; << 236 goto out_unlock; << 237 } << 238 << 239 xa_lock_irq(&address_space->i_pages); << 240 __delete_from_swap_cache(folio, entry, << 241 xa_unlock_irq(&address_space->i_pages) << 242 folio_ref_sub(folio, nr_pages); << 243 folio_set_dirty(folio); << 244 << 245 spin_lock(&si->lock); << 246 /* Only sinple page folio can be backe << 247 if (nr_pages == 1) << 248 zswap_invalidate(entry); << 249 swap_entry_range_free(si, entry, nr_pa << 250 spin_unlock(&si->lock); << 251 ret = nr_pages; << 252 out_unlock: << 253 folio_unlock(folio); << 254 out: << 255 folio_put(folio); << 256 return ret; 117 return ret; 257 } 118 } 258 119 259 static inline struct swap_extent *first_se(str << 260 { << 261 struct rb_node *rb = rb_first(&sis->sw << 262 return rb_entry(rb, struct swap_extent << 263 } << 264 << 265 static inline struct swap_extent *next_se(stru << 266 { << 267 struct rb_node *rb = rb_next(&se->rb_n << 268 return rb ? rb_entry(rb, struct swap_e << 269 } << 270 << 271 /* 120 /* 272 * swapon tell device that all the old swap co 121 * swapon tell device that all the old swap contents can be discarded, 273 * to allow the swap device to optimize its we 122 * to allow the swap device to optimize its wear-levelling. 274 */ 123 */ 275 static int discard_swap(struct swap_info_struc 124 static int discard_swap(struct swap_info_struct *si) 276 { 125 { 277 struct swap_extent *se; 126 struct swap_extent *se; 278 sector_t start_block; 127 sector_t start_block; 279 sector_t nr_blocks; 128 sector_t nr_blocks; 280 int err = 0; 129 int err = 0; 281 130 282 /* Do not discard the swap header page 131 /* Do not discard the swap header page! */ 283 se = first_se(si); !! 132 se = &si->first_swap_extent; 284 start_block = (se->start_block + 1) << 133 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); 285 nr_blocks = ((sector_t)se->nr_pages - 134 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); 286 if (nr_blocks) { 135 if (nr_blocks) { 287 err = blkdev_issue_discard(si- 136 err = blkdev_issue_discard(si->bdev, start_block, 288 nr_blocks, GFP !! 137 nr_blocks, GFP_KERNEL, 0); 289 if (err) 138 if (err) 290 return err; 139 return err; 291 cond_resched(); 140 cond_resched(); 292 } 141 } 293 142 294 for (se = next_se(se); se; se = next_s !! 143 list_for_each_entry(se, &si->first_swap_extent.list, list) { 295 start_block = se->start_block 144 start_block = se->start_block << (PAGE_SHIFT - 9); 296 nr_blocks = (sector_t)se->nr_p 145 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); 297 146 298 err = blkdev_issue_discard(si- 147 err = blkdev_issue_discard(si->bdev, start_block, 299 nr_blocks, GFP !! 148 nr_blocks, GFP_KERNEL, 0); 300 if (err) 149 if (err) 301 break; 150 break; 302 151 303 cond_resched(); 152 cond_resched(); 304 } 153 } 305 return err; /* That will o 154 return err; /* That will often be -EOPNOTSUPP */ 306 } 155 } 307 156 308 static struct swap_extent * << 309 offset_to_swap_extent(struct swap_info_struct << 310 { << 311 struct swap_extent *se; << 312 struct rb_node *rb; << 313 << 314 rb = sis->swap_extent_root.rb_node; << 315 while (rb) { << 316 se = rb_entry(rb, struct swap_ << 317 if (offset < se->start_page) << 318 rb = rb->rb_left; << 319 else if (offset >= se->start_p << 320 rb = rb->rb_right; << 321 else << 322 return se; << 323 } << 324 /* It *must* be present */ << 325 BUG(); << 326 } << 327 << 328 sector_t swap_folio_sector(struct folio *folio << 329 { << 330 struct swap_info_struct *sis = swp_swa << 331 struct swap_extent *se; << 332 sector_t sector; << 333 pgoff_t offset; << 334 << 335 offset = swp_offset(folio->swap); << 336 se = offset_to_swap_extent(sis, offset << 337 sector = se->start_block + (offset - s << 338 return sector << (PAGE_SHIFT - 9); << 339 } << 340 << 341 /* 157 /* 342 * swap allocation tell device that a cluster 158 * swap allocation tell device that a cluster of swap can now be discarded, 343 * to allow the swap device to optimize its we 159 * to allow the swap device to optimize its wear-levelling. 344 */ 160 */ 345 static void discard_swap_cluster(struct swap_i 161 static void discard_swap_cluster(struct swap_info_struct *si, 346 pgoff_t start 162 pgoff_t start_page, pgoff_t nr_pages) 347 { 163 { 348 struct swap_extent *se = offset_to_swa !! 164 struct swap_extent *se = si->curr_swap_extent; >> 165 int found_extent = 0; 349 166 350 while (nr_pages) { 167 while (nr_pages) { 351 pgoff_t offset = start_page - !! 168 struct list_head *lh; 352 sector_t start_block = se->sta << 353 sector_t nr_blocks = se->nr_pa << 354 << 355 if (nr_blocks > nr_pages) << 356 nr_blocks = nr_pages; << 357 start_page += nr_blocks; << 358 nr_pages -= nr_blocks; << 359 << 360 start_block <<= PAGE_SHIFT - 9 << 361 nr_blocks <<= PAGE_SHIFT - 9; << 362 if (blkdev_issue_discard(si->b << 363 nr_blo << 364 break; << 365 169 366 se = next_se(se); !! 170 if (se->start_page <= start_page && >> 171 start_page < se->start_page + se->nr_pages) { >> 172 pgoff_t offset = start_page - se->start_page; >> 173 sector_t start_block = se->start_block + offset; >> 174 sector_t nr_blocks = se->nr_pages - offset; >> 175 >> 176 if (nr_blocks > nr_pages) >> 177 nr_blocks = nr_pages; >> 178 start_page += nr_blocks; >> 179 nr_pages -= nr_blocks; >> 180 >> 181 if (!found_extent++) >> 182 si->curr_swap_extent = se; >> 183 >> 184 start_block <<= PAGE_SHIFT - 9; >> 185 nr_blocks <<= PAGE_SHIFT - 9; >> 186 if (blkdev_issue_discard(si->bdev, start_block, >> 187 nr_blocks, GFP_NOIO, 0)) >> 188 break; >> 189 } >> 190 >> 191 lh = se->list.next; >> 192 se = list_entry(lh, struct swap_extent, list); 367 } 193 } 368 } 194 } 369 195 370 #ifdef CONFIG_THP_SWAP << 371 #define SWAPFILE_CLUSTER HPAGE_PMD_NR << 372 << 373 #define swap_entry_order(order) (order) << 374 #else << 375 #define SWAPFILE_CLUSTER 256 196 #define SWAPFILE_CLUSTER 256 376 << 377 /* << 378 * Define swap_entry_order() as constant to le << 379 * out some code if !CONFIG_THP_SWAP << 380 */ << 381 #define swap_entry_order(order) 0 << 382 #endif << 383 #define LATENCY_LIMIT 256 197 #define LATENCY_LIMIT 256 384 198 385 static inline bool cluster_is_free(struct swap !! 199 static inline void cluster_set_flag(struct swap_cluster_info *info, >> 200 unsigned int flag) 386 { 201 { 387 return info->flags & CLUSTER_FLAG_FREE !! 202 info->flags = flag; 388 } 203 } 389 204 390 static inline unsigned int cluster_index(struc !! 205 static inline unsigned int cluster_count(struct swap_cluster_info *info) 391 struc << 392 { 206 { 393 return ci - si->cluster_info; !! 207 return info->data; 394 } 208 } 395 209 396 static inline unsigned int cluster_offset(stru !! 210 static inline void cluster_set_count(struct swap_cluster_info *info, 397 stru !! 211 unsigned int c) 398 { 212 { 399 return cluster_index(si, ci) * SWAPFIL !! 213 info->data = c; 400 } 214 } 401 215 402 static inline struct swap_cluster_info *lock_c !! 216 static inline void cluster_set_count_flag(struct swap_cluster_info *info, 403 !! 217 unsigned int c, unsigned int f) 404 { 218 { 405 struct swap_cluster_info *ci; !! 219 info->flags = f; >> 220 info->data = c; >> 221 } 406 222 407 ci = si->cluster_info; !! 223 static inline unsigned int cluster_next(struct swap_cluster_info *info) 408 if (ci) { !! 224 { 409 ci += offset / SWAPFILE_CLUSTE !! 225 return info->data; 410 spin_lock(&ci->lock); << 411 } << 412 return ci; << 413 } 226 } 414 227 415 static inline void unlock_cluster(struct swap_ !! 228 static inline void cluster_set_next(struct swap_cluster_info *info, >> 229 unsigned int n) 416 { 230 { 417 if (ci) !! 231 info->data = n; 418 spin_unlock(&ci->lock); << 419 } 232 } 420 233 421 /* !! 234 static inline void cluster_set_next_flag(struct swap_cluster_info *info, 422 * Determine the locking method in use for thi !! 235 unsigned int n, unsigned int f) 423 * swap_cluster_info if SSD-style cluster-base << 424 */ << 425 static inline struct swap_cluster_info *lock_c << 426 struct swap_info_struct *si, u << 427 { 236 { 428 struct swap_cluster_info *ci; !! 237 info->flags = f; >> 238 info->data = n; >> 239 } 429 240 430 /* Try to use fine-grained SSD-style l !! 241 static inline bool cluster_is_free(struct swap_cluster_info *info) 431 ci = lock_cluster(si, offset); !! 242 { 432 /* Otherwise, fall back to traditional !! 243 return info->flags & CLUSTER_FLAG_FREE; 433 if (!ci) !! 244 } 434 spin_lock(&si->lock); << 435 245 436 return ci; !! 246 static inline bool cluster_is_null(struct swap_cluster_info *info) >> 247 { >> 248 return info->flags & CLUSTER_FLAG_NEXT_NULL; 437 } 249 } 438 250 439 static inline void unlock_cluster_or_swap_info !! 251 static inline void cluster_set_null(struct swap_cluster_info *info) 440 << 441 { 252 { 442 if (ci) !! 253 info->flags = CLUSTER_FLAG_NEXT_NULL; 443 unlock_cluster(ci); !! 254 info->data = 0; 444 else << 445 spin_unlock(&si->lock); << 446 } 255 } 447 256 448 /* Add a cluster to discard list and schedule 257 /* Add a cluster to discard list and schedule it to do discard */ 449 static void swap_cluster_schedule_discard(stru 258 static void swap_cluster_schedule_discard(struct swap_info_struct *si, 450 struct swap_cluster_info *ci) !! 259 unsigned int idx) 451 { 260 { 452 unsigned int idx = cluster_index(si, c << 453 /* 261 /* 454 * If scan_swap_map_slots() can't find !! 262 * If scan_swap_map() can't find a free cluster, it will check 455 * si->swap_map directly. To make sure 263 * si->swap_map directly. To make sure the discarding cluster isn't 456 * taken by scan_swap_map_slots(), mar !! 264 * taken by scan_swap_map(), mark the swap entries bad (occupied). It 457 * It will be cleared after discard !! 265 * will be cleared after discard 458 */ 266 */ 459 memset(si->swap_map + idx * SWAPFILE_C 267 memset(si->swap_map + idx * SWAPFILE_CLUSTER, 460 SWAP_MAP_BAD, SWAPFILE 268 SWAP_MAP_BAD, SWAPFILE_CLUSTER); 461 269 462 VM_BUG_ON(ci->flags & CLUSTER_FLAG_FRE !! 270 if (cluster_is_null(&si->discard_cluster_head)) { 463 list_move_tail(&ci->list, &si->discard !! 271 cluster_set_next_flag(&si->discard_cluster_head, 464 ci->flags = 0; !! 272 idx, 0); 465 schedule_work(&si->discard_work); !! 273 cluster_set_next_flag(&si->discard_cluster_tail, 466 } !! 274 idx, 0); 467 !! 275 } else { 468 static void __free_cluster(struct swap_info_st !! 276 unsigned int tail = cluster_next(&si->discard_cluster_tail); 469 { !! 277 cluster_set_next(&si->cluster_info[tail], idx); 470 lockdep_assert_held(&si->lock); !! 278 cluster_set_next_flag(&si->discard_cluster_tail, 471 lockdep_assert_held(&ci->lock); !! 279 idx, 0); >> 280 } 472 281 473 if (ci->flags) !! 282 schedule_work(&si->discard_work); 474 list_move_tail(&ci->list, &si- << 475 else << 476 list_add_tail(&ci->list, &si-> << 477 ci->flags = CLUSTER_FLAG_FREE; << 478 ci->order = 0; << 479 } 283 } 480 284 481 /* 285 /* 482 * Doing discard actually. After a cluster dis 286 * Doing discard actually. After a cluster discard is finished, the cluster 483 * will be added to free cluster list. caller 287 * will be added to free cluster list. caller should hold si->lock. 484 */ 288 */ 485 static void swap_do_scheduled_discard(struct s 289 static void swap_do_scheduled_discard(struct swap_info_struct *si) 486 { 290 { 487 struct swap_cluster_info *ci; !! 291 struct swap_cluster_info *info; 488 unsigned int idx; 292 unsigned int idx; 489 293 490 while (!list_empty(&si->discard_cluste !! 294 info = si->cluster_info; 491 ci = list_first_entry(&si->dis !! 295 492 list_del(&ci->list); !! 296 while (!cluster_is_null(&si->discard_cluster_head)) { 493 idx = cluster_index(si, ci); !! 297 idx = cluster_next(&si->discard_cluster_head); >> 298 >> 299 cluster_set_next_flag(&si->discard_cluster_head, >> 300 cluster_next(&info[idx]), 0); >> 301 if (cluster_next(&si->discard_cluster_tail) == idx) { >> 302 cluster_set_null(&si->discard_cluster_head); >> 303 cluster_set_null(&si->discard_cluster_tail); >> 304 } 494 spin_unlock(&si->lock); 305 spin_unlock(&si->lock); 495 306 496 discard_swap_cluster(si, idx * 307 discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, 497 SWAPFILE_CLUST 308 SWAPFILE_CLUSTER); 498 309 499 spin_lock(&si->lock); 310 spin_lock(&si->lock); 500 spin_lock(&ci->lock); !! 311 cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE); 501 __free_cluster(si, ci); !! 312 if (cluster_is_null(&si->free_cluster_head)) { >> 313 cluster_set_next_flag(&si->free_cluster_head, >> 314 idx, 0); >> 315 cluster_set_next_flag(&si->free_cluster_tail, >> 316 idx, 0); >> 317 } else { >> 318 unsigned int tail; >> 319 >> 320 tail = cluster_next(&si->free_cluster_tail); >> 321 cluster_set_next(&info[tail], idx); >> 322 cluster_set_next_flag(&si->free_cluster_tail, >> 323 idx, 0); >> 324 } 502 memset(si->swap_map + idx * SW 325 memset(si->swap_map + idx * SWAPFILE_CLUSTER, 503 0, SWAPFILE_CL 326 0, SWAPFILE_CLUSTER); 504 spin_unlock(&ci->lock); << 505 } 327 } 506 } 328 } 507 329 508 static void swap_discard_work(struct work_stru 330 static void swap_discard_work(struct work_struct *work) 509 { 331 { 510 struct swap_info_struct *si; 332 struct swap_info_struct *si; 511 333 512 si = container_of(work, struct swap_in 334 si = container_of(work, struct swap_info_struct, discard_work); 513 335 514 spin_lock(&si->lock); 336 spin_lock(&si->lock); 515 swap_do_scheduled_discard(si); 337 swap_do_scheduled_discard(si); 516 spin_unlock(&si->lock); 338 spin_unlock(&si->lock); 517 } 339 } 518 340 519 static void swap_users_ref_free(struct percpu_ << 520 { << 521 struct swap_info_struct *si; << 522 << 523 si = container_of(ref, struct swap_inf << 524 complete(&si->comp); << 525 } << 526 << 527 static void free_cluster(struct swap_info_stru << 528 { << 529 VM_BUG_ON(ci->count != 0); << 530 lockdep_assert_held(&si->lock); << 531 lockdep_assert_held(&ci->lock); << 532 << 533 if (ci->flags & CLUSTER_FLAG_FRAG) << 534 si->frag_cluster_nr[ci->order] << 535 << 536 /* << 537 * If the swap is discardable, prepare << 538 * instead of free it immediately. The << 539 * after discard. << 540 */ << 541 if ((si->flags & (SWP_WRITEOK | SWP_PA << 542 (SWP_WRITEOK | SWP_PAGE_DISCARD)) << 543 swap_cluster_schedule_discard( << 544 return; << 545 } << 546 << 547 __free_cluster(si, ci); << 548 } << 549 << 550 /* 341 /* 551 * The cluster corresponding to page_nr will b !! 342 * The cluster corresponding to page_nr will be used. The cluster will be 552 * added to free cluster list and its usage co !! 343 * removed from free cluster list and its usage counter will be increased. 553 * Only used for initialization. << 554 */ 344 */ 555 static void inc_cluster_info_page(struct swap_ !! 345 static void inc_cluster_info_page(struct swap_info_struct *p, 556 struct swap_cluster_info *cluster_info 346 struct swap_cluster_info *cluster_info, unsigned long page_nr) 557 { 347 { 558 unsigned long idx = page_nr / SWAPFILE 348 unsigned long idx = page_nr / SWAPFILE_CLUSTER; 559 struct swap_cluster_info *ci; << 560 349 561 if (!cluster_info) 350 if (!cluster_info) 562 return; 351 return; >> 352 if (cluster_is_free(&cluster_info[idx])) { >> 353 VM_BUG_ON(cluster_next(&p->free_cluster_head) != idx); >> 354 cluster_set_next_flag(&p->free_cluster_head, >> 355 cluster_next(&cluster_info[idx]), 0); >> 356 if (cluster_next(&p->free_cluster_tail) == idx) { >> 357 cluster_set_null(&p->free_cluster_tail); >> 358 cluster_set_null(&p->free_cluster_head); >> 359 } >> 360 cluster_set_count_flag(&cluster_info[idx], 0, 0); >> 361 } 563 362 564 ci = cluster_info + idx; !! 363 VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER); 565 ci->count++; !! 364 cluster_set_count(&cluster_info[idx], 566 !! 365 cluster_count(&cluster_info[idx]) + 1); 567 VM_BUG_ON(ci->count > SWAPFILE_CLUSTER << 568 VM_BUG_ON(ci->flags); << 569 } 366 } 570 367 571 /* 368 /* 572 * The cluster ci decreases @nr_pages usage. I !! 369 * The cluster corresponding to page_nr decreases one usage. If the usage 573 * which means no page in the cluster is in us !! 370 * counter becomes 0, which means no page in the cluster is in using, we can 574 * the cluster and add it to free cluster list !! 371 * optionally discard the cluster and add it to free cluster list. 575 */ 372 */ 576 static void dec_cluster_info_page(struct swap_ !! 373 static void dec_cluster_info_page(struct swap_info_struct *p, 577 struct swap_ !! 374 struct swap_cluster_info *cluster_info, unsigned long page_nr) 578 { 375 { 579 if (!si->cluster_info) !! 376 unsigned long idx = page_nr / SWAPFILE_CLUSTER; 580 return; << 581 << 582 VM_BUG_ON(ci->count < nr_pages); << 583 VM_BUG_ON(cluster_is_free(ci)); << 584 lockdep_assert_held(&si->lock); << 585 lockdep_assert_held(&ci->lock); << 586 ci->count -= nr_pages; << 587 377 588 if (!ci->count) { !! 378 if (!cluster_info) 589 free_cluster(si, ci); << 590 return; 379 return; 591 } << 592 380 593 if (!(ci->flags & CLUSTER_FLAG_NONFULL !! 381 VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0); 594 VM_BUG_ON(ci->flags & CLUSTER_ !! 382 cluster_set_count(&cluster_info[idx], 595 if (ci->flags & CLUSTER_FLAG_F !! 383 cluster_count(&cluster_info[idx]) - 1); 596 si->frag_cluster_nr[ci !! 384 597 list_move_tail(&ci->list, &si- !! 385 if (cluster_count(&cluster_info[idx]) == 0) { 598 ci->flags = CLUSTER_FLAG_NONFU !! 386 /* 599 } !! 387 * If the swap is discardable, prepare discard the cluster 600 } !! 388 * instead of free it immediately. The cluster will be freed 601 !! 389 * after discard. 602 static bool cluster_reclaim_range(struct swap_ !! 390 */ 603 struct swap_ !! 391 if ((p->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == 604 unsigned lon !! 392 (SWP_WRITEOK | SWP_PAGE_DISCARD)) { 605 { !! 393 swap_cluster_schedule_discard(p, idx); 606 unsigned char *map = si->swap_map; !! 394 return; 607 unsigned long offset; << 608 << 609 spin_unlock(&ci->lock); << 610 spin_unlock(&si->lock); << 611 << 612 for (offset = start; offset < end; off << 613 switch (READ_ONCE(map[offset]) << 614 case 0: << 615 continue; << 616 case SWAP_HAS_CACHE: << 617 if (__try_to_reclaim_s << 618 continue; << 619 goto out; << 620 default: << 621 goto out; << 622 } << 623 } << 624 out: << 625 spin_lock(&si->lock); << 626 spin_lock(&ci->lock); << 627 << 628 /* << 629 * Recheck the range no matter reclaim << 630 * could have been be freed while we a << 631 */ << 632 for (offset = start; offset < end; off << 633 if (READ_ONCE(map[offset])) << 634 return false; << 635 << 636 return true; << 637 } << 638 << 639 static bool cluster_scan_range(struct swap_inf << 640 struct swap_clu << 641 unsigned long s << 642 { << 643 unsigned long offset, end = start + nr << 644 unsigned char *map = si->swap_map; << 645 bool need_reclaim = false; << 646 << 647 for (offset = start; offset < end; off << 648 switch (READ_ONCE(map[offset]) << 649 case 0: << 650 continue; << 651 case SWAP_HAS_CACHE: << 652 if (!vm_swap_full()) << 653 return false; << 654 need_reclaim = true; << 655 continue; << 656 default: << 657 return false; << 658 } << 659 } << 660 << 661 if (need_reclaim) << 662 return cluster_reclaim_range(s << 663 << 664 return true; << 665 } << 666 << 667 static void cluster_alloc_range(struct swap_in << 668 unsigned int s << 669 unsigned int o << 670 { << 671 unsigned int nr_pages = 1 << order; << 672 << 673 if (cluster_is_free(ci)) { << 674 if (nr_pages < SWAPFILE_CLUSTE << 675 list_move_tail(&ci->li << 676 ci->flags = CLUSTER_FL << 677 } 395 } 678 ci->order = order; << 679 } << 680 << 681 memset(si->swap_map + start, usage, nr << 682 swap_range_alloc(si, start, nr_pages); << 683 ci->count += nr_pages; << 684 << 685 if (ci->count == SWAPFILE_CLUSTER) { << 686 VM_BUG_ON(!(ci->flags & << 687 (CLUSTER_FLAG_FREE | << 688 if (ci->flags & CLUSTER_FLAG_F << 689 si->frag_cluster_nr[ci << 690 list_move_tail(&ci->list, &si- << 691 ci->flags = CLUSTER_FLAG_FULL; << 692 } << 693 } << 694 << 695 static unsigned int alloc_swap_scan_cluster(st << 696 un << 697 un << 698 { << 699 unsigned long start = offset & ~(SWAPF << 700 unsigned long end = min(start + SWAPFI << 701 unsigned int nr_pages = 1 << order; << 702 struct swap_cluster_info *ci; << 703 396 704 if (end < nr_pages) !! 397 cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); 705 return SWAP_NEXT_INVALID; !! 398 if (cluster_is_null(&p->free_cluster_head)) { 706 end -= nr_pages; !! 399 cluster_set_next_flag(&p->free_cluster_head, idx, 0); 707 !! 400 cluster_set_next_flag(&p->free_cluster_tail, idx, 0); 708 ci = lock_cluster(si, offset); !! 401 } else { 709 if (ci->count + nr_pages > SWAPFILE_CL !! 402 unsigned int tail = cluster_next(&p->free_cluster_tail); 710 offset = SWAP_NEXT_INVALID; !! 403 cluster_set_next(&cluster_info[tail], idx); 711 goto done; !! 404 cluster_set_next_flag(&p->free_cluster_tail, idx, 0); 712 } << 713 << 714 while (offset <= end) { << 715 if (cluster_scan_range(si, ci, << 716 cluster_alloc_range(si << 717 *foundp = offset; << 718 if (ci->count == SWAPF << 719 offset = SWAP_ << 720 goto done; << 721 } << 722 offset += nr_pages; << 723 break; << 724 } 405 } 725 offset += nr_pages; << 726 } 406 } 727 if (offset > end) << 728 offset = SWAP_NEXT_INVALID; << 729 done: << 730 unlock_cluster(ci); << 731 return offset; << 732 } 407 } 733 408 734 /* Return true if reclaimed a whole cluster */ !! 409 /* 735 static void swap_reclaim_full_clusters(struct !! 410 * It's possible scan_swap_map() uses a free cluster in the middle of free >> 411 * cluster list. Avoiding such abuse to avoid list corruption. >> 412 */ >> 413 static bool >> 414 scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, >> 415 unsigned long offset) 736 { 416 { 737 long to_scan = 1; !! 417 struct percpu_cluster *percpu_cluster; 738 unsigned long offset, end; !! 418 bool conflict; 739 struct swap_cluster_info *ci; << 740 unsigned char *map = si->swap_map; << 741 int nr_reclaim; << 742 << 743 if (force) << 744 to_scan = si->inuse_pages / SW << 745 << 746 while (!list_empty(&si->full_clusters) << 747 ci = list_first_entry(&si->ful << 748 list_move_tail(&ci->list, &si- << 749 offset = cluster_offset(si, ci << 750 end = min(si->max, offset + SW << 751 to_scan--; << 752 << 753 spin_unlock(&si->lock); << 754 while (offset < end) { << 755 if (READ_ONCE(map[offs << 756 nr_reclaim = _ << 757 << 758 if (nr_reclaim << 759 offset << 760 contin << 761 } << 762 } << 763 offset++; << 764 } << 765 spin_lock(&si->lock); << 766 419 767 if (to_scan <= 0) !! 420 offset /= SWAPFILE_CLUSTER; 768 break; !! 421 conflict = !cluster_is_null(&si->free_cluster_head) && 769 } !! 422 offset != cluster_next(&si->free_cluster_head) && 770 } !! 423 cluster_is_free(&si->cluster_info[offset]); 771 << 772 static void swap_reclaim_work(struct work_stru << 773 { << 774 struct swap_info_struct *si; << 775 424 776 si = container_of(work, struct swap_in !! 425 if (!conflict) >> 426 return false; 777 427 778 spin_lock(&si->lock); !! 428 percpu_cluster = this_cpu_ptr(si->percpu_cluster); 779 swap_reclaim_full_clusters(si, true); !! 429 cluster_set_null(&percpu_cluster->index); 780 spin_unlock(&si->lock); !! 430 return true; 781 } 431 } 782 432 783 /* 433 /* 784 * Try to get swap entries with specified orde !! 434 * Try to get a swap entry from current cpu's swap entry pool (a cluster). This 785 * pool (a cluster). This might involve alloca !! 435 * might involve allocating a new cluster for current CPU too. 786 * too. << 787 */ 436 */ 788 static unsigned long cluster_alloc_swap_entry( !! 437 static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, 789 !! 438 unsigned long *offset, unsigned long *scan_base) 790 { 439 { 791 struct percpu_cluster *cluster; 440 struct percpu_cluster *cluster; 792 struct swap_cluster_info *ci; !! 441 bool found_free; 793 unsigned int offset, found = 0; !! 442 unsigned long tmp; 794 443 795 new_cluster: 444 new_cluster: 796 lockdep_assert_held(&si->lock); << 797 cluster = this_cpu_ptr(si->percpu_clus 445 cluster = this_cpu_ptr(si->percpu_cluster); 798 offset = cluster->next[order]; !! 446 if (cluster_is_null(&cluster->index)) { 799 if (offset) { !! 447 if (!cluster_is_null(&si->free_cluster_head)) { 800 offset = alloc_swap_scan_clust !! 448 cluster->index = si->free_cluster_head; 801 if (found) !! 449 cluster->next = cluster_next(&cluster->index) * 802 goto done; !! 450 SWAPFILE_CLUSTER; 803 } !! 451 } else if (!cluster_is_null(&si->discard_cluster_head)) { 804 << 805 if (!list_empty(&si->free_clusters)) { << 806 ci = list_first_entry(&si->fre << 807 offset = alloc_swap_scan_clust << 808 VM_BUG_ON(!found); << 809 goto done; << 810 } << 811 << 812 /* Try reclaim from full clusters if f << 813 if (vm_swap_full()) << 814 swap_reclaim_full_clusters(si, << 815 << 816 if (order < PMD_ORDER) { << 817 unsigned int frags = 0; << 818 << 819 while (!list_empty(&si->nonful << 820 ci = list_first_entry( << 821 << 822 list_move_tail(&ci->li << 823 ci->flags = CLUSTER_FL << 824 si->frag_cluster_nr[or << 825 offset = alloc_swap_sc << 826 << 827 frags++; << 828 if (found) << 829 break; << 830 } << 831 << 832 if (!found) { << 833 /* 452 /* 834 * Nonfull clusters ar !! 453 * we don't have free cluster but have some clusters in 835 * here, count them to !! 454 * discarding, do discard now and reclaim them 836 */ 455 */ 837 while (frags < si->fra !! 456 swap_do_scheduled_discard(si); 838 ci = list_firs !! 457 *scan_base = *offset = si->cluster_next; 839 !! 458 goto new_cluster; 840 /* !! 459 } else 841 * Rotate the !! 460 return; 842 * high order << 843 * this help k << 844 */ << 845 list_move_tail << 846 offset = alloc << 847 << 848 frags++; << 849 if (found) << 850 break; << 851 } << 852 } << 853 } << 854 << 855 if (found) << 856 goto done; << 857 << 858 if (!list_empty(&si->discard_clusters) << 859 /* << 860 * we don't have free cluster << 861 * discarding, do discard now << 862 * reread cluster_next_cpu sin << 863 */ << 864 swap_do_scheduled_discard(si); << 865 goto new_cluster; << 866 } << 867 << 868 if (order) << 869 goto done; << 870 << 871 /* Order 0 stealing from higher order << 872 for (int o = 1; o < SWAP_NR_ORDERS; o+ << 873 /* << 874 * Clusters here have at least << 875 * allocation, but reclaim may << 876 */ << 877 while (!list_empty(&si->frag_c << 878 ci = list_first_entry( << 879 << 880 offset = alloc_swap_sc << 881 << 882 if (found) << 883 goto done; << 884 } << 885 << 886 while (!list_empty(&si->nonful << 887 ci = list_first_entry( << 888 << 889 offset = alloc_swap_sc << 890 << 891 if (found) << 892 goto done; << 893 } << 894 } << 895 << 896 done: << 897 cluster->next[order] = offset; << 898 return found; << 899 } << 900 << 901 static void __del_from_avail_list(struct swap_ << 902 { << 903 int nid; << 904 << 905 assert_spin_locked(&si->lock); << 906 for_each_node(nid) << 907 plist_del(&si->avail_lists[nid << 908 } << 909 << 910 static void del_from_avail_list(struct swap_in << 911 { << 912 spin_lock(&swap_avail_lock); << 913 __del_from_avail_list(si); << 914 spin_unlock(&swap_avail_lock); << 915 } << 916 << 917 static void swap_range_alloc(struct swap_info_ << 918 unsigned int nr_e << 919 { << 920 unsigned int end = offset + nr_entries << 921 << 922 if (offset == si->lowest_bit) << 923 si->lowest_bit += nr_entries; << 924 if (end == si->highest_bit) << 925 WRITE_ONCE(si->highest_bit, si << 926 WRITE_ONCE(si->inuse_pages, si->inuse_ << 927 if (si->inuse_pages == si->pages) { << 928 si->lowest_bit = si->max; << 929 si->highest_bit = 0; << 930 del_from_avail_list(si); << 931 << 932 if (vm_swap_full()) << 933 schedule_work(&si->rec << 934 } << 935 } << 936 << 937 static void add_to_avail_list(struct swap_info << 938 { << 939 int nid; << 940 << 941 spin_lock(&swap_avail_lock); << 942 for_each_node(nid) << 943 plist_add(&si->avail_lists[nid << 944 spin_unlock(&swap_avail_lock); << 945 } << 946 << 947 static void swap_range_free(struct swap_info_s << 948 unsigned int nr_en << 949 { << 950 unsigned long begin = offset; << 951 unsigned long end = offset + nr_entrie << 952 void (*swap_slot_free_notify)(struct b << 953 unsigned int i; << 954 << 955 /* << 956 * Use atomic clear_bit operations onl << 957 * bitmap_clear to prevent adjacent bi << 958 */ << 959 for (i = 0; i < nr_entries; i++) << 960 clear_bit(offset + i, si->zero << 961 << 962 if (offset < si->lowest_bit) << 963 si->lowest_bit = offset; << 964 if (end > si->highest_bit) { << 965 bool was_full = !si->highest_b << 966 << 967 WRITE_ONCE(si->highest_bit, en << 968 if (was_full && (si->flags & S << 969 add_to_avail_list(si); << 970 } << 971 if (si->flags & SWP_BLKDEV) << 972 swap_slot_free_notify = << 973 si->bdev->bd_disk->fop << 974 else << 975 swap_slot_free_notify = NULL; << 976 while (offset <= end) { << 977 arch_swap_invalidate_page(si-> << 978 if (swap_slot_free_notify) << 979 swap_slot_free_notify( << 980 offset++; << 981 } 461 } 982 clear_shadow_from_swap_cache(si->type, << 983 << 984 /* << 985 * Make sure that try_to_unuse() obser << 986 * only after the above cleanups are d << 987 */ << 988 smp_wmb(); << 989 atomic_long_add(nr_entries, &nr_swap_p << 990 WRITE_ONCE(si->inuse_pages, si->inuse_ << 991 } << 992 << 993 static void set_cluster_next(struct swap_info_ << 994 { << 995 unsigned long prev; << 996 462 997 if (!(si->flags & SWP_SOLIDSTATE)) { !! 463 found_free = false; 998 si->cluster_next = next; << 999 return; << 1000 } << 1001 464 1002 prev = this_cpu_read(*si->cluster_nex << 1003 /* 465 /* 1004 * Cross the swap address space size !! 466 * Other CPUs can use our cluster if they can't find a free cluster, 1005 * another trunk randomly to avoid lo !! 467 * check if there is still free entry in the cluster 1006 * address space if possible. << 1007 */ 468 */ 1008 if ((prev >> SWAP_ADDRESS_SPACE_SHIFT !! 469 tmp = cluster->next; 1009 (next >> SWAP_ADDRESS_SPACE_SHIFT !! 470 while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) * 1010 /* No free swap slots availab !! 471 SWAPFILE_CLUSTER) { 1011 if (si->highest_bit <= si->lo !! 472 if (!si->swap_map[tmp]) { 1012 return; !! 473 found_free = true; 1013 next = get_random_u32_inclusi << 1014 next = ALIGN_DOWN(next, SWAP_ << 1015 next = max_t(unsigned int, ne << 1016 } << 1017 this_cpu_write(*si->cluster_next_cpu, << 1018 } << 1019 << 1020 static bool swap_offset_available_and_locked( << 1021 << 1022 { << 1023 if (data_race(!si->swap_map[offset])) << 1024 spin_lock(&si->lock); << 1025 return true; << 1026 } << 1027 << 1028 if (vm_swap_full() && READ_ONCE(si->s << 1029 spin_lock(&si->lock); << 1030 return true; << 1031 } << 1032 << 1033 return false; << 1034 } << 1035 << 1036 static int cluster_alloc_swap(struct swap_inf << 1037 unsigned char us << 1038 swp_entry_t slot << 1039 { << 1040 int n_ret = 0; << 1041 << 1042 VM_BUG_ON(!si->cluster_info); << 1043 << 1044 while (n_ret < nr) { << 1045 unsigned long offset = cluste << 1046 << 1047 if (!offset) << 1048 break; 474 break; 1049 slots[n_ret++] = swp_entry(si !! 475 } >> 476 tmp++; 1050 } 477 } 1051 !! 478 if (!found_free) { 1052 return n_ret; !! 479 cluster_set_null(&cluster->index); >> 480 goto new_cluster; >> 481 } >> 482 cluster->next = tmp + 1; >> 483 *offset = tmp; >> 484 *scan_base = tmp; 1053 } 485 } 1054 486 1055 static int scan_swap_map_slots(struct swap_in !! 487 static unsigned long scan_swap_map(struct swap_info_struct *si, 1056 unsigned char !! 488 unsigned char usage) 1057 swp_entry_t sl << 1058 { 489 { 1059 unsigned long offset; 490 unsigned long offset; 1060 unsigned long scan_base; 491 unsigned long scan_base; 1061 unsigned long last_in_cluster = 0; 492 unsigned long last_in_cluster = 0; 1062 int latency_ration = LATENCY_LIMIT; 493 int latency_ration = LATENCY_LIMIT; 1063 unsigned int nr_pages = 1 << order; << 1064 int n_ret = 0; << 1065 bool scanned_many = false; << 1066 494 1067 /* 495 /* 1068 * We try to cluster swap pages by al 496 * We try to cluster swap pages by allocating them sequentially 1069 * in swap. Once we've allocated SWA 497 * in swap. Once we've allocated SWAPFILE_CLUSTER pages this 1070 * way, however, we resort to first-f 498 * way, however, we resort to first-free allocation, starting 1071 * a new cluster. This prevents us f 499 * a new cluster. This prevents us from scattering swap pages 1072 * all over the entire swap partition 500 * all over the entire swap partition, so that we reduce 1073 * overall disk seek times between sw 501 * overall disk seek times between swap pages. -- sct 1074 * But we do now try to find an empty 502 * But we do now try to find an empty cluster. -Andrea 1075 * And we let swap pages go all over 503 * And we let swap pages go all over an SSD partition. Hugh 1076 */ 504 */ 1077 505 1078 if (order > 0) { << 1079 /* << 1080 * Should not even be attempt << 1081 * page swap is disabled. Wa << 1082 */ << 1083 if (!IS_ENABLED(CONFIG_THP_SW << 1084 nr_pages > SWAPFILE_CLUST << 1085 VM_WARN_ON_ONCE(1); << 1086 return 0; << 1087 } << 1088 << 1089 /* << 1090 * Swapfile is not block devi << 1091 * to allocate large entries. << 1092 */ << 1093 if (!(si->flags & SWP_BLKDEV) << 1094 return 0; << 1095 } << 1096 << 1097 if (si->cluster_info) << 1098 return cluster_alloc_swap(si, << 1099 << 1100 si->flags += SWP_SCANNING; 506 si->flags += SWP_SCANNING; >> 507 scan_base = offset = si->cluster_next; 1101 508 1102 /* For HDD, sequential access is more !! 509 /* SSD algorithm */ 1103 scan_base = si->cluster_next; !! 510 if (si->cluster_info) { 1104 offset = scan_base; !! 511 scan_swap_map_try_ssd_cluster(si, &offset, &scan_base); >> 512 goto checks; >> 513 } 1105 514 1106 if (unlikely(!si->cluster_nr--)) { 515 if (unlikely(!si->cluster_nr--)) { 1107 if (si->pages - si->inuse_pag 516 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { 1108 si->cluster_nr = SWAP 517 si->cluster_nr = SWAPFILE_CLUSTER - 1; 1109 goto checks; 518 goto checks; 1110 } 519 } 1111 520 1112 spin_unlock(&si->lock); 521 spin_unlock(&si->lock); 1113 522 1114 /* 523 /* 1115 * If seek is expensive, star 524 * If seek is expensive, start searching for new cluster from 1116 * start of partition, to min 525 * start of partition, to minimize the span of allocated swap. >> 526 * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info >> 527 * case, just handled by scan_swap_map_try_ssd_cluster() above. 1117 */ 528 */ 1118 scan_base = offset = si->lowe 529 scan_base = offset = si->lowest_bit; 1119 last_in_cluster = offset + SW 530 last_in_cluster = offset + SWAPFILE_CLUSTER - 1; 1120 531 1121 /* Locate the first empty (un 532 /* Locate the first empty (unaligned) cluster */ 1122 for (; last_in_cluster <= REA !! 533 for (; last_in_cluster <= si->highest_bit; offset++) { 1123 if (si->swap_map[offs 534 if (si->swap_map[offset]) 1124 last_in_clust 535 last_in_cluster = offset + SWAPFILE_CLUSTER; 1125 else if (offset == la 536 else if (offset == last_in_cluster) { 1126 spin_lock(&si 537 spin_lock(&si->lock); 1127 offset -= SWA 538 offset -= SWAPFILE_CLUSTER - 1; 1128 si->cluster_n 539 si->cluster_next = offset; 1129 si->cluster_n 540 si->cluster_nr = SWAPFILE_CLUSTER - 1; 1130 goto checks; 541 goto checks; 1131 } 542 } 1132 if (unlikely(--latenc 543 if (unlikely(--latency_ration < 0)) { 1133 cond_resched( 544 cond_resched(); 1134 latency_ratio 545 latency_ration = LATENCY_LIMIT; 1135 } 546 } 1136 } 547 } 1137 548 1138 offset = scan_base; 549 offset = scan_base; 1139 spin_lock(&si->lock); 550 spin_lock(&si->lock); 1140 si->cluster_nr = SWAPFILE_CLU 551 si->cluster_nr = SWAPFILE_CLUSTER - 1; 1141 } 552 } 1142 553 1143 checks: 554 checks: >> 555 if (si->cluster_info) { >> 556 while (scan_swap_map_ssd_cluster_conflict(si, offset)) >> 557 scan_swap_map_try_ssd_cluster(si, &offset, &scan_base); >> 558 } 1144 if (!(si->flags & SWP_WRITEOK)) 559 if (!(si->flags & SWP_WRITEOK)) 1145 goto no_page; 560 goto no_page; 1146 if (!si->highest_bit) 561 if (!si->highest_bit) 1147 goto no_page; 562 goto no_page; 1148 if (offset > si->highest_bit) 563 if (offset > si->highest_bit) 1149 scan_base = offset = si->lowe 564 scan_base = offset = si->lowest_bit; 1150 565 1151 /* reuse swap entry of cache-only swa 566 /* reuse swap entry of cache-only swap if not busy. */ 1152 if (vm_swap_full() && si->swap_map[of 567 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 1153 int swap_was_freed; 568 int swap_was_freed; 1154 spin_unlock(&si->lock); 569 spin_unlock(&si->lock); 1155 swap_was_freed = __try_to_rec !! 570 swap_was_freed = __try_to_reclaim_swap(si, offset); 1156 spin_lock(&si->lock); 571 spin_lock(&si->lock); 1157 /* entry was freed successful 572 /* entry was freed successfully, try to use this again */ 1158 if (swap_was_freed > 0) !! 573 if (swap_was_freed) 1159 goto checks; 574 goto checks; 1160 goto scan; /* check next one 575 goto scan; /* check next one */ 1161 } 576 } 1162 577 1163 if (si->swap_map[offset]) { !! 578 if (si->swap_map[offset]) 1164 if (!n_ret) !! 579 goto scan; 1165 goto scan; << 1166 else << 1167 goto done; << 1168 } << 1169 memset(si->swap_map + offset, usage, << 1170 << 1171 swap_range_alloc(si, offset, nr_pages << 1172 slots[n_ret++] = swp_entry(si->type, << 1173 << 1174 /* got enough slots or reach max slot << 1175 if ((n_ret == nr) || (offset >= si->h << 1176 goto done; << 1177 << 1178 /* search for next available slot */ << 1179 << 1180 /* time to take a break? */ << 1181 if (unlikely(--latency_ration < 0)) { << 1182 if (n_ret) << 1183 goto done; << 1184 spin_unlock(&si->lock); << 1185 cond_resched(); << 1186 spin_lock(&si->lock); << 1187 latency_ration = LATENCY_LIMI << 1188 } << 1189 << 1190 if (si->cluster_nr && !si->swap_map[+ << 1191 /* non-ssd case, still more s << 1192 --si->cluster_nr; << 1193 goto checks; << 1194 } << 1195 << 1196 /* << 1197 * Even if there's no free clusters a << 1198 * try to scan a little more quickly << 1199 * have scanned too many slots alread << 1200 */ << 1201 if (!scanned_many) { << 1202 unsigned long scan_limit; << 1203 580 1204 if (offset < scan_base) !! 581 if (offset == si->lowest_bit) 1205 scan_limit = scan_bas !! 582 si->lowest_bit++; 1206 else !! 583 if (offset == si->highest_bit) 1207 scan_limit = si->high !! 584 si->highest_bit--; 1208 for (; offset <= scan_limit & !! 585 si->inuse_pages++; 1209 offset++) { !! 586 if (si->inuse_pages == si->pages) { 1210 if (!si->swap_map[off !! 587 si->lowest_bit = si->max; 1211 goto checks; !! 588 si->highest_bit = 0; 1212 } !! 589 spin_lock(&swap_avail_lock); >> 590 plist_del(&si->avail_list, &swap_avail_head); >> 591 spin_unlock(&swap_avail_lock); 1213 } 592 } 1214 !! 593 si->swap_map[offset] = usage; 1215 done: !! 594 inc_cluster_info_page(si, si->cluster_info, offset); 1216 if (order == 0) !! 595 si->cluster_next = offset + 1; 1217 set_cluster_next(si, offset + << 1218 si->flags -= SWP_SCANNING; 596 si->flags -= SWP_SCANNING; 1219 return n_ret; !! 597 >> 598 return offset; 1220 599 1221 scan: 600 scan: 1222 VM_WARN_ON(order > 0); << 1223 spin_unlock(&si->lock); 601 spin_unlock(&si->lock); 1224 while (++offset <= READ_ONCE(si->high !! 602 while (++offset <= si->highest_bit) { >> 603 if (!si->swap_map[offset]) { >> 604 spin_lock(&si->lock); >> 605 goto checks; >> 606 } >> 607 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { >> 608 spin_lock(&si->lock); >> 609 goto checks; >> 610 } 1225 if (unlikely(--latency_ration 611 if (unlikely(--latency_ration < 0)) { 1226 cond_resched(); 612 cond_resched(); 1227 latency_ration = LATE 613 latency_ration = LATENCY_LIMIT; 1228 scanned_many = true; << 1229 } 614 } 1230 if (swap_offset_available_and << 1231 goto checks; << 1232 } 615 } 1233 offset = si->lowest_bit; 616 offset = si->lowest_bit; 1234 while (offset < scan_base) { 617 while (offset < scan_base) { >> 618 if (!si->swap_map[offset]) { >> 619 spin_lock(&si->lock); >> 620 goto checks; >> 621 } >> 622 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { >> 623 spin_lock(&si->lock); >> 624 goto checks; >> 625 } 1235 if (unlikely(--latency_ration 626 if (unlikely(--latency_ration < 0)) { 1236 cond_resched(); 627 cond_resched(); 1237 latency_ration = LATE 628 latency_ration = LATENCY_LIMIT; 1238 scanned_many = true; << 1239 } 629 } 1240 if (swap_offset_available_and << 1241 goto checks; << 1242 offset++; 630 offset++; 1243 } 631 } 1244 spin_lock(&si->lock); 632 spin_lock(&si->lock); 1245 633 1246 no_page: 634 no_page: 1247 si->flags -= SWP_SCANNING; 635 si->flags -= SWP_SCANNING; 1248 return n_ret; !! 636 return 0; 1249 } 637 } 1250 638 1251 int get_swap_pages(int n_goal, swp_entry_t sw !! 639 swp_entry_t get_swap_page(void) 1252 { 640 { 1253 int order = swap_entry_order(entry_or << 1254 unsigned long size = 1 << order; << 1255 struct swap_info_struct *si, *next; 641 struct swap_info_struct *si, *next; 1256 long avail_pgs; !! 642 pgoff_t offset; 1257 int n_ret = 0; << 1258 int node; << 1259 << 1260 spin_lock(&swap_avail_lock); << 1261 643 1262 avail_pgs = atomic_long_read(&nr_swap !! 644 if (atomic_long_read(&nr_swap_pages) <= 0) 1263 if (avail_pgs <= 0) { << 1264 spin_unlock(&swap_avail_lock) << 1265 goto noswap; 645 goto noswap; 1266 } !! 646 atomic_long_dec(&nr_swap_pages); 1267 647 1268 n_goal = min3((long)n_goal, (long)SWA !! 648 spin_lock(&swap_avail_lock); 1269 << 1270 atomic_long_sub(n_goal * size, &nr_sw << 1271 649 1272 start_over: 650 start_over: 1273 node = numa_node_id(); !! 651 plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { 1274 plist_for_each_entry_safe(si, next, & << 1275 /* requeue si to after same-p 652 /* requeue si to after same-priority siblings */ 1276 plist_requeue(&si->avail_list !! 653 plist_requeue(&si->avail_list, &swap_avail_head); 1277 spin_unlock(&swap_avail_lock) 654 spin_unlock(&swap_avail_lock); 1278 spin_lock(&si->lock); 655 spin_lock(&si->lock); 1279 if (!si->highest_bit || !(si- 656 if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { 1280 spin_lock(&swap_avail 657 spin_lock(&swap_avail_lock); 1281 if (plist_node_empty( !! 658 if (plist_node_empty(&si->avail_list)) { 1282 spin_unlock(& 659 spin_unlock(&si->lock); 1283 goto nextsi; 660 goto nextsi; 1284 } 661 } 1285 WARN(!si->highest_bit 662 WARN(!si->highest_bit, 1286 "swap_info %d in 663 "swap_info %d in list but !highest_bit\n", 1287 si->type); 664 si->type); 1288 WARN(!(si->flags & SW 665 WARN(!(si->flags & SWP_WRITEOK), 1289 "swap_info %d in 666 "swap_info %d in list but !SWP_WRITEOK\n", 1290 si->type); 667 si->type); 1291 __del_from_avail_list !! 668 plist_del(&si->avail_list, &swap_avail_head); 1292 spin_unlock(&si->lock 669 spin_unlock(&si->lock); 1293 goto nextsi; 670 goto nextsi; 1294 } 671 } 1295 n_ret = scan_swap_map_slots(s << 1296 n << 1297 spin_unlock(&si->lock); << 1298 if (n_ret || size > 1) << 1299 goto check_out; << 1300 cond_resched(); << 1301 672 >> 673 /* This is called for allocating swap entry for cache */ >> 674 offset = scan_swap_map(si, SWAP_HAS_CACHE); >> 675 spin_unlock(&si->lock); >> 676 if (offset) >> 677 return swp_entry(si->type, offset); >> 678 pr_debug("scan_swap_map of si %d failed to find offset\n", >> 679 si->type); 1302 spin_lock(&swap_avail_lock); 680 spin_lock(&swap_avail_lock); 1303 nextsi: 681 nextsi: 1304 /* 682 /* 1305 * if we got here, it's likel 683 * if we got here, it's likely that si was almost full before, 1306 * and since scan_swap_map_sl !! 684 * and since scan_swap_map() can drop the si->lock, multiple 1307 * multiple callers probably !! 685 * callers probably all tried to get a page from the same si 1308 * same si and it filled up b !! 686 * and it filled up before we could get one; or, the si filled 1309 * filled up between us dropp !! 687 * up between us dropping swap_avail_lock and taking si->lock. 1310 * si->lock. Since we dropped !! 688 * Since we dropped the swap_avail_lock, the swap_avail_head 1311 * swap_avail_head list may h !! 689 * list may have been modified; so if next is still in the 1312 * still in the swap_avail_he !! 690 * swap_avail_head list then try it, otherwise start over. 1313 * start over if we have not << 1314 */ 691 */ 1315 if (plist_node_empty(&next->a !! 692 if (plist_node_empty(&next->avail_list)) 1316 goto start_over; 693 goto start_over; 1317 } 694 } 1318 695 1319 spin_unlock(&swap_avail_lock); 696 spin_unlock(&swap_avail_lock); 1320 697 1321 check_out: !! 698 atomic_long_inc(&nr_swap_pages); 1322 if (n_ret < n_goal) << 1323 atomic_long_add((long)(n_goal << 1324 &nr_swap_page << 1325 noswap: 699 noswap: 1326 return n_ret; !! 700 return (swp_entry_t) {0}; 1327 } 701 } 1328 702 1329 static struct swap_info_struct *_swap_info_ge !! 703 /* The only caller of this function is now suspend routine */ >> 704 swp_entry_t get_swap_page_of_type(int type) 1330 { 705 { 1331 struct swap_info_struct *si; 706 struct swap_info_struct *si; 1332 unsigned long offset; !! 707 pgoff_t offset; >> 708 >> 709 si = swap_info[type]; >> 710 spin_lock(&si->lock); >> 711 if (si && (si->flags & SWP_WRITEOK)) { >> 712 atomic_long_dec(&nr_swap_pages); >> 713 /* This is called for allocating swap entry, not cache */ >> 714 offset = scan_swap_map(si, 1); >> 715 if (offset) { >> 716 spin_unlock(&si->lock); >> 717 return swp_entry(type, offset); >> 718 } >> 719 atomic_long_inc(&nr_swap_pages); >> 720 } >> 721 spin_unlock(&si->lock); >> 722 return (swp_entry_t) {0}; >> 723 } >> 724 >> 725 static struct swap_info_struct *swap_info_get(swp_entry_t entry) >> 726 { >> 727 struct swap_info_struct *p; >> 728 unsigned long offset, type; 1333 729 1334 if (!entry.val) 730 if (!entry.val) 1335 goto out; 731 goto out; 1336 si = swp_swap_info(entry); !! 732 type = swp_type(entry); 1337 if (!si) !! 733 if (type >= nr_swapfiles) 1338 goto bad_nofile; 734 goto bad_nofile; 1339 if (data_race(!(si->flags & SWP_USED) !! 735 p = swap_info[type]; >> 736 if (!(p->flags & SWP_USED)) 1340 goto bad_device; 737 goto bad_device; 1341 offset = swp_offset(entry); 738 offset = swp_offset(entry); 1342 if (offset >= si->max) !! 739 if (offset >= p->max) 1343 goto bad_offset; 740 goto bad_offset; 1344 if (data_race(!si->swap_map[swp_offse !! 741 if (!p->swap_map[offset]) 1345 goto bad_free; 742 goto bad_free; 1346 return si; !! 743 spin_lock(&p->lock); >> 744 return p; 1347 745 1348 bad_free: 746 bad_free: 1349 pr_err("%s: %s%08lx\n", __func__, Unu !! 747 pr_err("swap_free: %s%08lx\n", Unused_offset, entry.val); 1350 goto out; 748 goto out; 1351 bad_offset: 749 bad_offset: 1352 pr_err("%s: %s%08lx\n", __func__, Bad !! 750 pr_err("swap_free: %s%08lx\n", Bad_offset, entry.val); 1353 goto out; 751 goto out; 1354 bad_device: 752 bad_device: 1355 pr_err("%s: %s%08lx\n", __func__, Unu !! 753 pr_err("swap_free: %s%08lx\n", Unused_file, entry.val); 1356 goto out; 754 goto out; 1357 bad_nofile: 755 bad_nofile: 1358 pr_err("%s: %s%08lx\n", __func__, Bad !! 756 pr_err("swap_free: %s%08lx\n", Bad_file, entry.val); 1359 out: 757 out: 1360 return NULL; 758 return NULL; 1361 } 759 } 1362 760 1363 static struct swap_info_struct *swap_info_get !! 761 static unsigned char swap_entry_free(struct swap_info_struct *p, 1364 struc !! 762 swp_entry_t entry, unsigned char usage) 1365 { << 1366 struct swap_info_struct *p; << 1367 << 1368 p = _swap_info_get(entry); << 1369 << 1370 if (p != q) { << 1371 if (q != NULL) << 1372 spin_unlock(&q->lock) << 1373 if (p != NULL) << 1374 spin_lock(&p->lock); << 1375 } << 1376 return p; << 1377 } << 1378 << 1379 static unsigned char __swap_entry_free_locked << 1380 << 1381 << 1382 { 763 { >> 764 unsigned long offset = swp_offset(entry); 1383 unsigned char count; 765 unsigned char count; 1384 unsigned char has_cache; 766 unsigned char has_cache; 1385 767 1386 count = si->swap_map[offset]; !! 768 count = p->swap_map[offset]; 1387 << 1388 has_cache = count & SWAP_HAS_CACHE; 769 has_cache = count & SWAP_HAS_CACHE; 1389 count &= ~SWAP_HAS_CACHE; 770 count &= ~SWAP_HAS_CACHE; 1390 771 1391 if (usage == SWAP_HAS_CACHE) { 772 if (usage == SWAP_HAS_CACHE) { 1392 VM_BUG_ON(!has_cache); 773 VM_BUG_ON(!has_cache); 1393 has_cache = 0; 774 has_cache = 0; 1394 } else if (count == SWAP_MAP_SHMEM) { 775 } else if (count == SWAP_MAP_SHMEM) { 1395 /* 776 /* 1396 * Or we could insist on shme 777 * Or we could insist on shmem.c using a special 1397 * swap_shmem_free() and free 778 * swap_shmem_free() and free_shmem_swap_and_cache()... 1398 */ 779 */ 1399 count = 0; 780 count = 0; 1400 } else if ((count & ~COUNT_CONTINUED) 781 } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { 1401 if (count == COUNT_CONTINUED) 782 if (count == COUNT_CONTINUED) { 1402 if (swap_count_contin !! 783 if (swap_count_continued(p, offset, count)) 1403 count = SWAP_ 784 count = SWAP_MAP_MAX | COUNT_CONTINUED; 1404 else 785 else 1405 count = SWAP_ 786 count = SWAP_MAP_MAX; 1406 } else 787 } else 1407 count--; 788 count--; 1408 } 789 } 1409 790 1410 usage = count | has_cache; !! 791 if (!count) 1411 if (usage) !! 792 mem_cgroup_uncharge_swap(entry); 1412 WRITE_ONCE(si->swap_map[offse << 1413 else << 1414 WRITE_ONCE(si->swap_map[offse << 1415 << 1416 return usage; << 1417 } << 1418 << 1419 /* << 1420 * When we get a swap entry, if there aren't << 1421 * prevent swapoff, such as the folio in swap << 1422 * reader side is locked, etc., the swap entr << 1423 * because of swapoff. Then, we need to encl << 1424 * functions with get_swap_device() and put_s << 1425 * swap functions call get/put_swap_device() << 1426 * << 1427 * RCU reader side lock (including any spinlo << 1428 * prevent swapoff, because synchronize_rcu() << 1429 * before freeing data structures. << 1430 * << 1431 * Check whether swap entry is valid in the s << 1432 * return pointer to swap_info_struct, and ke << 1433 * via preventing the swap device from being << 1434 * put_swap_device() is called. Otherwise re << 1435 * << 1436 * Notice that swapoff or swapoff+swapon can << 1437 * percpu_ref_tryget_live() in get_swap_devic << 1438 * percpu_ref_put() in put_swap_device() if t << 1439 * to prevent swapoff. The caller must be pr << 1440 * example, the following situation is possib << 1441 * << 1442 * CPU1 CPU2 << 1443 * do_swap_page() << 1444 * ... swapo << 1445 * __read_swap_cache_async() << 1446 * swapcache_prepare() << 1447 * __swap_duplicate() << 1448 * // check swap_map << 1449 * // verify PTE not changed << 1450 * << 1451 * In __swap_duplicate(), the swap_map need t << 1452 * changing partly because the specified swap << 1453 * swap device which has been swapoff. And i << 1454 * the page is read from the swap device, the << 1455 * changed with the page table locked to chec << 1456 * has been swapoff or swapoff+swapon. << 1457 */ << 1458 struct swap_info_struct *get_swap_device(swp_ << 1459 { << 1460 struct swap_info_struct *si; << 1461 unsigned long offset; << 1462 793 1463 if (!entry.val) !! 794 usage = count | has_cache; 1464 goto out; !! 795 p->swap_map[offset] = usage; 1465 si = swp_swap_info(entry); << 1466 if (!si) << 1467 goto bad_nofile; << 1468 if (!percpu_ref_tryget_live(&si->user << 1469 goto out; << 1470 /* << 1471 * Guarantee the si->users are checke << 1472 * fields of swap_info_struct. << 1473 * << 1474 * Paired with the spin_unlock() afte << 1475 * enable_swap_info(). << 1476 */ << 1477 smp_rmb(); << 1478 offset = swp_offset(entry); << 1479 if (offset >= si->max) << 1480 goto put_out; << 1481 << 1482 return si; << 1483 bad_nofile: << 1484 pr_err("%s: %s%08lx\n", __func__, Bad << 1485 out: << 1486 return NULL; << 1487 put_out: << 1488 pr_err("%s: %s%08lx\n", __func__, Bad << 1489 percpu_ref_put(&si->users); << 1490 return NULL; << 1491 } << 1492 << 1493 static unsigned char __swap_entry_free(struct << 1494 swp_en << 1495 { << 1496 struct swap_cluster_info *ci; << 1497 unsigned long offset = swp_offset(ent << 1498 unsigned char usage; << 1499 << 1500 ci = lock_cluster_or_swap_info(si, of << 1501 usage = __swap_entry_free_locked(si, << 1502 unlock_cluster_or_swap_info(si, ci); << 1503 if (!usage) << 1504 free_swap_slot(entry); << 1505 << 1506 return usage; << 1507 } << 1508 << 1509 static bool __swap_entries_free(struct swap_i << 1510 swp_entry_t entry, int nr) << 1511 { << 1512 unsigned long offset = swp_offset(ent << 1513 unsigned int type = swp_type(entry); << 1514 struct swap_cluster_info *ci; << 1515 bool has_cache = false; << 1516 unsigned char count; << 1517 int i; << 1518 << 1519 if (nr <= 1 || swap_count(data_race(s << 1520 goto fallback; << 1521 /* cross into another cluster */ << 1522 if (nr > SWAPFILE_CLUSTER - offset % << 1523 goto fallback; << 1524 << 1525 ci = lock_cluster_or_swap_info(si, of << 1526 if (!swap_is_last_map(si, offset, nr, << 1527 unlock_cluster_or_swap_info(s << 1528 goto fallback; << 1529 } << 1530 for (i = 0; i < nr; i++) << 1531 WRITE_ONCE(si->swap_map[offse << 1532 unlock_cluster_or_swap_info(si, ci); << 1533 << 1534 if (!has_cache) { << 1535 for (i = 0; i < nr; i++) << 1536 zswap_invalidate(swp_ << 1537 spin_lock(&si->lock); << 1538 swap_entry_range_free(si, ent << 1539 spin_unlock(&si->lock); << 1540 } << 1541 return has_cache; << 1542 796 1543 fallback: !! 797 /* free if no reference */ 1544 for (i = 0; i < nr; i++) { !! 798 if (!usage) { 1545 if (data_race(si->swap_map[of !! 799 dec_cluster_info_page(p, p->cluster_info, offset); 1546 count = __swap_entry_ !! 800 if (offset < p->lowest_bit) 1547 if (count == SWAP_HAS !! 801 p->lowest_bit = offset; 1548 has_cache = t !! 802 if (offset > p->highest_bit) { 1549 } else { !! 803 bool was_full = !p->highest_bit; 1550 WARN_ON_ONCE(1); !! 804 p->highest_bit = offset; >> 805 if (was_full && (p->flags & SWP_WRITEOK)) { >> 806 spin_lock(&swap_avail_lock); >> 807 WARN_ON(!plist_node_empty(&p->avail_list)); >> 808 if (plist_node_empty(&p->avail_list)) >> 809 plist_add(&p->avail_list, >> 810 &swap_avail_head); >> 811 spin_unlock(&swap_avail_lock); >> 812 } 1551 } 813 } 1552 } !! 814 atomic_long_inc(&nr_swap_pages); 1553 return has_cache; !! 815 p->inuse_pages--; 1554 } !! 816 frontswap_invalidate_page(p->type, offset); 1555 !! 817 if (p->flags & SWP_BLKDEV) { 1556 /* !! 818 struct gendisk *disk = p->bdev->bd_disk; 1557 * Drop the last HAS_CACHE flag of swap entri !! 819 if (disk->fops->swap_slot_free_notify) 1558 * ensure all entries belong to the same cgro !! 820 disk->fops->swap_slot_free_notify(p->bdev, 1559 */ !! 821 offset); 1560 static void swap_entry_range_free(struct swap << 1561 unsigned in << 1562 { << 1563 unsigned long offset = swp_offset(ent << 1564 unsigned char *map = si->swap_map + o << 1565 unsigned char *map_end = map + nr_pag << 1566 struct swap_cluster_info *ci; << 1567 << 1568 ci = lock_cluster(si, offset); << 1569 do { << 1570 VM_BUG_ON(*map != SWAP_HAS_CA << 1571 *map = 0; << 1572 } while (++map < map_end); << 1573 dec_cluster_info_page(si, ci, nr_page << 1574 unlock_cluster(ci); << 1575 << 1576 mem_cgroup_uncharge_swap(entry, nr_pa << 1577 swap_range_free(si, offset, nr_pages) << 1578 } << 1579 << 1580 static void cluster_swap_free_nr(struct swap_ << 1581 unsigned long offset, int nr_ << 1582 unsigned char usage) << 1583 { << 1584 struct swap_cluster_info *ci; << 1585 DECLARE_BITMAP(to_free, BITS_PER_LONG << 1586 int i, nr; << 1587 << 1588 ci = lock_cluster_or_swap_info(si, of << 1589 while (nr_pages) { << 1590 nr = min(BITS_PER_LONG, nr_pa << 1591 for (i = 0; i < nr; i++) { << 1592 if (!__swap_entry_fre << 1593 bitmap_set(to << 1594 } << 1595 if (!bitmap_empty(to_free, BI << 1596 unlock_cluster_or_swa << 1597 for_each_set_bit(i, t << 1598 free_swap_slo << 1599 if (nr == nr_pages) << 1600 return; << 1601 bitmap_clear(to_free, << 1602 ci = lock_cluster_or_ << 1603 } 822 } 1604 offset += nr; << 1605 nr_pages -= nr; << 1606 } 823 } 1607 unlock_cluster_or_swap_info(si, ci); !! 824 >> 825 return usage; 1608 } 826 } 1609 827 1610 /* 828 /* 1611 * Caller has made sure that the swap device 829 * Caller has made sure that the swap device corresponding to entry 1612 * is still around or has not been recycled. 830 * is still around or has not been recycled. 1613 */ 831 */ 1614 void swap_free_nr(swp_entry_t entry, int nr_p !! 832 void swap_free(swp_entry_t entry) 1615 { 833 { 1616 int nr; !! 834 struct swap_info_struct *p; 1617 struct swap_info_struct *sis; << 1618 unsigned long offset = swp_offset(ent << 1619 << 1620 sis = _swap_info_get(entry); << 1621 if (!sis) << 1622 return; << 1623 835 1624 while (nr_pages) { !! 836 p = swap_info_get(entry); 1625 nr = min_t(int, nr_pages, SWA !! 837 if (p) { 1626 cluster_swap_free_nr(sis, off !! 838 swap_entry_free(p, entry, 1); 1627 offset += nr; !! 839 spin_unlock(&p->lock); 1628 nr_pages -= nr; << 1629 } 840 } 1630 } 841 } 1631 842 1632 /* 843 /* 1633 * Called after dropping swapcache to decreas 844 * Called after dropping swapcache to decrease refcnt to swap entries. 1634 */ 845 */ 1635 void put_swap_folio(struct folio *folio, swp_ !! 846 void swapcache_free(swp_entry_t entry) 1636 { 847 { 1637 unsigned long offset = swp_offset(ent !! 848 struct swap_info_struct *p; 1638 struct swap_cluster_info *ci; << 1639 struct swap_info_struct *si; << 1640 int size = 1 << swap_entry_order(foli << 1641 << 1642 si = _swap_info_get(entry); << 1643 if (!si) << 1644 return; << 1645 << 1646 ci = lock_cluster_or_swap_info(si, of << 1647 if (size > 1 && swap_is_has_cache(si, << 1648 unlock_cluster_or_swap_info(s << 1649 spin_lock(&si->lock); << 1650 swap_entry_range_free(si, ent << 1651 spin_unlock(&si->lock); << 1652 return; << 1653 } << 1654 for (int i = 0; i < size; i++, entry. << 1655 if (!__swap_entry_free_locked << 1656 unlock_cluster_or_swa << 1657 free_swap_slot(entry) << 1658 if (i == size - 1) << 1659 return; << 1660 lock_cluster_or_swap_ << 1661 } << 1662 } << 1663 unlock_cluster_or_swap_info(si, ci); << 1664 } << 1665 << 1666 static int swp_entry_cmp(const void *ent1, co << 1667 { << 1668 const swp_entry_t *e1 = ent1, *e2 = e << 1669 << 1670 return (int)swp_type(*e1) - (int)swp_ << 1671 } << 1672 << 1673 void swapcache_free_entries(swp_entry_t *entr << 1674 { << 1675 struct swap_info_struct *p, *prev; << 1676 int i; << 1677 << 1678 if (n <= 0) << 1679 return; << 1680 << 1681 prev = NULL; << 1682 p = NULL; << 1683 849 1684 /* !! 850 p = swap_info_get(entry); 1685 * Sort swap entries by swap device, !! 851 if (p) { 1686 * nr_swapfiles isn't absolutely corr !! 852 swap_entry_free(p, entry, SWAP_HAS_CACHE); 1687 * so low that it isn't necessary to << 1688 */ << 1689 if (nr_swapfiles > 1) << 1690 sort(entries, n, sizeof(entri << 1691 for (i = 0; i < n; ++i) { << 1692 p = swap_info_get_cont(entrie << 1693 if (p) << 1694 swap_entry_range_free << 1695 prev = p; << 1696 } << 1697 if (p) << 1698 spin_unlock(&p->lock); 853 spin_unlock(&p->lock); 1699 } !! 854 } 1700 << 1701 int __swap_count(swp_entry_t entry) << 1702 { << 1703 struct swap_info_struct *si = swp_swa << 1704 pgoff_t offset = swp_offset(entry); << 1705 << 1706 return swap_count(si->swap_map[offset << 1707 } 855 } 1708 856 1709 /* 857 /* 1710 * How many references to @entry are currentl !! 858 * How many references to page are currently swapped out? 1711 * This does not give an exact answer when sw 859 * This does not give an exact answer when swap count is continued, 1712 * but does include the high COUNT_CONTINUED 860 * but does include the high COUNT_CONTINUED flag to allow for that. 1713 */ 861 */ 1714 int swap_swapcount(struct swap_info_struct *s !! 862 int page_swapcount(struct page *page) 1715 { 863 { 1716 pgoff_t offset = swp_offset(entry); !! 864 int count = 0; 1717 struct swap_cluster_info *ci; !! 865 struct swap_info_struct *p; 1718 int count; !! 866 swp_entry_t entry; 1719 867 1720 ci = lock_cluster_or_swap_info(si, of !! 868 entry.val = page_private(page); 1721 count = swap_count(si->swap_map[offse !! 869 p = swap_info_get(entry); 1722 unlock_cluster_or_swap_info(si, ci); !! 870 if (p) { >> 871 count = swap_count(p->swap_map[swp_offset(entry)]); >> 872 spin_unlock(&p->lock); >> 873 } 1723 return count; 874 return count; 1724 } 875 } 1725 876 1726 /* 877 /* 1727 * How many references to @entry are currentl 878 * How many references to @entry are currently swapped out? 1728 * This considers COUNT_CONTINUED so it retur 879 * This considers COUNT_CONTINUED so it returns exact answer. 1729 */ 880 */ 1730 int swp_swapcount(swp_entry_t entry) 881 int swp_swapcount(swp_entry_t entry) 1731 { 882 { 1732 int count, tmp_count, n; 883 int count, tmp_count, n; 1733 struct swap_info_struct *si; !! 884 struct swap_info_struct *p; 1734 struct swap_cluster_info *ci; << 1735 struct page *page; 885 struct page *page; 1736 pgoff_t offset; 886 pgoff_t offset; 1737 unsigned char *map; 887 unsigned char *map; 1738 888 1739 si = _swap_info_get(entry); !! 889 p = swap_info_get(entry); 1740 if (!si) !! 890 if (!p) 1741 return 0; 891 return 0; 1742 892 1743 offset = swp_offset(entry); !! 893 count = swap_count(p->swap_map[swp_offset(entry)]); 1744 << 1745 ci = lock_cluster_or_swap_info(si, of << 1746 << 1747 count = swap_count(si->swap_map[offse << 1748 if (!(count & COUNT_CONTINUED)) 894 if (!(count & COUNT_CONTINUED)) 1749 goto out; 895 goto out; 1750 896 1751 count &= ~COUNT_CONTINUED; 897 count &= ~COUNT_CONTINUED; 1752 n = SWAP_MAP_MAX + 1; 898 n = SWAP_MAP_MAX + 1; 1753 899 1754 page = vmalloc_to_page(si->swap_map + !! 900 offset = swp_offset(entry); >> 901 page = vmalloc_to_page(p->swap_map + offset); 1755 offset &= ~PAGE_MASK; 902 offset &= ~PAGE_MASK; 1756 VM_BUG_ON(page_private(page) != SWP_C 903 VM_BUG_ON(page_private(page) != SWP_CONTINUED); 1757 904 1758 do { 905 do { 1759 page = list_next_entry(page, !! 906 page = list_entry(page->lru.next, struct page, lru); 1760 map = kmap_local_page(page); !! 907 map = kmap_atomic(page); 1761 tmp_count = map[offset]; 908 tmp_count = map[offset]; 1762 kunmap_local(map); !! 909 kunmap_atomic(map); 1763 910 1764 count += (tmp_count & ~COUNT_ 911 count += (tmp_count & ~COUNT_CONTINUED) * n; 1765 n *= (SWAP_CONT_MAX + 1); 912 n *= (SWAP_CONT_MAX + 1); 1766 } while (tmp_count & COUNT_CONTINUED) 913 } while (tmp_count & COUNT_CONTINUED); 1767 out: 914 out: 1768 unlock_cluster_or_swap_info(si, ci); !! 915 spin_unlock(&p->lock); 1769 return count; 916 return count; 1770 } 917 } 1771 918 1772 static bool swap_page_trans_huge_swapped(stru !! 919 /* 1773 swp_ !! 920 * We can write to an anon page without COW if there are no other references >> 921 * to it. And as a side-effect, free up its swap: because the old content >> 922 * on disk will never be read, and seeking back there to write new content >> 923 * later would only waste time away from clustering. >> 924 */ >> 925 int reuse_swap_page(struct page *page) 1774 { 926 { 1775 struct swap_cluster_info *ci; !! 927 int count; 1776 unsigned char *map = si->swap_map; << 1777 unsigned int nr_pages = 1 << order; << 1778 unsigned long roffset = swp_offset(en << 1779 unsigned long offset = round_down(rof << 1780 int i; << 1781 bool ret = false; << 1782 928 1783 ci = lock_cluster_or_swap_info(si, of !! 929 VM_BUG_ON_PAGE(!PageLocked(page), page); 1784 if (!ci || nr_pages == 1) { !! 930 if (unlikely(PageKsm(page))) 1785 if (swap_count(map[roffset])) !! 931 return 0; 1786 ret = true; !! 932 count = page_mapcount(page); 1787 goto unlock_out; !! 933 if (count <= 1 && PageSwapCache(page)) { 1788 } !! 934 count += page_swapcount(page); 1789 for (i = 0; i < nr_pages; i++) { !! 935 if (count == 1 && !PageWriteback(page)) { 1790 if (swap_count(map[offset + i !! 936 delete_from_swap_cache(page); 1791 ret = true; !! 937 SetPageDirty(page); 1792 break; << 1793 } 938 } 1794 } 939 } 1795 unlock_out: !! 940 return count <= 1; 1796 unlock_cluster_or_swap_info(si, ci); << 1797 return ret; << 1798 } << 1799 << 1800 static bool folio_swapped(struct folio *folio << 1801 { << 1802 swp_entry_t entry = folio->swap; << 1803 struct swap_info_struct *si = _swap_i << 1804 << 1805 if (!si) << 1806 return false; << 1807 << 1808 if (!IS_ENABLED(CONFIG_THP_SWAP) || l << 1809 return swap_swapcount(si, ent << 1810 << 1811 return swap_page_trans_huge_swapped(s << 1812 } 941 } 1813 942 1814 static bool folio_swapcache_freeable(struct f !! 943 /* >> 944 * If swap is getting full, or if there are no more mappings of this page, >> 945 * then try_to_free_swap is called to free its swap space. >> 946 */ >> 947 int try_to_free_swap(struct page *page) 1815 { 948 { 1816 VM_BUG_ON_FOLIO(!folio_test_locked(fo !! 949 VM_BUG_ON_PAGE(!PageLocked(page), page); 1817 950 1818 if (!folio_test_swapcache(folio)) !! 951 if (!PageSwapCache(page)) 1819 return false; !! 952 return 0; 1820 if (folio_test_writeback(folio)) !! 953 if (PageWriteback(page)) 1821 return false; !! 954 return 0; >> 955 if (page_swapcount(page)) >> 956 return 0; 1822 957 1823 /* 958 /* 1824 * Once hibernation has begun to crea 959 * Once hibernation has begun to create its image of memory, 1825 * there's a danger that one of the c !! 960 * there's a danger that one of the calls to try_to_free_swap() 1826 * - most probably a call from __try_ 961 * - most probably a call from __try_to_reclaim_swap() while 1827 * hibernation is allocating its own 962 * hibernation is allocating its own swap pages for the image, 1828 * but conceivably even a call from m 963 * but conceivably even a call from memory reclaim - will free 1829 * the swap from a folio which has al !! 964 * the swap from a page which has already been recorded in the 1830 * image as a clean swapcache folio, !! 965 * image as a clean swapcache page, and then reuse its swap for 1831 * another page of the image. On wak 966 * another page of the image. On waking from hibernation, the 1832 * original folio might be freed unde !! 967 * original page might be freed under memory pressure, then 1833 * later read back in from swap, now 968 * later read back in from swap, now with the wrong data. 1834 * 969 * 1835 * Hibernation suspends storage while 970 * Hibernation suspends storage while it is writing the image 1836 * to disk so check that here. 971 * to disk so check that here. 1837 */ 972 */ 1838 if (pm_suspended_storage()) 973 if (pm_suspended_storage()) 1839 return false; !! 974 return 0; 1840 << 1841 return true; << 1842 } << 1843 << 1844 /** << 1845 * folio_free_swap() - Free the swap space us << 1846 * @folio: The folio to remove. << 1847 * << 1848 * If swap is getting full, or if there are n << 1849 * then call folio_free_swap to free its swap << 1850 * << 1851 * Return: true if we were able to release th << 1852 */ << 1853 bool folio_free_swap(struct folio *folio) << 1854 { << 1855 if (!folio_swapcache_freeable(folio)) << 1856 return false; << 1857 if (folio_swapped(folio)) << 1858 return false; << 1859 975 1860 delete_from_swap_cache(folio); !! 976 delete_from_swap_cache(page); 1861 folio_set_dirty(folio); !! 977 SetPageDirty(page); 1862 return true; !! 978 return 1; 1863 } 979 } 1864 980 1865 /** !! 981 /* 1866 * free_swap_and_cache_nr() - Release referen !! 982 * Free the swap entry like above, but also try to 1867 * reclaim their c !! 983 * free the page cache entry if it is the last user. 1868 * @entry: First entry of range. << 1869 * @nr: Number of entries in range. << 1870 * << 1871 * For each swap entry in the contiguous rang << 1872 * entries become free, try to reclaim their << 1873 * offset range is defined by [entry.offset, << 1874 */ 984 */ 1875 void free_swap_and_cache_nr(swp_entry_t entry !! 985 int free_swap_and_cache(swp_entry_t entry) 1876 { 986 { 1877 const unsigned long start_offset = sw !! 987 struct swap_info_struct *p; 1878 const unsigned long end_offset = star !! 988 struct page *page = NULL; 1879 struct swap_info_struct *si; << 1880 bool any_only_cache = false; << 1881 unsigned long offset; << 1882 989 1883 if (non_swap_entry(entry)) 990 if (non_swap_entry(entry)) 1884 return; !! 991 return 1; 1885 992 1886 si = get_swap_device(entry); !! 993 p = swap_info_get(entry); 1887 if (!si) !! 994 if (p) { 1888 return; !! 995 if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { 1889 !! 996 page = find_get_page(swap_address_space(entry), 1890 if (WARN_ON(end_offset > si->max)) !! 997 entry.val); 1891 goto out; !! 998 if (page && !trylock_page(page)) { 1892 !! 999 page_cache_release(page); 1893 /* !! 1000 page = NULL; 1894 * First free all entries in the rang !! 1001 } 1895 */ << 1896 any_only_cache = __swap_entries_free( << 1897 << 1898 /* << 1899 * Short-circuit the below loop if no << 1900 * reference drop to zero. << 1901 */ << 1902 if (!any_only_cache) << 1903 goto out; << 1904 << 1905 /* << 1906 * Now go back over the range trying << 1907 * more efficient for large folios be << 1908 * the swap once per folio in the com << 1909 * __swap_entry_free() and __try_to_r << 1910 * latter will get a reference and lo << 1911 * page but will only succeed once th << 1912 * zero. << 1913 */ << 1914 for (offset = start_offset; offset < << 1915 nr = 1; << 1916 if (READ_ONCE(si->swap_map[of << 1917 /* << 1918 * Folios are always << 1919 * advance forward to << 1920 * folio was found fo << 1921 * in this case. Nega << 1922 * but could not be r << 1923 * to the next bounda << 1924 */ << 1925 nr = __try_to_reclaim << 1926 << 1927 if (nr == 0) << 1928 nr = 1; << 1929 else if (nr < 0) << 1930 nr = -nr; << 1931 nr = ALIGN(offset + 1 << 1932 } 1002 } >> 1003 spin_unlock(&p->lock); 1933 } 1004 } 1934 !! 1005 if (page) { 1935 out: !! 1006 /* 1936 put_swap_device(si); !! 1007 * Not mapped elsewhere, or swap space full? Free it! >> 1008 * Also recheck PageSwapCache now page is locked (above). >> 1009 */ >> 1010 if (PageSwapCache(page) && !PageWriteback(page) && >> 1011 (!page_mapped(page) || vm_swap_full())) { >> 1012 delete_from_swap_cache(page); >> 1013 SetPageDirty(page); >> 1014 } >> 1015 unlock_page(page); >> 1016 page_cache_release(page); >> 1017 } >> 1018 return p != NULL; 1937 } 1019 } 1938 1020 1939 #ifdef CONFIG_HIBERNATION 1021 #ifdef CONFIG_HIBERNATION 1940 << 1941 swp_entry_t get_swap_page_of_type(int type) << 1942 { << 1943 struct swap_info_struct *si = swap_ty << 1944 swp_entry_t entry = {0}; << 1945 << 1946 if (!si) << 1947 goto fail; << 1948 << 1949 /* This is called for allocating swap << 1950 spin_lock(&si->lock); << 1951 if ((si->flags & SWP_WRITEOK) && scan << 1952 atomic_long_dec(&nr_swap_page << 1953 spin_unlock(&si->lock); << 1954 fail: << 1955 return entry; << 1956 } << 1957 << 1958 /* 1022 /* 1959 * Find the swap type that corresponds to giv 1023 * Find the swap type that corresponds to given device (if any). 1960 * 1024 * 1961 * @offset - number of the PAGE_SIZE-sized bl 1025 * @offset - number of the PAGE_SIZE-sized block of the device, starting 1962 * from 0, in which the swap header is expect 1026 * from 0, in which the swap header is expected to be located. 1963 * 1027 * 1964 * This is needed for the suspend to disk (ak 1028 * This is needed for the suspend to disk (aka swsusp). 1965 */ 1029 */ 1966 int swap_type_of(dev_t device, sector_t offse !! 1030 int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) 1967 { 1031 { >> 1032 struct block_device *bdev = NULL; 1968 int type; 1033 int type; 1969 1034 1970 if (!device) !! 1035 if (device) 1971 return -1; !! 1036 bdev = bdget(device); 1972 1037 1973 spin_lock(&swap_lock); 1038 spin_lock(&swap_lock); 1974 for (type = 0; type < nr_swapfiles; t 1039 for (type = 0; type < nr_swapfiles; type++) { 1975 struct swap_info_struct *sis 1040 struct swap_info_struct *sis = swap_info[type]; 1976 1041 1977 if (!(sis->flags & SWP_WRITEO 1042 if (!(sis->flags & SWP_WRITEOK)) 1978 continue; 1043 continue; 1979 1044 1980 if (device == sis->bdev->bd_d !! 1045 if (!bdev) { 1981 struct swap_extent *s !! 1046 if (bdev_p) >> 1047 *bdev_p = bdgrab(sis->bdev); >> 1048 >> 1049 spin_unlock(&swap_lock); >> 1050 return type; >> 1051 } >> 1052 if (bdev == sis->bdev) { >> 1053 struct swap_extent *se = &sis->first_swap_extent; 1982 1054 1983 if (se->start_block = 1055 if (se->start_block == offset) { >> 1056 if (bdev_p) >> 1057 *bdev_p = bdgrab(sis->bdev); >> 1058 1984 spin_unlock(& 1059 spin_unlock(&swap_lock); >> 1060 bdput(bdev); 1985 return type; 1061 return type; 1986 } 1062 } 1987 } 1063 } 1988 } 1064 } 1989 spin_unlock(&swap_lock); 1065 spin_unlock(&swap_lock); 1990 return -ENODEV; !! 1066 if (bdev) 1991 } !! 1067 bdput(bdev); 1992 << 1993 int find_first_swap(dev_t *device) << 1994 { << 1995 int type; << 1996 << 1997 spin_lock(&swap_lock); << 1998 for (type = 0; type < nr_swapfiles; t << 1999 struct swap_info_struct *sis << 2000 1068 2001 if (!(sis->flags & SWP_WRITEO << 2002 continue; << 2003 *device = sis->bdev->bd_dev; << 2004 spin_unlock(&swap_lock); << 2005 return type; << 2006 } << 2007 spin_unlock(&swap_lock); << 2008 return -ENODEV; 1069 return -ENODEV; 2009 } 1070 } 2010 1071 2011 /* 1072 /* 2012 * Get the (PAGE_SIZE) block corresponding to 1073 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev 2013 * corresponding to given index in swap_info 1074 * corresponding to given index in swap_info (swap type). 2014 */ 1075 */ 2015 sector_t swapdev_block(int type, pgoff_t offs 1076 sector_t swapdev_block(int type, pgoff_t offset) 2016 { 1077 { 2017 struct swap_info_struct *si = swap_ty !! 1078 struct block_device *bdev; 2018 struct swap_extent *se; << 2019 1079 2020 if (!si || !(si->flags & SWP_WRITEOK) !! 1080 if ((unsigned int)type >= nr_swapfiles) >> 1081 return 0; >> 1082 if (!(swap_info[type]->flags & SWP_WRITEOK)) 2021 return 0; 1083 return 0; 2022 se = offset_to_swap_extent(si, offset !! 1084 return map_swap_entry(swp_entry(type, offset), &bdev); 2023 return se->start_block + (offset - se << 2024 } 1085 } 2025 1086 2026 /* 1087 /* 2027 * Return either the total number of swap pag 1088 * Return either the total number of swap pages of given type, or the number 2028 * of free pages of that type (depending on @ 1089 * of free pages of that type (depending on @free) 2029 * 1090 * 2030 * This is needed for software suspend 1091 * This is needed for software suspend 2031 */ 1092 */ 2032 unsigned int count_swap_pages(int type, int f 1093 unsigned int count_swap_pages(int type, int free) 2033 { 1094 { 2034 unsigned int n = 0; 1095 unsigned int n = 0; 2035 1096 2036 spin_lock(&swap_lock); 1097 spin_lock(&swap_lock); 2037 if ((unsigned int)type < nr_swapfiles 1098 if ((unsigned int)type < nr_swapfiles) { 2038 struct swap_info_struct *sis 1099 struct swap_info_struct *sis = swap_info[type]; 2039 1100 2040 spin_lock(&sis->lock); 1101 spin_lock(&sis->lock); 2041 if (sis->flags & SWP_WRITEOK) 1102 if (sis->flags & SWP_WRITEOK) { 2042 n = sis->pages; 1103 n = sis->pages; 2043 if (free) 1104 if (free) 2044 n -= sis->inu 1105 n -= sis->inuse_pages; 2045 } 1106 } 2046 spin_unlock(&sis->lock); 1107 spin_unlock(&sis->lock); 2047 } 1108 } 2048 spin_unlock(&swap_lock); 1109 spin_unlock(&swap_lock); 2049 return n; 1110 return n; 2050 } 1111 } 2051 #endif /* CONFIG_HIBERNATION */ 1112 #endif /* CONFIG_HIBERNATION */ 2052 1113 2053 static inline int pte_same_as_swp(pte_t pte, !! 1114 static inline int maybe_same_pte(pte_t pte, pte_t swp_pte) 2054 { 1115 { 2055 return pte_same(pte_swp_clear_flags(p !! 1116 #ifdef CONFIG_MEM_SOFT_DIRTY >> 1117 /* >> 1118 * When pte keeps soft dirty bit the pte generated >> 1119 * from swap entry does not has it, still it's same >> 1120 * pte from logical point of view. >> 1121 */ >> 1122 pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte); >> 1123 return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty); >> 1124 #else >> 1125 return pte_same(pte, swp_pte); >> 1126 #endif 2056 } 1127 } 2057 1128 2058 /* 1129 /* 2059 * No need to decide whether this PTE shares 1130 * No need to decide whether this PTE shares the swap entry with others, 2060 * just let do_wp_page work it out if a write 1131 * just let do_wp_page work it out if a write is requested later - to 2061 * force COW, vm_page_prot omits write permis 1132 * force COW, vm_page_prot omits write permission from any private vma. 2062 */ 1133 */ 2063 static int unuse_pte(struct vm_area_struct *v 1134 static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, 2064 unsigned long addr, swp_entry !! 1135 unsigned long addr, swp_entry_t entry, struct page *page) 2065 { 1136 { 2066 struct page *page; !! 1137 struct page *swapcache; 2067 struct folio *swapcache; !! 1138 struct mem_cgroup *memcg; 2068 spinlock_t *ptl; 1139 spinlock_t *ptl; 2069 pte_t *pte, new_pte, old_pte; !! 1140 pte_t *pte; 2070 bool hwpoisoned = false; << 2071 int ret = 1; 1141 int ret = 1; 2072 1142 2073 swapcache = folio; !! 1143 swapcache = page; 2074 folio = ksm_might_need_to_copy(folio, !! 1144 page = ksm_might_need_to_copy(page, vma, addr); 2075 if (unlikely(!folio)) !! 1145 if (unlikely(!page)) 2076 return -ENOMEM; 1146 return -ENOMEM; 2077 else if (unlikely(folio == ERR_PTR(-E << 2078 hwpoisoned = true; << 2079 folio = swapcache; << 2080 } << 2081 1147 2082 page = folio_file_page(folio, swp_off !! 1148 if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg)) { 2083 if (PageHWPoison(page)) !! 1149 ret = -ENOMEM; 2084 hwpoisoned = true; !! 1150 goto out_nolock; >> 1151 } 2085 1152 2086 pte = pte_offset_map_lock(vma->vm_mm, 1153 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 2087 if (unlikely(!pte || !pte_same_as_swp !! 1154 if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) { 2088 !! 1155 mem_cgroup_cancel_charge(page, memcg); 2089 ret = 0; 1156 ret = 0; 2090 goto out; 1157 goto out; 2091 } 1158 } 2092 1159 2093 old_pte = ptep_get(pte); << 2094 << 2095 if (unlikely(hwpoisoned || !folio_tes << 2096 swp_entry_t swp_entry; << 2097 << 2098 dec_mm_counter(vma->vm_mm, MM << 2099 if (hwpoisoned) { << 2100 swp_entry = make_hwpo << 2101 } else { << 2102 swp_entry = make_pois << 2103 } << 2104 new_pte = swp_entry_to_pte(sw << 2105 ret = 0; << 2106 goto setpte; << 2107 } << 2108 << 2109 /* << 2110 * Some architectures may have to res << 2111 * when reading from swap. This metad << 2112 * so this must be called before swap << 2113 */ << 2114 arch_swap_restore(folio_swap(entry, f << 2115 << 2116 dec_mm_counter(vma->vm_mm, MM_SWAPENT 1160 dec_mm_counter(vma->vm_mm, MM_SWAPENTS); 2117 inc_mm_counter(vma->vm_mm, MM_ANONPAG 1161 inc_mm_counter(vma->vm_mm, MM_ANONPAGES); 2118 folio_get(folio); !! 1162 get_page(page); 2119 if (folio == swapcache) { !! 1163 set_pte_at(vma->vm_mm, addr, pte, 2120 rmap_t rmap_flags = RMAP_NONE !! 1164 pte_mkold(mk_pte(page, vma->vm_page_prot))); 2121 !! 1165 if (page == swapcache) { 2122 /* !! 1166 page_add_anon_rmap(page, vma, addr); 2123 * See do_swap_page(): writeb !! 1167 mem_cgroup_commit_charge(page, memcg, true); 2124 * However, we do a folio_wai << 2125 * call and have the folio lo << 2126 */ << 2127 VM_BUG_ON_FOLIO(folio_test_wr << 2128 if (pte_swp_exclusive(old_pte << 2129 rmap_flags |= RMAP_EX << 2130 /* << 2131 * We currently only expect s << 2132 * fully exclusive or fully s << 2133 * here, we have to be carefu << 2134 */ << 2135 if (!folio_test_anon(folio)) << 2136 VM_WARN_ON_ONCE(folio << 2137 VM_WARN_ON_FOLIO(!fol << 2138 folio_add_new_anon_rm << 2139 } else { << 2140 folio_add_anon_rmap_p << 2141 } << 2142 } else { /* ksm created a completely 1168 } else { /* ksm created a completely new copy */ 2143 folio_add_new_anon_rmap(folio !! 1169 page_add_new_anon_rmap(page, vma, addr); 2144 folio_add_lru_vma(folio, vma) !! 1170 mem_cgroup_commit_charge(page, memcg, false); >> 1171 lru_cache_add_active_or_unevictable(page, vma); 2145 } 1172 } 2146 new_pte = pte_mkold(mk_pte(page, vma- << 2147 if (pte_swp_soft_dirty(old_pte)) << 2148 new_pte = pte_mksoft_dirty(ne << 2149 if (pte_swp_uffd_wp(old_pte)) << 2150 new_pte = pte_mkuffd_wp(new_p << 2151 setpte: << 2152 set_pte_at(vma->vm_mm, addr, pte, new << 2153 swap_free(entry); 1173 swap_free(entry); >> 1174 /* >> 1175 * Move the page to the active list so it is not >> 1176 * immediately swapped out again after swapon. >> 1177 */ >> 1178 activate_page(page); 2154 out: 1179 out: 2155 if (pte) !! 1180 pte_unmap_unlock(pte, ptl); 2156 pte_unmap_unlock(pte, ptl); !! 1181 out_nolock: 2157 if (folio != swapcache) { !! 1182 if (page != swapcache) { 2158 folio_unlock(folio); !! 1183 unlock_page(page); 2159 folio_put(folio); !! 1184 put_page(page); 2160 } 1185 } 2161 return ret; 1186 return ret; 2162 } 1187 } 2163 1188 2164 static int unuse_pte_range(struct vm_area_str 1189 static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 2165 unsigned long addr, u !! 1190 unsigned long addr, unsigned long end, 2166 unsigned int type) !! 1191 swp_entry_t entry, struct page *page) 2167 { 1192 { 2168 pte_t *pte = NULL; !! 1193 pte_t swp_pte = swp_entry_to_pte(entry); 2169 struct swap_info_struct *si; !! 1194 pte_t *pte; >> 1195 int ret = 0; 2170 1196 2171 si = swap_info[type]; !! 1197 /* >> 1198 * We don't actually need pte lock while scanning for swp_pte: since >> 1199 * we hold page lock and mmap_sem, swp_pte cannot be inserted into the >> 1200 * page table while we're scanning; though it could get zapped, and on >> 1201 * some architectures (e.g. x86_32 with PAE) we might catch a glimpse >> 1202 * of unmatched parts which look like swp_pte, so unuse_pte must >> 1203 * recheck under pte lock. Scanning without pte lock lets it be >> 1204 * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. >> 1205 */ >> 1206 pte = pte_offset_map(pmd, addr); 2172 do { 1207 do { 2173 struct folio *folio; !! 1208 /* 2174 unsigned long offset; !! 1209 * swapoff spends a _lot_ of time in this loop! 2175 unsigned char swp_count; !! 1210 * Test inline before going to call unuse_pte. 2176 swp_entry_t entry; !! 1211 */ 2177 int ret; !! 1212 if (unlikely(maybe_same_pte(*pte, swp_pte))) { 2178 pte_t ptent; !! 1213 pte_unmap(pte); 2179 !! 1214 ret = unuse_pte(vma, pmd, addr, entry, page); 2180 if (!pte++) { !! 1215 if (ret) >> 1216 goto out; 2181 pte = pte_offset_map( 1217 pte = pte_offset_map(pmd, addr); 2182 if (!pte) << 2183 break; << 2184 } << 2185 << 2186 ptent = ptep_get_lockless(pte << 2187 << 2188 if (!is_swap_pte(ptent)) << 2189 continue; << 2190 << 2191 entry = pte_to_swp_entry(pten << 2192 if (swp_type(entry) != type) << 2193 continue; << 2194 << 2195 offset = swp_offset(entry); << 2196 pte_unmap(pte); << 2197 pte = NULL; << 2198 << 2199 folio = swap_cache_get_folio( << 2200 if (!folio) { << 2201 struct vm_fault vmf = << 2202 .vma = vma, << 2203 .address = ad << 2204 .real_address << 2205 .pmd = pmd, << 2206 }; << 2207 << 2208 folio = swapin_readah << 2209 << 2210 } << 2211 if (!folio) { << 2212 swp_count = READ_ONCE << 2213 if (swp_count == 0 || << 2214 continue; << 2215 return -ENOMEM; << 2216 } << 2217 << 2218 folio_lock(folio); << 2219 folio_wait_writeback(folio); << 2220 ret = unuse_pte(vma, pmd, add << 2221 if (ret < 0) { << 2222 folio_unlock(folio); << 2223 folio_put(folio); << 2224 return ret; << 2225 } 1218 } 2226 !! 1219 } while (pte++, addr += PAGE_SIZE, addr != end); 2227 folio_free_swap(folio); !! 1220 pte_unmap(pte - 1); 2228 folio_unlock(folio); !! 1221 out: 2229 folio_put(folio); !! 1222 return ret; 2230 } while (addr += PAGE_SIZE, addr != e << 2231 << 2232 if (pte) << 2233 pte_unmap(pte); << 2234 return 0; << 2235 } 1223 } 2236 1224 2237 static inline int unuse_pmd_range(struct vm_a 1225 static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, 2238 unsigned long 1226 unsigned long addr, unsigned long end, 2239 unsigned int !! 1227 swp_entry_t entry, struct page *page) 2240 { 1228 { 2241 pmd_t *pmd; 1229 pmd_t *pmd; 2242 unsigned long next; 1230 unsigned long next; 2243 int ret; 1231 int ret; 2244 1232 2245 pmd = pmd_offset(pud, addr); 1233 pmd = pmd_offset(pud, addr); 2246 do { 1234 do { 2247 cond_resched(); << 2248 next = pmd_addr_end(addr, end 1235 next = pmd_addr_end(addr, end); 2249 ret = unuse_pte_range(vma, pm !! 1236 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) >> 1237 continue; >> 1238 ret = unuse_pte_range(vma, pmd, addr, next, entry, page); 2250 if (ret) 1239 if (ret) 2251 return ret; 1240 return ret; 2252 } while (pmd++, addr = next, addr != 1241 } while (pmd++, addr = next, addr != end); 2253 return 0; 1242 return 0; 2254 } 1243 } 2255 1244 2256 static inline int unuse_pud_range(struct vm_a !! 1245 static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 2257 unsigned long 1246 unsigned long addr, unsigned long end, 2258 unsigned int !! 1247 swp_entry_t entry, struct page *page) 2259 { 1248 { 2260 pud_t *pud; 1249 pud_t *pud; 2261 unsigned long next; 1250 unsigned long next; 2262 int ret; 1251 int ret; 2263 1252 2264 pud = pud_offset(p4d, addr); !! 1253 pud = pud_offset(pgd, addr); 2265 do { 1254 do { 2266 next = pud_addr_end(addr, end 1255 next = pud_addr_end(addr, end); 2267 if (pud_none_or_clear_bad(pud 1256 if (pud_none_or_clear_bad(pud)) 2268 continue; 1257 continue; 2269 ret = unuse_pmd_range(vma, pu !! 1258 ret = unuse_pmd_range(vma, pud, addr, next, entry, page); 2270 if (ret) 1259 if (ret) 2271 return ret; 1260 return ret; 2272 } while (pud++, addr = next, addr != 1261 } while (pud++, addr = next, addr != end); 2273 return 0; 1262 return 0; 2274 } 1263 } 2275 1264 2276 static inline int unuse_p4d_range(struct vm_a !! 1265 static int unuse_vma(struct vm_area_struct *vma, 2277 unsigned long !! 1266 swp_entry_t entry, struct page *page) 2278 unsigned int << 2279 { << 2280 p4d_t *p4d; << 2281 unsigned long next; << 2282 int ret; << 2283 << 2284 p4d = p4d_offset(pgd, addr); << 2285 do { << 2286 next = p4d_addr_end(addr, end << 2287 if (p4d_none_or_clear_bad(p4d << 2288 continue; << 2289 ret = unuse_pud_range(vma, p4 << 2290 if (ret) << 2291 return ret; << 2292 } while (p4d++, addr = next, addr != << 2293 return 0; << 2294 } << 2295 << 2296 static int unuse_vma(struct vm_area_struct *v << 2297 { 1267 { 2298 pgd_t *pgd; 1268 pgd_t *pgd; 2299 unsigned long addr, end, next; 1269 unsigned long addr, end, next; 2300 int ret; 1270 int ret; 2301 1271 2302 addr = vma->vm_start; !! 1272 if (page_anon_vma(page)) { 2303 end = vma->vm_end; !! 1273 addr = page_address_in_vma(page, vma); >> 1274 if (addr == -EFAULT) >> 1275 return 0; >> 1276 else >> 1277 end = addr + PAGE_SIZE; >> 1278 } else { >> 1279 addr = vma->vm_start; >> 1280 end = vma->vm_end; >> 1281 } 2304 1282 2305 pgd = pgd_offset(vma->vm_mm, addr); 1283 pgd = pgd_offset(vma->vm_mm, addr); 2306 do { 1284 do { 2307 next = pgd_addr_end(addr, end 1285 next = pgd_addr_end(addr, end); 2308 if (pgd_none_or_clear_bad(pgd 1286 if (pgd_none_or_clear_bad(pgd)) 2309 continue; 1287 continue; 2310 ret = unuse_p4d_range(vma, pg !! 1288 ret = unuse_pud_range(vma, pgd, addr, next, entry, page); 2311 if (ret) 1289 if (ret) 2312 return ret; 1290 return ret; 2313 } while (pgd++, addr = next, addr != 1291 } while (pgd++, addr = next, addr != end); 2314 return 0; 1292 return 0; 2315 } 1293 } 2316 1294 2317 static int unuse_mm(struct mm_struct *mm, uns !! 1295 static int unuse_mm(struct mm_struct *mm, >> 1296 swp_entry_t entry, struct page *page) 2318 { 1297 { 2319 struct vm_area_struct *vma; 1298 struct vm_area_struct *vma; 2320 int ret = 0; 1299 int ret = 0; 2321 VMA_ITERATOR(vmi, mm, 0); << 2322 << 2323 mmap_read_lock(mm); << 2324 for_each_vma(vmi, vma) { << 2325 if (vma->anon_vma && !is_vm_h << 2326 ret = unuse_vma(vma, << 2327 if (ret) << 2328 break; << 2329 } << 2330 1300 2331 cond_resched(); !! 1301 if (!down_read_trylock(&mm->mmap_sem)) { >> 1302 /* >> 1303 * Activate page so shrink_inactive_list is unlikely to unmap >> 1304 * its ptes while lock is dropped, so swapoff can make progress. >> 1305 */ >> 1306 activate_page(page); >> 1307 unlock_page(page); >> 1308 down_read(&mm->mmap_sem); >> 1309 lock_page(page); 2332 } 1310 } 2333 mmap_read_unlock(mm); !! 1311 for (vma = mm->mmap; vma; vma = vma->vm_next) { 2334 return ret; !! 1312 if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) >> 1313 break; >> 1314 } >> 1315 up_read(&mm->mmap_sem); >> 1316 return (ret < 0)? ret: 0; 2335 } 1317 } 2336 1318 2337 /* 1319 /* 2338 * Scan swap_map from current position to nex !! 1320 * Scan swap_map (or frontswap_map if frontswap parameter is true) 2339 * Return 0 if there are no inuse entries aft !! 1321 * from current position to next entry still in use. 2340 * the map. !! 1322 * Recycle to start on reaching the end, returning 0 when empty. 2341 */ 1323 */ 2342 static unsigned int find_next_to_unuse(struct 1324 static unsigned int find_next_to_unuse(struct swap_info_struct *si, 2343 unsig !! 1325 unsigned int prev, bool frontswap) 2344 { 1326 { 2345 unsigned int i; !! 1327 unsigned int max = si->max; >> 1328 unsigned int i = prev; 2346 unsigned char count; 1329 unsigned char count; 2347 1330 2348 /* 1331 /* 2349 * No need for swap_lock here: we're 1332 * No need for swap_lock here: we're just looking 2350 * for whether an entry is in use, no 1333 * for whether an entry is in use, not modifying it; false 2351 * hits are okay, and sys_swapoff() h 1334 * hits are okay, and sys_swapoff() has already prevented new 2352 * allocations from this area (while 1335 * allocations from this area (while holding swap_lock). 2353 */ 1336 */ 2354 for (i = prev + 1; i < si->max; i++) !! 1337 for (;;) { >> 1338 if (++i >= max) { >> 1339 if (!prev) { >> 1340 i = 0; >> 1341 break; >> 1342 } >> 1343 /* >> 1344 * No entries in use at top of swap_map, >> 1345 * loop back to start and recheck there. >> 1346 */ >> 1347 max = prev + 1; >> 1348 prev = 0; >> 1349 i = 1; >> 1350 } >> 1351 if (frontswap) { >> 1352 if (frontswap_test(si, i)) >> 1353 break; >> 1354 else >> 1355 continue; >> 1356 } 2355 count = READ_ONCE(si->swap_ma 1357 count = READ_ONCE(si->swap_map[i]); 2356 if (count && swap_count(count 1358 if (count && swap_count(count) != SWAP_MAP_BAD) 2357 break; 1359 break; 2358 if ((i % LATENCY_LIMIT) == 0) << 2359 cond_resched(); << 2360 } 1360 } 2361 << 2362 if (i == si->max) << 2363 i = 0; << 2364 << 2365 return i; 1361 return i; 2366 } 1362 } 2367 1363 2368 static int try_to_unuse(unsigned int type) !! 1364 /* >> 1365 * We completely avoid races by reading each swap page in advance, >> 1366 * and then search for the process using it. All the necessary >> 1367 * page table adjustments can then be made atomically. >> 1368 * >> 1369 * if the boolean frontswap is true, only unuse pages_to_unuse pages; >> 1370 * pages_to_unuse==0 means all pages; ignored if frontswap is false >> 1371 */ >> 1372 int try_to_unuse(unsigned int type, bool frontswap, >> 1373 unsigned long pages_to_unuse) 2369 { 1374 { 2370 struct mm_struct *prev_mm; << 2371 struct mm_struct *mm; << 2372 struct list_head *p; << 2373 int retval = 0; << 2374 struct swap_info_struct *si = swap_in 1375 struct swap_info_struct *si = swap_info[type]; 2375 struct folio *folio; !! 1376 struct mm_struct *start_mm; >> 1377 volatile unsigned char *swap_map; /* swap_map is accessed without >> 1378 * locking. Mark it as volatile >> 1379 * to prevent compiler doing >> 1380 * something odd. >> 1381 */ >> 1382 unsigned char swcount; >> 1383 struct page *page; 2376 swp_entry_t entry; 1384 swp_entry_t entry; 2377 unsigned int i; !! 1385 unsigned int i = 0; >> 1386 int retval = 0; 2378 1387 2379 if (!READ_ONCE(si->inuse_pages)) !! 1388 /* 2380 goto success; !! 1389 * When searching mms for an entry, a good strategy is to >> 1390 * start at the first mm we freed the previous entry from >> 1391 * (though actually we don't notice whether we or coincidence >> 1392 * freed the entry). Initialize this start_mm with a hold. >> 1393 * >> 1394 * A simpler strategy would be to start at the last mm we >> 1395 * freed the previous entry from; but that would take less >> 1396 * advantage of mmlist ordering, which clusters forked mms >> 1397 * together, child after parent. If we race with dup_mmap(), we >> 1398 * prefer to resolve parent before child, lest we miss entries >> 1399 * duplicated after we scanned child: using last mm would invert >> 1400 * that. >> 1401 */ >> 1402 start_mm = &init_mm; >> 1403 atomic_inc(&init_mm.mm_users); >> 1404 >> 1405 /* >> 1406 * Keep on scanning until all entries have gone. Usually, >> 1407 * one pass through swap_map is enough, but not necessarily: >> 1408 * there are races when an instance of an entry might be missed. >> 1409 */ >> 1410 while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { >> 1411 if (signal_pending(current)) { >> 1412 retval = -EINTR; >> 1413 break; >> 1414 } 2381 1415 2382 retry: !! 1416 /* 2383 retval = shmem_unuse(type); !! 1417 * Get a page for the entry, using the existing swap 2384 if (retval) !! 1418 * cache page if there is one. Otherwise, get a clean 2385 return retval; !! 1419 * page and read the swap into it. >> 1420 */ >> 1421 swap_map = &si->swap_map[i]; >> 1422 entry = swp_entry(type, i); >> 1423 page = read_swap_cache_async(entry, >> 1424 GFP_HIGHUSER_MOVABLE, NULL, 0); >> 1425 if (!page) { >> 1426 /* >> 1427 * Either swap_duplicate() failed because entry >> 1428 * has been freed independently, and will not be >> 1429 * reused since sys_swapoff() already disabled >> 1430 * allocation from here, or alloc_page() failed. >> 1431 */ >> 1432 swcount = *swap_map; >> 1433 /* >> 1434 * We don't hold lock here, so the swap entry could be >> 1435 * SWAP_MAP_BAD (when the cluster is discarding). >> 1436 * Instead of fail out, We can just skip the swap >> 1437 * entry because swapoff will wait for discarding >> 1438 * finish anyway. >> 1439 */ >> 1440 if (!swcount || swcount == SWAP_MAP_BAD) >> 1441 continue; >> 1442 retval = -ENOMEM; >> 1443 break; >> 1444 } 2386 1445 2387 prev_mm = &init_mm; !! 1446 /* 2388 mmget(prev_mm); !! 1447 * Don't hold on to start_mm if it looks like exiting. >> 1448 */ >> 1449 if (atomic_read(&start_mm->mm_users) == 1) { >> 1450 mmput(start_mm); >> 1451 start_mm = &init_mm; >> 1452 atomic_inc(&init_mm.mm_users); >> 1453 } 2389 1454 2390 spin_lock(&mmlist_lock); !! 1455 /* 2391 p = &init_mm.mmlist; !! 1456 * Wait for and lock page. When do_swap_page races with 2392 while (READ_ONCE(si->inuse_pages) && !! 1457 * try_to_unuse, do_swap_page can handle the fault much 2393 !signal_pending(current) && !! 1458 * faster than try_to_unuse can locate the entry. This 2394 (p = p->next) != &init_mm.mmli !! 1459 * apparently redundant "wait_on_page_locked" lets try_to_unuse >> 1460 * defer to do_swap_page in such a case - in some tests, >> 1461 * do_swap_page and try_to_unuse repeatedly compete. >> 1462 */ >> 1463 wait_on_page_locked(page); >> 1464 wait_on_page_writeback(page); >> 1465 lock_page(page); >> 1466 wait_on_page_writeback(page); 2395 1467 2396 mm = list_entry(p, struct mm_ !! 1468 /* 2397 if (!mmget_not_zero(mm)) !! 1469 * Remove all references to entry. >> 1470 */ >> 1471 swcount = *swap_map; >> 1472 if (swap_count(swcount) == SWAP_MAP_SHMEM) { >> 1473 retval = shmem_unuse(entry, page); >> 1474 /* page has already been unlocked and released */ >> 1475 if (retval < 0) >> 1476 break; 2398 continue; 1477 continue; 2399 spin_unlock(&mmlist_lock); !! 1478 } 2400 mmput(prev_mm); !! 1479 if (swap_count(swcount) && start_mm != &init_mm) 2401 prev_mm = mm; !! 1480 retval = unuse_mm(start_mm, entry, page); 2402 retval = unuse_mm(mm, type); !! 1481 2403 if (retval) { !! 1482 if (swap_count(*swap_map)) { >> 1483 int set_start_mm = (*swap_map >= swcount); >> 1484 struct list_head *p = &start_mm->mmlist; >> 1485 struct mm_struct *new_start_mm = start_mm; >> 1486 struct mm_struct *prev_mm = start_mm; >> 1487 struct mm_struct *mm; >> 1488 >> 1489 atomic_inc(&new_start_mm->mm_users); >> 1490 atomic_inc(&prev_mm->mm_users); >> 1491 spin_lock(&mmlist_lock); >> 1492 while (swap_count(*swap_map) && !retval && >> 1493 (p = p->next) != &start_mm->mmlist) { >> 1494 mm = list_entry(p, struct mm_struct, mmlist); >> 1495 if (!atomic_inc_not_zero(&mm->mm_users)) >> 1496 continue; >> 1497 spin_unlock(&mmlist_lock); >> 1498 mmput(prev_mm); >> 1499 prev_mm = mm; >> 1500 >> 1501 cond_resched(); >> 1502 >> 1503 swcount = *swap_map; >> 1504 if (!swap_count(swcount)) /* any usage ? */ >> 1505 ; >> 1506 else if (mm == &init_mm) >> 1507 set_start_mm = 1; >> 1508 else >> 1509 retval = unuse_mm(mm, entry, page); >> 1510 >> 1511 if (set_start_mm && *swap_map < swcount) { >> 1512 mmput(new_start_mm); >> 1513 atomic_inc(&mm->mm_users); >> 1514 new_start_mm = mm; >> 1515 set_start_mm = 0; >> 1516 } >> 1517 spin_lock(&mmlist_lock); >> 1518 } >> 1519 spin_unlock(&mmlist_lock); 2404 mmput(prev_mm); 1520 mmput(prev_mm); 2405 return retval; !! 1521 mmput(start_mm); >> 1522 start_mm = new_start_mm; >> 1523 } >> 1524 if (retval) { >> 1525 unlock_page(page); >> 1526 page_cache_release(page); >> 1527 break; 2406 } 1528 } 2407 1529 2408 /* 1530 /* 2409 * Make sure that we aren't c !! 1531 * If a reference remains (rare), we would like to leave 2410 * interactive performance. !! 1532 * the page in the swap cache; but try_to_unmap could 2411 */ !! 1533 * then re-duplicate the entry once we drop page lock, 2412 cond_resched(); !! 1534 * so we might loop indefinitely; also, that page could 2413 spin_lock(&mmlist_lock); !! 1535 * not be swapped out to other storage meanwhile. So: 2414 } !! 1536 * delete from cache even if there's another reference, 2415 spin_unlock(&mmlist_lock); !! 1537 * after ensuring that the data has been saved to disk - 2416 !! 1538 * since if the reference remains (rarer), it will be 2417 mmput(prev_mm); !! 1539 * read from disk into another page. Splitting into two >> 1540 * pages would be incorrect if swap supported "shared >> 1541 * private" pages, but they are handled by tmpfs files. >> 1542 * >> 1543 * Given how unuse_vma() targets one particular offset >> 1544 * in an anon_vma, once the anon_vma has been determined, >> 1545 * this splitting happens to be just what is needed to >> 1546 * handle where KSM pages have been swapped out: re-reading >> 1547 * is unnecessarily slow, but we can fix that later on. >> 1548 */ >> 1549 if (swap_count(*swap_map) && >> 1550 PageDirty(page) && PageSwapCache(page)) { >> 1551 struct writeback_control wbc = { >> 1552 .sync_mode = WB_SYNC_NONE, >> 1553 }; 2418 1554 2419 i = 0; !! 1555 swap_writepage(page, &wbc); 2420 while (READ_ONCE(si->inuse_pages) && !! 1556 lock_page(page); 2421 !signal_pending(current) && !! 1557 wait_on_page_writeback(page); 2422 (i = find_next_to_unuse(si, i) !! 1558 } 2423 1559 2424 entry = swp_entry(type, i); !! 1560 /* 2425 folio = filemap_get_folio(swa !! 1561 * It is conceivable that a racing task removed this page from 2426 if (IS_ERR(folio)) !! 1562 * swap cache just before we acquired the page lock at the top, 2427 continue; !! 1563 * or while we dropped it in unuse_mm(). The page might even >> 1564 * be back in swap cache on another swap area: that we must not >> 1565 * delete, since it may not have been written out to swap yet. >> 1566 */ >> 1567 if (PageSwapCache(page) && >> 1568 likely(page_private(page) == entry.val)) >> 1569 delete_from_swap_cache(page); 2428 1570 2429 /* 1571 /* 2430 * It is conceivable that a r !! 1572 * So we could skip searching mms once swap count went 2431 * swap cache just before we !! 1573 * to 1, we did not mark any present ptes as dirty: must 2432 * might even be back in swap !! 1574 * mark page dirty so shrink_page_list will preserve it. 2433 * that is okay, folio_free_s !! 1575 */ 2434 */ !! 1576 SetPageDirty(page); 2435 folio_lock(folio); !! 1577 unlock_page(page); 2436 folio_wait_writeback(folio); !! 1578 page_cache_release(page); 2437 folio_free_swap(folio); << 2438 folio_unlock(folio); << 2439 folio_put(folio); << 2440 } << 2441 1579 2442 /* !! 1580 /* 2443 * Lets check again to see if there a !! 1581 * Make sure that we aren't completely killing 2444 * If yes, we would need to do retry !! 1582 * interactive performance. 2445 * Under global memory pressure, swap !! 1583 */ 2446 * into process space after the mmlis !! 1584 cond_resched(); 2447 * !! 1585 if (frontswap && pages_to_unuse > 0) { 2448 * Limit the number of retries? No: w !! 1586 if (!--pages_to_unuse) 2449 * above fails, that mm is likely to !! 1587 break; 2450 * exit_mmap(), which proceeds at its !! 1588 } 2451 * and even shmem_writepage() could h << 2452 * folio_alloc_swap(), temporarily hi << 2453 * and robust (though cpu-intensive) << 2454 */ << 2455 if (READ_ONCE(si->inuse_pages)) { << 2456 if (!signal_pending(current)) << 2457 goto retry; << 2458 return -EINTR; << 2459 } 1589 } 2460 1590 2461 success: !! 1591 mmput(start_mm); 2462 /* !! 1592 return retval; 2463 * Make sure that further cleanups af << 2464 * after swap_range_free() reduces si << 2465 */ << 2466 smp_mb(); << 2467 return 0; << 2468 } 1593 } 2469 1594 2470 /* 1595 /* 2471 * After a successful try_to_unuse, if no swa 1596 * After a successful try_to_unuse, if no swap is now in use, we know 2472 * we can empty the mmlist. swap_lock must b 1597 * we can empty the mmlist. swap_lock must be held on entry and exit. 2473 * Note that mmlist_lock nests inside swap_lo 1598 * Note that mmlist_lock nests inside swap_lock, and an mm must be 2474 * added to the mmlist just after page_duplic 1599 * added to the mmlist just after page_duplicate - before would be racy. 2475 */ 1600 */ 2476 static void drain_mmlist(void) 1601 static void drain_mmlist(void) 2477 { 1602 { 2478 struct list_head *p, *next; 1603 struct list_head *p, *next; 2479 unsigned int type; 1604 unsigned int type; 2480 1605 2481 for (type = 0; type < nr_swapfiles; t 1606 for (type = 0; type < nr_swapfiles; type++) 2482 if (swap_info[type]->inuse_pa 1607 if (swap_info[type]->inuse_pages) 2483 return; 1608 return; 2484 spin_lock(&mmlist_lock); 1609 spin_lock(&mmlist_lock); 2485 list_for_each_safe(p, next, &init_mm. 1610 list_for_each_safe(p, next, &init_mm.mmlist) 2486 list_del_init(p); 1611 list_del_init(p); 2487 spin_unlock(&mmlist_lock); 1612 spin_unlock(&mmlist_lock); 2488 } 1613 } 2489 1614 2490 /* 1615 /* >> 1616 * Use this swapdev's extent info to locate the (PAGE_SIZE) block which >> 1617 * corresponds to page offset for the specified swap entry. >> 1618 * Note that the type of this function is sector_t, but it returns page offset >> 1619 * into the bdev, not sector offset. >> 1620 */ >> 1621 static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) >> 1622 { >> 1623 struct swap_info_struct *sis; >> 1624 struct swap_extent *start_se; >> 1625 struct swap_extent *se; >> 1626 pgoff_t offset; >> 1627 >> 1628 sis = swap_info[swp_type(entry)]; >> 1629 *bdev = sis->bdev; >> 1630 >> 1631 offset = swp_offset(entry); >> 1632 start_se = sis->curr_swap_extent; >> 1633 se = start_se; >> 1634 >> 1635 for ( ; ; ) { >> 1636 struct list_head *lh; >> 1637 >> 1638 if (se->start_page <= offset && >> 1639 offset < (se->start_page + se->nr_pages)) { >> 1640 return se->start_block + (offset - se->start_page); >> 1641 } >> 1642 lh = se->list.next; >> 1643 se = list_entry(lh, struct swap_extent, list); >> 1644 sis->curr_swap_extent = se; >> 1645 BUG_ON(se == start_se); /* It *must* be present */ >> 1646 } >> 1647 } >> 1648 >> 1649 /* >> 1650 * Returns the page offset into bdev for the specified page's swap entry. >> 1651 */ >> 1652 sector_t map_swap_page(struct page *page, struct block_device **bdev) >> 1653 { >> 1654 swp_entry_t entry; >> 1655 entry.val = page_private(page); >> 1656 return map_swap_entry(entry, bdev) << (PAGE_SHIFT - 9); >> 1657 } >> 1658 >> 1659 /* 2491 * Free all of a swapdev's extent information 1660 * Free all of a swapdev's extent information 2492 */ 1661 */ 2493 static void destroy_swap_extents(struct swap_ 1662 static void destroy_swap_extents(struct swap_info_struct *sis) 2494 { 1663 { 2495 while (!RB_EMPTY_ROOT(&sis->swap_exte !! 1664 while (!list_empty(&sis->first_swap_extent.list)) { 2496 struct rb_node *rb = sis->swa !! 1665 struct swap_extent *se; 2497 struct swap_extent *se = rb_e << 2498 1666 2499 rb_erase(rb, &sis->swap_exten !! 1667 se = list_entry(sis->first_swap_extent.list.next, >> 1668 struct swap_extent, list); >> 1669 list_del(&se->list); 2500 kfree(se); 1670 kfree(se); 2501 } 1671 } 2502 1672 2503 if (sis->flags & SWP_ACTIVATED) { !! 1673 if (sis->flags & SWP_FILE) { 2504 struct file *swap_file = sis- 1674 struct file *swap_file = sis->swap_file; 2505 struct address_space *mapping 1675 struct address_space *mapping = swap_file->f_mapping; 2506 1676 2507 sis->flags &= ~SWP_ACTIVATED; !! 1677 sis->flags &= ~SWP_FILE; 2508 if (mapping->a_ops->swap_deac !! 1678 mapping->a_ops->swap_deactivate(swap_file); 2509 mapping->a_ops->swap_ << 2510 } 1679 } 2511 } 1680 } 2512 1681 2513 /* 1682 /* 2514 * Add a block range (and the corresponding p 1683 * Add a block range (and the corresponding page range) into this swapdev's 2515 * extent tree. !! 1684 * extent list. The extent list is kept sorted in page order. 2516 * 1685 * 2517 * This function rather assumes that it is ca 1686 * This function rather assumes that it is called in ascending page order. 2518 */ 1687 */ 2519 int 1688 int 2520 add_swap_extent(struct swap_info_struct *sis, 1689 add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, 2521 unsigned long nr_pages, secto 1690 unsigned long nr_pages, sector_t start_block) 2522 { 1691 { 2523 struct rb_node **link = &sis->swap_ex << 2524 struct swap_extent *se; 1692 struct swap_extent *se; 2525 struct swap_extent *new_se; 1693 struct swap_extent *new_se; >> 1694 struct list_head *lh; 2526 1695 2527 /* !! 1696 if (start_page == 0) { 2528 * place the new node at the right mo !! 1697 se = &sis->first_swap_extent; 2529 * function is called in ascending pa !! 1698 sis->curr_swap_extent = se; 2530 */ !! 1699 se->start_page = 0; 2531 while (*link) { !! 1700 se->nr_pages = nr_pages; 2532 parent = *link; !! 1701 se->start_block = start_block; 2533 link = &parent->rb_right; !! 1702 return 1; 2534 } !! 1703 } else { 2535 !! 1704 lh = sis->first_swap_extent.list.prev; /* Highest extent */ 2536 if (parent) { !! 1705 se = list_entry(lh, struct swap_extent, list); 2537 se = rb_entry(parent, struct << 2538 BUG_ON(se->start_page + se->n 1706 BUG_ON(se->start_page + se->nr_pages != start_page); 2539 if (se->start_block + se->nr_ 1707 if (se->start_block + se->nr_pages == start_block) { 2540 /* Merge it */ 1708 /* Merge it */ 2541 se->nr_pages += nr_pa 1709 se->nr_pages += nr_pages; 2542 return 0; 1710 return 0; 2543 } 1711 } 2544 } 1712 } 2545 1713 2546 /* No merge, insert a new extent. */ !! 1714 /* >> 1715 * No merge. Insert a new extent, preserving ordering. >> 1716 */ 2547 new_se = kmalloc(sizeof(*se), GFP_KER 1717 new_se = kmalloc(sizeof(*se), GFP_KERNEL); 2548 if (new_se == NULL) 1718 if (new_se == NULL) 2549 return -ENOMEM; 1719 return -ENOMEM; 2550 new_se->start_page = start_page; 1720 new_se->start_page = start_page; 2551 new_se->nr_pages = nr_pages; 1721 new_se->nr_pages = nr_pages; 2552 new_se->start_block = start_block; 1722 new_se->start_block = start_block; 2553 1723 2554 rb_link_node(&new_se->rb_node, parent !! 1724 list_add_tail(&new_se->list, &sis->first_swap_extent.list); 2555 rb_insert_color(&new_se->rb_node, &si << 2556 return 1; 1725 return 1; 2557 } 1726 } 2558 EXPORT_SYMBOL_GPL(add_swap_extent); << 2559 1727 2560 /* 1728 /* 2561 * A `swap extent' is a simple thing which ma 1729 * A `swap extent' is a simple thing which maps a contiguous range of pages 2562 * onto a contiguous range of disk blocks. A !! 1730 * onto a contiguous range of disk blocks. An ordered list of swap extents 2563 * built at swapon time and is then used at s !! 1731 * is built at swapon time and is then used at swap_writepage/swap_readpage 2564 * time for locating where on disk a page bel 1732 * time for locating where on disk a page belongs. 2565 * 1733 * 2566 * If the swapfile is an S_ISBLK block device 1734 * If the swapfile is an S_ISBLK block device, a single extent is installed. 2567 * This is done so that the main operating co 1735 * This is done so that the main operating code can treat S_ISBLK and S_ISREG 2568 * swap files identically. 1736 * swap files identically. 2569 * 1737 * 2570 * Whether the swapdev is an S_ISREG file or 1738 * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap 2571 * extent rbtree operates in PAGE_SIZE disk b !! 1739 * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK 2572 * swapfiles are handled *identically* after 1740 * swapfiles are handled *identically* after swapon time. 2573 * 1741 * 2574 * For S_ISREG swapfiles, setup_swap_extents( 1742 * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks 2575 * and will parse them into a rbtree, in PAGE !! 1743 * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If 2576 * blocks are found which do not fall within !! 1744 * some stray blocks are found which do not fall within the PAGE_SIZE alignment 2577 * requirements, they are simply tossed out - 1745 * requirements, they are simply tossed out - we will never use those blocks 2578 * for swapping. 1746 * for swapping. 2579 * 1747 * 2580 * For all swap devices we set S_SWAPFILE acr !! 1748 * For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon. This 2581 * prevents users from writing to the swap de !! 1749 * prevents root from shooting her foot off by ftruncating an in-use swapfile, >> 1750 * which will scribble on the fs. 2582 * 1751 * 2583 * The amount of disk space which a single sw 1752 * The amount of disk space which a single swap extent represents varies. 2584 * Typically it is in the 1-4 megabyte range. 1753 * Typically it is in the 1-4 megabyte range. So we can have hundreds of 2585 * extents in the rbtree. - akpm. !! 1754 * extents in the list. To avoid much list walking, we cache the previous >> 1755 * search location in `curr_swap_extent', and start new searches from there. >> 1756 * This is extremely effective. The average number of iterations in >> 1757 * map_swap_page() has been measured at about 0.3 per page. - akpm. 2586 */ 1758 */ 2587 static int setup_swap_extents(struct swap_inf 1759 static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) 2588 { 1760 { 2589 struct file *swap_file = sis->swap_fi 1761 struct file *swap_file = sis->swap_file; 2590 struct address_space *mapping = swap_ 1762 struct address_space *mapping = swap_file->f_mapping; 2591 struct inode *inode = mapping->host; 1763 struct inode *inode = mapping->host; 2592 int ret; 1764 int ret; 2593 1765 2594 if (S_ISBLK(inode->i_mode)) { 1766 if (S_ISBLK(inode->i_mode)) { 2595 ret = add_swap_extent(sis, 0, 1767 ret = add_swap_extent(sis, 0, sis->max, 0); 2596 *span = sis->pages; 1768 *span = sis->pages; 2597 return ret; 1769 return ret; 2598 } 1770 } 2599 1771 2600 if (mapping->a_ops->swap_activate) { 1772 if (mapping->a_ops->swap_activate) { 2601 ret = mapping->a_ops->swap_ac 1773 ret = mapping->a_ops->swap_activate(sis, swap_file, span); 2602 if (ret < 0) !! 1774 if (!ret) { 2603 return ret; !! 1775 sis->flags |= SWP_FILE; 2604 sis->flags |= SWP_ACTIVATED; !! 1776 ret = add_swap_extent(sis, 0, sis->max, 0); 2605 if ((sis->flags & SWP_FS_OPS) !! 1777 *span = sis->pages; 2606 sio_pool_init() != 0) { << 2607 destroy_swap_extents( << 2608 return -ENOMEM; << 2609 } 1778 } 2610 return ret; 1779 return ret; 2611 } 1780 } 2612 1781 2613 return generic_swapfile_activate(sis, 1782 return generic_swapfile_activate(sis, swap_file, span); 2614 } 1783 } 2615 1784 2616 static int swap_node(struct swap_info_struct !! 1785 static void _enable_swap_info(struct swap_info_struct *p, int prio, 2617 { !! 1786 unsigned char *swap_map, 2618 struct block_device *bdev; !! 1787 struct swap_cluster_info *cluster_info) 2619 << 2620 if (si->bdev) << 2621 bdev = si->bdev; << 2622 else << 2623 bdev = si->swap_file->f_inode << 2624 << 2625 return bdev ? bdev->bd_disk->node_id << 2626 } << 2627 << 2628 static void setup_swap_info(struct swap_info_ << 2629 unsigned char *sw << 2630 struct swap_clust << 2631 unsigned long *ze << 2632 { 1788 { 2633 int i; << 2634 << 2635 if (prio >= 0) 1789 if (prio >= 0) 2636 si->prio = prio; !! 1790 p->prio = prio; 2637 else 1791 else 2638 si->prio = --least_priority; !! 1792 p->prio = --least_priority; 2639 /* 1793 /* 2640 * the plist prio is negated because 1794 * the plist prio is negated because plist ordering is 2641 * low-to-high, while swap ordering i 1795 * low-to-high, while swap ordering is high-to-low 2642 */ 1796 */ 2643 si->list.prio = -si->prio; !! 1797 p->list.prio = -p->prio; 2644 for_each_node(i) { !! 1798 p->avail_list.prio = -p->prio; 2645 if (si->prio >= 0) !! 1799 p->swap_map = swap_map; 2646 si->avail_lists[i].pr !! 1800 p->cluster_info = cluster_info; 2647 else { !! 1801 p->flags |= SWP_WRITEOK; 2648 if (swap_node(si) == !! 1802 atomic_long_add(p->pages, &nr_swap_pages); 2649 si->avail_lis !! 1803 total_swap_pages += p->pages; 2650 else << 2651 si->avail_lis << 2652 } << 2653 } << 2654 si->swap_map = swap_map; << 2655 si->cluster_info = cluster_info; << 2656 si->zeromap = zeromap; << 2657 } << 2658 << 2659 static void _enable_swap_info(struct swap_inf << 2660 { << 2661 si->flags |= SWP_WRITEOK; << 2662 atomic_long_add(si->pages, &nr_swap_p << 2663 total_swap_pages += si->pages; << 2664 1804 2665 assert_spin_locked(&swap_lock); 1805 assert_spin_locked(&swap_lock); 2666 /* 1806 /* 2667 * both lists are plists, and thus pr 1807 * both lists are plists, and thus priority ordered. 2668 * swap_active_head needs to be prior 1808 * swap_active_head needs to be priority ordered for swapoff(), 2669 * which on removal of any swap_info_ 1809 * which on removal of any swap_info_struct with an auto-assigned 2670 * (i.e. negative) priority increment 1810 * (i.e. negative) priority increments the auto-assigned priority 2671 * of any lower-priority swap_info_st 1811 * of any lower-priority swap_info_structs. 2672 * swap_avail_head needs to be priori !! 1812 * swap_avail_head needs to be priority ordered for get_swap_page(), 2673 * which allocates swap pages from th 1813 * which allocates swap pages from the highest available priority 2674 * swap_info_struct. 1814 * swap_info_struct. 2675 */ 1815 */ 2676 plist_add(&si->list, &swap_active_hea !! 1816 plist_add(&p->list, &swap_active_head); 2677 !! 1817 spin_lock(&swap_avail_lock); 2678 /* add to available list iff swap dev !! 1818 plist_add(&p->avail_list, &swap_avail_head); 2679 if (si->highest_bit) !! 1819 spin_unlock(&swap_avail_lock); 2680 add_to_avail_list(si); << 2681 } 1820 } 2682 1821 2683 static void enable_swap_info(struct swap_info !! 1822 static void enable_swap_info(struct swap_info_struct *p, int prio, 2684 unsigned char 1823 unsigned char *swap_map, 2685 struct swap_c 1824 struct swap_cluster_info *cluster_info, 2686 unsigned long !! 1825 unsigned long *frontswap_map) 2687 { 1826 { >> 1827 frontswap_init(p->type, frontswap_map); 2688 spin_lock(&swap_lock); 1828 spin_lock(&swap_lock); 2689 spin_lock(&si->lock); !! 1829 spin_lock(&p->lock); 2690 setup_swap_info(si, prio, swap_map, c !! 1830 _enable_swap_info(p, prio, swap_map, cluster_info); 2691 spin_unlock(&si->lock); !! 1831 spin_unlock(&p->lock); 2692 spin_unlock(&swap_lock); << 2693 /* << 2694 * Finished initializing swap device, << 2695 */ << 2696 percpu_ref_resurrect(&si->users); << 2697 spin_lock(&swap_lock); << 2698 spin_lock(&si->lock); << 2699 _enable_swap_info(si); << 2700 spin_unlock(&si->lock); << 2701 spin_unlock(&swap_lock); << 2702 } << 2703 << 2704 static void reinsert_swap_info(struct swap_in << 2705 { << 2706 spin_lock(&swap_lock); << 2707 spin_lock(&si->lock); << 2708 setup_swap_info(si, si->prio, si->swa << 2709 _enable_swap_info(si); << 2710 spin_unlock(&si->lock); << 2711 spin_unlock(&swap_lock); 1832 spin_unlock(&swap_lock); 2712 } 1833 } 2713 1834 2714 static bool __has_usable_swap(void) !! 1835 static void reinsert_swap_info(struct swap_info_struct *p) 2715 { << 2716 return !plist_head_empty(&swap_active << 2717 } << 2718 << 2719 bool has_usable_swap(void) << 2720 { 1836 { 2721 bool ret; << 2722 << 2723 spin_lock(&swap_lock); 1837 spin_lock(&swap_lock); 2724 ret = __has_usable_swap(); !! 1838 spin_lock(&p->lock); >> 1839 _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info); >> 1840 spin_unlock(&p->lock); 2725 spin_unlock(&swap_lock); 1841 spin_unlock(&swap_lock); 2726 return ret; << 2727 } 1842 } 2728 1843 2729 SYSCALL_DEFINE1(swapoff, const char __user *, 1844 SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) 2730 { 1845 { 2731 struct swap_info_struct *p = NULL; 1846 struct swap_info_struct *p = NULL; 2732 unsigned char *swap_map; 1847 unsigned char *swap_map; 2733 unsigned long *zeromap; << 2734 struct swap_cluster_info *cluster_inf 1848 struct swap_cluster_info *cluster_info; >> 1849 unsigned long *frontswap_map; 2735 struct file *swap_file, *victim; 1850 struct file *swap_file, *victim; 2736 struct address_space *mapping; 1851 struct address_space *mapping; 2737 struct inode *inode; 1852 struct inode *inode; 2738 struct filename *pathname; 1853 struct filename *pathname; 2739 int err, found = 0; 1854 int err, found = 0; >> 1855 unsigned int old_block_size; 2740 1856 2741 if (!capable(CAP_SYS_ADMIN)) 1857 if (!capable(CAP_SYS_ADMIN)) 2742 return -EPERM; 1858 return -EPERM; 2743 1859 2744 BUG_ON(!current->mm); 1860 BUG_ON(!current->mm); 2745 1861 2746 pathname = getname(specialfile); 1862 pathname = getname(specialfile); 2747 if (IS_ERR(pathname)) 1863 if (IS_ERR(pathname)) 2748 return PTR_ERR(pathname); 1864 return PTR_ERR(pathname); 2749 1865 2750 victim = file_open_name(pathname, O_R 1866 victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0); 2751 err = PTR_ERR(victim); 1867 err = PTR_ERR(victim); 2752 if (IS_ERR(victim)) 1868 if (IS_ERR(victim)) 2753 goto out; 1869 goto out; 2754 1870 2755 mapping = victim->f_mapping; 1871 mapping = victim->f_mapping; 2756 spin_lock(&swap_lock); 1872 spin_lock(&swap_lock); 2757 plist_for_each_entry(p, &swap_active_ 1873 plist_for_each_entry(p, &swap_active_head, list) { 2758 if (p->flags & SWP_WRITEOK) { 1874 if (p->flags & SWP_WRITEOK) { 2759 if (p->swap_file->f_m 1875 if (p->swap_file->f_mapping == mapping) { 2760 found = 1; 1876 found = 1; 2761 break; 1877 break; 2762 } 1878 } 2763 } 1879 } 2764 } 1880 } 2765 if (!found) { 1881 if (!found) { 2766 err = -EINVAL; 1882 err = -EINVAL; 2767 spin_unlock(&swap_lock); 1883 spin_unlock(&swap_lock); 2768 goto out_dput; 1884 goto out_dput; 2769 } 1885 } 2770 if (!security_vm_enough_memory_mm(cur 1886 if (!security_vm_enough_memory_mm(current->mm, p->pages)) 2771 vm_unacct_memory(p->pages); 1887 vm_unacct_memory(p->pages); 2772 else { 1888 else { 2773 err = -ENOMEM; 1889 err = -ENOMEM; 2774 spin_unlock(&swap_lock); 1890 spin_unlock(&swap_lock); 2775 goto out_dput; 1891 goto out_dput; 2776 } 1892 } >> 1893 spin_lock(&swap_avail_lock); >> 1894 plist_del(&p->avail_list, &swap_avail_head); >> 1895 spin_unlock(&swap_avail_lock); 2777 spin_lock(&p->lock); 1896 spin_lock(&p->lock); 2778 del_from_avail_list(p); << 2779 if (p->prio < 0) { 1897 if (p->prio < 0) { 2780 struct swap_info_struct *si = 1898 struct swap_info_struct *si = p; 2781 int nid; << 2782 1899 2783 plist_for_each_entry_continue 1900 plist_for_each_entry_continue(si, &swap_active_head, list) { 2784 si->prio++; 1901 si->prio++; 2785 si->list.prio--; 1902 si->list.prio--; 2786 for_each_node(nid) { !! 1903 si->avail_list.prio--; 2787 if (si->avail << 2788 si->a << 2789 } << 2790 } 1904 } 2791 least_priority++; 1905 least_priority++; 2792 } 1906 } 2793 plist_del(&p->list, &swap_active_head 1907 plist_del(&p->list, &swap_active_head); 2794 atomic_long_sub(p->pages, &nr_swap_pa 1908 atomic_long_sub(p->pages, &nr_swap_pages); 2795 total_swap_pages -= p->pages; 1909 total_swap_pages -= p->pages; 2796 p->flags &= ~SWP_WRITEOK; 1910 p->flags &= ~SWP_WRITEOK; 2797 spin_unlock(&p->lock); 1911 spin_unlock(&p->lock); 2798 spin_unlock(&swap_lock); 1912 spin_unlock(&swap_lock); 2799 1913 2800 disable_swap_slots_cache_lock(); << 2801 << 2802 set_current_oom_origin(); 1914 set_current_oom_origin(); 2803 err = try_to_unuse(p->type); !! 1915 err = try_to_unuse(p->type, false, 0); /* force unuse all pages */ 2804 clear_current_oom_origin(); 1916 clear_current_oom_origin(); 2805 1917 2806 if (err) { 1918 if (err) { 2807 /* re-insert swap space back 1919 /* re-insert swap space back into swap_list */ 2808 reinsert_swap_info(p); 1920 reinsert_swap_info(p); 2809 reenable_swap_slots_cache_unl << 2810 goto out_dput; 1921 goto out_dput; 2811 } 1922 } 2812 1923 2813 reenable_swap_slots_cache_unlock(); << 2814 << 2815 /* << 2816 * Wait for swap operations protected << 2817 * to complete. Because of synchroni << 2818 * operations protected by RCU reader << 2819 * spinlock) will be waited too. Thi << 2820 * prevent folio_test_swapcache() and << 2821 * operations from racing with swapof << 2822 */ << 2823 percpu_ref_kill(&p->users); << 2824 synchronize_rcu(); << 2825 wait_for_completion(&p->comp); << 2826 << 2827 flush_work(&p->discard_work); 1924 flush_work(&p->discard_work); 2828 flush_work(&p->reclaim_work); << 2829 1925 2830 destroy_swap_extents(p); 1926 destroy_swap_extents(p); 2831 if (p->flags & SWP_CONTINUED) 1927 if (p->flags & SWP_CONTINUED) 2832 free_swap_count_continuations 1928 free_swap_count_continuations(p); 2833 1929 2834 if (!p->bdev || !bdev_nonrot(p->bdev) << 2835 atomic_dec(&nr_rotate_swap); << 2836 << 2837 mutex_lock(&swapon_mutex); 1930 mutex_lock(&swapon_mutex); 2838 spin_lock(&swap_lock); 1931 spin_lock(&swap_lock); 2839 spin_lock(&p->lock); 1932 spin_lock(&p->lock); 2840 drain_mmlist(); 1933 drain_mmlist(); 2841 1934 2842 /* wait for anyone still in scan_swap !! 1935 /* wait for anyone still in scan_swap_map */ 2843 p->highest_bit = 0; /* cu 1936 p->highest_bit = 0; /* cuts scans short */ 2844 while (p->flags >= SWP_SCANNING) { 1937 while (p->flags >= SWP_SCANNING) { 2845 spin_unlock(&p->lock); 1938 spin_unlock(&p->lock); 2846 spin_unlock(&swap_lock); 1939 spin_unlock(&swap_lock); 2847 schedule_timeout_uninterrupti 1940 schedule_timeout_uninterruptible(1); 2848 spin_lock(&swap_lock); 1941 spin_lock(&swap_lock); 2849 spin_lock(&p->lock); 1942 spin_lock(&p->lock); 2850 } 1943 } 2851 1944 2852 swap_file = p->swap_file; 1945 swap_file = p->swap_file; >> 1946 old_block_size = p->old_block_size; 2853 p->swap_file = NULL; 1947 p->swap_file = NULL; 2854 p->max = 0; 1948 p->max = 0; 2855 swap_map = p->swap_map; 1949 swap_map = p->swap_map; 2856 p->swap_map = NULL; 1950 p->swap_map = NULL; 2857 zeromap = p->zeromap; << 2858 p->zeromap = NULL; << 2859 cluster_info = p->cluster_info; 1951 cluster_info = p->cluster_info; 2860 p->cluster_info = NULL; 1952 p->cluster_info = NULL; >> 1953 frontswap_map = frontswap_map_get(p); 2861 spin_unlock(&p->lock); 1954 spin_unlock(&p->lock); 2862 spin_unlock(&swap_lock); 1955 spin_unlock(&swap_lock); 2863 arch_swap_invalidate_area(p->type); !! 1956 frontswap_invalidate_area(p->type); 2864 zswap_swapoff(p->type); !! 1957 frontswap_map_set(p, NULL); 2865 mutex_unlock(&swapon_mutex); 1958 mutex_unlock(&swapon_mutex); 2866 free_percpu(p->percpu_cluster); 1959 free_percpu(p->percpu_cluster); 2867 p->percpu_cluster = NULL; 1960 p->percpu_cluster = NULL; 2868 free_percpu(p->cluster_next_cpu); << 2869 p->cluster_next_cpu = NULL; << 2870 vfree(swap_map); 1961 vfree(swap_map); 2871 kvfree(zeromap); !! 1962 vfree(cluster_info); 2872 kvfree(cluster_info); !! 1963 vfree(frontswap_map); 2873 /* Destroy swap account information * 1964 /* Destroy swap account information */ 2874 swap_cgroup_swapoff(p->type); 1965 swap_cgroup_swapoff(p->type); 2875 exit_swap_address_space(p->type); << 2876 1966 2877 inode = mapping->host; 1967 inode = mapping->host; 2878 !! 1968 if (S_ISBLK(inode->i_mode)) { 2879 inode_lock(inode); !! 1969 struct block_device *bdev = I_BDEV(inode); 2880 inode->i_flags &= ~S_SWAPFILE; !! 1970 set_blocksize(bdev, old_block_size); 2881 inode_unlock(inode); !! 1971 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); >> 1972 } else { >> 1973 mutex_lock(&inode->i_mutex); >> 1974 inode->i_flags &= ~S_SWAPFILE; >> 1975 mutex_unlock(&inode->i_mutex); >> 1976 } 2882 filp_close(swap_file, NULL); 1977 filp_close(swap_file, NULL); 2883 1978 2884 /* 1979 /* 2885 * Clear the SWP_USED flag after all 1980 * Clear the SWP_USED flag after all resources are freed so that swapon 2886 * can reuse this swap_info in alloc_ 1981 * can reuse this swap_info in alloc_swap_info() safely. It is ok to 2887 * not hold p->lock after we cleared 1982 * not hold p->lock after we cleared its SWP_WRITEOK. 2888 */ 1983 */ 2889 spin_lock(&swap_lock); 1984 spin_lock(&swap_lock); 2890 p->flags = 0; 1985 p->flags = 0; 2891 spin_unlock(&swap_lock); 1986 spin_unlock(&swap_lock); 2892 1987 2893 err = 0; 1988 err = 0; 2894 atomic_inc(&proc_poll_event); 1989 atomic_inc(&proc_poll_event); 2895 wake_up_interruptible(&proc_poll_wait 1990 wake_up_interruptible(&proc_poll_wait); 2896 1991 2897 out_dput: 1992 out_dput: 2898 filp_close(victim, NULL); 1993 filp_close(victim, NULL); 2899 out: 1994 out: 2900 putname(pathname); 1995 putname(pathname); 2901 return err; 1996 return err; 2902 } 1997 } 2903 1998 2904 #ifdef CONFIG_PROC_FS 1999 #ifdef CONFIG_PROC_FS 2905 static __poll_t swaps_poll(struct file *file, !! 2000 static unsigned swaps_poll(struct file *file, poll_table *wait) 2906 { 2001 { 2907 struct seq_file *seq = file->private_ 2002 struct seq_file *seq = file->private_data; 2908 2003 2909 poll_wait(file, &proc_poll_wait, wait 2004 poll_wait(file, &proc_poll_wait, wait); 2910 2005 2911 if (seq->poll_event != atomic_read(&p 2006 if (seq->poll_event != atomic_read(&proc_poll_event)) { 2912 seq->poll_event = atomic_read 2007 seq->poll_event = atomic_read(&proc_poll_event); 2913 return EPOLLIN | EPOLLRDNORM !! 2008 return POLLIN | POLLRDNORM | POLLERR | POLLPRI; 2914 } 2009 } 2915 2010 2916 return EPOLLIN | EPOLLRDNORM; !! 2011 return POLLIN | POLLRDNORM; 2917 } 2012 } 2918 2013 2919 /* iterator */ 2014 /* iterator */ 2920 static void *swap_start(struct seq_file *swap 2015 static void *swap_start(struct seq_file *swap, loff_t *pos) 2921 { 2016 { 2922 struct swap_info_struct *si; 2017 struct swap_info_struct *si; 2923 int type; 2018 int type; 2924 loff_t l = *pos; 2019 loff_t l = *pos; 2925 2020 2926 mutex_lock(&swapon_mutex); 2021 mutex_lock(&swapon_mutex); 2927 2022 2928 if (!l) 2023 if (!l) 2929 return SEQ_START_TOKEN; 2024 return SEQ_START_TOKEN; 2930 2025 2931 for (type = 0; (si = swap_type_to_swa !! 2026 for (type = 0; type < nr_swapfiles; type++) { >> 2027 smp_rmb(); /* read nr_swapfiles before swap_info[type] */ >> 2028 si = swap_info[type]; 2932 if (!(si->flags & SWP_USED) | 2029 if (!(si->flags & SWP_USED) || !si->swap_map) 2933 continue; 2030 continue; 2934 if (!--l) 2031 if (!--l) 2935 return si; 2032 return si; 2936 } 2033 } 2937 2034 2938 return NULL; 2035 return NULL; 2939 } 2036 } 2940 2037 2941 static void *swap_next(struct seq_file *swap, 2038 static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) 2942 { 2039 { 2943 struct swap_info_struct *si = v; 2040 struct swap_info_struct *si = v; 2944 int type; 2041 int type; 2945 2042 2946 if (v == SEQ_START_TOKEN) 2043 if (v == SEQ_START_TOKEN) 2947 type = 0; 2044 type = 0; 2948 else 2045 else 2949 type = si->type + 1; 2046 type = si->type + 1; 2950 2047 2951 ++(*pos); !! 2048 for (; type < nr_swapfiles; type++) { 2952 for (; (si = swap_type_to_swap_info(t !! 2049 smp_rmb(); /* read nr_swapfiles before swap_info[type] */ >> 2050 si = swap_info[type]; 2953 if (!(si->flags & SWP_USED) | 2051 if (!(si->flags & SWP_USED) || !si->swap_map) 2954 continue; 2052 continue; >> 2053 ++*pos; 2955 return si; 2054 return si; 2956 } 2055 } 2957 2056 2958 return NULL; 2057 return NULL; 2959 } 2058 } 2960 2059 2961 static void swap_stop(struct seq_file *swap, 2060 static void swap_stop(struct seq_file *swap, void *v) 2962 { 2061 { 2963 mutex_unlock(&swapon_mutex); 2062 mutex_unlock(&swapon_mutex); 2964 } 2063 } 2965 2064 2966 static int swap_show(struct seq_file *swap, v 2065 static int swap_show(struct seq_file *swap, void *v) 2967 { 2066 { 2968 struct swap_info_struct *si = v; 2067 struct swap_info_struct *si = v; 2969 struct file *file; 2068 struct file *file; 2970 int len; 2069 int len; 2971 unsigned long bytes, inuse; << 2972 2070 2973 if (si == SEQ_START_TOKEN) { 2071 if (si == SEQ_START_TOKEN) { 2974 seq_puts(swap, "Filename\t\t\ !! 2072 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); 2975 return 0; 2073 return 0; 2976 } 2074 } 2977 2075 2978 bytes = K(si->pages); << 2979 inuse = K(READ_ONCE(si->inuse_pages)) << 2980 << 2981 file = si->swap_file; 2076 file = si->swap_file; 2982 len = seq_file_path(swap, file, " \t\ 2077 len = seq_file_path(swap, file, " \t\n\\"); 2983 seq_printf(swap, "%*s%s\t%lu\t%s%lu\t !! 2078 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", 2984 len < 40 ? 40 - len : 2079 len < 40 ? 40 - len : 1, " ", 2985 S_ISBLK(file_inode(fi 2080 S_ISBLK(file_inode(file)->i_mode) ? 2986 "partition" : 2081 "partition" : "file\t", 2987 bytes, bytes < 100000 !! 2082 si->pages << (PAGE_SHIFT - 10), 2988 inuse, inuse < 100000 !! 2083 si->inuse_pages << (PAGE_SHIFT - 10), 2989 si->prio); 2084 si->prio); 2990 return 0; 2085 return 0; 2991 } 2086 } 2992 2087 2993 static const struct seq_operations swaps_op = 2088 static const struct seq_operations swaps_op = { 2994 .start = swap_start, 2089 .start = swap_start, 2995 .next = swap_next, 2090 .next = swap_next, 2996 .stop = swap_stop, 2091 .stop = swap_stop, 2997 .show = swap_show 2092 .show = swap_show 2998 }; 2093 }; 2999 2094 3000 static int swaps_open(struct inode *inode, st 2095 static int swaps_open(struct inode *inode, struct file *file) 3001 { 2096 { 3002 struct seq_file *seq; 2097 struct seq_file *seq; 3003 int ret; 2098 int ret; 3004 2099 3005 ret = seq_open(file, &swaps_op); 2100 ret = seq_open(file, &swaps_op); 3006 if (ret) 2101 if (ret) 3007 return ret; 2102 return ret; 3008 2103 3009 seq = file->private_data; 2104 seq = file->private_data; 3010 seq->poll_event = atomic_read(&proc_p 2105 seq->poll_event = atomic_read(&proc_poll_event); 3011 return 0; 2106 return 0; 3012 } 2107 } 3013 2108 3014 static const struct proc_ops swaps_proc_ops = !! 2109 static const struct file_operations proc_swaps_operations = { 3015 .proc_flags = PROC_ENTRY_PERMANEN !! 2110 .open = swaps_open, 3016 .proc_open = swaps_open, !! 2111 .read = seq_read, 3017 .proc_read = seq_read, !! 2112 .llseek = seq_lseek, 3018 .proc_lseek = seq_lseek, !! 2113 .release = seq_release, 3019 .proc_release = seq_release, !! 2114 .poll = swaps_poll, 3020 .proc_poll = swaps_poll, << 3021 }; 2115 }; 3022 2116 3023 static int __init procswaps_init(void) 2117 static int __init procswaps_init(void) 3024 { 2118 { 3025 proc_create("swaps", 0, NULL, &swaps_ !! 2119 proc_create("swaps", 0, NULL, &proc_swaps_operations); 3026 return 0; 2120 return 0; 3027 } 2121 } 3028 __initcall(procswaps_init); 2122 __initcall(procswaps_init); 3029 #endif /* CONFIG_PROC_FS */ 2123 #endif /* CONFIG_PROC_FS */ 3030 2124 3031 #ifdef MAX_SWAPFILES_CHECK 2125 #ifdef MAX_SWAPFILES_CHECK 3032 static int __init max_swapfiles_check(void) 2126 static int __init max_swapfiles_check(void) 3033 { 2127 { 3034 MAX_SWAPFILES_CHECK(); 2128 MAX_SWAPFILES_CHECK(); 3035 return 0; 2129 return 0; 3036 } 2130 } 3037 late_initcall(max_swapfiles_check); 2131 late_initcall(max_swapfiles_check); 3038 #endif 2132 #endif 3039 2133 3040 static struct swap_info_struct *alloc_swap_in 2134 static struct swap_info_struct *alloc_swap_info(void) 3041 { 2135 { 3042 struct swap_info_struct *p; 2136 struct swap_info_struct *p; 3043 struct swap_info_struct *defer = NULL << 3044 unsigned int type; 2137 unsigned int type; 3045 int i; << 3046 2138 3047 p = kvzalloc(struct_size(p, avail_lis !! 2139 p = kzalloc(sizeof(*p), GFP_KERNEL); 3048 if (!p) 2140 if (!p) 3049 return ERR_PTR(-ENOMEM); 2141 return ERR_PTR(-ENOMEM); 3050 2142 3051 if (percpu_ref_init(&p->users, swap_u << 3052 PERCPU_REF_INIT_D << 3053 kvfree(p); << 3054 return ERR_PTR(-ENOMEM); << 3055 } << 3056 << 3057 spin_lock(&swap_lock); 2143 spin_lock(&swap_lock); 3058 for (type = 0; type < nr_swapfiles; t 2144 for (type = 0; type < nr_swapfiles; type++) { 3059 if (!(swap_info[type]->flags 2145 if (!(swap_info[type]->flags & SWP_USED)) 3060 break; 2146 break; 3061 } 2147 } 3062 if (type >= MAX_SWAPFILES) { 2148 if (type >= MAX_SWAPFILES) { 3063 spin_unlock(&swap_lock); 2149 spin_unlock(&swap_lock); 3064 percpu_ref_exit(&p->users); !! 2150 kfree(p); 3065 kvfree(p); << 3066 return ERR_PTR(-EPERM); 2151 return ERR_PTR(-EPERM); 3067 } 2152 } 3068 if (type >= nr_swapfiles) { 2153 if (type >= nr_swapfiles) { 3069 p->type = type; 2154 p->type = type; >> 2155 swap_info[type] = p; 3070 /* 2156 /* 3071 * Publish the swap_info_stru !! 2157 * Write swap_info[type] before nr_swapfiles, in case a 3072 * Note that kvzalloc() above !! 2158 * racing procfs swap_start() or swap_next() is reading them. >> 2159 * (We never shrink nr_swapfiles, we never free this entry.) 3073 */ 2160 */ 3074 smp_store_release(&swap_info[ !! 2161 smp_wmb(); 3075 nr_swapfiles++; 2162 nr_swapfiles++; 3076 } else { 2163 } else { 3077 defer = p; !! 2164 kfree(p); 3078 p = swap_info[type]; 2165 p = swap_info[type]; 3079 /* 2166 /* 3080 * Do not memset this entry: 2167 * Do not memset this entry: a racing procfs swap_next() 3081 * would be relying on p->typ 2168 * would be relying on p->type to remain valid. 3082 */ 2169 */ 3083 } 2170 } 3084 p->swap_extent_root = RB_ROOT; !! 2171 INIT_LIST_HEAD(&p->first_swap_extent.list); 3085 plist_node_init(&p->list, 0); 2172 plist_node_init(&p->list, 0); 3086 for_each_node(i) !! 2173 plist_node_init(&p->avail_list, 0); 3087 plist_node_init(&p->avail_lis << 3088 p->flags = SWP_USED; 2174 p->flags = SWP_USED; 3089 spin_unlock(&swap_lock); 2175 spin_unlock(&swap_lock); 3090 if (defer) { << 3091 percpu_ref_exit(&defer->users << 3092 kvfree(defer); << 3093 } << 3094 spin_lock_init(&p->lock); 2176 spin_lock_init(&p->lock); 3095 spin_lock_init(&p->cont_lock); << 3096 init_completion(&p->comp); << 3097 2177 3098 return p; 2178 return p; 3099 } 2179 } 3100 2180 3101 static int claim_swapfile(struct swap_info_st !! 2181 static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) 3102 { 2182 { >> 2183 int error; >> 2184 3103 if (S_ISBLK(inode->i_mode)) { 2185 if (S_ISBLK(inode->i_mode)) { 3104 si->bdev = I_BDEV(inode); !! 2186 p->bdev = bdgrab(I_BDEV(inode)); 3105 /* !! 2187 error = blkdev_get(p->bdev, 3106 * Zoned block devices contai !! 2188 FMODE_READ | FMODE_WRITE | FMODE_EXCL, p); 3107 * write only restriction. H !! 2189 if (error < 0) { 3108 * suitable for swapping. Di !! 2190 p->bdev = NULL; 3109 */ !! 2191 return error; 3110 if (bdev_is_zoned(si->bdev)) !! 2192 } 3111 return -EINVAL; !! 2193 p->old_block_size = block_size(p->bdev); 3112 si->flags |= SWP_BLKDEV; !! 2194 error = set_blocksize(p->bdev, PAGE_SIZE); >> 2195 if (error < 0) >> 2196 return error; >> 2197 p->flags |= SWP_BLKDEV; 3113 } else if (S_ISREG(inode->i_mode)) { 2198 } else if (S_ISREG(inode->i_mode)) { 3114 si->bdev = inode->i_sb->s_bde !! 2199 p->bdev = inode->i_sb->s_bdev; 3115 } !! 2200 mutex_lock(&inode->i_mutex); >> 2201 if (IS_SWAPFILE(inode)) >> 2202 return -EBUSY; >> 2203 } else >> 2204 return -EINVAL; 3116 2205 3117 return 0; 2206 return 0; 3118 } 2207 } 3119 2208 3120 2209 3121 /* 2210 /* 3122 * Find out how many pages are allowed for a 2211 * Find out how many pages are allowed for a single swap device. There 3123 * are two limiting factors: 2212 * are two limiting factors: 3124 * 1) the number of bits for the swap offset 2213 * 1) the number of bits for the swap offset in the swp_entry_t type, and 3125 * 2) the number of bits in the swap pte, as 2214 * 2) the number of bits in the swap pte, as defined by the different 3126 * architectures. 2215 * architectures. 3127 * 2216 * 3128 * In order to find the largest possible bit 2217 * In order to find the largest possible bit mask, a swap entry with 3129 * swap type 0 and swap offset ~0UL is create 2218 * swap type 0 and swap offset ~0UL is created, encoded to a swap pte, 3130 * decoded to a swp_entry_t again, and finall 2219 * decoded to a swp_entry_t again, and finally the swap offset is 3131 * extracted. 2220 * extracted. 3132 * 2221 * 3133 * This will mask all the bits from the initi 2222 * This will mask all the bits from the initial ~0UL mask that can't 3134 * be encoded in either the swp_entry_t or th 2223 * be encoded in either the swp_entry_t or the architecture definition 3135 * of a swap pte. 2224 * of a swap pte. 3136 */ 2225 */ 3137 unsigned long generic_max_swapfile_size(void) 2226 unsigned long generic_max_swapfile_size(void) 3138 { 2227 { 3139 return swp_offset(pte_to_swp_entry( 2228 return swp_offset(pte_to_swp_entry( 3140 swp_entry_to_pte(swp_ 2229 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; 3141 } 2230 } 3142 2231 3143 /* Can be overridden by an architecture for a 2232 /* Can be overridden by an architecture for additional checks. */ 3144 __weak unsigned long arch_max_swapfile_size(v !! 2233 __weak unsigned long max_swapfile_size(void) 3145 { 2234 { 3146 return generic_max_swapfile_size(); 2235 return generic_max_swapfile_size(); 3147 } 2236 } 3148 2237 3149 static unsigned long read_swap_header(struct !! 2238 static unsigned long read_swap_header(struct swap_info_struct *p, 3150 union 2239 union swap_header *swap_header, 3151 struc 2240 struct inode *inode) 3152 { 2241 { 3153 int i; 2242 int i; 3154 unsigned long maxpages; 2243 unsigned long maxpages; 3155 unsigned long swapfilepages; 2244 unsigned long swapfilepages; 3156 unsigned long last_page; 2245 unsigned long last_page; 3157 2246 3158 if (memcmp("SWAPSPACE2", swap_header- 2247 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { 3159 pr_err("Unable to find swap-s 2248 pr_err("Unable to find swap-space signature\n"); 3160 return 0; 2249 return 0; 3161 } 2250 } 3162 2251 3163 /* swap partition endianness hack... !! 2252 /* swap partition endianess hack... */ 3164 if (swab32(swap_header->info.version) 2253 if (swab32(swap_header->info.version) == 1) { 3165 swab32s(&swap_header->info.ve 2254 swab32s(&swap_header->info.version); 3166 swab32s(&swap_header->info.la 2255 swab32s(&swap_header->info.last_page); 3167 swab32s(&swap_header->info.nr 2256 swab32s(&swap_header->info.nr_badpages); 3168 if (swap_header->info.nr_badp 2257 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 3169 return 0; 2258 return 0; 3170 for (i = 0; i < swap_header-> 2259 for (i = 0; i < swap_header->info.nr_badpages; i++) 3171 swab32s(&swap_header- 2260 swab32s(&swap_header->info.badpages[i]); 3172 } 2261 } 3173 /* Check the swap header's sub-versio 2262 /* Check the swap header's sub-version */ 3174 if (swap_header->info.version != 1) { 2263 if (swap_header->info.version != 1) { 3175 pr_warn("Unable to handle swa 2264 pr_warn("Unable to handle swap header version %d\n", 3176 swap_header->info.ver 2265 swap_header->info.version); 3177 return 0; 2266 return 0; 3178 } 2267 } 3179 2268 3180 si->lowest_bit = 1; !! 2269 p->lowest_bit = 1; 3181 si->cluster_next = 1; !! 2270 p->cluster_next = 1; 3182 si->cluster_nr = 0; !! 2271 p->cluster_nr = 0; 3183 2272 3184 maxpages = swapfile_maximum_size; !! 2273 maxpages = max_swapfile_size(); 3185 last_page = swap_header->info.last_pa 2274 last_page = swap_header->info.last_page; 3186 if (!last_page) { 2275 if (!last_page) { 3187 pr_warn("Empty swap-file\n"); 2276 pr_warn("Empty swap-file\n"); 3188 return 0; 2277 return 0; 3189 } 2278 } 3190 if (last_page > maxpages) { 2279 if (last_page > maxpages) { 3191 pr_warn("Truncating oversized 2280 pr_warn("Truncating oversized swap area, only using %luk out of %luk\n", 3192 K(maxpages), K(last_p !! 2281 maxpages << (PAGE_SHIFT - 10), >> 2282 last_page << (PAGE_SHIFT - 10)); 3193 } 2283 } 3194 if (maxpages > last_page) { 2284 if (maxpages > last_page) { 3195 maxpages = last_page + 1; 2285 maxpages = last_page + 1; 3196 /* p->max is an unsigned int: 2286 /* p->max is an unsigned int: don't overflow it */ 3197 if ((unsigned int)maxpages == 2287 if ((unsigned int)maxpages == 0) 3198 maxpages = UINT_MAX; 2288 maxpages = UINT_MAX; 3199 } 2289 } 3200 si->highest_bit = maxpages - 1; !! 2290 p->highest_bit = maxpages - 1; 3201 2291 3202 if (!maxpages) 2292 if (!maxpages) 3203 return 0; 2293 return 0; 3204 swapfilepages = i_size_read(inode) >> 2294 swapfilepages = i_size_read(inode) >> PAGE_SHIFT; 3205 if (swapfilepages && maxpages > swapf 2295 if (swapfilepages && maxpages > swapfilepages) { 3206 pr_warn("Swap area shorter th 2296 pr_warn("Swap area shorter than signature indicates\n"); 3207 return 0; 2297 return 0; 3208 } 2298 } 3209 if (swap_header->info.nr_badpages && 2299 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) 3210 return 0; 2300 return 0; 3211 if (swap_header->info.nr_badpages > M 2301 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 3212 return 0; 2302 return 0; 3213 2303 3214 return maxpages; 2304 return maxpages; 3215 } 2305 } 3216 2306 3217 #define SWAP_CLUSTER_INFO_COLS !! 2307 static int setup_swap_map_and_extents(struct swap_info_struct *p, 3218 DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(s << 3219 #define SWAP_CLUSTER_SPACE_COLS << 3220 DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES << 3221 #define SWAP_CLUSTER_COLS << 3222 max_t(unsigned int, SWAP_CLUSTER_INFO << 3223 << 3224 static int setup_swap_map_and_extents(struct << 3225 union 2308 union swap_header *swap_header, 3226 unsig 2309 unsigned char *swap_map, >> 2310 struct swap_cluster_info *cluster_info, 3227 unsig 2311 unsigned long maxpages, 3228 secto 2312 sector_t *span) 3229 { 2313 { >> 2314 int i; 3230 unsigned int nr_good_pages; 2315 unsigned int nr_good_pages; 3231 unsigned long i; << 3232 int nr_extents; 2316 int nr_extents; >> 2317 unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); >> 2318 unsigned long idx = p->cluster_next / SWAPFILE_CLUSTER; 3233 2319 3234 nr_good_pages = maxpages - 1; /* om 2320 nr_good_pages = maxpages - 1; /* omit header page */ 3235 2321 >> 2322 cluster_set_null(&p->free_cluster_head); >> 2323 cluster_set_null(&p->free_cluster_tail); >> 2324 cluster_set_null(&p->discard_cluster_head); >> 2325 cluster_set_null(&p->discard_cluster_tail); >> 2326 3236 for (i = 0; i < swap_header->info.nr_ 2327 for (i = 0; i < swap_header->info.nr_badpages; i++) { 3237 unsigned int page_nr = swap_h 2328 unsigned int page_nr = swap_header->info.badpages[i]; 3238 if (page_nr == 0 || page_nr > 2329 if (page_nr == 0 || page_nr > swap_header->info.last_page) 3239 return -EINVAL; 2330 return -EINVAL; 3240 if (page_nr < maxpages) { 2331 if (page_nr < maxpages) { 3241 swap_map[page_nr] = S 2332 swap_map[page_nr] = SWAP_MAP_BAD; 3242 nr_good_pages--; 2333 nr_good_pages--; >> 2334 /* >> 2335 * Haven't marked the cluster free yet, no list >> 2336 * operation involved >> 2337 */ >> 2338 inc_cluster_info_page(p, cluster_info, page_nr); 3243 } 2339 } 3244 } 2340 } 3245 2341 >> 2342 /* Haven't marked the cluster free yet, no list operation involved */ >> 2343 for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) >> 2344 inc_cluster_info_page(p, cluster_info, i); >> 2345 3246 if (nr_good_pages) { 2346 if (nr_good_pages) { 3247 swap_map[0] = SWAP_MAP_BAD; 2347 swap_map[0] = SWAP_MAP_BAD; 3248 si->max = maxpages; !! 2348 /* 3249 si->pages = nr_good_pages; !! 2349 * Not mark the cluster free yet, no list 3250 nr_extents = setup_swap_exten !! 2350 * operation involved >> 2351 */ >> 2352 inc_cluster_info_page(p, cluster_info, 0); >> 2353 p->max = maxpages; >> 2354 p->pages = nr_good_pages; >> 2355 nr_extents = setup_swap_extents(p, span); 3251 if (nr_extents < 0) 2356 if (nr_extents < 0) 3252 return nr_extents; 2357 return nr_extents; 3253 nr_good_pages = si->pages; !! 2358 nr_good_pages = p->pages; 3254 } 2359 } 3255 if (!nr_good_pages) { 2360 if (!nr_good_pages) { 3256 pr_warn("Empty swap-file\n"); 2361 pr_warn("Empty swap-file\n"); 3257 return -EINVAL; 2362 return -EINVAL; 3258 } 2363 } 3259 2364 3260 return nr_extents; << 3261 } << 3262 << 3263 static struct swap_cluster_info *setup_cluste << 3264 << 3265 << 3266 { << 3267 unsigned long nr_clusters = DIV_ROUND << 3268 unsigned long col = si->cluster_next << 3269 struct swap_cluster_info *cluster_inf << 3270 unsigned long i, j, k, idx; << 3271 int cpu, err = -ENOMEM; << 3272 << 3273 cluster_info = kvcalloc(nr_clusters, << 3274 if (!cluster_info) 2365 if (!cluster_info) 3275 goto err; !! 2366 return nr_extents; 3276 << 3277 for (i = 0; i < nr_clusters; i++) << 3278 spin_lock_init(&cluster_info[ << 3279 << 3280 si->cluster_next_cpu = alloc_percpu(u << 3281 if (!si->cluster_next_cpu) << 3282 goto err_free; << 3283 << 3284 /* Random start position to help with << 3285 for_each_possible_cpu(cpu) << 3286 per_cpu(*si->cluster_next_cpu << 3287 get_random_u32_inclusive(1, s << 3288 << 3289 si->percpu_cluster = alloc_percpu(str << 3290 if (!si->percpu_cluster) << 3291 goto err_free; << 3292 << 3293 for_each_possible_cpu(cpu) { << 3294 struct percpu_cluster *cluste << 3295 << 3296 cluster = per_cpu_ptr(si->per << 3297 for (i = 0; i < SWAP_NR_ORDER << 3298 cluster->next[i] = SW << 3299 } << 3300 << 3301 /* << 3302 * Mark unusable pages as unavailable << 3303 * marked free yet, so no list operat << 3304 * << 3305 * See setup_swap_map_and_extents(): << 3306 * and the EOF part of the last clust << 3307 */ << 3308 inc_cluster_info_page(si, cluster_inf << 3309 for (i = 0; i < swap_header->info.nr_ << 3310 inc_cluster_info_page(si, clu << 3311 swap_he << 3312 for (i = maxpages; i < round_up(maxpa << 3313 inc_cluster_info_page(si, clu << 3314 << 3315 INIT_LIST_HEAD(&si->free_clusters); << 3316 INIT_LIST_HEAD(&si->full_clusters); << 3317 INIT_LIST_HEAD(&si->discard_clusters) << 3318 << 3319 for (i = 0; i < SWAP_NR_ORDERS; i++) << 3320 INIT_LIST_HEAD(&si->nonfull_c << 3321 INIT_LIST_HEAD(&si->frag_clus << 3322 si->frag_cluster_nr[i] = 0; << 3323 } << 3324 2367 3325 /* !! 2368 for (i = 0; i < nr_clusters; i++) { 3326 * Reduce false cache line sharing be !! 2369 if (!cluster_count(&cluster_info[idx])) { 3327 * sharing same address space. !! 2370 cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); 3328 */ !! 2371 if (cluster_is_null(&p->free_cluster_head)) { 3329 for (k = 0; k < SWAP_CLUSTER_COLS; k+ !! 2372 cluster_set_next_flag(&p->free_cluster_head, 3330 j = (k + col) % SWAP_CLUSTER_ !! 2373 idx, 0); 3331 for (i = 0; i < DIV_ROUND_UP( !! 2374 cluster_set_next_flag(&p->free_cluster_tail, 3332 struct swap_cluster_i !! 2375 idx, 0); 3333 idx = i * SWAP_CLUSTE !! 2376 } else { 3334 ci = cluster_info + i !! 2377 unsigned int tail; 3335 if (idx >= nr_cluster !! 2378 3336 continue; !! 2379 tail = cluster_next(&p->free_cluster_tail); 3337 if (ci->count) { !! 2380 cluster_set_next(&cluster_info[tail], idx); 3338 ci->flags = C !! 2381 cluster_set_next_flag(&p->free_cluster_tail, 3339 list_add_tail !! 2382 idx, 0); 3340 continue; << 3341 } 2383 } 3342 ci->flags = CLUSTER_F << 3343 list_add_tail(&ci->li << 3344 } 2384 } >> 2385 idx++; >> 2386 if (idx == nr_clusters) >> 2387 idx = 0; 3345 } 2388 } >> 2389 return nr_extents; >> 2390 } >> 2391 >> 2392 /* >> 2393 * Helper to sys_swapon determining if a given swap >> 2394 * backing device queue supports DISCARD operations. >> 2395 */ >> 2396 static bool swap_discardable(struct swap_info_struct *si) >> 2397 { >> 2398 struct request_queue *q = bdev_get_queue(si->bdev); 3346 2399 3347 return cluster_info; !! 2400 if (!q || !blk_queue_discard(q)) >> 2401 return false; 3348 2402 3349 err_free: !! 2403 return true; 3350 kvfree(cluster_info); << 3351 err: << 3352 return ERR_PTR(err); << 3353 } 2404 } 3354 2405 3355 SYSCALL_DEFINE2(swapon, const char __user *, 2406 SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) 3356 { 2407 { 3357 struct swap_info_struct *si; !! 2408 struct swap_info_struct *p; 3358 struct filename *name; 2409 struct filename *name; 3359 struct file *swap_file = NULL; 2410 struct file *swap_file = NULL; 3360 struct address_space *mapping; 2411 struct address_space *mapping; 3361 struct dentry *dentry; << 3362 int prio; 2412 int prio; 3363 int error; 2413 int error; 3364 union swap_header *swap_header; 2414 union swap_header *swap_header; 3365 int nr_extents; 2415 int nr_extents; 3366 sector_t span; 2416 sector_t span; 3367 unsigned long maxpages; 2417 unsigned long maxpages; 3368 unsigned char *swap_map = NULL; 2418 unsigned char *swap_map = NULL; 3369 unsigned long *zeromap = NULL; << 3370 struct swap_cluster_info *cluster_inf 2419 struct swap_cluster_info *cluster_info = NULL; 3371 struct folio *folio = NULL; !! 2420 unsigned long *frontswap_map = NULL; >> 2421 struct page *page = NULL; 3372 struct inode *inode = NULL; 2422 struct inode *inode = NULL; 3373 bool inced_nr_rotate_swap = false; << 3374 2423 3375 if (swap_flags & ~SWAP_FLAGS_VALID) 2424 if (swap_flags & ~SWAP_FLAGS_VALID) 3376 return -EINVAL; 2425 return -EINVAL; 3377 2426 3378 if (!capable(CAP_SYS_ADMIN)) 2427 if (!capable(CAP_SYS_ADMIN)) 3379 return -EPERM; 2428 return -EPERM; 3380 2429 3381 if (!swap_avail_heads) !! 2430 p = alloc_swap_info(); 3382 return -ENOMEM; !! 2431 if (IS_ERR(p)) 3383 !! 2432 return PTR_ERR(p); 3384 si = alloc_swap_info(); << 3385 if (IS_ERR(si)) << 3386 return PTR_ERR(si); << 3387 2433 3388 INIT_WORK(&si->discard_work, swap_dis !! 2434 INIT_WORK(&p->discard_work, swap_discard_work); 3389 INIT_WORK(&si->reclaim_work, swap_rec << 3390 2435 3391 name = getname(specialfile); 2436 name = getname(specialfile); 3392 if (IS_ERR(name)) { 2437 if (IS_ERR(name)) { 3393 error = PTR_ERR(name); 2438 error = PTR_ERR(name); 3394 name = NULL; 2439 name = NULL; 3395 goto bad_swap; 2440 goto bad_swap; 3396 } 2441 } 3397 swap_file = file_open_name(name, O_RD !! 2442 swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0); 3398 if (IS_ERR(swap_file)) { 2443 if (IS_ERR(swap_file)) { 3399 error = PTR_ERR(swap_file); 2444 error = PTR_ERR(swap_file); 3400 swap_file = NULL; 2445 swap_file = NULL; 3401 goto bad_swap; 2446 goto bad_swap; 3402 } 2447 } 3403 2448 3404 si->swap_file = swap_file; !! 2449 p->swap_file = swap_file; 3405 mapping = swap_file->f_mapping; 2450 mapping = swap_file->f_mapping; 3406 dentry = swap_file->f_path.dentry; << 3407 inode = mapping->host; 2451 inode = mapping->host; 3408 2452 3409 error = claim_swapfile(si, inode); !! 2453 /* If S_ISREG(inode->i_mode) will do mutex_lock(&inode->i_mutex); */ >> 2454 error = claim_swapfile(p, inode); 3410 if (unlikely(error)) 2455 if (unlikely(error)) 3411 goto bad_swap; 2456 goto bad_swap; 3412 2457 3413 inode_lock(inode); << 3414 if (d_unlinked(dentry) || cant_mount( << 3415 error = -ENOENT; << 3416 goto bad_swap_unlock_inode; << 3417 } << 3418 if (IS_SWAPFILE(inode)) { << 3419 error = -EBUSY; << 3420 goto bad_swap_unlock_inode; << 3421 } << 3422 << 3423 /* 2458 /* 3424 * Read the swap header. 2459 * Read the swap header. 3425 */ 2460 */ 3426 if (!mapping->a_ops->read_folio) { !! 2461 if (!mapping->a_ops->readpage) { 3427 error = -EINVAL; 2462 error = -EINVAL; 3428 goto bad_swap_unlock_inode; !! 2463 goto bad_swap; 3429 } 2464 } 3430 folio = read_mapping_folio(mapping, 0 !! 2465 page = read_mapping_page(mapping, 0, swap_file); 3431 if (IS_ERR(folio)) { !! 2466 if (IS_ERR(page)) { 3432 error = PTR_ERR(folio); !! 2467 error = PTR_ERR(page); 3433 goto bad_swap_unlock_inode; !! 2468 goto bad_swap; 3434 } 2469 } 3435 swap_header = kmap_local_folio(folio, !! 2470 swap_header = kmap(page); 3436 2471 3437 maxpages = read_swap_header(si, swap_ !! 2472 maxpages = read_swap_header(p, swap_header, inode); 3438 if (unlikely(!maxpages)) { 2473 if (unlikely(!maxpages)) { 3439 error = -EINVAL; 2474 error = -EINVAL; 3440 goto bad_swap_unlock_inode; !! 2475 goto bad_swap; 3441 } 2476 } 3442 2477 3443 /* OK, set up the swap map and apply 2478 /* OK, set up the swap map and apply the bad block list */ 3444 swap_map = vzalloc(maxpages); 2479 swap_map = vzalloc(maxpages); 3445 if (!swap_map) { 2480 if (!swap_map) { 3446 error = -ENOMEM; 2481 error = -ENOMEM; 3447 goto bad_swap_unlock_inode; !! 2482 goto bad_swap; 3448 } 2483 } >> 2484 if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { >> 2485 int cpu; 3449 2486 3450 error = swap_cgroup_swapon(si->type, !! 2487 p->flags |= SWP_SOLIDSTATE; 3451 if (error) !! 2488 /* 3452 goto bad_swap_unlock_inode; !! 2489 * select a random position to start with to help wear leveling 3453 !! 2490 * SSD 3454 nr_extents = setup_swap_map_and_exten !! 2491 */ 3455 !! 2492 p->cluster_next = 1 + (prandom_u32() % p->highest_bit); 3456 if (unlikely(nr_extents < 0)) { << 3457 error = nr_extents; << 3458 goto bad_swap_unlock_inode; << 3459 } << 3460 2493 3461 /* !! 2494 cluster_info = vzalloc(DIV_ROUND_UP(maxpages, 3462 * Use kvmalloc_array instead of bitm !! 2495 SWAPFILE_CLUSTER) * sizeof(*cluster_info)); 3463 * be above MAX_PAGE_ORDER incase of !! 2496 if (!cluster_info) { 3464 */ !! 2497 error = -ENOMEM; 3465 zeromap = kvmalloc_array(BITS_TO_LONG !! 2498 goto bad_swap; 3466 GFP_KERNE !! 2499 } 3467 if (!zeromap) { !! 2500 p->percpu_cluster = alloc_percpu(struct percpu_cluster); 3468 error = -ENOMEM; !! 2501 if (!p->percpu_cluster) { 3469 goto bad_swap_unlock_inode; !! 2502 error = -ENOMEM; >> 2503 goto bad_swap; >> 2504 } >> 2505 for_each_possible_cpu(cpu) { >> 2506 struct percpu_cluster *cluster; >> 2507 cluster = per_cpu_ptr(p->percpu_cluster, cpu); >> 2508 cluster_set_null(&cluster->index); >> 2509 } 3470 } 2510 } 3471 2511 3472 if (si->bdev && bdev_stable_writes(si !! 2512 error = swap_cgroup_swapon(p->type, maxpages); 3473 si->flags |= SWP_STABLE_WRITE !! 2513 if (error) 3474 !! 2514 goto bad_swap; 3475 if (si->bdev && bdev_synchronous(si-> << 3476 si->flags |= SWP_SYNCHRONOUS_ << 3477 << 3478 if (si->bdev && bdev_nonrot(si->bdev) << 3479 si->flags |= SWP_SOLIDSTATE; << 3480 2515 3481 cluster_info = setup_clusters !! 2516 nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map, 3482 if (IS_ERR(cluster_info)) { !! 2517 cluster_info, maxpages, &span); 3483 error = PTR_ERR(clust !! 2518 if (unlikely(nr_extents < 0)) { 3484 cluster_info = NULL; !! 2519 error = nr_extents; 3485 goto bad_swap_unlock_ !! 2520 goto bad_swap; 3486 } << 3487 } else { << 3488 atomic_inc(&nr_rotate_swap); << 3489 inced_nr_rotate_swap = true; << 3490 } 2521 } >> 2522 /* frontswap enabled? set up bit-per-page map for frontswap */ >> 2523 if (frontswap_enabled) >> 2524 frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long)); 3491 2525 3492 if ((swap_flags & SWAP_FLAG_DISCARD) !! 2526 if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { 3493 si->bdev && bdev_max_discard_sect << 3494 /* 2527 /* 3495 * When discard is enabled fo 2528 * When discard is enabled for swap with no particular 3496 * policy flagged, we set all 2529 * policy flagged, we set all swap discard flags here in 3497 * order to sustain backward 2530 * order to sustain backward compatibility with older 3498 * swapon(8) releases. 2531 * swapon(8) releases. 3499 */ 2532 */ 3500 si->flags |= (SWP_DISCARDABLE !! 2533 p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | 3501 SWP_PAGE_DISCARD 2534 SWP_PAGE_DISCARD); 3502 2535 3503 /* 2536 /* 3504 * By flagging sys_swapon, a 2537 * By flagging sys_swapon, a sysadmin can tell us to 3505 * either do single-time area 2538 * either do single-time area discards only, or to just 3506 * perform discards for relea 2539 * perform discards for released swap page-clusters. 3507 * Now it's time to adjust th 2540 * Now it's time to adjust the p->flags accordingly. 3508 */ 2541 */ 3509 if (swap_flags & SWAP_FLAG_DI 2542 if (swap_flags & SWAP_FLAG_DISCARD_ONCE) 3510 si->flags &= ~SWP_PAG !! 2543 p->flags &= ~SWP_PAGE_DISCARD; 3511 else if (swap_flags & SWAP_FL 2544 else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) 3512 si->flags &= ~SWP_ARE !! 2545 p->flags &= ~SWP_AREA_DISCARD; 3513 2546 3514 /* issue a swapon-time discar 2547 /* issue a swapon-time discard if it's still required */ 3515 if (si->flags & SWP_AREA_DISC !! 2548 if (p->flags & SWP_AREA_DISCARD) { 3516 int err = discard_swa !! 2549 int err = discard_swap(p); 3517 if (unlikely(err)) 2550 if (unlikely(err)) 3518 pr_err("swapo 2551 pr_err("swapon: discard_swap(%p): %d\n", 3519 si, e !! 2552 p, err); 3520 } 2553 } 3521 } 2554 } 3522 2555 3523 error = init_swap_address_space(si->t << 3524 if (error) << 3525 goto bad_swap_unlock_inode; << 3526 << 3527 error = zswap_swapon(si->type, maxpag << 3528 if (error) << 3529 goto free_swap_address_space; << 3530 << 3531 /* << 3532 * Flush any pending IO and dirty map << 3533 * swap device. << 3534 */ << 3535 inode->i_flags |= S_SWAPFILE; << 3536 error = inode_drain_writes(inode); << 3537 if (error) { << 3538 inode->i_flags &= ~S_SWAPFILE << 3539 goto free_swap_zswap; << 3540 } << 3541 << 3542 mutex_lock(&swapon_mutex); 2556 mutex_lock(&swapon_mutex); 3543 prio = -1; 2557 prio = -1; 3544 if (swap_flags & SWAP_FLAG_PREFER) 2558 if (swap_flags & SWAP_FLAG_PREFER) 3545 prio = 2559 prio = 3546 (swap_flags & SWAP_FLAG_PRI 2560 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; 3547 enable_swap_info(si, prio, swap_map, !! 2561 enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map); 3548 2562 3549 pr_info("Adding %uk swap on %s. Prio !! 2563 pr_info("Adding %uk swap on %s. " 3550 K(si->pages), name->name, si- !! 2564 "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", 3551 K((unsigned long long)span), !! 2565 p->pages<<(PAGE_SHIFT-10), name->name, p->prio, 3552 (si->flags & SWP_SOLIDSTATE) !! 2566 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), 3553 (si->flags & SWP_DISCARDABLE) !! 2567 (p->flags & SWP_SOLIDSTATE) ? "SS" : "", 3554 (si->flags & SWP_AREA_DISCARD !! 2568 (p->flags & SWP_DISCARDABLE) ? "D" : "", 3555 (si->flags & SWP_PAGE_DISCARD !! 2569 (p->flags & SWP_AREA_DISCARD) ? "s" : "", >> 2570 (p->flags & SWP_PAGE_DISCARD) ? "c" : "", >> 2571 (frontswap_map) ? "FS" : ""); 3556 2572 3557 mutex_unlock(&swapon_mutex); 2573 mutex_unlock(&swapon_mutex); 3558 atomic_inc(&proc_poll_event); 2574 atomic_inc(&proc_poll_event); 3559 wake_up_interruptible(&proc_poll_wait 2575 wake_up_interruptible(&proc_poll_wait); 3560 2576 >> 2577 if (S_ISREG(inode->i_mode)) >> 2578 inode->i_flags |= S_SWAPFILE; 3561 error = 0; 2579 error = 0; 3562 goto out; 2580 goto out; 3563 free_swap_zswap: << 3564 zswap_swapoff(si->type); << 3565 free_swap_address_space: << 3566 exit_swap_address_space(si->type); << 3567 bad_swap_unlock_inode: << 3568 inode_unlock(inode); << 3569 bad_swap: 2581 bad_swap: 3570 free_percpu(si->percpu_cluster); !! 2582 free_percpu(p->percpu_cluster); 3571 si->percpu_cluster = NULL; !! 2583 p->percpu_cluster = NULL; 3572 free_percpu(si->cluster_next_cpu); !! 2584 if (inode && S_ISBLK(inode->i_mode) && p->bdev) { 3573 si->cluster_next_cpu = NULL; !! 2585 set_blocksize(p->bdev, p->old_block_size); 3574 inode = NULL; !! 2586 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); 3575 destroy_swap_extents(si); !! 2587 } 3576 swap_cgroup_swapoff(si->type); !! 2588 destroy_swap_extents(p); >> 2589 swap_cgroup_swapoff(p->type); 3577 spin_lock(&swap_lock); 2590 spin_lock(&swap_lock); 3578 si->swap_file = NULL; !! 2591 p->swap_file = NULL; 3579 si->flags = 0; !! 2592 p->flags = 0; 3580 spin_unlock(&swap_lock); 2593 spin_unlock(&swap_lock); 3581 vfree(swap_map); 2594 vfree(swap_map); 3582 kvfree(zeromap); !! 2595 vfree(cluster_info); 3583 kvfree(cluster_info); !! 2596 if (swap_file) { 3584 if (inced_nr_rotate_swap) !! 2597 if (inode && S_ISREG(inode->i_mode)) { 3585 atomic_dec(&nr_rotate_swap); !! 2598 mutex_unlock(&inode->i_mutex); 3586 if (swap_file) !! 2599 inode = NULL; >> 2600 } 3587 filp_close(swap_file, NULL); 2601 filp_close(swap_file, NULL); >> 2602 } 3588 out: 2603 out: 3589 if (!IS_ERR_OR_NULL(folio)) !! 2604 if (page && !IS_ERR(page)) { 3590 folio_release_kmap(folio, swa !! 2605 kunmap(page); >> 2606 page_cache_release(page); >> 2607 } 3591 if (name) 2608 if (name) 3592 putname(name); 2609 putname(name); 3593 if (inode) !! 2610 if (inode && S_ISREG(inode->i_mode)) 3594 inode_unlock(inode); !! 2611 mutex_unlock(&inode->i_mutex); 3595 if (!error) << 3596 enable_swap_slots_cache(); << 3597 return error; 2612 return error; 3598 } 2613 } 3599 2614 3600 void si_swapinfo(struct sysinfo *val) 2615 void si_swapinfo(struct sysinfo *val) 3601 { 2616 { 3602 unsigned int type; 2617 unsigned int type; 3603 unsigned long nr_to_be_unused = 0; 2618 unsigned long nr_to_be_unused = 0; 3604 2619 3605 spin_lock(&swap_lock); 2620 spin_lock(&swap_lock); 3606 for (type = 0; type < nr_swapfiles; t 2621 for (type = 0; type < nr_swapfiles; type++) { 3607 struct swap_info_struct *si = 2622 struct swap_info_struct *si = swap_info[type]; 3608 2623 3609 if ((si->flags & SWP_USED) && 2624 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) 3610 nr_to_be_unused += RE !! 2625 nr_to_be_unused += si->inuse_pages; 3611 } 2626 } 3612 val->freeswap = atomic_long_read(&nr_ 2627 val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; 3613 val->totalswap = total_swap_pages + n 2628 val->totalswap = total_swap_pages + nr_to_be_unused; 3614 spin_unlock(&swap_lock); 2629 spin_unlock(&swap_lock); 3615 } 2630 } 3616 2631 3617 /* 2632 /* 3618 * Verify that nr swap entries are valid and !! 2633 * Verify that a swap entry is valid and increment its swap map count. 3619 * 2634 * 3620 * Returns error code in following case. 2635 * Returns error code in following case. 3621 * - success -> 0 2636 * - success -> 0 3622 * - swp_entry is invalid -> EINVAL 2637 * - swp_entry is invalid -> EINVAL 3623 * - swp_entry is migration entry -> EINVAL 2638 * - swp_entry is migration entry -> EINVAL 3624 * - swap-cache reference is requested but th 2639 * - swap-cache reference is requested but there is already one. -> EEXIST 3625 * - swap-cache reference is requested but th 2640 * - swap-cache reference is requested but the entry is not used. -> ENOENT 3626 * - swap-mapped reference requested but need 2641 * - swap-mapped reference requested but needs continued swap count. -> ENOMEM 3627 */ 2642 */ 3628 static int __swap_duplicate(swp_entry_t entry !! 2643 static int __swap_duplicate(swp_entry_t entry, unsigned char usage) 3629 { 2644 { 3630 struct swap_info_struct *si; !! 2645 struct swap_info_struct *p; 3631 struct swap_cluster_info *ci; !! 2646 unsigned long offset, type; 3632 unsigned long offset; << 3633 unsigned char count; 2647 unsigned char count; 3634 unsigned char has_cache; 2648 unsigned char has_cache; 3635 int err, i; !! 2649 int err = -EINVAL; 3636 2650 3637 si = swp_swap_info(entry); !! 2651 if (non_swap_entry(entry)) >> 2652 goto out; 3638 2653 >> 2654 type = swp_type(entry); >> 2655 if (type >= nr_swapfiles) >> 2656 goto bad_file; >> 2657 p = swap_info[type]; 3639 offset = swp_offset(entry); 2658 offset = swp_offset(entry); 3640 VM_WARN_ON(nr > SWAPFILE_CLUSTER - of << 3641 VM_WARN_ON(usage == 1 && nr > 1); << 3642 ci = lock_cluster_or_swap_info(si, of << 3643 << 3644 err = 0; << 3645 for (i = 0; i < nr; i++) { << 3646 count = si->swap_map[offset + << 3647 2659 3648 /* !! 2660 spin_lock(&p->lock); 3649 * swapin_readahead() doesn't !! 2661 if (unlikely(offset >= p->max)) 3650 * swap entry could be SWAP_M !! 2662 goto unlock_out; 3651 */ << 3652 if (unlikely(swap_count(count << 3653 err = -ENOENT; << 3654 goto unlock_out; << 3655 } << 3656 << 3657 has_cache = count & SWAP_HAS_ << 3658 count &= ~SWAP_HAS_CACHE; << 3659 2663 3660 if (!count && !has_cache) { !! 2664 count = p->swap_map[offset]; 3661 err = -ENOENT; << 3662 } else if (usage == SWAP_HAS_ << 3663 if (has_cache) << 3664 err = -EEXIST << 3665 } else if ((count & ~COUNT_CO << 3666 err = -EINVAL; << 3667 } << 3668 2665 3669 if (err) !! 2666 /* 3670 goto unlock_out; !! 2667 * swapin_readahead() doesn't check if a swap entry is valid, so the >> 2668 * swap entry could be SWAP_MAP_BAD. Check here with lock held. >> 2669 */ >> 2670 if (unlikely(swap_count(count) == SWAP_MAP_BAD)) { >> 2671 err = -ENOENT; >> 2672 goto unlock_out; 3671 } 2673 } 3672 2674 3673 for (i = 0; i < nr; i++) { !! 2675 has_cache = count & SWAP_HAS_CACHE; 3674 count = si->swap_map[offset + !! 2676 count &= ~SWAP_HAS_CACHE; 3675 has_cache = count & SWAP_HAS_ !! 2677 err = 0; 3676 count &= ~SWAP_HAS_CACHE; !! 2678 >> 2679 if (usage == SWAP_HAS_CACHE) { 3677 2680 3678 if (usage == SWAP_HAS_CACHE) !! 2681 /* set SWAP_HAS_CACHE if there is no cache and entry is used */ >> 2682 if (!has_cache && count) 3679 has_cache = SWAP_HAS_ 2683 has_cache = SWAP_HAS_CACHE; 3680 else if ((count & ~COUNT_CONT !! 2684 else if (has_cache) /* someone else added cache */ >> 2685 err = -EEXIST; >> 2686 else /* no users remaining */ >> 2687 err = -ENOENT; >> 2688 >> 2689 } else if (count || has_cache) { >> 2690 >> 2691 if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) 3681 count += usage; 2692 count += usage; 3682 else if (swap_count_continued !! 2693 else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) >> 2694 err = -EINVAL; >> 2695 else if (swap_count_continued(p, offset, count)) 3683 count = COUNT_CONTINU 2696 count = COUNT_CONTINUED; 3684 else { !! 2697 else 3685 /* << 3686 * Don't need to roll << 3687 * usage == 1, there << 3688 */ << 3689 err = -ENOMEM; 2698 err = -ENOMEM; 3690 goto unlock_out; !! 2699 } else 3691 } !! 2700 err = -ENOENT; /* unused swap entry */ 3692 2701 3693 WRITE_ONCE(si->swap_map[offse !! 2702 p->swap_map[offset] = count | has_cache; 3694 } << 3695 2703 3696 unlock_out: 2704 unlock_out: 3697 unlock_cluster_or_swap_info(si, ci); !! 2705 spin_unlock(&p->lock); >> 2706 out: 3698 return err; 2707 return err; >> 2708 >> 2709 bad_file: >> 2710 pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val); >> 2711 goto out; 3699 } 2712 } 3700 2713 3701 /* 2714 /* 3702 * Help swapoff by noting that swap entry bel 2715 * Help swapoff by noting that swap entry belongs to shmem/tmpfs 3703 * (in which case its reference count is neve 2716 * (in which case its reference count is never incremented). 3704 */ 2717 */ 3705 void swap_shmem_alloc(swp_entry_t entry, int !! 2718 void swap_shmem_alloc(swp_entry_t entry) 3706 { 2719 { 3707 __swap_duplicate(entry, SWAP_MAP_SHME !! 2720 __swap_duplicate(entry, SWAP_MAP_SHMEM); 3708 } 2721 } 3709 2722 3710 /* 2723 /* 3711 * Increase reference count of swap entry by 2724 * Increase reference count of swap entry by 1. 3712 * Returns 0 for success, or -ENOMEM if a swa 2725 * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required 3713 * but could not be atomically allocated. Re 2726 * but could not be atomically allocated. Returns 0, just as if it succeeded, 3714 * if __swap_duplicate() fails for another re 2727 * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which 3715 * might occur if a page table entry has got 2728 * might occur if a page table entry has got corrupted. 3716 */ 2729 */ 3717 int swap_duplicate(swp_entry_t entry) 2730 int swap_duplicate(swp_entry_t entry) 3718 { 2731 { 3719 int err = 0; 2732 int err = 0; 3720 2733 3721 while (!err && __swap_duplicate(entry !! 2734 while (!err && __swap_duplicate(entry, 1) == -ENOMEM) 3722 err = add_swap_count_continua 2735 err = add_swap_count_continuation(entry, GFP_ATOMIC); 3723 return err; 2736 return err; 3724 } 2737 } 3725 2738 3726 /* 2739 /* 3727 * @entry: first swap entry from which we all !! 2740 * @entry: swap entry for which we allocate swap cache. 3728 * 2741 * 3729 * Called when allocating swap cache for exis !! 2742 * Called when allocating swap cache for existing swap entry, 3730 * This can return error codes. Returns 0 at 2743 * This can return error codes. Returns 0 at success. 3731 * -EEXIST means there is a swap cache. !! 2744 * -EBUSY means there is a swap cache. 3732 * Note: return code is different from swap_d 2745 * Note: return code is different from swap_duplicate(). 3733 */ 2746 */ 3734 int swapcache_prepare(swp_entry_t entry, int !! 2747 int swapcache_prepare(swp_entry_t entry) 3735 { 2748 { 3736 return __swap_duplicate(entry, SWAP_H !! 2749 return __swap_duplicate(entry, SWAP_HAS_CACHE); 3737 } 2750 } 3738 2751 3739 void swapcache_clear(struct swap_info_struct !! 2752 struct swap_info_struct *page_swap_info(struct page *page) 3740 { 2753 { 3741 unsigned long offset = swp_offset(ent !! 2754 swp_entry_t swap = { .val = page_private(page) }; 3742 !! 2755 BUG_ON(!PageSwapCache(page)); 3743 cluster_swap_free_nr(si, offset, nr, !! 2756 return swap_info[swp_type(swap)]; 3744 } << 3745 << 3746 struct swap_info_struct *swp_swap_info(swp_en << 3747 { << 3748 return swap_type_to_swap_info(swp_typ << 3749 } 2757 } 3750 2758 3751 /* 2759 /* 3752 * out-of-line methods to avoid include hell. !! 2760 * out-of-line __page_file_ methods to avoid include hell. 3753 */ 2761 */ 3754 struct address_space *swapcache_mapping(struc !! 2762 struct address_space *__page_file_mapping(struct page *page) 3755 { 2763 { 3756 return swp_swap_info(folio->swap)->sw !! 2764 VM_BUG_ON_PAGE(!PageSwapCache(page), page); >> 2765 return page_swap_info(page)->swap_file->f_mapping; 3757 } 2766 } 3758 EXPORT_SYMBOL_GPL(swapcache_mapping); !! 2767 EXPORT_SYMBOL_GPL(__page_file_mapping); 3759 2768 3760 pgoff_t __folio_swap_cache_index(struct folio !! 2769 pgoff_t __page_file_index(struct page *page) 3761 { 2770 { 3762 return swap_cache_index(folio->swap); !! 2771 swp_entry_t swap = { .val = page_private(page) }; >> 2772 VM_BUG_ON_PAGE(!PageSwapCache(page), page); >> 2773 return swp_offset(swap); 3763 } 2774 } 3764 EXPORT_SYMBOL_GPL(__folio_swap_cache_index); !! 2775 EXPORT_SYMBOL_GPL(__page_file_index); 3765 2776 3766 /* 2777 /* 3767 * add_swap_count_continuation - called when 2778 * add_swap_count_continuation - called when a swap count is duplicated 3768 * beyond SWAP_MAP_MAX, it allocates a new pa 2779 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's 3769 * page of the original vmalloc'ed swap_map, 2780 * page of the original vmalloc'ed swap_map, to hold the continuation count 3770 * (for that entry and for its neighbouring P 2781 * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called 3771 * again when count is duplicated beyond SWAP 2782 * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc. 3772 * 2783 * 3773 * These continuation pages are seldom refere 2784 * These continuation pages are seldom referenced: the common paths all work 3774 * on the original swap_map, only referring t 2785 * on the original swap_map, only referring to a continuation page when the 3775 * low "digit" of a count is incremented or d 2786 * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. 3776 * 2787 * 3777 * add_swap_count_continuation(, GFP_ATOMIC) 2788 * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding 3778 * page table locks; if it fails, add_swap_co 2789 * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL) 3779 * can be called after dropping locks. 2790 * can be called after dropping locks. 3780 */ 2791 */ 3781 int add_swap_count_continuation(swp_entry_t e 2792 int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) 3782 { 2793 { 3783 struct swap_info_struct *si; 2794 struct swap_info_struct *si; 3784 struct swap_cluster_info *ci; << 3785 struct page *head; 2795 struct page *head; 3786 struct page *page; 2796 struct page *page; 3787 struct page *list_page; 2797 struct page *list_page; 3788 pgoff_t offset; 2798 pgoff_t offset; 3789 unsigned char count; 2799 unsigned char count; 3790 int ret = 0; << 3791 2800 3792 /* 2801 /* 3793 * When debugging, it's easier to use 2802 * When debugging, it's easier to use __GFP_ZERO here; but it's better 3794 * for latency not to zero a page whi 2803 * for latency not to zero a page while GFP_ATOMIC and holding locks. 3795 */ 2804 */ 3796 page = alloc_page(gfp_mask | __GFP_HI 2805 page = alloc_page(gfp_mask | __GFP_HIGHMEM); 3797 2806 3798 si = get_swap_device(entry); !! 2807 si = swap_info_get(entry); 3799 if (!si) { 2808 if (!si) { 3800 /* 2809 /* 3801 * An acceptable race has occ 2810 * An acceptable race has occurred since the failing 3802 * __swap_duplicate(): the sw !! 2811 * __swap_duplicate(): the swap entry has been freed, >> 2812 * perhaps even the whole swap_map cleared for swapoff. 3803 */ 2813 */ 3804 goto outer; 2814 goto outer; 3805 } 2815 } 3806 spin_lock(&si->lock); << 3807 2816 3808 offset = swp_offset(entry); 2817 offset = swp_offset(entry); 3809 !! 2818 count = si->swap_map[offset] & ~SWAP_HAS_CACHE; 3810 ci = lock_cluster(si, offset); << 3811 << 3812 count = swap_count(si->swap_map[offse << 3813 2819 3814 if ((count & ~COUNT_CONTINUED) != SWA 2820 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { 3815 /* 2821 /* 3816 * The higher the swap count, 2822 * The higher the swap count, the more likely it is that tasks 3817 * will race to add swap coun 2823 * will race to add swap count continuation: we need to avoid 3818 * over-provisioning. 2824 * over-provisioning. 3819 */ 2825 */ 3820 goto out; 2826 goto out; 3821 } 2827 } 3822 2828 3823 if (!page) { 2829 if (!page) { 3824 ret = -ENOMEM; !! 2830 spin_unlock(&si->lock); 3825 goto out; !! 2831 return -ENOMEM; 3826 } 2832 } 3827 2833 >> 2834 /* >> 2835 * We are fortunate that although vmalloc_to_page uses pte_offset_map, >> 2836 * no architecture is using highmem pages for kernel page tables: so it >> 2837 * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps. >> 2838 */ 3828 head = vmalloc_to_page(si->swap_map + 2839 head = vmalloc_to_page(si->swap_map + offset); 3829 offset &= ~PAGE_MASK; 2840 offset &= ~PAGE_MASK; 3830 2841 3831 spin_lock(&si->cont_lock); << 3832 /* 2842 /* 3833 * Page allocation does not initializ 2843 * Page allocation does not initialize the page's lru field, 3834 * but it does always reset its priva 2844 * but it does always reset its private field. 3835 */ 2845 */ 3836 if (!page_private(head)) { 2846 if (!page_private(head)) { 3837 BUG_ON(count & COUNT_CONTINUE 2847 BUG_ON(count & COUNT_CONTINUED); 3838 INIT_LIST_HEAD(&head->lru); 2848 INIT_LIST_HEAD(&head->lru); 3839 set_page_private(head, SWP_CO 2849 set_page_private(head, SWP_CONTINUED); 3840 si->flags |= SWP_CONTINUED; 2850 si->flags |= SWP_CONTINUED; 3841 } 2851 } 3842 2852 3843 list_for_each_entry(list_page, &head- 2853 list_for_each_entry(list_page, &head->lru, lru) { 3844 unsigned char *map; 2854 unsigned char *map; 3845 2855 3846 /* 2856 /* 3847 * If the previous map said n 2857 * If the previous map said no continuation, but we've found 3848 * a continuation page, free 2858 * a continuation page, free our allocation and use this one. 3849 */ 2859 */ 3850 if (!(count & COUNT_CONTINUED 2860 if (!(count & COUNT_CONTINUED)) 3851 goto out_unlock_cont; !! 2861 goto out; 3852 2862 3853 map = kmap_local_page(list_pa !! 2863 map = kmap_atomic(list_page) + offset; 3854 count = *map; 2864 count = *map; 3855 kunmap_local(map); !! 2865 kunmap_atomic(map); 3856 2866 3857 /* 2867 /* 3858 * If this continuation count 2868 * If this continuation count now has some space in it, 3859 * free our allocation and us 2869 * free our allocation and use this one. 3860 */ 2870 */ 3861 if ((count & ~COUNT_CONTINUED 2871 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) 3862 goto out_unlock_cont; !! 2872 goto out; 3863 } 2873 } 3864 2874 3865 list_add_tail(&page->lru, &head->lru) 2875 list_add_tail(&page->lru, &head->lru); 3866 page = NULL; /* no 2876 page = NULL; /* now it's attached, don't free it */ 3867 out_unlock_cont: << 3868 spin_unlock(&si->cont_lock); << 3869 out: 2877 out: 3870 unlock_cluster(ci); << 3871 spin_unlock(&si->lock); 2878 spin_unlock(&si->lock); 3872 put_swap_device(si); << 3873 outer: 2879 outer: 3874 if (page) 2880 if (page) 3875 __free_page(page); 2881 __free_page(page); 3876 return ret; !! 2882 return 0; 3877 } 2883 } 3878 2884 3879 /* 2885 /* 3880 * swap_count_continued - when the original s 2886 * swap_count_continued - when the original swap_map count is incremented 3881 * from SWAP_MAP_MAX, check if there is alrea 2887 * from SWAP_MAP_MAX, check if there is already a continuation page to carry 3882 * into, carry if so, or else fail until a ne 2888 * into, carry if so, or else fail until a new continuation page is allocated; 3883 * when the original swap_map count is decrem 2889 * when the original swap_map count is decremented from 0 with continuation, 3884 * borrow from the continuation and report wh 2890 * borrow from the continuation and report whether it still holds more. 3885 * Called while __swap_duplicate() or swap_en !! 2891 * Called while __swap_duplicate() or swap_entry_free() holds swap_lock. 3886 * lock. << 3887 */ 2892 */ 3888 static bool swap_count_continued(struct swap_ 2893 static bool swap_count_continued(struct swap_info_struct *si, 3889 pgoff_t offs 2894 pgoff_t offset, unsigned char count) 3890 { 2895 { 3891 struct page *head; 2896 struct page *head; 3892 struct page *page; 2897 struct page *page; 3893 unsigned char *map; 2898 unsigned char *map; 3894 bool ret; << 3895 2899 3896 head = vmalloc_to_page(si->swap_map + 2900 head = vmalloc_to_page(si->swap_map + offset); 3897 if (page_private(head) != SWP_CONTINU 2901 if (page_private(head) != SWP_CONTINUED) { 3898 BUG_ON(count & COUNT_CONTINUE 2902 BUG_ON(count & COUNT_CONTINUED); 3899 return false; /* ne 2903 return false; /* need to add count continuation */ 3900 } 2904 } 3901 2905 3902 spin_lock(&si->cont_lock); << 3903 offset &= ~PAGE_MASK; 2906 offset &= ~PAGE_MASK; 3904 page = list_next_entry(head, lru); !! 2907 page = list_entry(head->lru.next, struct page, lru); 3905 map = kmap_local_page(page) + offset; !! 2908 map = kmap_atomic(page) + offset; 3906 2909 3907 if (count == SWAP_MAP_MAX) /* in 2910 if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ 3908 goto init_map; /* ju 2911 goto init_map; /* jump over SWAP_CONT_MAX checks */ 3909 2912 3910 if (count == (SWAP_MAP_MAX | COUNT_CO 2913 if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */ 3911 /* 2914 /* 3912 * Think of how you add 1 to 2915 * Think of how you add 1 to 999 3913 */ 2916 */ 3914 while (*map == (SWAP_CONT_MAX 2917 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { 3915 kunmap_local(map); !! 2918 kunmap_atomic(map); 3916 page = list_next_entr !! 2919 page = list_entry(page->lru.next, struct page, lru); 3917 BUG_ON(page == head); 2920 BUG_ON(page == head); 3918 map = kmap_local_page !! 2921 map = kmap_atomic(page) + offset; 3919 } 2922 } 3920 if (*map == SWAP_CONT_MAX) { 2923 if (*map == SWAP_CONT_MAX) { 3921 kunmap_local(map); !! 2924 kunmap_atomic(map); 3922 page = list_next_entr !! 2925 page = list_entry(page->lru.next, struct page, lru); 3923 if (page == head) { !! 2926 if (page == head) 3924 ret = false; !! 2927 return false; /* add count continuation */ 3925 goto out; !! 2928 map = kmap_atomic(page) + offset; 3926 } << 3927 map = kmap_local_page << 3928 init_map: *map = 0; 2929 init_map: *map = 0; /* we didn't zero the page */ 3929 } 2930 } 3930 *map += 1; 2931 *map += 1; 3931 kunmap_local(map); !! 2932 kunmap_atomic(map); 3932 while ((page = list_prev_entr !! 2933 page = list_entry(page->lru.prev, struct page, lru); 3933 map = kmap_local_page !! 2934 while (page != head) { >> 2935 map = kmap_atomic(page) + offset; 3934 *map = COUNT_CONTINUE 2936 *map = COUNT_CONTINUED; 3935 kunmap_local(map); !! 2937 kunmap_atomic(map); >> 2938 page = list_entry(page->lru.prev, struct page, lru); 3936 } 2939 } 3937 ret = true; !! 2940 return true; /* incremented */ 3938 2941 3939 } else { 2942 } else { /* decrementing */ 3940 /* 2943 /* 3941 * Think of how you subtract 2944 * Think of how you subtract 1 from 1000 3942 */ 2945 */ 3943 BUG_ON(count != COUNT_CONTINU 2946 BUG_ON(count != COUNT_CONTINUED); 3944 while (*map == COUNT_CONTINUE 2947 while (*map == COUNT_CONTINUED) { 3945 kunmap_local(map); !! 2948 kunmap_atomic(map); 3946 page = list_next_entr !! 2949 page = list_entry(page->lru.next, struct page, lru); 3947 BUG_ON(page == head); 2950 BUG_ON(page == head); 3948 map = kmap_local_page !! 2951 map = kmap_atomic(page) + offset; 3949 } 2952 } 3950 BUG_ON(*map == 0); 2953 BUG_ON(*map == 0); 3951 *map -= 1; 2954 *map -= 1; 3952 if (*map == 0) 2955 if (*map == 0) 3953 count = 0; 2956 count = 0; 3954 kunmap_local(map); !! 2957 kunmap_atomic(map); 3955 while ((page = list_prev_entr !! 2958 page = list_entry(page->lru.prev, struct page, lru); 3956 map = kmap_local_page !! 2959 while (page != head) { >> 2960 map = kmap_atomic(page) + offset; 3957 *map = SWAP_CONT_MAX 2961 *map = SWAP_CONT_MAX | count; 3958 count = COUNT_CONTINU 2962 count = COUNT_CONTINUED; 3959 kunmap_local(map); !! 2963 kunmap_atomic(map); >> 2964 page = list_entry(page->lru.prev, struct page, lru); 3960 } 2965 } 3961 ret = count == COUNT_CONTINUE !! 2966 return count == COUNT_CONTINUED; 3962 } 2967 } 3963 out: << 3964 spin_unlock(&si->cont_lock); << 3965 return ret; << 3966 } 2968 } 3967 2969 3968 /* 2970 /* 3969 * free_swap_count_continuations - swapoff fr 2971 * free_swap_count_continuations - swapoff free all the continuation pages 3970 * appended to the swap_map, after swap_map i 2972 * appended to the swap_map, after swap_map is quiesced, before vfree'ing it. 3971 */ 2973 */ 3972 static void free_swap_count_continuations(str 2974 static void free_swap_count_continuations(struct swap_info_struct *si) 3973 { 2975 { 3974 pgoff_t offset; 2976 pgoff_t offset; 3975 2977 3976 for (offset = 0; offset < si->max; of 2978 for (offset = 0; offset < si->max; offset += PAGE_SIZE) { 3977 struct page *head; 2979 struct page *head; 3978 head = vmalloc_to_page(si->sw 2980 head = vmalloc_to_page(si->swap_map + offset); 3979 if (page_private(head)) { 2981 if (page_private(head)) { 3980 struct page *page, *n !! 2982 struct list_head *this, *next; 3981 !! 2983 list_for_each_safe(this, next, &head->lru) { 3982 list_for_each_entry_s !! 2984 struct page *page; 3983 list_del(&pag !! 2985 page = list_entry(this, struct page, lru); >> 2986 list_del(this); 3984 __free_page(p 2987 __free_page(page); 3985 } 2988 } 3986 } 2989 } 3987 } 2990 } 3988 } 2991 } 3989 << 3990 #if defined(CONFIG_MEMCG) && defined(CONFIG_B << 3991 void __folio_throttle_swaprate(struct folio * << 3992 { << 3993 struct swap_info_struct *si, *next; << 3994 int nid = folio_nid(folio); << 3995 << 3996 if (!(gfp & __GFP_IO)) << 3997 return; << 3998 << 3999 if (!__has_usable_swap()) << 4000 return; << 4001 << 4002 if (!blk_cgroup_congested()) << 4003 return; << 4004 << 4005 /* << 4006 * We've already scheduled a throttle << 4007 * lock. << 4008 */ << 4009 if (current->throttle_disk) << 4010 return; << 4011 << 4012 spin_lock(&swap_avail_lock); << 4013 plist_for_each_entry_safe(si, next, & << 4014 avail_lists << 4015 if (si->bdev) { << 4016 blkcg_schedule_thrott << 4017 break; << 4018 } << 4019 } << 4020 spin_unlock(&swap_avail_lock); << 4021 } << 4022 #endif << 4023 << 4024 static int __init swapfile_init(void) << 4025 { << 4026 int nid; << 4027 << 4028 swap_avail_heads = kmalloc_array(nr_n << 4029 GFP_ << 4030 if (!swap_avail_heads) { << 4031 pr_emerg("Not enough memory f << 4032 return -ENOMEM; << 4033 } << 4034 << 4035 for_each_node(nid) << 4036 plist_head_init(&swap_avail_h << 4037 << 4038 swapfile_maximum_size = arch_max_swap << 4039 << 4040 #ifdef CONFIG_MIGRATION << 4041 if (swapfile_maximum_size >= (1UL << << 4042 swap_migration_ad_supported = << 4043 #endif /* CONFIG_MIGRATION */ << 4044 << 4045 return 0; << 4046 } << 4047 subsys_initcall(swapfile_init); << 4048 2992
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.