1 // SPDX-License-Identifier: GPL-2.0-only 1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 2 /* 3 * linux/mm/swapfile.c 3 * linux/mm/swapfile.c 4 * 4 * 5 * Copyright (C) 1991, 1992, 1993, 1994 Linu 5 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 6 * Swap reorganised 29.12.95, Stephen Tweedie 6 * Swap reorganised 29.12.95, Stephen Tweedie 7 */ 7 */ 8 8 9 #include <linux/blkdev.h> << 10 #include <linux/mm.h> 9 #include <linux/mm.h> 11 #include <linux/sched/mm.h> 10 #include <linux/sched/mm.h> 12 #include <linux/sched/task.h> 11 #include <linux/sched/task.h> 13 #include <linux/hugetlb.h> 12 #include <linux/hugetlb.h> 14 #include <linux/mman.h> 13 #include <linux/mman.h> 15 #include <linux/slab.h> 14 #include <linux/slab.h> 16 #include <linux/kernel_stat.h> 15 #include <linux/kernel_stat.h> 17 #include <linux/swap.h> 16 #include <linux/swap.h> 18 #include <linux/vmalloc.h> 17 #include <linux/vmalloc.h> 19 #include <linux/pagemap.h> 18 #include <linux/pagemap.h> 20 #include <linux/namei.h> 19 #include <linux/namei.h> 21 #include <linux/shmem_fs.h> 20 #include <linux/shmem_fs.h> 22 #include <linux/blk-cgroup.h> !! 21 #include <linux/blkdev.h> 23 #include <linux/random.h> 22 #include <linux/random.h> 24 #include <linux/writeback.h> 23 #include <linux/writeback.h> 25 #include <linux/proc_fs.h> 24 #include <linux/proc_fs.h> 26 #include <linux/seq_file.h> 25 #include <linux/seq_file.h> 27 #include <linux/init.h> 26 #include <linux/init.h> 28 #include <linux/ksm.h> 27 #include <linux/ksm.h> 29 #include <linux/rmap.h> 28 #include <linux/rmap.h> 30 #include <linux/security.h> 29 #include <linux/security.h> 31 #include <linux/backing-dev.h> 30 #include <linux/backing-dev.h> 32 #include <linux/mutex.h> 31 #include <linux/mutex.h> 33 #include <linux/capability.h> 32 #include <linux/capability.h> 34 #include <linux/syscalls.h> 33 #include <linux/syscalls.h> 35 #include <linux/memcontrol.h> 34 #include <linux/memcontrol.h> 36 #include <linux/poll.h> 35 #include <linux/poll.h> 37 #include <linux/oom.h> 36 #include <linux/oom.h> >> 37 #include <linux/frontswap.h> 38 #include <linux/swapfile.h> 38 #include <linux/swapfile.h> 39 #include <linux/export.h> 39 #include <linux/export.h> 40 #include <linux/swap_slots.h> 40 #include <linux/swap_slots.h> 41 #include <linux/sort.h> 41 #include <linux/sort.h> 42 #include <linux/completion.h> 42 #include <linux/completion.h> 43 #include <linux/suspend.h> << 44 #include <linux/zswap.h> << 45 #include <linux/plist.h> << 46 43 47 #include <asm/tlbflush.h> 44 #include <asm/tlbflush.h> 48 #include <linux/swapops.h> 45 #include <linux/swapops.h> 49 #include <linux/swap_cgroup.h> 46 #include <linux/swap_cgroup.h> 50 #include "internal.h" << 51 #include "swap.h" << 52 47 53 static bool swap_count_continued(struct swap_i 48 static bool swap_count_continued(struct swap_info_struct *, pgoff_t, 54 unsigned char 49 unsigned char); 55 static void free_swap_count_continuations(stru 50 static void free_swap_count_continuations(struct swap_info_struct *); 56 static void swap_entry_range_free(struct swap_ << 57 unsigned int << 58 static void swap_range_alloc(struct swap_info_ << 59 unsigned int nr_e << 60 static bool folio_swapcache_freeable(struct fo << 61 static struct swap_cluster_info *lock_cluster_ << 62 struct swap_info_struct *si, u << 63 static void unlock_cluster_or_swap_info(struct << 64 struct << 65 51 66 static DEFINE_SPINLOCK(swap_lock); !! 52 DEFINE_SPINLOCK(swap_lock); 67 static unsigned int nr_swapfiles; 53 static unsigned int nr_swapfiles; 68 atomic_long_t nr_swap_pages; 54 atomic_long_t nr_swap_pages; 69 /* 55 /* 70 * Some modules use swappable objects and may 56 * Some modules use swappable objects and may try to swap them out under 71 * memory pressure (via the shrinker). Before 57 * memory pressure (via the shrinker). Before doing so, they may wish to 72 * check to see if any swap space is available 58 * check to see if any swap space is available. 73 */ 59 */ 74 EXPORT_SYMBOL_GPL(nr_swap_pages); 60 EXPORT_SYMBOL_GPL(nr_swap_pages); 75 /* protected with swap_lock. reading in vm_swa 61 /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ 76 long total_swap_pages; 62 long total_swap_pages; 77 static int least_priority = -1; 63 static int least_priority = -1; 78 unsigned long swapfile_maximum_size; << 79 #ifdef CONFIG_MIGRATION << 80 bool swap_migration_ad_supported; << 81 #endif /* CONFIG_MIGRATION */ << 82 64 83 static const char Bad_file[] = "Bad swap file 65 static const char Bad_file[] = "Bad swap file entry "; 84 static const char Unused_file[] = "Unused swap 66 static const char Unused_file[] = "Unused swap file entry "; 85 static const char Bad_offset[] = "Bad swap off 67 static const char Bad_offset[] = "Bad swap offset entry "; 86 static const char Unused_offset[] = "Unused sw 68 static const char Unused_offset[] = "Unused swap offset entry "; 87 69 88 /* 70 /* 89 * all active swap_info_structs 71 * all active swap_info_structs 90 * protected with swap_lock, and ordered by pr 72 * protected with swap_lock, and ordered by priority. 91 */ 73 */ 92 static PLIST_HEAD(swap_active_head); !! 74 PLIST_HEAD(swap_active_head); 93 75 94 /* 76 /* 95 * all available (active, not full) swap_info_ 77 * all available (active, not full) swap_info_structs 96 * protected with swap_avail_lock, ordered by 78 * protected with swap_avail_lock, ordered by priority. 97 * This is used by folio_alloc_swap() instead !! 79 * This is used by get_swap_page() instead of swap_active_head 98 * because swap_active_head includes all swap_ 80 * because swap_active_head includes all swap_info_structs, 99 * but folio_alloc_swap() doesn't need to look !! 81 * but get_swap_page() doesn't need to look at full ones. 100 * This uses its own lock instead of swap_lock 82 * This uses its own lock instead of swap_lock because when a 101 * swap_info_struct changes between not-full/f 83 * swap_info_struct changes between not-full/full, it needs to 102 * add/remove itself to/from this list, but th 84 * add/remove itself to/from this list, but the swap_info_struct->lock 103 * is held and the locking order requires swap 85 * is held and the locking order requires swap_lock to be taken 104 * before any swap_info_struct->lock. 86 * before any swap_info_struct->lock. 105 */ 87 */ 106 static struct plist_head *swap_avail_heads; 88 static struct plist_head *swap_avail_heads; 107 static DEFINE_SPINLOCK(swap_avail_lock); 89 static DEFINE_SPINLOCK(swap_avail_lock); 108 90 109 static struct swap_info_struct *swap_info[MAX_ !! 91 struct swap_info_struct *swap_info[MAX_SWAPFILES]; 110 92 111 static DEFINE_MUTEX(swapon_mutex); 93 static DEFINE_MUTEX(swapon_mutex); 112 94 113 static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait) 95 static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); 114 /* Activity counter to indicate that a swapon 96 /* Activity counter to indicate that a swapon or swapoff has occurred */ 115 static atomic_t proc_poll_event = ATOMIC_INIT( 97 static atomic_t proc_poll_event = ATOMIC_INIT(0); 116 98 117 atomic_t nr_rotate_swap = ATOMIC_INIT(0); 99 atomic_t nr_rotate_swap = ATOMIC_INIT(0); 118 100 119 static struct swap_info_struct *swap_type_to_s 101 static struct swap_info_struct *swap_type_to_swap_info(int type) 120 { 102 { 121 if (type >= MAX_SWAPFILES) 103 if (type >= MAX_SWAPFILES) 122 return NULL; 104 return NULL; 123 105 124 return READ_ONCE(swap_info[type]); /* 106 return READ_ONCE(swap_info[type]); /* rcu_dereference() */ 125 } 107 } 126 108 127 static inline unsigned char swap_count(unsigne 109 static inline unsigned char swap_count(unsigned char ent) 128 { 110 { 129 return ent & ~SWAP_HAS_CACHE; /* may 111 return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ 130 } 112 } 131 113 132 /* Reclaim the swap entry anyway if possible * 114 /* Reclaim the swap entry anyway if possible */ 133 #define TTRS_ANYWAY 0x1 115 #define TTRS_ANYWAY 0x1 134 /* 116 /* 135 * Reclaim the swap entry if there are no more 117 * Reclaim the swap entry if there are no more mappings of the 136 * corresponding page 118 * corresponding page 137 */ 119 */ 138 #define TTRS_UNMAPPED 0x2 120 #define TTRS_UNMAPPED 0x2 139 /* Reclaim the swap entry if swap is getting f !! 121 /* Reclaim the swap entry if swap is getting full*/ 140 #define TTRS_FULL 0x4 122 #define TTRS_FULL 0x4 141 /* Reclaim directly, bypass the slot cache and << 142 #define TTRS_DIRECT 0x8 << 143 << 144 static bool swap_is_has_cache(struct swap_info << 145 unsigned long of << 146 { << 147 unsigned char *map = si->swap_map + of << 148 unsigned char *map_end = map + nr_page << 149 << 150 do { << 151 VM_BUG_ON(!(*map & SWAP_HAS_CA << 152 if (*map != SWAP_HAS_CACHE) << 153 return false; << 154 } while (++map < map_end); << 155 << 156 return true; << 157 } << 158 << 159 static bool swap_is_last_map(struct swap_info_ << 160 unsigned long offset, int nr_p << 161 { << 162 unsigned char *map = si->swap_map + of << 163 unsigned char *map_end = map + nr_page << 164 unsigned char count = *map; << 165 123 166 if (swap_count(count) != 1) !! 124 /* returns 1 if swap entry is freed */ 167 return false; << 168 << 169 while (++map < map_end) { << 170 if (*map != count) << 171 return false; << 172 } << 173 << 174 *has_cache = !!(count & SWAP_HAS_CACHE << 175 return true; << 176 } << 177 << 178 /* << 179 * returns number of pages in the folio that b << 180 * the folio was reclaimed. If negative, the f << 181 * folio was associated with the swap entry. << 182 */ << 183 static int __try_to_reclaim_swap(struct swap_i 125 static int __try_to_reclaim_swap(struct swap_info_struct *si, 184 unsigned long 126 unsigned long offset, unsigned long flags) 185 { 127 { 186 swp_entry_t entry = swp_entry(si->type 128 swp_entry_t entry = swp_entry(si->type, offset); 187 struct address_space *address_space = !! 129 struct page *page; 188 struct swap_cluster_info *ci; !! 130 int ret = 0; 189 struct folio *folio; << 190 int ret, nr_pages; << 191 bool need_reclaim; << 192 131 193 folio = filemap_get_folio(address_spac !! 132 page = find_get_page(swap_address_space(entry), offset); 194 if (IS_ERR(folio)) !! 133 if (!page) 195 return 0; 134 return 0; 196 << 197 nr_pages = folio_nr_pages(folio); << 198 ret = -nr_pages; << 199 << 200 /* 135 /* 201 * When this function is called from s 136 * When this function is called from scan_swap_map_slots() and it's 202 * called by vmscan.c at reclaiming fo !! 137 * called by vmscan.c at reclaiming pages. So, we hold a lock on a page, 203 * here. We have to use trylock for av 138 * here. We have to use trylock for avoiding deadlock. This is a special 204 * case and you should use folio_free_ !! 139 * case and you should use try_to_free_swap() with explicit lock_page() 205 * in usual operations. 140 * in usual operations. 206 */ 141 */ 207 if (!folio_trylock(folio)) !! 142 if (trylock_page(page)) { 208 goto out; !! 143 if ((flags & TTRS_ANYWAY) || 209 !! 144 ((flags & TTRS_UNMAPPED) && !page_mapped(page)) || 210 /* offset could point to the middle of !! 145 ((flags & TTRS_FULL) && mem_cgroup_swap_full(page))) 211 entry = folio->swap; !! 146 ret = try_to_free_swap(page); 212 offset = swp_offset(entry); !! 147 unlock_page(page); 213 !! 148 } 214 need_reclaim = ((flags & TTRS_ANYWAY) !! 149 put_page(page); 215 ((flags & TTRS_UNMAPPE << 216 ((flags & TTRS_FULL) & << 217 if (!need_reclaim || !folio_swapcache_ << 218 goto out_unlock; << 219 << 220 /* << 221 * It's safe to delete the folio from << 222 * swap_map is HAS_CACHE only, which m << 223 * reference or pending writeback, and << 224 */ << 225 ci = lock_cluster_or_swap_info(si, off << 226 need_reclaim = swap_is_has_cache(si, o << 227 unlock_cluster_or_swap_info(si, ci); << 228 if (!need_reclaim) << 229 goto out_unlock; << 230 << 231 if (!(flags & TTRS_DIRECT)) { << 232 /* Free through slot cache */ << 233 delete_from_swap_cache(folio); << 234 folio_set_dirty(folio); << 235 ret = nr_pages; << 236 goto out_unlock; << 237 } << 238 << 239 xa_lock_irq(&address_space->i_pages); << 240 __delete_from_swap_cache(folio, entry, << 241 xa_unlock_irq(&address_space->i_pages) << 242 folio_ref_sub(folio, nr_pages); << 243 folio_set_dirty(folio); << 244 << 245 spin_lock(&si->lock); << 246 /* Only sinple page folio can be backe << 247 if (nr_pages == 1) << 248 zswap_invalidate(entry); << 249 swap_entry_range_free(si, entry, nr_pa << 250 spin_unlock(&si->lock); << 251 ret = nr_pages; << 252 out_unlock: << 253 folio_unlock(folio); << 254 out: << 255 folio_put(folio); << 256 return ret; 150 return ret; 257 } 151 } 258 152 259 static inline struct swap_extent *first_se(str 153 static inline struct swap_extent *first_se(struct swap_info_struct *sis) 260 { 154 { 261 struct rb_node *rb = rb_first(&sis->sw 155 struct rb_node *rb = rb_first(&sis->swap_extent_root); 262 return rb_entry(rb, struct swap_extent 156 return rb_entry(rb, struct swap_extent, rb_node); 263 } 157 } 264 158 265 static inline struct swap_extent *next_se(stru 159 static inline struct swap_extent *next_se(struct swap_extent *se) 266 { 160 { 267 struct rb_node *rb = rb_next(&se->rb_n 161 struct rb_node *rb = rb_next(&se->rb_node); 268 return rb ? rb_entry(rb, struct swap_e 162 return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL; 269 } 163 } 270 164 271 /* 165 /* 272 * swapon tell device that all the old swap co 166 * swapon tell device that all the old swap contents can be discarded, 273 * to allow the swap device to optimize its we 167 * to allow the swap device to optimize its wear-levelling. 274 */ 168 */ 275 static int discard_swap(struct swap_info_struc 169 static int discard_swap(struct swap_info_struct *si) 276 { 170 { 277 struct swap_extent *se; 171 struct swap_extent *se; 278 sector_t start_block; 172 sector_t start_block; 279 sector_t nr_blocks; 173 sector_t nr_blocks; 280 int err = 0; 174 int err = 0; 281 175 282 /* Do not discard the swap header page 176 /* Do not discard the swap header page! */ 283 se = first_se(si); 177 se = first_se(si); 284 start_block = (se->start_block + 1) << 178 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); 285 nr_blocks = ((sector_t)se->nr_pages - 179 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); 286 if (nr_blocks) { 180 if (nr_blocks) { 287 err = blkdev_issue_discard(si- 181 err = blkdev_issue_discard(si->bdev, start_block, 288 nr_blocks, GFP !! 182 nr_blocks, GFP_KERNEL, 0); 289 if (err) 183 if (err) 290 return err; 184 return err; 291 cond_resched(); 185 cond_resched(); 292 } 186 } 293 187 294 for (se = next_se(se); se; se = next_s 188 for (se = next_se(se); se; se = next_se(se)) { 295 start_block = se->start_block 189 start_block = se->start_block << (PAGE_SHIFT - 9); 296 nr_blocks = (sector_t)se->nr_p 190 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); 297 191 298 err = blkdev_issue_discard(si- 192 err = blkdev_issue_discard(si->bdev, start_block, 299 nr_blocks, GFP !! 193 nr_blocks, GFP_KERNEL, 0); 300 if (err) 194 if (err) 301 break; 195 break; 302 196 303 cond_resched(); 197 cond_resched(); 304 } 198 } 305 return err; /* That will o 199 return err; /* That will often be -EOPNOTSUPP */ 306 } 200 } 307 201 308 static struct swap_extent * 202 static struct swap_extent * 309 offset_to_swap_extent(struct swap_info_struct 203 offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset) 310 { 204 { 311 struct swap_extent *se; 205 struct swap_extent *se; 312 struct rb_node *rb; 206 struct rb_node *rb; 313 207 314 rb = sis->swap_extent_root.rb_node; 208 rb = sis->swap_extent_root.rb_node; 315 while (rb) { 209 while (rb) { 316 se = rb_entry(rb, struct swap_ 210 se = rb_entry(rb, struct swap_extent, rb_node); 317 if (offset < se->start_page) 211 if (offset < se->start_page) 318 rb = rb->rb_left; 212 rb = rb->rb_left; 319 else if (offset >= se->start_p 213 else if (offset >= se->start_page + se->nr_pages) 320 rb = rb->rb_right; 214 rb = rb->rb_right; 321 else 215 else 322 return se; 216 return se; 323 } 217 } 324 /* It *must* be present */ 218 /* It *must* be present */ 325 BUG(); 219 BUG(); 326 } 220 } 327 221 328 sector_t swap_folio_sector(struct folio *folio !! 222 sector_t swap_page_sector(struct page *page) 329 { 223 { 330 struct swap_info_struct *sis = swp_swa !! 224 struct swap_info_struct *sis = page_swap_info(page); 331 struct swap_extent *se; 225 struct swap_extent *se; 332 sector_t sector; 226 sector_t sector; 333 pgoff_t offset; 227 pgoff_t offset; 334 228 335 offset = swp_offset(folio->swap); !! 229 offset = __page_file_index(page); 336 se = offset_to_swap_extent(sis, offset 230 se = offset_to_swap_extent(sis, offset); 337 sector = se->start_block + (offset - s 231 sector = se->start_block + (offset - se->start_page); 338 return sector << (PAGE_SHIFT - 9); 232 return sector << (PAGE_SHIFT - 9); 339 } 233 } 340 234 341 /* 235 /* 342 * swap allocation tell device that a cluster 236 * swap allocation tell device that a cluster of swap can now be discarded, 343 * to allow the swap device to optimize its we 237 * to allow the swap device to optimize its wear-levelling. 344 */ 238 */ 345 static void discard_swap_cluster(struct swap_i 239 static void discard_swap_cluster(struct swap_info_struct *si, 346 pgoff_t start 240 pgoff_t start_page, pgoff_t nr_pages) 347 { 241 { 348 struct swap_extent *se = offset_to_swa 242 struct swap_extent *se = offset_to_swap_extent(si, start_page); 349 243 350 while (nr_pages) { 244 while (nr_pages) { 351 pgoff_t offset = start_page - 245 pgoff_t offset = start_page - se->start_page; 352 sector_t start_block = se->sta 246 sector_t start_block = se->start_block + offset; 353 sector_t nr_blocks = se->nr_pa 247 sector_t nr_blocks = se->nr_pages - offset; 354 248 355 if (nr_blocks > nr_pages) 249 if (nr_blocks > nr_pages) 356 nr_blocks = nr_pages; 250 nr_blocks = nr_pages; 357 start_page += nr_blocks; 251 start_page += nr_blocks; 358 nr_pages -= nr_blocks; 252 nr_pages -= nr_blocks; 359 253 360 start_block <<= PAGE_SHIFT - 9 254 start_block <<= PAGE_SHIFT - 9; 361 nr_blocks <<= PAGE_SHIFT - 9; 255 nr_blocks <<= PAGE_SHIFT - 9; 362 if (blkdev_issue_discard(si->b 256 if (blkdev_issue_discard(si->bdev, start_block, 363 nr_blo !! 257 nr_blocks, GFP_NOIO, 0)) 364 break; 258 break; 365 259 366 se = next_se(se); 260 se = next_se(se); 367 } 261 } 368 } 262 } 369 263 370 #ifdef CONFIG_THP_SWAP 264 #ifdef CONFIG_THP_SWAP 371 #define SWAPFILE_CLUSTER HPAGE_PMD_NR 265 #define SWAPFILE_CLUSTER HPAGE_PMD_NR 372 266 373 #define swap_entry_order(order) (order) !! 267 #define swap_entry_size(size) (size) 374 #else 268 #else 375 #define SWAPFILE_CLUSTER 256 269 #define SWAPFILE_CLUSTER 256 376 270 377 /* 271 /* 378 * Define swap_entry_order() as constant to le !! 272 * Define swap_entry_size() as constant to let compiler to optimize 379 * out some code if !CONFIG_THP_SWAP 273 * out some code if !CONFIG_THP_SWAP 380 */ 274 */ 381 #define swap_entry_order(order) 0 !! 275 #define swap_entry_size(size) 1 382 #endif 276 #endif 383 #define LATENCY_LIMIT 256 277 #define LATENCY_LIMIT 256 384 278 >> 279 static inline void cluster_set_flag(struct swap_cluster_info *info, >> 280 unsigned int flag) >> 281 { >> 282 info->flags = flag; >> 283 } >> 284 >> 285 static inline unsigned int cluster_count(struct swap_cluster_info *info) >> 286 { >> 287 return info->data; >> 288 } >> 289 >> 290 static inline void cluster_set_count(struct swap_cluster_info *info, >> 291 unsigned int c) >> 292 { >> 293 info->data = c; >> 294 } >> 295 >> 296 static inline void cluster_set_count_flag(struct swap_cluster_info *info, >> 297 unsigned int c, unsigned int f) >> 298 { >> 299 info->flags = f; >> 300 info->data = c; >> 301 } >> 302 >> 303 static inline unsigned int cluster_next(struct swap_cluster_info *info) >> 304 { >> 305 return info->data; >> 306 } >> 307 >> 308 static inline void cluster_set_next(struct swap_cluster_info *info, >> 309 unsigned int n) >> 310 { >> 311 info->data = n; >> 312 } >> 313 >> 314 static inline void cluster_set_next_flag(struct swap_cluster_info *info, >> 315 unsigned int n, unsigned int f) >> 316 { >> 317 info->flags = f; >> 318 info->data = n; >> 319 } >> 320 385 static inline bool cluster_is_free(struct swap 321 static inline bool cluster_is_free(struct swap_cluster_info *info) 386 { 322 { 387 return info->flags & CLUSTER_FLAG_FREE 323 return info->flags & CLUSTER_FLAG_FREE; 388 } 324 } 389 325 390 static inline unsigned int cluster_index(struc !! 326 static inline bool cluster_is_null(struct swap_cluster_info *info) 391 struc !! 327 { >> 328 return info->flags & CLUSTER_FLAG_NEXT_NULL; >> 329 } >> 330 >> 331 static inline void cluster_set_null(struct swap_cluster_info *info) >> 332 { >> 333 info->flags = CLUSTER_FLAG_NEXT_NULL; >> 334 info->data = 0; >> 335 } >> 336 >> 337 static inline bool cluster_is_huge(struct swap_cluster_info *info) 392 { 338 { 393 return ci - si->cluster_info; !! 339 if (IS_ENABLED(CONFIG_THP_SWAP)) >> 340 return info->flags & CLUSTER_FLAG_HUGE; >> 341 return false; 394 } 342 } 395 343 396 static inline unsigned int cluster_offset(stru !! 344 static inline void cluster_clear_huge(struct swap_cluster_info *info) 397 stru << 398 { 345 { 399 return cluster_index(si, ci) * SWAPFIL !! 346 info->flags &= ~CLUSTER_FLAG_HUGE; 400 } 347 } 401 348 402 static inline struct swap_cluster_info *lock_c 349 static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, 403 350 unsigned long offset) 404 { 351 { 405 struct swap_cluster_info *ci; 352 struct swap_cluster_info *ci; 406 353 407 ci = si->cluster_info; 354 ci = si->cluster_info; 408 if (ci) { 355 if (ci) { 409 ci += offset / SWAPFILE_CLUSTE 356 ci += offset / SWAPFILE_CLUSTER; 410 spin_lock(&ci->lock); 357 spin_lock(&ci->lock); 411 } 358 } 412 return ci; 359 return ci; 413 } 360 } 414 361 415 static inline void unlock_cluster(struct swap_ 362 static inline void unlock_cluster(struct swap_cluster_info *ci) 416 { 363 { 417 if (ci) 364 if (ci) 418 spin_unlock(&ci->lock); 365 spin_unlock(&ci->lock); 419 } 366 } 420 367 421 /* 368 /* 422 * Determine the locking method in use for thi 369 * Determine the locking method in use for this device. Return 423 * swap_cluster_info if SSD-style cluster-base 370 * swap_cluster_info if SSD-style cluster-based locking is in place. 424 */ 371 */ 425 static inline struct swap_cluster_info *lock_c 372 static inline struct swap_cluster_info *lock_cluster_or_swap_info( 426 struct swap_info_struct *si, u 373 struct swap_info_struct *si, unsigned long offset) 427 { 374 { 428 struct swap_cluster_info *ci; 375 struct swap_cluster_info *ci; 429 376 430 /* Try to use fine-grained SSD-style l 377 /* Try to use fine-grained SSD-style locking if available: */ 431 ci = lock_cluster(si, offset); 378 ci = lock_cluster(si, offset); 432 /* Otherwise, fall back to traditional 379 /* Otherwise, fall back to traditional, coarse locking: */ 433 if (!ci) 380 if (!ci) 434 spin_lock(&si->lock); 381 spin_lock(&si->lock); 435 382 436 return ci; 383 return ci; 437 } 384 } 438 385 439 static inline void unlock_cluster_or_swap_info 386 static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si, 440 387 struct swap_cluster_info *ci) 441 { 388 { 442 if (ci) 389 if (ci) 443 unlock_cluster(ci); 390 unlock_cluster(ci); 444 else 391 else 445 spin_unlock(&si->lock); 392 spin_unlock(&si->lock); 446 } 393 } 447 394 >> 395 static inline bool cluster_list_empty(struct swap_cluster_list *list) >> 396 { >> 397 return cluster_is_null(&list->head); >> 398 } >> 399 >> 400 static inline unsigned int cluster_list_first(struct swap_cluster_list *list) >> 401 { >> 402 return cluster_next(&list->head); >> 403 } >> 404 >> 405 static void cluster_list_init(struct swap_cluster_list *list) >> 406 { >> 407 cluster_set_null(&list->head); >> 408 cluster_set_null(&list->tail); >> 409 } >> 410 >> 411 static void cluster_list_add_tail(struct swap_cluster_list *list, >> 412 struct swap_cluster_info *ci, >> 413 unsigned int idx) >> 414 { >> 415 if (cluster_list_empty(list)) { >> 416 cluster_set_next_flag(&list->head, idx, 0); >> 417 cluster_set_next_flag(&list->tail, idx, 0); >> 418 } else { >> 419 struct swap_cluster_info *ci_tail; >> 420 unsigned int tail = cluster_next(&list->tail); >> 421 >> 422 /* >> 423 * Nested cluster lock, but both cluster locks are >> 424 * only acquired when we held swap_info_struct->lock >> 425 */ >> 426 ci_tail = ci + tail; >> 427 spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING); >> 428 cluster_set_next(ci_tail, idx); >> 429 spin_unlock(&ci_tail->lock); >> 430 cluster_set_next_flag(&list->tail, idx, 0); >> 431 } >> 432 } >> 433 >> 434 static unsigned int cluster_list_del_first(struct swap_cluster_list *list, >> 435 struct swap_cluster_info *ci) >> 436 { >> 437 unsigned int idx; >> 438 >> 439 idx = cluster_next(&list->head); >> 440 if (cluster_next(&list->tail) == idx) { >> 441 cluster_set_null(&list->head); >> 442 cluster_set_null(&list->tail); >> 443 } else >> 444 cluster_set_next_flag(&list->head, >> 445 cluster_next(&ci[idx]), 0); >> 446 >> 447 return idx; >> 448 } >> 449 448 /* Add a cluster to discard list and schedule 450 /* Add a cluster to discard list and schedule it to do discard */ 449 static void swap_cluster_schedule_discard(stru 451 static void swap_cluster_schedule_discard(struct swap_info_struct *si, 450 struct swap_cluster_info *ci) !! 452 unsigned int idx) 451 { 453 { 452 unsigned int idx = cluster_index(si, c << 453 /* 454 /* 454 * If scan_swap_map_slots() can't find 455 * If scan_swap_map_slots() can't find a free cluster, it will check 455 * si->swap_map directly. To make sure 456 * si->swap_map directly. To make sure the discarding cluster isn't 456 * taken by scan_swap_map_slots(), mar 457 * taken by scan_swap_map_slots(), mark the swap entries bad (occupied). 457 * It will be cleared after discard 458 * It will be cleared after discard 458 */ 459 */ 459 memset(si->swap_map + idx * SWAPFILE_C 460 memset(si->swap_map + idx * SWAPFILE_CLUSTER, 460 SWAP_MAP_BAD, SWAPFILE 461 SWAP_MAP_BAD, SWAPFILE_CLUSTER); 461 462 462 VM_BUG_ON(ci->flags & CLUSTER_FLAG_FRE !! 463 cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx); 463 list_move_tail(&ci->list, &si->discard !! 464 464 ci->flags = 0; << 465 schedule_work(&si->discard_work); 465 schedule_work(&si->discard_work); 466 } 466 } 467 467 468 static void __free_cluster(struct swap_info_st !! 468 static void __free_cluster(struct swap_info_struct *si, unsigned long idx) 469 { 469 { 470 lockdep_assert_held(&si->lock); !! 470 struct swap_cluster_info *ci = si->cluster_info; 471 lockdep_assert_held(&ci->lock); << 472 471 473 if (ci->flags) !! 472 cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE); 474 list_move_tail(&ci->list, &si- !! 473 cluster_list_add_tail(&si->free_clusters, ci, idx); 475 else << 476 list_add_tail(&ci->list, &si-> << 477 ci->flags = CLUSTER_FLAG_FREE; << 478 ci->order = 0; << 479 } 474 } 480 475 481 /* 476 /* 482 * Doing discard actually. After a cluster dis 477 * Doing discard actually. After a cluster discard is finished, the cluster 483 * will be added to free cluster list. caller 478 * will be added to free cluster list. caller should hold si->lock. 484 */ 479 */ 485 static void swap_do_scheduled_discard(struct s 480 static void swap_do_scheduled_discard(struct swap_info_struct *si) 486 { 481 { 487 struct swap_cluster_info *ci; !! 482 struct swap_cluster_info *info, *ci; 488 unsigned int idx; 483 unsigned int idx; 489 484 490 while (!list_empty(&si->discard_cluste !! 485 info = si->cluster_info; 491 ci = list_first_entry(&si->dis !! 486 492 list_del(&ci->list); !! 487 while (!cluster_list_empty(&si->discard_clusters)) { 493 idx = cluster_index(si, ci); !! 488 idx = cluster_list_del_first(&si->discard_clusters, info); 494 spin_unlock(&si->lock); 489 spin_unlock(&si->lock); 495 490 496 discard_swap_cluster(si, idx * 491 discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, 497 SWAPFILE_CLUST 492 SWAPFILE_CLUSTER); 498 493 499 spin_lock(&si->lock); 494 spin_lock(&si->lock); 500 spin_lock(&ci->lock); !! 495 ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); 501 __free_cluster(si, ci); !! 496 __free_cluster(si, idx); 502 memset(si->swap_map + idx * SW 497 memset(si->swap_map + idx * SWAPFILE_CLUSTER, 503 0, SWAPFILE_CL 498 0, SWAPFILE_CLUSTER); 504 spin_unlock(&ci->lock); !! 499 unlock_cluster(ci); 505 } 500 } 506 } 501 } 507 502 508 static void swap_discard_work(struct work_stru 503 static void swap_discard_work(struct work_struct *work) 509 { 504 { 510 struct swap_info_struct *si; 505 struct swap_info_struct *si; 511 506 512 si = container_of(work, struct swap_in 507 si = container_of(work, struct swap_info_struct, discard_work); 513 508 514 spin_lock(&si->lock); 509 spin_lock(&si->lock); 515 swap_do_scheduled_discard(si); 510 swap_do_scheduled_discard(si); 516 spin_unlock(&si->lock); 511 spin_unlock(&si->lock); 517 } 512 } 518 513 519 static void swap_users_ref_free(struct percpu_ 514 static void swap_users_ref_free(struct percpu_ref *ref) 520 { 515 { 521 struct swap_info_struct *si; 516 struct swap_info_struct *si; 522 517 523 si = container_of(ref, struct swap_inf 518 si = container_of(ref, struct swap_info_struct, users); 524 complete(&si->comp); 519 complete(&si->comp); 525 } 520 } 526 521 527 static void free_cluster(struct swap_info_stru !! 522 static void alloc_cluster(struct swap_info_struct *si, unsigned long idx) 528 { 523 { 529 VM_BUG_ON(ci->count != 0); !! 524 struct swap_cluster_info *ci = si->cluster_info; 530 lockdep_assert_held(&si->lock); << 531 lockdep_assert_held(&ci->lock); << 532 525 533 if (ci->flags & CLUSTER_FLAG_FRAG) !! 526 VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx); 534 si->frag_cluster_nr[ci->order] !! 527 cluster_list_del_first(&si->free_clusters, ci); >> 528 cluster_set_count_flag(ci + idx, 0, 0); >> 529 } >> 530 >> 531 static void free_cluster(struct swap_info_struct *si, unsigned long idx) >> 532 { >> 533 struct swap_cluster_info *ci = si->cluster_info + idx; 535 534 >> 535 VM_BUG_ON(cluster_count(ci) != 0); 536 /* 536 /* 537 * If the swap is discardable, prepare 537 * If the swap is discardable, prepare discard the cluster 538 * instead of free it immediately. The 538 * instead of free it immediately. The cluster will be freed 539 * after discard. 539 * after discard. 540 */ 540 */ 541 if ((si->flags & (SWP_WRITEOK | SWP_PA 541 if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == 542 (SWP_WRITEOK | SWP_PAGE_DISCARD)) 542 (SWP_WRITEOK | SWP_PAGE_DISCARD)) { 543 swap_cluster_schedule_discard( !! 543 swap_cluster_schedule_discard(si, idx); 544 return; 544 return; 545 } 545 } 546 546 547 __free_cluster(si, ci); !! 547 __free_cluster(si, idx); 548 } 548 } 549 549 550 /* 550 /* 551 * The cluster corresponding to page_nr will b !! 551 * The cluster corresponding to page_nr will be used. The cluster will be 552 * added to free cluster list and its usage co !! 552 * removed from free cluster list and its usage counter will be increased. 553 * Only used for initialization. << 554 */ 553 */ 555 static void inc_cluster_info_page(struct swap_ !! 554 static void inc_cluster_info_page(struct swap_info_struct *p, 556 struct swap_cluster_info *cluster_info 555 struct swap_cluster_info *cluster_info, unsigned long page_nr) 557 { 556 { 558 unsigned long idx = page_nr / SWAPFILE 557 unsigned long idx = page_nr / SWAPFILE_CLUSTER; 559 struct swap_cluster_info *ci; << 560 558 561 if (!cluster_info) 559 if (!cluster_info) 562 return; 560 return; >> 561 if (cluster_is_free(&cluster_info[idx])) >> 562 alloc_cluster(p, idx); 563 563 564 ci = cluster_info + idx; !! 564 VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER); 565 ci->count++; !! 565 cluster_set_count(&cluster_info[idx], 566 !! 566 cluster_count(&cluster_info[idx]) + 1); 567 VM_BUG_ON(ci->count > SWAPFILE_CLUSTER << 568 VM_BUG_ON(ci->flags); << 569 } 567 } 570 568 571 /* 569 /* 572 * The cluster ci decreases @nr_pages usage. I !! 570 * The cluster corresponding to page_nr decreases one usage. If the usage 573 * which means no page in the cluster is in us !! 571 * counter becomes 0, which means no page in the cluster is in using, we can 574 * the cluster and add it to free cluster list !! 572 * optionally discard the cluster and add it to free cluster list. 575 */ 573 */ 576 static void dec_cluster_info_page(struct swap_ !! 574 static void dec_cluster_info_page(struct swap_info_struct *p, 577 struct swap_ !! 575 struct swap_cluster_info *cluster_info, unsigned long page_nr) 578 { 576 { 579 if (!si->cluster_info) !! 577 unsigned long idx = page_nr / SWAPFILE_CLUSTER; 580 return; << 581 << 582 VM_BUG_ON(ci->count < nr_pages); << 583 VM_BUG_ON(cluster_is_free(ci)); << 584 lockdep_assert_held(&si->lock); << 585 lockdep_assert_held(&ci->lock); << 586 ci->count -= nr_pages; << 587 578 588 if (!ci->count) { !! 579 if (!cluster_info) 589 free_cluster(si, ci); << 590 return; 580 return; 591 } << 592 << 593 if (!(ci->flags & CLUSTER_FLAG_NONFULL << 594 VM_BUG_ON(ci->flags & CLUSTER_ << 595 if (ci->flags & CLUSTER_FLAG_F << 596 si->frag_cluster_nr[ci << 597 list_move_tail(&ci->list, &si- << 598 ci->flags = CLUSTER_FLAG_NONFU << 599 } << 600 } << 601 << 602 static bool cluster_reclaim_range(struct swap_ << 603 struct swap_ << 604 unsigned lon << 605 { << 606 unsigned char *map = si->swap_map; << 607 unsigned long offset; << 608 581 609 spin_unlock(&ci->lock); !! 582 VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0); 610 spin_unlock(&si->lock); !! 583 cluster_set_count(&cluster_info[idx], >> 584 cluster_count(&cluster_info[idx]) - 1); 611 585 612 for (offset = start; offset < end; off !! 586 if (cluster_count(&cluster_info[idx]) == 0) 613 switch (READ_ONCE(map[offset]) !! 587 free_cluster(p, idx); 614 case 0: << 615 continue; << 616 case SWAP_HAS_CACHE: << 617 if (__try_to_reclaim_s << 618 continue; << 619 goto out; << 620 default: << 621 goto out; << 622 } << 623 } << 624 out: << 625 spin_lock(&si->lock); << 626 spin_lock(&ci->lock); << 627 << 628 /* << 629 * Recheck the range no matter reclaim << 630 * could have been be freed while we a << 631 */ << 632 for (offset = start; offset < end; off << 633 if (READ_ONCE(map[offset])) << 634 return false; << 635 << 636 return true; << 637 } 588 } 638 589 639 static bool cluster_scan_range(struct swap_inf !! 590 /* 640 struct swap_clu !! 591 * It's possible scan_swap_map_slots() uses a free cluster in the middle of free 641 unsigned long s !! 592 * cluster list. Avoiding such abuse to avoid list corruption. >> 593 */ >> 594 static bool >> 595 scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, >> 596 unsigned long offset) 642 { 597 { 643 unsigned long offset, end = start + nr !! 598 struct percpu_cluster *percpu_cluster; 644 unsigned char *map = si->swap_map; !! 599 bool conflict; 645 bool need_reclaim = false; << 646 600 647 for (offset = start; offset < end; off !! 601 offset /= SWAPFILE_CLUSTER; 648 switch (READ_ONCE(map[offset]) !! 602 conflict = !cluster_list_empty(&si->free_clusters) && 649 case 0: !! 603 offset != cluster_list_first(&si->free_clusters) && 650 continue; !! 604 cluster_is_free(&si->cluster_info[offset]); 651 case SWAP_HAS_CACHE: << 652 if (!vm_swap_full()) << 653 return false; << 654 need_reclaim = true; << 655 continue; << 656 default: << 657 return false; << 658 } << 659 } << 660 605 661 if (need_reclaim) !! 606 if (!conflict) 662 return cluster_reclaim_range(s !! 607 return false; 663 608 >> 609 percpu_cluster = this_cpu_ptr(si->percpu_cluster); >> 610 cluster_set_null(&percpu_cluster->index); 664 return true; 611 return true; 665 } 612 } 666 613 667 static void cluster_alloc_range(struct swap_in << 668 unsigned int s << 669 unsigned int o << 670 { << 671 unsigned int nr_pages = 1 << order; << 672 << 673 if (cluster_is_free(ci)) { << 674 if (nr_pages < SWAPFILE_CLUSTE << 675 list_move_tail(&ci->li << 676 ci->flags = CLUSTER_FL << 677 } << 678 ci->order = order; << 679 } << 680 << 681 memset(si->swap_map + start, usage, nr << 682 swap_range_alloc(si, start, nr_pages); << 683 ci->count += nr_pages; << 684 << 685 if (ci->count == SWAPFILE_CLUSTER) { << 686 VM_BUG_ON(!(ci->flags & << 687 (CLUSTER_FLAG_FREE | << 688 if (ci->flags & CLUSTER_FLAG_F << 689 si->frag_cluster_nr[ci << 690 list_move_tail(&ci->list, &si- << 691 ci->flags = CLUSTER_FLAG_FULL; << 692 } << 693 } << 694 << 695 static unsigned int alloc_swap_scan_cluster(st << 696 un << 697 un << 698 { << 699 unsigned long start = offset & ~(SWAPF << 700 unsigned long end = min(start + SWAPFI << 701 unsigned int nr_pages = 1 << order; << 702 struct swap_cluster_info *ci; << 703 << 704 if (end < nr_pages) << 705 return SWAP_NEXT_INVALID; << 706 end -= nr_pages; << 707 << 708 ci = lock_cluster(si, offset); << 709 if (ci->count + nr_pages > SWAPFILE_CL << 710 offset = SWAP_NEXT_INVALID; << 711 goto done; << 712 } << 713 << 714 while (offset <= end) { << 715 if (cluster_scan_range(si, ci, << 716 cluster_alloc_range(si << 717 *foundp = offset; << 718 if (ci->count == SWAPF << 719 offset = SWAP_ << 720 goto done; << 721 } << 722 offset += nr_pages; << 723 break; << 724 } << 725 offset += nr_pages; << 726 } << 727 if (offset > end) << 728 offset = SWAP_NEXT_INVALID; << 729 done: << 730 unlock_cluster(ci); << 731 return offset; << 732 } << 733 << 734 /* Return true if reclaimed a whole cluster */ << 735 static void swap_reclaim_full_clusters(struct << 736 { << 737 long to_scan = 1; << 738 unsigned long offset, end; << 739 struct swap_cluster_info *ci; << 740 unsigned char *map = si->swap_map; << 741 int nr_reclaim; << 742 << 743 if (force) << 744 to_scan = si->inuse_pages / SW << 745 << 746 while (!list_empty(&si->full_clusters) << 747 ci = list_first_entry(&si->ful << 748 list_move_tail(&ci->list, &si- << 749 offset = cluster_offset(si, ci << 750 end = min(si->max, offset + SW << 751 to_scan--; << 752 << 753 spin_unlock(&si->lock); << 754 while (offset < end) { << 755 if (READ_ONCE(map[offs << 756 nr_reclaim = _ << 757 << 758 if (nr_reclaim << 759 offset << 760 contin << 761 } << 762 } << 763 offset++; << 764 } << 765 spin_lock(&si->lock); << 766 << 767 if (to_scan <= 0) << 768 break; << 769 } << 770 } << 771 << 772 static void swap_reclaim_work(struct work_stru << 773 { << 774 struct swap_info_struct *si; << 775 << 776 si = container_of(work, struct swap_in << 777 << 778 spin_lock(&si->lock); << 779 swap_reclaim_full_clusters(si, true); << 780 spin_unlock(&si->lock); << 781 } << 782 << 783 /* 614 /* 784 * Try to get swap entries with specified orde !! 615 * Try to get a swap entry from current cpu's swap entry pool (a cluster). This 785 * pool (a cluster). This might involve alloca !! 616 * might involve allocating a new cluster for current CPU too. 786 * too. << 787 */ 617 */ 788 static unsigned long cluster_alloc_swap_entry( !! 618 static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, 789 !! 619 unsigned long *offset, unsigned long *scan_base) 790 { 620 { 791 struct percpu_cluster *cluster; 621 struct percpu_cluster *cluster; 792 struct swap_cluster_info *ci; 622 struct swap_cluster_info *ci; 793 unsigned int offset, found = 0; !! 623 unsigned long tmp, max; 794 624 795 new_cluster: 625 new_cluster: 796 lockdep_assert_held(&si->lock); << 797 cluster = this_cpu_ptr(si->percpu_clus 626 cluster = this_cpu_ptr(si->percpu_cluster); 798 offset = cluster->next[order]; !! 627 if (cluster_is_null(&cluster->index)) { 799 if (offset) { !! 628 if (!cluster_list_empty(&si->free_clusters)) { 800 offset = alloc_swap_scan_clust !! 629 cluster->index = si->free_clusters.head; 801 if (found) !! 630 cluster->next = cluster_next(&cluster->index) * 802 goto done; !! 631 SWAPFILE_CLUSTER; 803 } !! 632 } else if (!cluster_list_empty(&si->discard_clusters)) { 804 !! 633 /* 805 if (!list_empty(&si->free_clusters)) { !! 634 * we don't have free cluster but have some clusters in 806 ci = list_first_entry(&si->fre !! 635 * discarding, do discard now and reclaim them, then 807 offset = alloc_swap_scan_clust !! 636 * reread cluster_next_cpu since we dropped si->lock 808 VM_BUG_ON(!found); !! 637 */ 809 goto done; !! 638 swap_do_scheduled_discard(si); >> 639 *scan_base = this_cpu_read(*si->cluster_next_cpu); >> 640 *offset = *scan_base; >> 641 goto new_cluster; >> 642 } else >> 643 return false; 810 } 644 } 811 645 812 /* Try reclaim from full clusters if f !! 646 /* 813 if (vm_swap_full()) !! 647 * Other CPUs can use our cluster if they can't find a free cluster, 814 swap_reclaim_full_clusters(si, !! 648 * check if there is still free entry in the cluster 815 !! 649 */ 816 if (order < PMD_ORDER) { !! 650 tmp = cluster->next; 817 unsigned int frags = 0; !! 651 max = min_t(unsigned long, si->max, 818 !! 652 (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER); 819 while (!list_empty(&si->nonful !! 653 if (tmp < max) { 820 ci = list_first_entry( !! 654 ci = lock_cluster(si, tmp); 821 !! 655 while (tmp < max) { 822 list_move_tail(&ci->li !! 656 if (!si->swap_map[tmp]) 823 ci->flags = CLUSTER_FL << 824 si->frag_cluster_nr[or << 825 offset = alloc_swap_sc << 826 << 827 frags++; << 828 if (found) << 829 break; 657 break; >> 658 tmp++; 830 } 659 } 831 !! 660 unlock_cluster(ci); 832 if (!found) { << 833 /* << 834 * Nonfull clusters ar << 835 * here, count them to << 836 */ << 837 while (frags < si->fra << 838 ci = list_firs << 839 << 840 /* << 841 * Rotate the << 842 * high order << 843 * this help k << 844 */ << 845 list_move_tail << 846 offset = alloc << 847 << 848 frags++; << 849 if (found) << 850 break; << 851 } << 852 } << 853 } 661 } 854 !! 662 if (tmp >= max) { 855 if (found) !! 663 cluster_set_null(&cluster->index); 856 goto done; << 857 << 858 if (!list_empty(&si->discard_clusters) << 859 /* << 860 * we don't have free cluster << 861 * discarding, do discard now << 862 * reread cluster_next_cpu sin << 863 */ << 864 swap_do_scheduled_discard(si); << 865 goto new_cluster; 664 goto new_cluster; 866 } 665 } 867 !! 666 cluster->next = tmp + 1; 868 if (order) !! 667 *offset = tmp; 869 goto done; !! 668 *scan_base = tmp; 870 !! 669 return true; 871 /* Order 0 stealing from higher order << 872 for (int o = 1; o < SWAP_NR_ORDERS; o+ << 873 /* << 874 * Clusters here have at least << 875 * allocation, but reclaim may << 876 */ << 877 while (!list_empty(&si->frag_c << 878 ci = list_first_entry( << 879 << 880 offset = alloc_swap_sc << 881 << 882 if (found) << 883 goto done; << 884 } << 885 << 886 while (!list_empty(&si->nonful << 887 ci = list_first_entry( << 888 << 889 offset = alloc_swap_sc << 890 << 891 if (found) << 892 goto done; << 893 } << 894 } << 895 << 896 done: << 897 cluster->next[order] = offset; << 898 return found; << 899 } 670 } 900 671 901 static void __del_from_avail_list(struct swap_ !! 672 static void __del_from_avail_list(struct swap_info_struct *p) 902 { 673 { 903 int nid; 674 int nid; 904 675 905 assert_spin_locked(&si->lock); !! 676 assert_spin_locked(&p->lock); 906 for_each_node(nid) 677 for_each_node(nid) 907 plist_del(&si->avail_lists[nid !! 678 plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]); 908 } 679 } 909 680 910 static void del_from_avail_list(struct swap_in !! 681 static void del_from_avail_list(struct swap_info_struct *p) 911 { 682 { 912 spin_lock(&swap_avail_lock); 683 spin_lock(&swap_avail_lock); 913 __del_from_avail_list(si); !! 684 __del_from_avail_list(p); 914 spin_unlock(&swap_avail_lock); 685 spin_unlock(&swap_avail_lock); 915 } 686 } 916 687 917 static void swap_range_alloc(struct swap_info_ 688 static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, 918 unsigned int nr_e 689 unsigned int nr_entries) 919 { 690 { 920 unsigned int end = offset + nr_entries 691 unsigned int end = offset + nr_entries - 1; 921 692 922 if (offset == si->lowest_bit) 693 if (offset == si->lowest_bit) 923 si->lowest_bit += nr_entries; 694 si->lowest_bit += nr_entries; 924 if (end == si->highest_bit) 695 if (end == si->highest_bit) 925 WRITE_ONCE(si->highest_bit, si 696 WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries); 926 WRITE_ONCE(si->inuse_pages, si->inuse_ !! 697 si->inuse_pages += nr_entries; 927 if (si->inuse_pages == si->pages) { 698 if (si->inuse_pages == si->pages) { 928 si->lowest_bit = si->max; 699 si->lowest_bit = si->max; 929 si->highest_bit = 0; 700 si->highest_bit = 0; 930 del_from_avail_list(si); 701 del_from_avail_list(si); 931 << 932 if (vm_swap_full()) << 933 schedule_work(&si->rec << 934 } 702 } 935 } 703 } 936 704 937 static void add_to_avail_list(struct swap_info !! 705 static void add_to_avail_list(struct swap_info_struct *p) 938 { 706 { 939 int nid; 707 int nid; 940 708 941 spin_lock(&swap_avail_lock); 709 spin_lock(&swap_avail_lock); 942 for_each_node(nid) !! 710 for_each_node(nid) { 943 plist_add(&si->avail_lists[nid !! 711 WARN_ON(!plist_node_empty(&p->avail_lists[nid])); >> 712 plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]); >> 713 } 944 spin_unlock(&swap_avail_lock); 714 spin_unlock(&swap_avail_lock); 945 } 715 } 946 716 947 static void swap_range_free(struct swap_info_s 717 static void swap_range_free(struct swap_info_struct *si, unsigned long offset, 948 unsigned int nr_en 718 unsigned int nr_entries) 949 { 719 { 950 unsigned long begin = offset; 720 unsigned long begin = offset; 951 unsigned long end = offset + nr_entrie 721 unsigned long end = offset + nr_entries - 1; 952 void (*swap_slot_free_notify)(struct b 722 void (*swap_slot_free_notify)(struct block_device *, unsigned long); 953 unsigned int i; << 954 << 955 /* << 956 * Use atomic clear_bit operations onl << 957 * bitmap_clear to prevent adjacent bi << 958 */ << 959 for (i = 0; i < nr_entries; i++) << 960 clear_bit(offset + i, si->zero << 961 723 962 if (offset < si->lowest_bit) 724 if (offset < si->lowest_bit) 963 si->lowest_bit = offset; 725 si->lowest_bit = offset; 964 if (end > si->highest_bit) { 726 if (end > si->highest_bit) { 965 bool was_full = !si->highest_b 727 bool was_full = !si->highest_bit; 966 728 967 WRITE_ONCE(si->highest_bit, en 729 WRITE_ONCE(si->highest_bit, end); 968 if (was_full && (si->flags & S 730 if (was_full && (si->flags & SWP_WRITEOK)) 969 add_to_avail_list(si); 731 add_to_avail_list(si); 970 } 732 } >> 733 atomic_long_add(nr_entries, &nr_swap_pages); >> 734 si->inuse_pages -= nr_entries; 971 if (si->flags & SWP_BLKDEV) 735 if (si->flags & SWP_BLKDEV) 972 swap_slot_free_notify = 736 swap_slot_free_notify = 973 si->bdev->bd_disk->fop 737 si->bdev->bd_disk->fops->swap_slot_free_notify; 974 else 738 else 975 swap_slot_free_notify = NULL; 739 swap_slot_free_notify = NULL; 976 while (offset <= end) { 740 while (offset <= end) { 977 arch_swap_invalidate_page(si-> 741 arch_swap_invalidate_page(si->type, offset); >> 742 frontswap_invalidate_page(si->type, offset); 978 if (swap_slot_free_notify) 743 if (swap_slot_free_notify) 979 swap_slot_free_notify( 744 swap_slot_free_notify(si->bdev, offset); 980 offset++; 745 offset++; 981 } 746 } 982 clear_shadow_from_swap_cache(si->type, 747 clear_shadow_from_swap_cache(si->type, begin, end); 983 << 984 /* << 985 * Make sure that try_to_unuse() obser << 986 * only after the above cleanups are d << 987 */ << 988 smp_wmb(); << 989 atomic_long_add(nr_entries, &nr_swap_p << 990 WRITE_ONCE(si->inuse_pages, si->inuse_ << 991 } 748 } 992 749 993 static void set_cluster_next(struct swap_info_ 750 static void set_cluster_next(struct swap_info_struct *si, unsigned long next) 994 { 751 { 995 unsigned long prev; 752 unsigned long prev; 996 753 997 if (!(si->flags & SWP_SOLIDSTATE)) { 754 if (!(si->flags & SWP_SOLIDSTATE)) { 998 si->cluster_next = next; 755 si->cluster_next = next; 999 return; 756 return; 1000 } 757 } 1001 758 1002 prev = this_cpu_read(*si->cluster_nex 759 prev = this_cpu_read(*si->cluster_next_cpu); 1003 /* 760 /* 1004 * Cross the swap address space size 761 * Cross the swap address space size aligned trunk, choose 1005 * another trunk randomly to avoid lo 762 * another trunk randomly to avoid lock contention on swap 1006 * address space if possible. 763 * address space if possible. 1007 */ 764 */ 1008 if ((prev >> SWAP_ADDRESS_SPACE_SHIFT 765 if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) != 1009 (next >> SWAP_ADDRESS_SPACE_SHIFT 766 (next >> SWAP_ADDRESS_SPACE_SHIFT)) { 1010 /* No free swap slots availab 767 /* No free swap slots available */ 1011 if (si->highest_bit <= si->lo 768 if (si->highest_bit <= si->lowest_bit) 1012 return; 769 return; 1013 next = get_random_u32_inclusi !! 770 next = si->lowest_bit + >> 771 prandom_u32_max(si->highest_bit - si->lowest_bit + 1); 1014 next = ALIGN_DOWN(next, SWAP_ 772 next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES); 1015 next = max_t(unsigned int, ne 773 next = max_t(unsigned int, next, si->lowest_bit); 1016 } 774 } 1017 this_cpu_write(*si->cluster_next_cpu, 775 this_cpu_write(*si->cluster_next_cpu, next); 1018 } 776 } 1019 777 1020 static bool swap_offset_available_and_locked( << 1021 << 1022 { << 1023 if (data_race(!si->swap_map[offset])) << 1024 spin_lock(&si->lock); << 1025 return true; << 1026 } << 1027 << 1028 if (vm_swap_full() && READ_ONCE(si->s << 1029 spin_lock(&si->lock); << 1030 return true; << 1031 } << 1032 << 1033 return false; << 1034 } << 1035 << 1036 static int cluster_alloc_swap(struct swap_inf << 1037 unsigned char us << 1038 swp_entry_t slot << 1039 { << 1040 int n_ret = 0; << 1041 << 1042 VM_BUG_ON(!si->cluster_info); << 1043 << 1044 while (n_ret < nr) { << 1045 unsigned long offset = cluste << 1046 << 1047 if (!offset) << 1048 break; << 1049 slots[n_ret++] = swp_entry(si << 1050 } << 1051 << 1052 return n_ret; << 1053 } << 1054 << 1055 static int scan_swap_map_slots(struct swap_in 778 static int scan_swap_map_slots(struct swap_info_struct *si, 1056 unsigned char 779 unsigned char usage, int nr, 1057 swp_entry_t sl !! 780 swp_entry_t slots[]) 1058 { 781 { >> 782 struct swap_cluster_info *ci; 1059 unsigned long offset; 783 unsigned long offset; 1060 unsigned long scan_base; 784 unsigned long scan_base; 1061 unsigned long last_in_cluster = 0; 785 unsigned long last_in_cluster = 0; 1062 int latency_ration = LATENCY_LIMIT; 786 int latency_ration = LATENCY_LIMIT; 1063 unsigned int nr_pages = 1 << order; << 1064 int n_ret = 0; 787 int n_ret = 0; 1065 bool scanned_many = false; 788 bool scanned_many = false; 1066 789 1067 /* 790 /* 1068 * We try to cluster swap pages by al 791 * We try to cluster swap pages by allocating them sequentially 1069 * in swap. Once we've allocated SWA 792 * in swap. Once we've allocated SWAPFILE_CLUSTER pages this 1070 * way, however, we resort to first-f 793 * way, however, we resort to first-free allocation, starting 1071 * a new cluster. This prevents us f 794 * a new cluster. This prevents us from scattering swap pages 1072 * all over the entire swap partition 795 * all over the entire swap partition, so that we reduce 1073 * overall disk seek times between sw 796 * overall disk seek times between swap pages. -- sct 1074 * But we do now try to find an empty 797 * But we do now try to find an empty cluster. -Andrea 1075 * And we let swap pages go all over 798 * And we let swap pages go all over an SSD partition. Hugh 1076 */ 799 */ 1077 800 1078 if (order > 0) { << 1079 /* << 1080 * Should not even be attempt << 1081 * page swap is disabled. Wa << 1082 */ << 1083 if (!IS_ENABLED(CONFIG_THP_SW << 1084 nr_pages > SWAPFILE_CLUST << 1085 VM_WARN_ON_ONCE(1); << 1086 return 0; << 1087 } << 1088 << 1089 /* << 1090 * Swapfile is not block devi << 1091 * to allocate large entries. << 1092 */ << 1093 if (!(si->flags & SWP_BLKDEV) << 1094 return 0; << 1095 } << 1096 << 1097 if (si->cluster_info) << 1098 return cluster_alloc_swap(si, << 1099 << 1100 si->flags += SWP_SCANNING; 801 si->flags += SWP_SCANNING; 1101 !! 802 /* 1102 /* For HDD, sequential access is more !! 803 * Use percpu scan base for SSD to reduce lock contention on 1103 scan_base = si->cluster_next; !! 804 * cluster and swap cache. For HDD, sequential access is more >> 805 * important. >> 806 */ >> 807 if (si->flags & SWP_SOLIDSTATE) >> 808 scan_base = this_cpu_read(*si->cluster_next_cpu); >> 809 else >> 810 scan_base = si->cluster_next; 1104 offset = scan_base; 811 offset = scan_base; 1105 812 1106 if (unlikely(!si->cluster_nr--)) { !! 813 /* SSD algorithm */ >> 814 if (si->cluster_info) { >> 815 if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) >> 816 goto scan; >> 817 } else if (unlikely(!si->cluster_nr--)) { 1107 if (si->pages - si->inuse_pag 818 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { 1108 si->cluster_nr = SWAP 819 si->cluster_nr = SWAPFILE_CLUSTER - 1; 1109 goto checks; 820 goto checks; 1110 } 821 } 1111 822 1112 spin_unlock(&si->lock); 823 spin_unlock(&si->lock); 1113 824 1114 /* 825 /* 1115 * If seek is expensive, star 826 * If seek is expensive, start searching for new cluster from 1116 * start of partition, to min 827 * start of partition, to minimize the span of allocated swap. >> 828 * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info >> 829 * case, just handled by scan_swap_map_try_ssd_cluster() above. 1117 */ 830 */ 1118 scan_base = offset = si->lowe 831 scan_base = offset = si->lowest_bit; 1119 last_in_cluster = offset + SW 832 last_in_cluster = offset + SWAPFILE_CLUSTER - 1; 1120 833 1121 /* Locate the first empty (un 834 /* Locate the first empty (unaligned) cluster */ 1122 for (; last_in_cluster <= REA !! 835 for (; last_in_cluster <= si->highest_bit; offset++) { 1123 if (si->swap_map[offs 836 if (si->swap_map[offset]) 1124 last_in_clust 837 last_in_cluster = offset + SWAPFILE_CLUSTER; 1125 else if (offset == la 838 else if (offset == last_in_cluster) { 1126 spin_lock(&si 839 spin_lock(&si->lock); 1127 offset -= SWA 840 offset -= SWAPFILE_CLUSTER - 1; 1128 si->cluster_n 841 si->cluster_next = offset; 1129 si->cluster_n 842 si->cluster_nr = SWAPFILE_CLUSTER - 1; 1130 goto checks; 843 goto checks; 1131 } 844 } 1132 if (unlikely(--latenc 845 if (unlikely(--latency_ration < 0)) { 1133 cond_resched( 846 cond_resched(); 1134 latency_ratio 847 latency_ration = LATENCY_LIMIT; 1135 } 848 } 1136 } 849 } 1137 850 1138 offset = scan_base; 851 offset = scan_base; 1139 spin_lock(&si->lock); 852 spin_lock(&si->lock); 1140 si->cluster_nr = SWAPFILE_CLU 853 si->cluster_nr = SWAPFILE_CLUSTER - 1; 1141 } 854 } 1142 855 1143 checks: 856 checks: >> 857 if (si->cluster_info) { >> 858 while (scan_swap_map_ssd_cluster_conflict(si, offset)) { >> 859 /* take a break if we already got some slots */ >> 860 if (n_ret) >> 861 goto done; >> 862 if (!scan_swap_map_try_ssd_cluster(si, &offset, >> 863 &scan_base)) >> 864 goto scan; >> 865 } >> 866 } 1144 if (!(si->flags & SWP_WRITEOK)) 867 if (!(si->flags & SWP_WRITEOK)) 1145 goto no_page; 868 goto no_page; 1146 if (!si->highest_bit) 869 if (!si->highest_bit) 1147 goto no_page; 870 goto no_page; 1148 if (offset > si->highest_bit) 871 if (offset > si->highest_bit) 1149 scan_base = offset = si->lowe 872 scan_base = offset = si->lowest_bit; 1150 873 >> 874 ci = lock_cluster(si, offset); 1151 /* reuse swap entry of cache-only swa 875 /* reuse swap entry of cache-only swap if not busy. */ 1152 if (vm_swap_full() && si->swap_map[of 876 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 1153 int swap_was_freed; 877 int swap_was_freed; >> 878 unlock_cluster(ci); 1154 spin_unlock(&si->lock); 879 spin_unlock(&si->lock); 1155 swap_was_freed = __try_to_rec !! 880 swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); 1156 spin_lock(&si->lock); 881 spin_lock(&si->lock); 1157 /* entry was freed successful 882 /* entry was freed successfully, try to use this again */ 1158 if (swap_was_freed > 0) !! 883 if (swap_was_freed) 1159 goto checks; 884 goto checks; 1160 goto scan; /* check next one 885 goto scan; /* check next one */ 1161 } 886 } 1162 887 1163 if (si->swap_map[offset]) { 888 if (si->swap_map[offset]) { >> 889 unlock_cluster(ci); 1164 if (!n_ret) 890 if (!n_ret) 1165 goto scan; 891 goto scan; 1166 else 892 else 1167 goto done; 893 goto done; 1168 } 894 } 1169 memset(si->swap_map + offset, usage, !! 895 WRITE_ONCE(si->swap_map[offset], usage); >> 896 inc_cluster_info_page(si, si->cluster_info, offset); >> 897 unlock_cluster(ci); 1170 898 1171 swap_range_alloc(si, offset, nr_pages !! 899 swap_range_alloc(si, offset, 1); 1172 slots[n_ret++] = swp_entry(si->type, 900 slots[n_ret++] = swp_entry(si->type, offset); 1173 901 1174 /* got enough slots or reach max slot 902 /* got enough slots or reach max slots? */ 1175 if ((n_ret == nr) || (offset >= si->h 903 if ((n_ret == nr) || (offset >= si->highest_bit)) 1176 goto done; 904 goto done; 1177 905 1178 /* search for next available slot */ 906 /* search for next available slot */ 1179 907 1180 /* time to take a break? */ 908 /* time to take a break? */ 1181 if (unlikely(--latency_ration < 0)) { 909 if (unlikely(--latency_ration < 0)) { 1182 if (n_ret) 910 if (n_ret) 1183 goto done; 911 goto done; 1184 spin_unlock(&si->lock); 912 spin_unlock(&si->lock); 1185 cond_resched(); 913 cond_resched(); 1186 spin_lock(&si->lock); 914 spin_lock(&si->lock); 1187 latency_ration = LATENCY_LIMI 915 latency_ration = LATENCY_LIMIT; 1188 } 916 } 1189 917 1190 if (si->cluster_nr && !si->swap_map[+ !! 918 /* try to get more slots in cluster */ >> 919 if (si->cluster_info) { >> 920 if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) >> 921 goto checks; >> 922 } else if (si->cluster_nr && !si->swap_map[++offset]) { 1191 /* non-ssd case, still more s 923 /* non-ssd case, still more slots in cluster? */ 1192 --si->cluster_nr; 924 --si->cluster_nr; 1193 goto checks; 925 goto checks; 1194 } 926 } 1195 927 1196 /* 928 /* 1197 * Even if there's no free clusters a 929 * Even if there's no free clusters available (fragmented), 1198 * try to scan a little more quickly 930 * try to scan a little more quickly with lock held unless we 1199 * have scanned too many slots alread 931 * have scanned too many slots already. 1200 */ 932 */ 1201 if (!scanned_many) { 933 if (!scanned_many) { 1202 unsigned long scan_limit; 934 unsigned long scan_limit; 1203 935 1204 if (offset < scan_base) 936 if (offset < scan_base) 1205 scan_limit = scan_bas 937 scan_limit = scan_base; 1206 else 938 else 1207 scan_limit = si->high 939 scan_limit = si->highest_bit; 1208 for (; offset <= scan_limit & 940 for (; offset <= scan_limit && --latency_ration > 0; 1209 offset++) { 941 offset++) { 1210 if (!si->swap_map[off 942 if (!si->swap_map[offset]) 1211 goto checks; 943 goto checks; 1212 } 944 } 1213 } 945 } 1214 946 1215 done: 947 done: 1216 if (order == 0) !! 948 set_cluster_next(si, offset + 1); 1217 set_cluster_next(si, offset + << 1218 si->flags -= SWP_SCANNING; 949 si->flags -= SWP_SCANNING; 1219 return n_ret; 950 return n_ret; 1220 951 1221 scan: 952 scan: 1222 VM_WARN_ON(order > 0); << 1223 spin_unlock(&si->lock); 953 spin_unlock(&si->lock); 1224 while (++offset <= READ_ONCE(si->high 954 while (++offset <= READ_ONCE(si->highest_bit)) { >> 955 if (data_race(!si->swap_map[offset])) { >> 956 spin_lock(&si->lock); >> 957 goto checks; >> 958 } >> 959 if (vm_swap_full() && >> 960 READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { >> 961 spin_lock(&si->lock); >> 962 goto checks; >> 963 } 1225 if (unlikely(--latency_ration 964 if (unlikely(--latency_ration < 0)) { 1226 cond_resched(); 965 cond_resched(); 1227 latency_ration = LATE 966 latency_ration = LATENCY_LIMIT; 1228 scanned_many = true; 967 scanned_many = true; 1229 } 968 } 1230 if (swap_offset_available_and << 1231 goto checks; << 1232 } 969 } 1233 offset = si->lowest_bit; 970 offset = si->lowest_bit; 1234 while (offset < scan_base) { 971 while (offset < scan_base) { >> 972 if (data_race(!si->swap_map[offset])) { >> 973 spin_lock(&si->lock); >> 974 goto checks; >> 975 } >> 976 if (vm_swap_full() && >> 977 READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { >> 978 spin_lock(&si->lock); >> 979 goto checks; >> 980 } 1235 if (unlikely(--latency_ration 981 if (unlikely(--latency_ration < 0)) { 1236 cond_resched(); 982 cond_resched(); 1237 latency_ration = LATE 983 latency_ration = LATENCY_LIMIT; 1238 scanned_many = true; 984 scanned_many = true; 1239 } 985 } 1240 if (swap_offset_available_and << 1241 goto checks; << 1242 offset++; 986 offset++; 1243 } 987 } 1244 spin_lock(&si->lock); 988 spin_lock(&si->lock); 1245 989 1246 no_page: 990 no_page: 1247 si->flags -= SWP_SCANNING; 991 si->flags -= SWP_SCANNING; 1248 return n_ret; 992 return n_ret; 1249 } 993 } 1250 994 1251 int get_swap_pages(int n_goal, swp_entry_t sw !! 995 static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) 1252 { 996 { 1253 int order = swap_entry_order(entry_or !! 997 unsigned long idx; 1254 unsigned long size = 1 << order; !! 998 struct swap_cluster_info *ci; >> 999 unsigned long offset; >> 1000 >> 1001 /* >> 1002 * Should not even be attempting cluster allocations when huge >> 1003 * page swap is disabled. Warn and fail the allocation. >> 1004 */ >> 1005 if (!IS_ENABLED(CONFIG_THP_SWAP)) { >> 1006 VM_WARN_ON_ONCE(1); >> 1007 return 0; >> 1008 } >> 1009 >> 1010 if (cluster_list_empty(&si->free_clusters)) >> 1011 return 0; >> 1012 >> 1013 idx = cluster_list_first(&si->free_clusters); >> 1014 offset = idx * SWAPFILE_CLUSTER; >> 1015 ci = lock_cluster(si, offset); >> 1016 alloc_cluster(si, idx); >> 1017 cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE); >> 1018 >> 1019 memset(si->swap_map + offset, SWAP_HAS_CACHE, SWAPFILE_CLUSTER); >> 1020 unlock_cluster(ci); >> 1021 swap_range_alloc(si, offset, SWAPFILE_CLUSTER); >> 1022 *slot = swp_entry(si->type, offset); >> 1023 >> 1024 return 1; >> 1025 } >> 1026 >> 1027 static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) >> 1028 { >> 1029 unsigned long offset = idx * SWAPFILE_CLUSTER; >> 1030 struct swap_cluster_info *ci; >> 1031 >> 1032 ci = lock_cluster(si, offset); >> 1033 memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); >> 1034 cluster_set_count_flag(ci, 0, 0); >> 1035 free_cluster(si, idx); >> 1036 unlock_cluster(ci); >> 1037 swap_range_free(si, offset, SWAPFILE_CLUSTER); >> 1038 } >> 1039 >> 1040 int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) >> 1041 { >> 1042 unsigned long size = swap_entry_size(entry_size); 1255 struct swap_info_struct *si, *next; 1043 struct swap_info_struct *si, *next; 1256 long avail_pgs; 1044 long avail_pgs; 1257 int n_ret = 0; 1045 int n_ret = 0; 1258 int node; 1046 int node; 1259 1047 >> 1048 /* Only single cluster request supported */ >> 1049 WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER); >> 1050 1260 spin_lock(&swap_avail_lock); 1051 spin_lock(&swap_avail_lock); 1261 1052 1262 avail_pgs = atomic_long_read(&nr_swap 1053 avail_pgs = atomic_long_read(&nr_swap_pages) / size; 1263 if (avail_pgs <= 0) { 1054 if (avail_pgs <= 0) { 1264 spin_unlock(&swap_avail_lock) 1055 spin_unlock(&swap_avail_lock); 1265 goto noswap; 1056 goto noswap; 1266 } 1057 } 1267 1058 1268 n_goal = min3((long)n_goal, (long)SWA 1059 n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs); 1269 1060 1270 atomic_long_sub(n_goal * size, &nr_sw 1061 atomic_long_sub(n_goal * size, &nr_swap_pages); 1271 1062 1272 start_over: 1063 start_over: 1273 node = numa_node_id(); 1064 node = numa_node_id(); 1274 plist_for_each_entry_safe(si, next, & 1065 plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) { 1275 /* requeue si to after same-p 1066 /* requeue si to after same-priority siblings */ 1276 plist_requeue(&si->avail_list 1067 plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); 1277 spin_unlock(&swap_avail_lock) 1068 spin_unlock(&swap_avail_lock); 1278 spin_lock(&si->lock); 1069 spin_lock(&si->lock); 1279 if (!si->highest_bit || !(si- 1070 if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { 1280 spin_lock(&swap_avail 1071 spin_lock(&swap_avail_lock); 1281 if (plist_node_empty( 1072 if (plist_node_empty(&si->avail_lists[node])) { 1282 spin_unlock(& 1073 spin_unlock(&si->lock); 1283 goto nextsi; 1074 goto nextsi; 1284 } 1075 } 1285 WARN(!si->highest_bit 1076 WARN(!si->highest_bit, 1286 "swap_info %d in 1077 "swap_info %d in list but !highest_bit\n", 1287 si->type); 1078 si->type); 1288 WARN(!(si->flags & SW 1079 WARN(!(si->flags & SWP_WRITEOK), 1289 "swap_info %d in 1080 "swap_info %d in list but !SWP_WRITEOK\n", 1290 si->type); 1081 si->type); 1291 __del_from_avail_list 1082 __del_from_avail_list(si); 1292 spin_unlock(&si->lock 1083 spin_unlock(&si->lock); 1293 goto nextsi; 1084 goto nextsi; 1294 } 1085 } 1295 n_ret = scan_swap_map_slots(s !! 1086 if (size == SWAPFILE_CLUSTER) { 1296 n !! 1087 if (si->flags & SWP_BLKDEV) >> 1088 n_ret = swap_alloc_cluster(si, swp_entries); >> 1089 } else >> 1090 n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, >> 1091 n_goal, swp_entries); 1297 spin_unlock(&si->lock); 1092 spin_unlock(&si->lock); 1298 if (n_ret || size > 1) !! 1093 if (n_ret || size == SWAPFILE_CLUSTER) 1299 goto check_out; 1094 goto check_out; >> 1095 pr_debug("scan_swap_map of si %d failed to find offset\n", >> 1096 si->type); 1300 cond_resched(); 1097 cond_resched(); 1301 1098 1302 spin_lock(&swap_avail_lock); 1099 spin_lock(&swap_avail_lock); 1303 nextsi: 1100 nextsi: 1304 /* 1101 /* 1305 * if we got here, it's likel 1102 * if we got here, it's likely that si was almost full before, 1306 * and since scan_swap_map_sl 1103 * and since scan_swap_map_slots() can drop the si->lock, 1307 * multiple callers probably 1104 * multiple callers probably all tried to get a page from the 1308 * same si and it filled up b 1105 * same si and it filled up before we could get one; or, the si 1309 * filled up between us dropp 1106 * filled up between us dropping swap_avail_lock and taking 1310 * si->lock. Since we dropped 1107 * si->lock. Since we dropped the swap_avail_lock, the 1311 * swap_avail_head list may h 1108 * swap_avail_head list may have been modified; so if next is 1312 * still in the swap_avail_he 1109 * still in the swap_avail_head list then try it, otherwise 1313 * start over if we have not 1110 * start over if we have not gotten any slots. 1314 */ 1111 */ 1315 if (plist_node_empty(&next->a 1112 if (plist_node_empty(&next->avail_lists[node])) 1316 goto start_over; 1113 goto start_over; 1317 } 1114 } 1318 1115 1319 spin_unlock(&swap_avail_lock); 1116 spin_unlock(&swap_avail_lock); 1320 1117 1321 check_out: 1118 check_out: 1322 if (n_ret < n_goal) 1119 if (n_ret < n_goal) 1323 atomic_long_add((long)(n_goal 1120 atomic_long_add((long)(n_goal - n_ret) * size, 1324 &nr_swap_page 1121 &nr_swap_pages); 1325 noswap: 1122 noswap: 1326 return n_ret; 1123 return n_ret; 1327 } 1124 } 1328 1125 1329 static struct swap_info_struct *_swap_info_ge !! 1126 static struct swap_info_struct *__swap_info_get(swp_entry_t entry) 1330 { 1127 { 1331 struct swap_info_struct *si; !! 1128 struct swap_info_struct *p; 1332 unsigned long offset; 1129 unsigned long offset; 1333 1130 1334 if (!entry.val) 1131 if (!entry.val) 1335 goto out; 1132 goto out; 1336 si = swp_swap_info(entry); !! 1133 p = swp_swap_info(entry); 1337 if (!si) !! 1134 if (!p) 1338 goto bad_nofile; 1135 goto bad_nofile; 1339 if (data_race(!(si->flags & SWP_USED) !! 1136 if (data_race(!(p->flags & SWP_USED))) 1340 goto bad_device; 1137 goto bad_device; 1341 offset = swp_offset(entry); 1138 offset = swp_offset(entry); 1342 if (offset >= si->max) !! 1139 if (offset >= p->max) 1343 goto bad_offset; 1140 goto bad_offset; 1344 if (data_race(!si->swap_map[swp_offse !! 1141 return p; 1345 goto bad_free; << 1346 return si; << 1347 1142 1348 bad_free: << 1349 pr_err("%s: %s%08lx\n", __func__, Unu << 1350 goto out; << 1351 bad_offset: 1143 bad_offset: 1352 pr_err("%s: %s%08lx\n", __func__, Bad 1144 pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val); 1353 goto out; 1145 goto out; 1354 bad_device: 1146 bad_device: 1355 pr_err("%s: %s%08lx\n", __func__, Unu 1147 pr_err("%s: %s%08lx\n", __func__, Unused_file, entry.val); 1356 goto out; 1148 goto out; 1357 bad_nofile: 1149 bad_nofile: 1358 pr_err("%s: %s%08lx\n", __func__, Bad 1150 pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); 1359 out: 1151 out: 1360 return NULL; 1152 return NULL; 1361 } 1153 } 1362 1154 >> 1155 static struct swap_info_struct *_swap_info_get(swp_entry_t entry) >> 1156 { >> 1157 struct swap_info_struct *p; >> 1158 >> 1159 p = __swap_info_get(entry); >> 1160 if (!p) >> 1161 goto out; >> 1162 if (data_race(!p->swap_map[swp_offset(entry)])) >> 1163 goto bad_free; >> 1164 return p; >> 1165 >> 1166 bad_free: >> 1167 pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val); >> 1168 out: >> 1169 return NULL; >> 1170 } >> 1171 >> 1172 static struct swap_info_struct *swap_info_get(swp_entry_t entry) >> 1173 { >> 1174 struct swap_info_struct *p; >> 1175 >> 1176 p = _swap_info_get(entry); >> 1177 if (p) >> 1178 spin_lock(&p->lock); >> 1179 return p; >> 1180 } >> 1181 1363 static struct swap_info_struct *swap_info_get 1182 static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, 1364 struc 1183 struct swap_info_struct *q) 1365 { 1184 { 1366 struct swap_info_struct *p; 1185 struct swap_info_struct *p; 1367 1186 1368 p = _swap_info_get(entry); 1187 p = _swap_info_get(entry); 1369 1188 1370 if (p != q) { 1189 if (p != q) { 1371 if (q != NULL) 1190 if (q != NULL) 1372 spin_unlock(&q->lock) 1191 spin_unlock(&q->lock); 1373 if (p != NULL) 1192 if (p != NULL) 1374 spin_lock(&p->lock); 1193 spin_lock(&p->lock); 1375 } 1194 } 1376 return p; 1195 return p; 1377 } 1196 } 1378 1197 1379 static unsigned char __swap_entry_free_locked !! 1198 static unsigned char __swap_entry_free_locked(struct swap_info_struct *p, 1380 1199 unsigned long offset, 1381 1200 unsigned char usage) 1382 { 1201 { 1383 unsigned char count; 1202 unsigned char count; 1384 unsigned char has_cache; 1203 unsigned char has_cache; 1385 1204 1386 count = si->swap_map[offset]; !! 1205 count = p->swap_map[offset]; 1387 1206 1388 has_cache = count & SWAP_HAS_CACHE; 1207 has_cache = count & SWAP_HAS_CACHE; 1389 count &= ~SWAP_HAS_CACHE; 1208 count &= ~SWAP_HAS_CACHE; 1390 1209 1391 if (usage == SWAP_HAS_CACHE) { 1210 if (usage == SWAP_HAS_CACHE) { 1392 VM_BUG_ON(!has_cache); 1211 VM_BUG_ON(!has_cache); 1393 has_cache = 0; 1212 has_cache = 0; 1394 } else if (count == SWAP_MAP_SHMEM) { 1213 } else if (count == SWAP_MAP_SHMEM) { 1395 /* 1214 /* 1396 * Or we could insist on shme 1215 * Or we could insist on shmem.c using a special 1397 * swap_shmem_free() and free 1216 * swap_shmem_free() and free_shmem_swap_and_cache()... 1398 */ 1217 */ 1399 count = 0; 1218 count = 0; 1400 } else if ((count & ~COUNT_CONTINUED) 1219 } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { 1401 if (count == COUNT_CONTINUED) 1220 if (count == COUNT_CONTINUED) { 1402 if (swap_count_contin !! 1221 if (swap_count_continued(p, offset, count)) 1403 count = SWAP_ 1222 count = SWAP_MAP_MAX | COUNT_CONTINUED; 1404 else 1223 else 1405 count = SWAP_ 1224 count = SWAP_MAP_MAX; 1406 } else 1225 } else 1407 count--; 1226 count--; 1408 } 1227 } 1409 1228 1410 usage = count | has_cache; 1229 usage = count | has_cache; 1411 if (usage) 1230 if (usage) 1412 WRITE_ONCE(si->swap_map[offse !! 1231 WRITE_ONCE(p->swap_map[offset], usage); 1413 else 1232 else 1414 WRITE_ONCE(si->swap_map[offse !! 1233 WRITE_ONCE(p->swap_map[offset], SWAP_HAS_CACHE); 1415 1234 1416 return usage; 1235 return usage; 1417 } 1236 } 1418 1237 1419 /* 1238 /* 1420 * When we get a swap entry, if there aren't 1239 * When we get a swap entry, if there aren't some other ways to 1421 * prevent swapoff, such as the folio in swap !! 1240 * prevent swapoff, such as the folio in swap cache is locked, page 1422 * reader side is locked, etc., the swap entr !! 1241 * table lock is held, etc., the swap entry may become invalid because 1423 * because of swapoff. Then, we need to encl !! 1242 * of swapoff. Then, we need to enclose all swap related functions 1424 * functions with get_swap_device() and put_s !! 1243 * with get_swap_device() and put_swap_device(), unless the swap 1425 * swap functions call get/put_swap_device() !! 1244 * functions call get/put_swap_device() by themselves. 1426 * !! 1245 * 1427 * RCU reader side lock (including any spinlo !! 1246 * Note that when only holding the PTL, swapoff might succeed immediately 1428 * prevent swapoff, because synchronize_rcu() !! 1247 * after freeing a swap entry. Therefore, immediately after 1429 * before freeing data structures. !! 1248 * __swap_entry_free(), the swap info might become stale and should not >> 1249 * be touched without a prior get_swap_device(). 1430 * 1250 * 1431 * Check whether swap entry is valid in the s 1251 * Check whether swap entry is valid in the swap device. If so, 1432 * return pointer to swap_info_struct, and ke 1252 * return pointer to swap_info_struct, and keep the swap entry valid 1433 * via preventing the swap device from being 1253 * via preventing the swap device from being swapoff, until 1434 * put_swap_device() is called. Otherwise re 1254 * put_swap_device() is called. Otherwise return NULL. 1435 * 1255 * 1436 * Notice that swapoff or swapoff+swapon can 1256 * Notice that swapoff or swapoff+swapon can still happen before the 1437 * percpu_ref_tryget_live() in get_swap_devic 1257 * percpu_ref_tryget_live() in get_swap_device() or after the 1438 * percpu_ref_put() in put_swap_device() if t 1258 * percpu_ref_put() in put_swap_device() if there isn't any other way 1439 * to prevent swapoff. The caller must be pr 1259 * to prevent swapoff. The caller must be prepared for that. For 1440 * example, the following situation is possib 1260 * example, the following situation is possible. 1441 * 1261 * 1442 * CPU1 CPU2 1262 * CPU1 CPU2 1443 * do_swap_page() 1263 * do_swap_page() 1444 * ... swapo 1264 * ... swapoff+swapon 1445 * __read_swap_cache_async() 1265 * __read_swap_cache_async() 1446 * swapcache_prepare() 1266 * swapcache_prepare() 1447 * __swap_duplicate() 1267 * __swap_duplicate() 1448 * // check swap_map 1268 * // check swap_map 1449 * // verify PTE not changed 1269 * // verify PTE not changed 1450 * 1270 * 1451 * In __swap_duplicate(), the swap_map need t 1271 * In __swap_duplicate(), the swap_map need to be checked before 1452 * changing partly because the specified swap 1272 * changing partly because the specified swap entry may be for another 1453 * swap device which has been swapoff. And i 1273 * swap device which has been swapoff. And in do_swap_page(), after 1454 * the page is read from the swap device, the 1274 * the page is read from the swap device, the PTE is verified not 1455 * changed with the page table locked to chec 1275 * changed with the page table locked to check whether the swap device 1456 * has been swapoff or swapoff+swapon. 1276 * has been swapoff or swapoff+swapon. 1457 */ 1277 */ 1458 struct swap_info_struct *get_swap_device(swp_ 1278 struct swap_info_struct *get_swap_device(swp_entry_t entry) 1459 { 1279 { 1460 struct swap_info_struct *si; 1280 struct swap_info_struct *si; 1461 unsigned long offset; 1281 unsigned long offset; 1462 1282 1463 if (!entry.val) 1283 if (!entry.val) 1464 goto out; 1284 goto out; 1465 si = swp_swap_info(entry); 1285 si = swp_swap_info(entry); 1466 if (!si) 1286 if (!si) 1467 goto bad_nofile; 1287 goto bad_nofile; 1468 if (!percpu_ref_tryget_live(&si->user 1288 if (!percpu_ref_tryget_live(&si->users)) 1469 goto out; 1289 goto out; 1470 /* 1290 /* 1471 * Guarantee the si->users are checke 1291 * Guarantee the si->users are checked before accessing other 1472 * fields of swap_info_struct. 1292 * fields of swap_info_struct. 1473 * 1293 * 1474 * Paired with the spin_unlock() afte 1294 * Paired with the spin_unlock() after setup_swap_info() in 1475 * enable_swap_info(). 1295 * enable_swap_info(). 1476 */ 1296 */ 1477 smp_rmb(); 1297 smp_rmb(); 1478 offset = swp_offset(entry); 1298 offset = swp_offset(entry); 1479 if (offset >= si->max) 1299 if (offset >= si->max) 1480 goto put_out; 1300 goto put_out; 1481 1301 1482 return si; 1302 return si; 1483 bad_nofile: 1303 bad_nofile: 1484 pr_err("%s: %s%08lx\n", __func__, Bad 1304 pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); 1485 out: 1305 out: 1486 return NULL; 1306 return NULL; 1487 put_out: 1307 put_out: 1488 pr_err("%s: %s%08lx\n", __func__, Bad << 1489 percpu_ref_put(&si->users); 1308 percpu_ref_put(&si->users); 1490 return NULL; 1309 return NULL; 1491 } 1310 } 1492 1311 1493 static unsigned char __swap_entry_free(struct !! 1312 static unsigned char __swap_entry_free(struct swap_info_struct *p, 1494 swp_en 1313 swp_entry_t entry) 1495 { 1314 { 1496 struct swap_cluster_info *ci; 1315 struct swap_cluster_info *ci; 1497 unsigned long offset = swp_offset(ent 1316 unsigned long offset = swp_offset(entry); 1498 unsigned char usage; 1317 unsigned char usage; 1499 1318 1500 ci = lock_cluster_or_swap_info(si, of !! 1319 ci = lock_cluster_or_swap_info(p, offset); 1501 usage = __swap_entry_free_locked(si, !! 1320 usage = __swap_entry_free_locked(p, offset, 1); 1502 unlock_cluster_or_swap_info(si, ci); !! 1321 unlock_cluster_or_swap_info(p, ci); 1503 if (!usage) 1322 if (!usage) 1504 free_swap_slot(entry); 1323 free_swap_slot(entry); 1505 1324 1506 return usage; 1325 return usage; 1507 } 1326 } 1508 1327 1509 static bool __swap_entries_free(struct swap_i !! 1328 static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) 1510 swp_entry_t entry, int nr) << 1511 { 1329 { 1512 unsigned long offset = swp_offset(ent << 1513 unsigned int type = swp_type(entry); << 1514 struct swap_cluster_info *ci; 1330 struct swap_cluster_info *ci; 1515 bool has_cache = false; << 1516 unsigned char count; << 1517 int i; << 1518 << 1519 if (nr <= 1 || swap_count(data_race(s << 1520 goto fallback; << 1521 /* cross into another cluster */ << 1522 if (nr > SWAPFILE_CLUSTER - offset % << 1523 goto fallback; << 1524 << 1525 ci = lock_cluster_or_swap_info(si, of << 1526 if (!swap_is_last_map(si, offset, nr, << 1527 unlock_cluster_or_swap_info(s << 1528 goto fallback; << 1529 } << 1530 for (i = 0; i < nr; i++) << 1531 WRITE_ONCE(si->swap_map[offse << 1532 unlock_cluster_or_swap_info(si, ci); << 1533 << 1534 if (!has_cache) { << 1535 for (i = 0; i < nr; i++) << 1536 zswap_invalidate(swp_ << 1537 spin_lock(&si->lock); << 1538 swap_entry_range_free(si, ent << 1539 spin_unlock(&si->lock); << 1540 } << 1541 return has_cache; << 1542 << 1543 fallback: << 1544 for (i = 0; i < nr; i++) { << 1545 if (data_race(si->swap_map[of << 1546 count = __swap_entry_ << 1547 if (count == SWAP_HAS << 1548 has_cache = t << 1549 } else { << 1550 WARN_ON_ONCE(1); << 1551 } << 1552 } << 1553 return has_cache; << 1554 } << 1555 << 1556 /* << 1557 * Drop the last HAS_CACHE flag of swap entri << 1558 * ensure all entries belong to the same cgro << 1559 */ << 1560 static void swap_entry_range_free(struct swap << 1561 unsigned in << 1562 { << 1563 unsigned long offset = swp_offset(ent 1331 unsigned long offset = swp_offset(entry); 1564 unsigned char *map = si->swap_map + o !! 1332 unsigned char count; 1565 unsigned char *map_end = map + nr_pag << 1566 struct swap_cluster_info *ci; << 1567 1333 1568 ci = lock_cluster(si, offset); !! 1334 ci = lock_cluster(p, offset); 1569 do { !! 1335 count = p->swap_map[offset]; 1570 VM_BUG_ON(*map != SWAP_HAS_CA !! 1336 VM_BUG_ON(count != SWAP_HAS_CACHE); 1571 *map = 0; !! 1337 p->swap_map[offset] = 0; 1572 } while (++map < map_end); !! 1338 dec_cluster_info_page(p, p->cluster_info, offset); 1573 dec_cluster_info_page(si, ci, nr_page << 1574 unlock_cluster(ci); 1339 unlock_cluster(ci); 1575 1340 1576 mem_cgroup_uncharge_swap(entry, nr_pa !! 1341 mem_cgroup_uncharge_swap(entry, 1); 1577 swap_range_free(si, offset, nr_pages) !! 1342 swap_range_free(p, offset, 1); 1578 } << 1579 << 1580 static void cluster_swap_free_nr(struct swap_ << 1581 unsigned long offset, int nr_ << 1582 unsigned char usage) << 1583 { << 1584 struct swap_cluster_info *ci; << 1585 DECLARE_BITMAP(to_free, BITS_PER_LONG << 1586 int i, nr; << 1587 << 1588 ci = lock_cluster_or_swap_info(si, of << 1589 while (nr_pages) { << 1590 nr = min(BITS_PER_LONG, nr_pa << 1591 for (i = 0; i < nr; i++) { << 1592 if (!__swap_entry_fre << 1593 bitmap_set(to << 1594 } << 1595 if (!bitmap_empty(to_free, BI << 1596 unlock_cluster_or_swa << 1597 for_each_set_bit(i, t << 1598 free_swap_slo << 1599 if (nr == nr_pages) << 1600 return; << 1601 bitmap_clear(to_free, << 1602 ci = lock_cluster_or_ << 1603 } << 1604 offset += nr; << 1605 nr_pages -= nr; << 1606 } << 1607 unlock_cluster_or_swap_info(si, ci); << 1608 } 1343 } 1609 1344 1610 /* 1345 /* 1611 * Caller has made sure that the swap device 1346 * Caller has made sure that the swap device corresponding to entry 1612 * is still around or has not been recycled. 1347 * is still around or has not been recycled. 1613 */ 1348 */ 1614 void swap_free_nr(swp_entry_t entry, int nr_p !! 1349 void swap_free(swp_entry_t entry) 1615 { 1350 { 1616 int nr; !! 1351 struct swap_info_struct *p; 1617 struct swap_info_struct *sis; << 1618 unsigned long offset = swp_offset(ent << 1619 << 1620 sis = _swap_info_get(entry); << 1621 if (!sis) << 1622 return; << 1623 1352 1624 while (nr_pages) { !! 1353 p = _swap_info_get(entry); 1625 nr = min_t(int, nr_pages, SWA !! 1354 if (p) 1626 cluster_swap_free_nr(sis, off !! 1355 __swap_entry_free(p, entry); 1627 offset += nr; << 1628 nr_pages -= nr; << 1629 } << 1630 } 1356 } 1631 1357 1632 /* 1358 /* 1633 * Called after dropping swapcache to decreas 1359 * Called after dropping swapcache to decrease refcnt to swap entries. 1634 */ 1360 */ 1635 void put_swap_folio(struct folio *folio, swp_ !! 1361 void put_swap_page(struct page *page, swp_entry_t entry) 1636 { 1362 { 1637 unsigned long offset = swp_offset(ent 1363 unsigned long offset = swp_offset(entry); >> 1364 unsigned long idx = offset / SWAPFILE_CLUSTER; 1638 struct swap_cluster_info *ci; 1365 struct swap_cluster_info *ci; 1639 struct swap_info_struct *si; 1366 struct swap_info_struct *si; 1640 int size = 1 << swap_entry_order(foli !! 1367 unsigned char *map; >> 1368 unsigned int i, free_entries = 0; >> 1369 unsigned char val; >> 1370 int size = swap_entry_size(thp_nr_pages(page)); 1641 1371 1642 si = _swap_info_get(entry); 1372 si = _swap_info_get(entry); 1643 if (!si) 1373 if (!si) 1644 return; 1374 return; 1645 1375 1646 ci = lock_cluster_or_swap_info(si, of 1376 ci = lock_cluster_or_swap_info(si, offset); 1647 if (size > 1 && swap_is_has_cache(si, !! 1377 if (size == SWAPFILE_CLUSTER) { 1648 unlock_cluster_or_swap_info(s !! 1378 VM_BUG_ON(!cluster_is_huge(ci)); 1649 spin_lock(&si->lock); !! 1379 map = si->swap_map + offset; 1650 swap_entry_range_free(si, ent !! 1380 for (i = 0; i < SWAPFILE_CLUSTER; i++) { 1651 spin_unlock(&si->lock); !! 1381 val = map[i]; 1652 return; !! 1382 VM_BUG_ON(!(val & SWAP_HAS_CACHE)); >> 1383 if (val == SWAP_HAS_CACHE) >> 1384 free_entries++; >> 1385 } >> 1386 cluster_clear_huge(ci); >> 1387 if (free_entries == SWAPFILE_CLUSTER) { >> 1388 unlock_cluster_or_swap_info(si, ci); >> 1389 spin_lock(&si->lock); >> 1390 mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); >> 1391 swap_free_cluster(si, idx); >> 1392 spin_unlock(&si->lock); >> 1393 return; >> 1394 } 1653 } 1395 } 1654 for (int i = 0; i < size; i++, entry. !! 1396 for (i = 0; i < size; i++, entry.val++) { 1655 if (!__swap_entry_free_locked 1397 if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) { 1656 unlock_cluster_or_swa 1398 unlock_cluster_or_swap_info(si, ci); 1657 free_swap_slot(entry) 1399 free_swap_slot(entry); 1658 if (i == size - 1) 1400 if (i == size - 1) 1659 return; 1401 return; 1660 lock_cluster_or_swap_ 1402 lock_cluster_or_swap_info(si, offset); 1661 } 1403 } 1662 } 1404 } 1663 unlock_cluster_or_swap_info(si, ci); 1405 unlock_cluster_or_swap_info(si, ci); 1664 } 1406 } 1665 1407 >> 1408 #ifdef CONFIG_THP_SWAP >> 1409 int split_swap_cluster(swp_entry_t entry) >> 1410 { >> 1411 struct swap_info_struct *si; >> 1412 struct swap_cluster_info *ci; >> 1413 unsigned long offset = swp_offset(entry); >> 1414 >> 1415 si = _swap_info_get(entry); >> 1416 if (!si) >> 1417 return -EBUSY; >> 1418 ci = lock_cluster(si, offset); >> 1419 cluster_clear_huge(ci); >> 1420 unlock_cluster(ci); >> 1421 return 0; >> 1422 } >> 1423 #endif >> 1424 1666 static int swp_entry_cmp(const void *ent1, co 1425 static int swp_entry_cmp(const void *ent1, const void *ent2) 1667 { 1426 { 1668 const swp_entry_t *e1 = ent1, *e2 = e 1427 const swp_entry_t *e1 = ent1, *e2 = ent2; 1669 1428 1670 return (int)swp_type(*e1) - (int)swp_ 1429 return (int)swp_type(*e1) - (int)swp_type(*e2); 1671 } 1430 } 1672 1431 1673 void swapcache_free_entries(swp_entry_t *entr 1432 void swapcache_free_entries(swp_entry_t *entries, int n) 1674 { 1433 { 1675 struct swap_info_struct *p, *prev; 1434 struct swap_info_struct *p, *prev; 1676 int i; 1435 int i; 1677 1436 1678 if (n <= 0) 1437 if (n <= 0) 1679 return; 1438 return; 1680 1439 1681 prev = NULL; 1440 prev = NULL; 1682 p = NULL; 1441 p = NULL; 1683 1442 1684 /* 1443 /* 1685 * Sort swap entries by swap device, 1444 * Sort swap entries by swap device, so each lock is only taken once. 1686 * nr_swapfiles isn't absolutely corr 1445 * nr_swapfiles isn't absolutely correct, but the overhead of sort() is 1687 * so low that it isn't necessary to 1446 * so low that it isn't necessary to optimize further. 1688 */ 1447 */ 1689 if (nr_swapfiles > 1) 1448 if (nr_swapfiles > 1) 1690 sort(entries, n, sizeof(entri 1449 sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL); 1691 for (i = 0; i < n; ++i) { 1450 for (i = 0; i < n; ++i) { 1692 p = swap_info_get_cont(entrie 1451 p = swap_info_get_cont(entries[i], prev); 1693 if (p) 1452 if (p) 1694 swap_entry_range_free !! 1453 swap_entry_free(p, entries[i]); 1695 prev = p; 1454 prev = p; 1696 } 1455 } 1697 if (p) 1456 if (p) 1698 spin_unlock(&p->lock); 1457 spin_unlock(&p->lock); 1699 } 1458 } 1700 1459 >> 1460 /* >> 1461 * How many references to page are currently swapped out? >> 1462 * This does not give an exact answer when swap count is continued, >> 1463 * but does include the high COUNT_CONTINUED flag to allow for that. >> 1464 */ >> 1465 int page_swapcount(struct page *page) >> 1466 { >> 1467 int count = 0; >> 1468 struct swap_info_struct *p; >> 1469 struct swap_cluster_info *ci; >> 1470 swp_entry_t entry; >> 1471 unsigned long offset; >> 1472 >> 1473 entry.val = page_private(page); >> 1474 p = _swap_info_get(entry); >> 1475 if (p) { >> 1476 offset = swp_offset(entry); >> 1477 ci = lock_cluster_or_swap_info(p, offset); >> 1478 count = swap_count(p->swap_map[offset]); >> 1479 unlock_cluster_or_swap_info(p, ci); >> 1480 } >> 1481 return count; >> 1482 } >> 1483 1701 int __swap_count(swp_entry_t entry) 1484 int __swap_count(swp_entry_t entry) 1702 { 1485 { 1703 struct swap_info_struct *si = swp_swa !! 1486 struct swap_info_struct *si; 1704 pgoff_t offset = swp_offset(entry); 1487 pgoff_t offset = swp_offset(entry); >> 1488 int count = 0; 1705 1489 1706 return swap_count(si->swap_map[offset !! 1490 si = get_swap_device(entry); >> 1491 if (si) { >> 1492 count = swap_count(si->swap_map[offset]); >> 1493 put_swap_device(si); >> 1494 } >> 1495 return count; 1707 } 1496 } 1708 1497 1709 /* !! 1498 static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) 1710 * How many references to @entry are currentl << 1711 * This does not give an exact answer when sw << 1712 * but does include the high COUNT_CONTINUED << 1713 */ << 1714 int swap_swapcount(struct swap_info_struct *s << 1715 { 1499 { >> 1500 int count = 0; 1716 pgoff_t offset = swp_offset(entry); 1501 pgoff_t offset = swp_offset(entry); 1717 struct swap_cluster_info *ci; 1502 struct swap_cluster_info *ci; 1718 int count; << 1719 1503 1720 ci = lock_cluster_or_swap_info(si, of 1504 ci = lock_cluster_or_swap_info(si, offset); 1721 count = swap_count(si->swap_map[offse 1505 count = swap_count(si->swap_map[offset]); 1722 unlock_cluster_or_swap_info(si, ci); 1506 unlock_cluster_or_swap_info(si, ci); 1723 return count; 1507 return count; 1724 } 1508 } 1725 1509 1726 /* 1510 /* 1727 * How many references to @entry are currentl 1511 * How many references to @entry are currently swapped out? >> 1512 * This does not give an exact answer when swap count is continued, >> 1513 * but does include the high COUNT_CONTINUED flag to allow for that. >> 1514 */ >> 1515 int __swp_swapcount(swp_entry_t entry) >> 1516 { >> 1517 int count = 0; >> 1518 struct swap_info_struct *si; >> 1519 >> 1520 si = get_swap_device(entry); >> 1521 if (si) { >> 1522 count = swap_swapcount(si, entry); >> 1523 put_swap_device(si); >> 1524 } >> 1525 return count; >> 1526 } >> 1527 >> 1528 /* >> 1529 * How many references to @entry are currently swapped out? 1728 * This considers COUNT_CONTINUED so it retur 1530 * This considers COUNT_CONTINUED so it returns exact answer. 1729 */ 1531 */ 1730 int swp_swapcount(swp_entry_t entry) 1532 int swp_swapcount(swp_entry_t entry) 1731 { 1533 { 1732 int count, tmp_count, n; 1534 int count, tmp_count, n; 1733 struct swap_info_struct *si; !! 1535 struct swap_info_struct *p; 1734 struct swap_cluster_info *ci; 1536 struct swap_cluster_info *ci; 1735 struct page *page; 1537 struct page *page; 1736 pgoff_t offset; 1538 pgoff_t offset; 1737 unsigned char *map; 1539 unsigned char *map; 1738 1540 1739 si = _swap_info_get(entry); !! 1541 p = _swap_info_get(entry); 1740 if (!si) !! 1542 if (!p) 1741 return 0; 1543 return 0; 1742 1544 1743 offset = swp_offset(entry); 1545 offset = swp_offset(entry); 1744 1546 1745 ci = lock_cluster_or_swap_info(si, of !! 1547 ci = lock_cluster_or_swap_info(p, offset); 1746 1548 1747 count = swap_count(si->swap_map[offse !! 1549 count = swap_count(p->swap_map[offset]); 1748 if (!(count & COUNT_CONTINUED)) 1550 if (!(count & COUNT_CONTINUED)) 1749 goto out; 1551 goto out; 1750 1552 1751 count &= ~COUNT_CONTINUED; 1553 count &= ~COUNT_CONTINUED; 1752 n = SWAP_MAP_MAX + 1; 1554 n = SWAP_MAP_MAX + 1; 1753 1555 1754 page = vmalloc_to_page(si->swap_map + !! 1556 page = vmalloc_to_page(p->swap_map + offset); 1755 offset &= ~PAGE_MASK; 1557 offset &= ~PAGE_MASK; 1756 VM_BUG_ON(page_private(page) != SWP_C 1558 VM_BUG_ON(page_private(page) != SWP_CONTINUED); 1757 1559 1758 do { 1560 do { 1759 page = list_next_entry(page, 1561 page = list_next_entry(page, lru); 1760 map = kmap_local_page(page); !! 1562 map = kmap_atomic(page); 1761 tmp_count = map[offset]; 1563 tmp_count = map[offset]; 1762 kunmap_local(map); !! 1564 kunmap_atomic(map); 1763 1565 1764 count += (tmp_count & ~COUNT_ 1566 count += (tmp_count & ~COUNT_CONTINUED) * n; 1765 n *= (SWAP_CONT_MAX + 1); 1567 n *= (SWAP_CONT_MAX + 1); 1766 } while (tmp_count & COUNT_CONTINUED) 1568 } while (tmp_count & COUNT_CONTINUED); 1767 out: 1569 out: 1768 unlock_cluster_or_swap_info(si, ci); !! 1570 unlock_cluster_or_swap_info(p, ci); 1769 return count; 1571 return count; 1770 } 1572 } 1771 1573 1772 static bool swap_page_trans_huge_swapped(stru 1574 static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, 1773 swp_ !! 1575 swp_entry_t entry) 1774 { 1576 { 1775 struct swap_cluster_info *ci; 1577 struct swap_cluster_info *ci; 1776 unsigned char *map = si->swap_map; 1578 unsigned char *map = si->swap_map; 1777 unsigned int nr_pages = 1 << order; << 1778 unsigned long roffset = swp_offset(en 1579 unsigned long roffset = swp_offset(entry); 1779 unsigned long offset = round_down(rof !! 1580 unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER); 1780 int i; 1581 int i; 1781 bool ret = false; 1582 bool ret = false; 1782 1583 1783 ci = lock_cluster_or_swap_info(si, of 1584 ci = lock_cluster_or_swap_info(si, offset); 1784 if (!ci || nr_pages == 1) { !! 1585 if (!ci || !cluster_is_huge(ci)) { 1785 if (swap_count(map[roffset])) 1586 if (swap_count(map[roffset])) 1786 ret = true; 1587 ret = true; 1787 goto unlock_out; 1588 goto unlock_out; 1788 } 1589 } 1789 for (i = 0; i < nr_pages; i++) { !! 1590 for (i = 0; i < SWAPFILE_CLUSTER; i++) { 1790 if (swap_count(map[offset + i 1591 if (swap_count(map[offset + i])) { 1791 ret = true; 1592 ret = true; 1792 break; 1593 break; 1793 } 1594 } 1794 } 1595 } 1795 unlock_out: 1596 unlock_out: 1796 unlock_cluster_or_swap_info(si, ci); 1597 unlock_cluster_or_swap_info(si, ci); 1797 return ret; 1598 return ret; 1798 } 1599 } 1799 1600 1800 static bool folio_swapped(struct folio *folio !! 1601 static bool page_swapped(struct page *page) 1801 { 1602 { 1802 swp_entry_t entry = folio->swap; !! 1603 swp_entry_t entry; 1803 struct swap_info_struct *si = _swap_i !! 1604 struct swap_info_struct *si; 1804 1605 1805 if (!si) !! 1606 if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) 1806 return false; !! 1607 return page_swapcount(page) != 0; >> 1608 >> 1609 page = compound_head(page); >> 1610 entry.val = page_private(page); >> 1611 si = _swap_info_get(entry); >> 1612 if (si) >> 1613 return swap_page_trans_huge_swapped(si, entry); >> 1614 return false; >> 1615 } 1807 1616 1808 if (!IS_ENABLED(CONFIG_THP_SWAP) || l !! 1617 static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, 1809 return swap_swapcount(si, ent !! 1618 int *total_swapcount) >> 1619 { >> 1620 int i, map_swapcount, _total_mapcount, _total_swapcount; >> 1621 unsigned long offset = 0; >> 1622 struct swap_info_struct *si; >> 1623 struct swap_cluster_info *ci = NULL; >> 1624 unsigned char *map = NULL; >> 1625 int mapcount, swapcount = 0; >> 1626 >> 1627 /* hugetlbfs shouldn't call it */ >> 1628 VM_BUG_ON_PAGE(PageHuge(page), page); >> 1629 >> 1630 if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) { >> 1631 mapcount = page_trans_huge_mapcount(page, total_mapcount); >> 1632 if (PageSwapCache(page)) >> 1633 swapcount = page_swapcount(page); >> 1634 if (total_swapcount) >> 1635 *total_swapcount = swapcount; >> 1636 return mapcount + swapcount; >> 1637 } >> 1638 >> 1639 page = compound_head(page); >> 1640 >> 1641 _total_mapcount = _total_swapcount = map_swapcount = 0; >> 1642 if (PageSwapCache(page)) { >> 1643 swp_entry_t entry; 1810 1644 1811 return swap_page_trans_huge_swapped(s !! 1645 entry.val = page_private(page); >> 1646 si = _swap_info_get(entry); >> 1647 if (si) { >> 1648 map = si->swap_map; >> 1649 offset = swp_offset(entry); >> 1650 } >> 1651 } >> 1652 if (map) >> 1653 ci = lock_cluster(si, offset); >> 1654 for (i = 0; i < HPAGE_PMD_NR; i++) { >> 1655 mapcount = atomic_read(&page[i]._mapcount) + 1; >> 1656 _total_mapcount += mapcount; >> 1657 if (map) { >> 1658 swapcount = swap_count(map[offset + i]); >> 1659 _total_swapcount += swapcount; >> 1660 } >> 1661 map_swapcount = max(map_swapcount, mapcount + swapcount); >> 1662 } >> 1663 unlock_cluster(ci); >> 1664 if (PageDoubleMap(page)) { >> 1665 map_swapcount -= 1; >> 1666 _total_mapcount -= HPAGE_PMD_NR; >> 1667 } >> 1668 mapcount = compound_mapcount(page); >> 1669 map_swapcount += mapcount; >> 1670 _total_mapcount += mapcount; >> 1671 if (total_mapcount) >> 1672 *total_mapcount = _total_mapcount; >> 1673 if (total_swapcount) >> 1674 *total_swapcount = _total_swapcount; >> 1675 >> 1676 return map_swapcount; 1812 } 1677 } 1813 1678 1814 static bool folio_swapcache_freeable(struct f !! 1679 /* >> 1680 * We can write to an anon page without COW if there are no other references >> 1681 * to it. And as a side-effect, free up its swap: because the old content >> 1682 * on disk will never be read, and seeking back there to write new content >> 1683 * later would only waste time away from clustering. >> 1684 * >> 1685 * NOTE: total_map_swapcount should not be relied upon by the caller if >> 1686 * reuse_swap_page() returns false, but it may be always overwritten >> 1687 * (see the other implementation for CONFIG_SWAP=n). >> 1688 */ >> 1689 bool reuse_swap_page(struct page *page, int *total_map_swapcount) 1815 { 1690 { 1816 VM_BUG_ON_FOLIO(!folio_test_locked(fo !! 1691 int count, total_mapcount, total_swapcount; 1817 1692 1818 if (!folio_test_swapcache(folio)) !! 1693 VM_BUG_ON_PAGE(!PageLocked(page), page); 1819 return false; !! 1694 if (unlikely(PageKsm(page))) 1820 if (folio_test_writeback(folio)) << 1821 return false; 1695 return false; >> 1696 count = page_trans_huge_map_swapcount(page, &total_mapcount, >> 1697 &total_swapcount); >> 1698 if (total_map_swapcount) >> 1699 *total_map_swapcount = total_mapcount + total_swapcount; >> 1700 if (count == 1 && PageSwapCache(page) && >> 1701 (likely(!PageTransCompound(page)) || >> 1702 /* The remaining swap count will be freed soon */ >> 1703 total_swapcount == page_swapcount(page))) { >> 1704 if (!PageWriteback(page)) { >> 1705 page = compound_head(page); >> 1706 delete_from_swap_cache(page); >> 1707 SetPageDirty(page); >> 1708 } else { >> 1709 swp_entry_t entry; >> 1710 struct swap_info_struct *p; >> 1711 >> 1712 entry.val = page_private(page); >> 1713 p = swap_info_get(entry); >> 1714 if (p->flags & SWP_STABLE_WRITES) { >> 1715 spin_unlock(&p->lock); >> 1716 return false; >> 1717 } >> 1718 spin_unlock(&p->lock); >> 1719 } >> 1720 } >> 1721 >> 1722 return count <= 1; >> 1723 } >> 1724 >> 1725 /* >> 1726 * If swap is getting full, or if there are no more mappings of this page, >> 1727 * then try_to_free_swap is called to free its swap space. >> 1728 */ >> 1729 int try_to_free_swap(struct page *page) >> 1730 { >> 1731 VM_BUG_ON_PAGE(!PageLocked(page), page); >> 1732 >> 1733 if (!PageSwapCache(page)) >> 1734 return 0; >> 1735 if (PageWriteback(page)) >> 1736 return 0; >> 1737 if (page_swapped(page)) >> 1738 return 0; 1822 1739 1823 /* 1740 /* 1824 * Once hibernation has begun to crea 1741 * Once hibernation has begun to create its image of memory, 1825 * there's a danger that one of the c !! 1742 * there's a danger that one of the calls to try_to_free_swap() 1826 * - most probably a call from __try_ 1743 * - most probably a call from __try_to_reclaim_swap() while 1827 * hibernation is allocating its own 1744 * hibernation is allocating its own swap pages for the image, 1828 * but conceivably even a call from m 1745 * but conceivably even a call from memory reclaim - will free 1829 * the swap from a folio which has al !! 1746 * the swap from a page which has already been recorded in the 1830 * image as a clean swapcache folio, !! 1747 * image as a clean swapcache page, and then reuse its swap for 1831 * another page of the image. On wak 1748 * another page of the image. On waking from hibernation, the 1832 * original folio might be freed unde !! 1749 * original page might be freed under memory pressure, then 1833 * later read back in from swap, now 1750 * later read back in from swap, now with the wrong data. 1834 * 1751 * 1835 * Hibernation suspends storage while 1752 * Hibernation suspends storage while it is writing the image 1836 * to disk so check that here. 1753 * to disk so check that here. 1837 */ 1754 */ 1838 if (pm_suspended_storage()) 1755 if (pm_suspended_storage()) 1839 return false; !! 1756 return 0; 1840 << 1841 return true; << 1842 } << 1843 << 1844 /** << 1845 * folio_free_swap() - Free the swap space us << 1846 * @folio: The folio to remove. << 1847 * << 1848 * If swap is getting full, or if there are n << 1849 * then call folio_free_swap to free its swap << 1850 * << 1851 * Return: true if we were able to release th << 1852 */ << 1853 bool folio_free_swap(struct folio *folio) << 1854 { << 1855 if (!folio_swapcache_freeable(folio)) << 1856 return false; << 1857 if (folio_swapped(folio)) << 1858 return false; << 1859 1757 1860 delete_from_swap_cache(folio); !! 1758 page = compound_head(page); 1861 folio_set_dirty(folio); !! 1759 delete_from_swap_cache(page); 1862 return true; !! 1760 SetPageDirty(page); >> 1761 return 1; 1863 } 1762 } 1864 1763 1865 /** !! 1764 /* 1866 * free_swap_and_cache_nr() - Release referen !! 1765 * Free the swap entry like above, but also try to 1867 * reclaim their c !! 1766 * free the page cache entry if it is the last user. 1868 * @entry: First entry of range. << 1869 * @nr: Number of entries in range. << 1870 * << 1871 * For each swap entry in the contiguous rang << 1872 * entries become free, try to reclaim their << 1873 * offset range is defined by [entry.offset, << 1874 */ 1767 */ 1875 void free_swap_and_cache_nr(swp_entry_t entry !! 1768 int free_swap_and_cache(swp_entry_t entry) 1876 { 1769 { 1877 const unsigned long start_offset = sw !! 1770 struct swap_info_struct *p; 1878 const unsigned long end_offset = star !! 1771 unsigned char count; 1879 struct swap_info_struct *si; << 1880 bool any_only_cache = false; << 1881 unsigned long offset; << 1882 1772 1883 if (non_swap_entry(entry)) 1773 if (non_swap_entry(entry)) 1884 return; !! 1774 return 1; 1885 << 1886 si = get_swap_device(entry); << 1887 if (!si) << 1888 return; << 1889 1775 1890 if (WARN_ON(end_offset > si->max)) !! 1776 p = get_swap_device(entry); 1891 goto out; !! 1777 if (p) { 1892 !! 1778 if (WARN_ON(data_race(!p->swap_map[swp_offset(entry)]))) { 1893 /* !! 1779 put_swap_device(p); 1894 * First free all entries in the rang !! 1780 return 0; 1895 */ << 1896 any_only_cache = __swap_entries_free( << 1897 << 1898 /* << 1899 * Short-circuit the below loop if no << 1900 * reference drop to zero. << 1901 */ << 1902 if (!any_only_cache) << 1903 goto out; << 1904 << 1905 /* << 1906 * Now go back over the range trying << 1907 * more efficient for large folios be << 1908 * the swap once per folio in the com << 1909 * __swap_entry_free() and __try_to_r << 1910 * latter will get a reference and lo << 1911 * page but will only succeed once th << 1912 * zero. << 1913 */ << 1914 for (offset = start_offset; offset < << 1915 nr = 1; << 1916 if (READ_ONCE(si->swap_map[of << 1917 /* << 1918 * Folios are always << 1919 * advance forward to << 1920 * folio was found fo << 1921 * in this case. Nega << 1922 * but could not be r << 1923 * to the next bounda << 1924 */ << 1925 nr = __try_to_reclaim << 1926 << 1927 if (nr == 0) << 1928 nr = 1; << 1929 else if (nr < 0) << 1930 nr = -nr; << 1931 nr = ALIGN(offset + 1 << 1932 } 1781 } 1933 } << 1934 1782 1935 out: !! 1783 count = __swap_entry_free(p, entry); 1936 put_swap_device(si); !! 1784 if (count == SWAP_HAS_CACHE && >> 1785 !swap_page_trans_huge_swapped(p, entry)) >> 1786 __try_to_reclaim_swap(p, swp_offset(entry), >> 1787 TTRS_UNMAPPED | TTRS_FULL); >> 1788 put_swap_device(p); >> 1789 } >> 1790 return p != NULL; 1937 } 1791 } 1938 1792 1939 #ifdef CONFIG_HIBERNATION 1793 #ifdef CONFIG_HIBERNATION 1940 1794 1941 swp_entry_t get_swap_page_of_type(int type) 1795 swp_entry_t get_swap_page_of_type(int type) 1942 { 1796 { 1943 struct swap_info_struct *si = swap_ty 1797 struct swap_info_struct *si = swap_type_to_swap_info(type); 1944 swp_entry_t entry = {0}; 1798 swp_entry_t entry = {0}; 1945 1799 1946 if (!si) 1800 if (!si) 1947 goto fail; 1801 goto fail; 1948 1802 1949 /* This is called for allocating swap 1803 /* This is called for allocating swap entry, not cache */ 1950 spin_lock(&si->lock); 1804 spin_lock(&si->lock); 1951 if ((si->flags & SWP_WRITEOK) && scan !! 1805 if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry)) 1952 atomic_long_dec(&nr_swap_page 1806 atomic_long_dec(&nr_swap_pages); 1953 spin_unlock(&si->lock); 1807 spin_unlock(&si->lock); 1954 fail: 1808 fail: 1955 return entry; 1809 return entry; 1956 } 1810 } 1957 1811 1958 /* 1812 /* 1959 * Find the swap type that corresponds to giv 1813 * Find the swap type that corresponds to given device (if any). 1960 * 1814 * 1961 * @offset - number of the PAGE_SIZE-sized bl 1815 * @offset - number of the PAGE_SIZE-sized block of the device, starting 1962 * from 0, in which the swap header is expect 1816 * from 0, in which the swap header is expected to be located. 1963 * 1817 * 1964 * This is needed for the suspend to disk (ak 1818 * This is needed for the suspend to disk (aka swsusp). 1965 */ 1819 */ 1966 int swap_type_of(dev_t device, sector_t offse 1820 int swap_type_of(dev_t device, sector_t offset) 1967 { 1821 { 1968 int type; 1822 int type; 1969 1823 1970 if (!device) 1824 if (!device) 1971 return -1; 1825 return -1; 1972 1826 1973 spin_lock(&swap_lock); 1827 spin_lock(&swap_lock); 1974 for (type = 0; type < nr_swapfiles; t 1828 for (type = 0; type < nr_swapfiles; type++) { 1975 struct swap_info_struct *sis 1829 struct swap_info_struct *sis = swap_info[type]; 1976 1830 1977 if (!(sis->flags & SWP_WRITEO 1831 if (!(sis->flags & SWP_WRITEOK)) 1978 continue; 1832 continue; 1979 1833 1980 if (device == sis->bdev->bd_d 1834 if (device == sis->bdev->bd_dev) { 1981 struct swap_extent *s 1835 struct swap_extent *se = first_se(sis); 1982 1836 1983 if (se->start_block = 1837 if (se->start_block == offset) { 1984 spin_unlock(& 1838 spin_unlock(&swap_lock); 1985 return type; 1839 return type; 1986 } 1840 } 1987 } 1841 } 1988 } 1842 } 1989 spin_unlock(&swap_lock); 1843 spin_unlock(&swap_lock); 1990 return -ENODEV; 1844 return -ENODEV; 1991 } 1845 } 1992 1846 1993 int find_first_swap(dev_t *device) 1847 int find_first_swap(dev_t *device) 1994 { 1848 { 1995 int type; 1849 int type; 1996 1850 1997 spin_lock(&swap_lock); 1851 spin_lock(&swap_lock); 1998 for (type = 0; type < nr_swapfiles; t 1852 for (type = 0; type < nr_swapfiles; type++) { 1999 struct swap_info_struct *sis 1853 struct swap_info_struct *sis = swap_info[type]; 2000 1854 2001 if (!(sis->flags & SWP_WRITEO 1855 if (!(sis->flags & SWP_WRITEOK)) 2002 continue; 1856 continue; 2003 *device = sis->bdev->bd_dev; 1857 *device = sis->bdev->bd_dev; 2004 spin_unlock(&swap_lock); 1858 spin_unlock(&swap_lock); 2005 return type; 1859 return type; 2006 } 1860 } 2007 spin_unlock(&swap_lock); 1861 spin_unlock(&swap_lock); 2008 return -ENODEV; 1862 return -ENODEV; 2009 } 1863 } 2010 1864 2011 /* 1865 /* 2012 * Get the (PAGE_SIZE) block corresponding to 1866 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev 2013 * corresponding to given index in swap_info 1867 * corresponding to given index in swap_info (swap type). 2014 */ 1868 */ 2015 sector_t swapdev_block(int type, pgoff_t offs 1869 sector_t swapdev_block(int type, pgoff_t offset) 2016 { 1870 { 2017 struct swap_info_struct *si = swap_ty 1871 struct swap_info_struct *si = swap_type_to_swap_info(type); 2018 struct swap_extent *se; 1872 struct swap_extent *se; 2019 1873 2020 if (!si || !(si->flags & SWP_WRITEOK) 1874 if (!si || !(si->flags & SWP_WRITEOK)) 2021 return 0; 1875 return 0; 2022 se = offset_to_swap_extent(si, offset 1876 se = offset_to_swap_extent(si, offset); 2023 return se->start_block + (offset - se 1877 return se->start_block + (offset - se->start_page); 2024 } 1878 } 2025 1879 2026 /* 1880 /* 2027 * Return either the total number of swap pag 1881 * Return either the total number of swap pages of given type, or the number 2028 * of free pages of that type (depending on @ 1882 * of free pages of that type (depending on @free) 2029 * 1883 * 2030 * This is needed for software suspend 1884 * This is needed for software suspend 2031 */ 1885 */ 2032 unsigned int count_swap_pages(int type, int f 1886 unsigned int count_swap_pages(int type, int free) 2033 { 1887 { 2034 unsigned int n = 0; 1888 unsigned int n = 0; 2035 1889 2036 spin_lock(&swap_lock); 1890 spin_lock(&swap_lock); 2037 if ((unsigned int)type < nr_swapfiles 1891 if ((unsigned int)type < nr_swapfiles) { 2038 struct swap_info_struct *sis 1892 struct swap_info_struct *sis = swap_info[type]; 2039 1893 2040 spin_lock(&sis->lock); 1894 spin_lock(&sis->lock); 2041 if (sis->flags & SWP_WRITEOK) 1895 if (sis->flags & SWP_WRITEOK) { 2042 n = sis->pages; 1896 n = sis->pages; 2043 if (free) 1897 if (free) 2044 n -= sis->inu 1898 n -= sis->inuse_pages; 2045 } 1899 } 2046 spin_unlock(&sis->lock); 1900 spin_unlock(&sis->lock); 2047 } 1901 } 2048 spin_unlock(&swap_lock); 1902 spin_unlock(&swap_lock); 2049 return n; 1903 return n; 2050 } 1904 } 2051 #endif /* CONFIG_HIBERNATION */ 1905 #endif /* CONFIG_HIBERNATION */ 2052 1906 2053 static inline int pte_same_as_swp(pte_t pte, 1907 static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte) 2054 { 1908 { 2055 return pte_same(pte_swp_clear_flags(p 1909 return pte_same(pte_swp_clear_flags(pte), swp_pte); 2056 } 1910 } 2057 1911 2058 /* 1912 /* 2059 * No need to decide whether this PTE shares 1913 * No need to decide whether this PTE shares the swap entry with others, 2060 * just let do_wp_page work it out if a write 1914 * just let do_wp_page work it out if a write is requested later - to 2061 * force COW, vm_page_prot omits write permis 1915 * force COW, vm_page_prot omits write permission from any private vma. 2062 */ 1916 */ 2063 static int unuse_pte(struct vm_area_struct *v 1917 static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, 2064 unsigned long addr, swp_entry !! 1918 unsigned long addr, swp_entry_t entry, struct page *page) 2065 { 1919 { 2066 struct page *page; !! 1920 struct page *swapcache; 2067 struct folio *swapcache; << 2068 spinlock_t *ptl; 1921 spinlock_t *ptl; 2069 pte_t *pte, new_pte, old_pte; !! 1922 pte_t *pte; 2070 bool hwpoisoned = false; << 2071 int ret = 1; 1923 int ret = 1; 2072 1924 2073 swapcache = folio; !! 1925 swapcache = page; 2074 folio = ksm_might_need_to_copy(folio, !! 1926 page = ksm_might_need_to_copy(page, vma, addr); 2075 if (unlikely(!folio)) !! 1927 if (unlikely(!page)) 2076 return -ENOMEM; 1928 return -ENOMEM; 2077 else if (unlikely(folio == ERR_PTR(-E << 2078 hwpoisoned = true; << 2079 folio = swapcache; << 2080 } << 2081 << 2082 page = folio_file_page(folio, swp_off << 2083 if (PageHWPoison(page)) << 2084 hwpoisoned = true; << 2085 1929 2086 pte = pte_offset_map_lock(vma->vm_mm, 1930 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 2087 if (unlikely(!pte || !pte_same_as_swp !! 1931 if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) { 2088 << 2089 ret = 0; 1932 ret = 0; 2090 goto out; 1933 goto out; 2091 } 1934 } 2092 1935 2093 old_pte = ptep_get(pte); << 2094 << 2095 if (unlikely(hwpoisoned || !folio_tes << 2096 swp_entry_t swp_entry; << 2097 << 2098 dec_mm_counter(vma->vm_mm, MM << 2099 if (hwpoisoned) { << 2100 swp_entry = make_hwpo << 2101 } else { << 2102 swp_entry = make_pois << 2103 } << 2104 new_pte = swp_entry_to_pte(sw << 2105 ret = 0; << 2106 goto setpte; << 2107 } << 2108 << 2109 /* << 2110 * Some architectures may have to res << 2111 * when reading from swap. This metad << 2112 * so this must be called before swap << 2113 */ << 2114 arch_swap_restore(folio_swap(entry, f << 2115 << 2116 dec_mm_counter(vma->vm_mm, MM_SWAPENT 1936 dec_mm_counter(vma->vm_mm, MM_SWAPENTS); 2117 inc_mm_counter(vma->vm_mm, MM_ANONPAG 1937 inc_mm_counter(vma->vm_mm, MM_ANONPAGES); 2118 folio_get(folio); !! 1938 get_page(page); 2119 if (folio == swapcache) { !! 1939 set_pte_at(vma->vm_mm, addr, pte, 2120 rmap_t rmap_flags = RMAP_NONE !! 1940 pte_mkold(mk_pte(page, vma->vm_page_prot))); 2121 !! 1941 if (page == swapcache) { 2122 /* !! 1942 page_add_anon_rmap(page, vma, addr, false); 2123 * See do_swap_page(): writeb << 2124 * However, we do a folio_wai << 2125 * call and have the folio lo << 2126 */ << 2127 VM_BUG_ON_FOLIO(folio_test_wr << 2128 if (pte_swp_exclusive(old_pte << 2129 rmap_flags |= RMAP_EX << 2130 /* << 2131 * We currently only expect s << 2132 * fully exclusive or fully s << 2133 * here, we have to be carefu << 2134 */ << 2135 if (!folio_test_anon(folio)) << 2136 VM_WARN_ON_ONCE(folio << 2137 VM_WARN_ON_FOLIO(!fol << 2138 folio_add_new_anon_rm << 2139 } else { << 2140 folio_add_anon_rmap_p << 2141 } << 2142 } else { /* ksm created a completely 1943 } else { /* ksm created a completely new copy */ 2143 folio_add_new_anon_rmap(folio !! 1944 page_add_new_anon_rmap(page, vma, addr, false); 2144 folio_add_lru_vma(folio, vma) !! 1945 lru_cache_add_inactive_or_unevictable(page, vma); 2145 } 1946 } 2146 new_pte = pte_mkold(mk_pte(page, vma- << 2147 if (pte_swp_soft_dirty(old_pte)) << 2148 new_pte = pte_mksoft_dirty(ne << 2149 if (pte_swp_uffd_wp(old_pte)) << 2150 new_pte = pte_mkuffd_wp(new_p << 2151 setpte: << 2152 set_pte_at(vma->vm_mm, addr, pte, new << 2153 swap_free(entry); 1947 swap_free(entry); 2154 out: 1948 out: 2155 if (pte) !! 1949 pte_unmap_unlock(pte, ptl); 2156 pte_unmap_unlock(pte, ptl); !! 1950 if (page != swapcache) { 2157 if (folio != swapcache) { !! 1951 unlock_page(page); 2158 folio_unlock(folio); !! 1952 put_page(page); 2159 folio_put(folio); << 2160 } 1953 } 2161 return ret; 1954 return ret; 2162 } 1955 } 2163 1956 2164 static int unuse_pte_range(struct vm_area_str 1957 static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 2165 unsigned long addr, u 1958 unsigned long addr, unsigned long end, 2166 unsigned int type) !! 1959 unsigned int type, bool frontswap, >> 1960 unsigned long *fs_pages_to_unuse) 2167 { 1961 { 2168 pte_t *pte = NULL; !! 1962 struct page *page; >> 1963 swp_entry_t entry; >> 1964 pte_t *pte; 2169 struct swap_info_struct *si; 1965 struct swap_info_struct *si; >> 1966 unsigned long offset; >> 1967 int ret = 0; >> 1968 volatile unsigned char *swap_map; 2170 1969 2171 si = swap_info[type]; 1970 si = swap_info[type]; >> 1971 pte = pte_offset_map(pmd, addr); 2172 do { 1972 do { 2173 struct folio *folio; !! 1973 if (!is_swap_pte(*pte)) 2174 unsigned long offset; << 2175 unsigned char swp_count; << 2176 swp_entry_t entry; << 2177 int ret; << 2178 pte_t ptent; << 2179 << 2180 if (!pte++) { << 2181 pte = pte_offset_map( << 2182 if (!pte) << 2183 break; << 2184 } << 2185 << 2186 ptent = ptep_get_lockless(pte << 2187 << 2188 if (!is_swap_pte(ptent)) << 2189 continue; 1974 continue; 2190 1975 2191 entry = pte_to_swp_entry(pten !! 1976 entry = pte_to_swp_entry(*pte); 2192 if (swp_type(entry) != type) 1977 if (swp_type(entry) != type) 2193 continue; 1978 continue; 2194 1979 2195 offset = swp_offset(entry); 1980 offset = swp_offset(entry); 2196 pte_unmap(pte); !! 1981 if (frontswap && !frontswap_test(si, offset)) 2197 pte = NULL; !! 1982 continue; 2198 1983 2199 folio = swap_cache_get_folio( !! 1984 pte_unmap(pte); 2200 if (!folio) { !! 1985 swap_map = &si->swap_map[offset]; >> 1986 page = lookup_swap_cache(entry, vma, addr); >> 1987 if (!page) { 2201 struct vm_fault vmf = 1988 struct vm_fault vmf = { 2202 .vma = vma, 1989 .vma = vma, 2203 .address = ad 1990 .address = addr, 2204 .real_address << 2205 .pmd = pmd, 1991 .pmd = pmd, 2206 }; 1992 }; 2207 1993 2208 folio = swapin_readah !! 1994 page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, 2209 1995 &vmf); 2210 } 1996 } 2211 if (!folio) { !! 1997 if (!page) { 2212 swp_count = READ_ONCE !! 1998 if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD) 2213 if (swp_count == 0 || !! 1999 goto try_next; 2214 continue; << 2215 return -ENOMEM; 2000 return -ENOMEM; 2216 } 2001 } 2217 2002 2218 folio_lock(folio); !! 2003 lock_page(page); 2219 folio_wait_writeback(folio); !! 2004 wait_on_page_writeback(page); 2220 ret = unuse_pte(vma, pmd, add !! 2005 ret = unuse_pte(vma, pmd, addr, entry, page); 2221 if (ret < 0) { 2006 if (ret < 0) { 2222 folio_unlock(folio); !! 2007 unlock_page(page); 2223 folio_put(folio); !! 2008 put_page(page); 2224 return ret; !! 2009 goto out; 2225 } 2010 } 2226 2011 2227 folio_free_swap(folio); !! 2012 try_to_free_swap(page); 2228 folio_unlock(folio); !! 2013 unlock_page(page); 2229 folio_put(folio); !! 2014 put_page(page); 2230 } while (addr += PAGE_SIZE, addr != e << 2231 2015 2232 if (pte) !! 2016 if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) { 2233 pte_unmap(pte); !! 2017 ret = FRONTSWAP_PAGES_UNUSED; 2234 return 0; !! 2018 goto out; >> 2019 } >> 2020 try_next: >> 2021 pte = pte_offset_map(pmd, addr); >> 2022 } while (pte++, addr += PAGE_SIZE, addr != end); >> 2023 pte_unmap(pte - 1); >> 2024 >> 2025 ret = 0; >> 2026 out: >> 2027 return ret; 2235 } 2028 } 2236 2029 2237 static inline int unuse_pmd_range(struct vm_a 2030 static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, 2238 unsigned long 2031 unsigned long addr, unsigned long end, 2239 unsigned int !! 2032 unsigned int type, bool frontswap, >> 2033 unsigned long *fs_pages_to_unuse) 2240 { 2034 { 2241 pmd_t *pmd; 2035 pmd_t *pmd; 2242 unsigned long next; 2036 unsigned long next; 2243 int ret; 2037 int ret; 2244 2038 2245 pmd = pmd_offset(pud, addr); 2039 pmd = pmd_offset(pud, addr); 2246 do { 2040 do { 2247 cond_resched(); 2041 cond_resched(); 2248 next = pmd_addr_end(addr, end 2042 next = pmd_addr_end(addr, end); 2249 ret = unuse_pte_range(vma, pm !! 2043 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) >> 2044 continue; >> 2045 ret = unuse_pte_range(vma, pmd, addr, next, type, >> 2046 frontswap, fs_pages_to_unuse); 2250 if (ret) 2047 if (ret) 2251 return ret; 2048 return ret; 2252 } while (pmd++, addr = next, addr != 2049 } while (pmd++, addr = next, addr != end); 2253 return 0; 2050 return 0; 2254 } 2051 } 2255 2052 2256 static inline int unuse_pud_range(struct vm_a 2053 static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, 2257 unsigned long 2054 unsigned long addr, unsigned long end, 2258 unsigned int !! 2055 unsigned int type, bool frontswap, >> 2056 unsigned long *fs_pages_to_unuse) 2259 { 2057 { 2260 pud_t *pud; 2058 pud_t *pud; 2261 unsigned long next; 2059 unsigned long next; 2262 int ret; 2060 int ret; 2263 2061 2264 pud = pud_offset(p4d, addr); 2062 pud = pud_offset(p4d, addr); 2265 do { 2063 do { 2266 next = pud_addr_end(addr, end 2064 next = pud_addr_end(addr, end); 2267 if (pud_none_or_clear_bad(pud 2065 if (pud_none_or_clear_bad(pud)) 2268 continue; 2066 continue; 2269 ret = unuse_pmd_range(vma, pu !! 2067 ret = unuse_pmd_range(vma, pud, addr, next, type, >> 2068 frontswap, fs_pages_to_unuse); 2270 if (ret) 2069 if (ret) 2271 return ret; 2070 return ret; 2272 } while (pud++, addr = next, addr != 2071 } while (pud++, addr = next, addr != end); 2273 return 0; 2072 return 0; 2274 } 2073 } 2275 2074 2276 static inline int unuse_p4d_range(struct vm_a 2075 static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, 2277 unsigned long 2076 unsigned long addr, unsigned long end, 2278 unsigned int !! 2077 unsigned int type, bool frontswap, >> 2078 unsigned long *fs_pages_to_unuse) 2279 { 2079 { 2280 p4d_t *p4d; 2080 p4d_t *p4d; 2281 unsigned long next; 2081 unsigned long next; 2282 int ret; 2082 int ret; 2283 2083 2284 p4d = p4d_offset(pgd, addr); 2084 p4d = p4d_offset(pgd, addr); 2285 do { 2085 do { 2286 next = p4d_addr_end(addr, end 2086 next = p4d_addr_end(addr, end); 2287 if (p4d_none_or_clear_bad(p4d 2087 if (p4d_none_or_clear_bad(p4d)) 2288 continue; 2088 continue; 2289 ret = unuse_pud_range(vma, p4 !! 2089 ret = unuse_pud_range(vma, p4d, addr, next, type, >> 2090 frontswap, fs_pages_to_unuse); 2290 if (ret) 2091 if (ret) 2291 return ret; 2092 return ret; 2292 } while (p4d++, addr = next, addr != 2093 } while (p4d++, addr = next, addr != end); 2293 return 0; 2094 return 0; 2294 } 2095 } 2295 2096 2296 static int unuse_vma(struct vm_area_struct *v !! 2097 static int unuse_vma(struct vm_area_struct *vma, unsigned int type, >> 2098 bool frontswap, unsigned long *fs_pages_to_unuse) 2297 { 2099 { 2298 pgd_t *pgd; 2100 pgd_t *pgd; 2299 unsigned long addr, end, next; 2101 unsigned long addr, end, next; 2300 int ret; 2102 int ret; 2301 2103 2302 addr = vma->vm_start; 2104 addr = vma->vm_start; 2303 end = vma->vm_end; 2105 end = vma->vm_end; 2304 2106 2305 pgd = pgd_offset(vma->vm_mm, addr); 2107 pgd = pgd_offset(vma->vm_mm, addr); 2306 do { 2108 do { 2307 next = pgd_addr_end(addr, end 2109 next = pgd_addr_end(addr, end); 2308 if (pgd_none_or_clear_bad(pgd 2110 if (pgd_none_or_clear_bad(pgd)) 2309 continue; 2111 continue; 2310 ret = unuse_p4d_range(vma, pg !! 2112 ret = unuse_p4d_range(vma, pgd, addr, next, type, >> 2113 frontswap, fs_pages_to_unuse); 2311 if (ret) 2114 if (ret) 2312 return ret; 2115 return ret; 2313 } while (pgd++, addr = next, addr != 2116 } while (pgd++, addr = next, addr != end); 2314 return 0; 2117 return 0; 2315 } 2118 } 2316 2119 2317 static int unuse_mm(struct mm_struct *mm, uns !! 2120 static int unuse_mm(struct mm_struct *mm, unsigned int type, >> 2121 bool frontswap, unsigned long *fs_pages_to_unuse) 2318 { 2122 { 2319 struct vm_area_struct *vma; 2123 struct vm_area_struct *vma; 2320 int ret = 0; 2124 int ret = 0; 2321 VMA_ITERATOR(vmi, mm, 0); << 2322 2125 2323 mmap_read_lock(mm); 2126 mmap_read_lock(mm); 2324 for_each_vma(vmi, vma) { !! 2127 for (vma = mm->mmap; vma; vma = vma->vm_next) { 2325 if (vma->anon_vma && !is_vm_h 2128 if (vma->anon_vma && !is_vm_hugetlb_page(vma)) { 2326 ret = unuse_vma(vma, !! 2129 ret = unuse_vma(vma, type, frontswap, >> 2130 fs_pages_to_unuse); 2327 if (ret) 2131 if (ret) 2328 break; 2132 break; 2329 } 2133 } 2330 << 2331 cond_resched(); 2134 cond_resched(); 2332 } 2135 } 2333 mmap_read_unlock(mm); 2136 mmap_read_unlock(mm); 2334 return ret; 2137 return ret; 2335 } 2138 } 2336 2139 2337 /* 2140 /* 2338 * Scan swap_map from current position to nex !! 2141 * Scan swap_map (or frontswap_map if frontswap parameter is true) 2339 * Return 0 if there are no inuse entries aft !! 2142 * from current position to next entry still in use. Return 0 2340 * the map. !! 2143 * if there are no inuse entries after prev till end of the map. 2341 */ 2144 */ 2342 static unsigned int find_next_to_unuse(struct 2145 static unsigned int find_next_to_unuse(struct swap_info_struct *si, 2343 unsig !! 2146 unsigned int prev, bool frontswap) 2344 { 2147 { 2345 unsigned int i; 2148 unsigned int i; 2346 unsigned char count; 2149 unsigned char count; 2347 2150 2348 /* 2151 /* 2349 * No need for swap_lock here: we're 2152 * No need for swap_lock here: we're just looking 2350 * for whether an entry is in use, no 2153 * for whether an entry is in use, not modifying it; false 2351 * hits are okay, and sys_swapoff() h 2154 * hits are okay, and sys_swapoff() has already prevented new 2352 * allocations from this area (while 2155 * allocations from this area (while holding swap_lock). 2353 */ 2156 */ 2354 for (i = prev + 1; i < si->max; i++) 2157 for (i = prev + 1; i < si->max; i++) { 2355 count = READ_ONCE(si->swap_ma 2158 count = READ_ONCE(si->swap_map[i]); 2356 if (count && swap_count(count 2159 if (count && swap_count(count) != SWAP_MAP_BAD) 2357 break; !! 2160 if (!frontswap || frontswap_test(si, i)) >> 2161 break; 2358 if ((i % LATENCY_LIMIT) == 0) 2162 if ((i % LATENCY_LIMIT) == 0) 2359 cond_resched(); 2163 cond_resched(); 2360 } 2164 } 2361 2165 2362 if (i == si->max) 2166 if (i == si->max) 2363 i = 0; 2167 i = 0; 2364 2168 2365 return i; 2169 return i; 2366 } 2170 } 2367 2171 2368 static int try_to_unuse(unsigned int type) !! 2172 /* >> 2173 * If the boolean frontswap is true, only unuse pages_to_unuse pages; >> 2174 * pages_to_unuse==0 means all pages; ignored if frontswap is false >> 2175 */ >> 2176 int try_to_unuse(unsigned int type, bool frontswap, >> 2177 unsigned long pages_to_unuse) 2369 { 2178 { 2370 struct mm_struct *prev_mm; 2179 struct mm_struct *prev_mm; 2371 struct mm_struct *mm; 2180 struct mm_struct *mm; 2372 struct list_head *p; 2181 struct list_head *p; 2373 int retval = 0; 2182 int retval = 0; 2374 struct swap_info_struct *si = swap_in 2183 struct swap_info_struct *si = swap_info[type]; 2375 struct folio *folio; !! 2184 struct page *page; 2376 swp_entry_t entry; 2185 swp_entry_t entry; 2377 unsigned int i; 2186 unsigned int i; 2378 2187 2379 if (!READ_ONCE(si->inuse_pages)) 2188 if (!READ_ONCE(si->inuse_pages)) 2380 goto success; !! 2189 return 0; >> 2190 >> 2191 if (!frontswap) >> 2192 pages_to_unuse = 0; 2381 2193 2382 retry: 2194 retry: 2383 retval = shmem_unuse(type); !! 2195 retval = shmem_unuse(type, frontswap, &pages_to_unuse); 2384 if (retval) 2196 if (retval) 2385 return retval; !! 2197 goto out; 2386 2198 2387 prev_mm = &init_mm; 2199 prev_mm = &init_mm; 2388 mmget(prev_mm); 2200 mmget(prev_mm); 2389 2201 2390 spin_lock(&mmlist_lock); 2202 spin_lock(&mmlist_lock); 2391 p = &init_mm.mmlist; 2203 p = &init_mm.mmlist; 2392 while (READ_ONCE(si->inuse_pages) && 2204 while (READ_ONCE(si->inuse_pages) && 2393 !signal_pending(current) && 2205 !signal_pending(current) && 2394 (p = p->next) != &init_mm.mmli 2206 (p = p->next) != &init_mm.mmlist) { 2395 2207 2396 mm = list_entry(p, struct mm_ 2208 mm = list_entry(p, struct mm_struct, mmlist); 2397 if (!mmget_not_zero(mm)) 2209 if (!mmget_not_zero(mm)) 2398 continue; 2210 continue; 2399 spin_unlock(&mmlist_lock); 2211 spin_unlock(&mmlist_lock); 2400 mmput(prev_mm); 2212 mmput(prev_mm); 2401 prev_mm = mm; 2213 prev_mm = mm; 2402 retval = unuse_mm(mm, type); !! 2214 retval = unuse_mm(mm, type, frontswap, &pages_to_unuse); >> 2215 2403 if (retval) { 2216 if (retval) { 2404 mmput(prev_mm); 2217 mmput(prev_mm); 2405 return retval; !! 2218 goto out; 2406 } 2219 } 2407 2220 2408 /* 2221 /* 2409 * Make sure that we aren't c 2222 * Make sure that we aren't completely killing 2410 * interactive performance. 2223 * interactive performance. 2411 */ 2224 */ 2412 cond_resched(); 2225 cond_resched(); 2413 spin_lock(&mmlist_lock); 2226 spin_lock(&mmlist_lock); 2414 } 2227 } 2415 spin_unlock(&mmlist_lock); 2228 spin_unlock(&mmlist_lock); 2416 2229 2417 mmput(prev_mm); 2230 mmput(prev_mm); 2418 2231 2419 i = 0; 2232 i = 0; 2420 while (READ_ONCE(si->inuse_pages) && 2233 while (READ_ONCE(si->inuse_pages) && 2421 !signal_pending(current) && 2234 !signal_pending(current) && 2422 (i = find_next_to_unuse(si, i) !! 2235 (i = find_next_to_unuse(si, i, frontswap)) != 0) { 2423 2236 2424 entry = swp_entry(type, i); 2237 entry = swp_entry(type, i); 2425 folio = filemap_get_folio(swa !! 2238 page = find_get_page(swap_address_space(entry), i); 2426 if (IS_ERR(folio)) !! 2239 if (!page) 2427 continue; 2240 continue; 2428 2241 2429 /* 2242 /* 2430 * It is conceivable that a r !! 2243 * It is conceivable that a racing task removed this page from 2431 * swap cache just before we !! 2244 * swap cache just before we acquired the page lock. The page 2432 * might even be back in swap 2245 * might even be back in swap cache on another swap area. But 2433 * that is okay, folio_free_s !! 2246 * that is okay, try_to_free_swap() only removes stale pages. >> 2247 */ >> 2248 lock_page(page); >> 2249 wait_on_page_writeback(page); >> 2250 try_to_free_swap(page); >> 2251 unlock_page(page); >> 2252 put_page(page); >> 2253 >> 2254 /* >> 2255 * For frontswap, we just need to unuse pages_to_unuse, if >> 2256 * it was specified. Need not check frontswap again here as >> 2257 * we already zeroed out pages_to_unuse if not frontswap. 2434 */ 2258 */ 2435 folio_lock(folio); !! 2259 if (pages_to_unuse && --pages_to_unuse == 0) 2436 folio_wait_writeback(folio); !! 2260 goto out; 2437 folio_free_swap(folio); << 2438 folio_unlock(folio); << 2439 folio_put(folio); << 2440 } 2261 } 2441 2262 2442 /* 2263 /* 2443 * Lets check again to see if there a 2264 * Lets check again to see if there are still swap entries in the map. 2444 * If yes, we would need to do retry 2265 * If yes, we would need to do retry the unuse logic again. 2445 * Under global memory pressure, swap 2266 * Under global memory pressure, swap entries can be reinserted back 2446 * into process space after the mmlis 2267 * into process space after the mmlist loop above passes over them. 2447 * 2268 * 2448 * Limit the number of retries? No: w !! 2269 * Limit the number of retries? No: when mmget_not_zero() above fails, 2449 * above fails, that mm is likely to !! 2270 * that mm is likely to be freeing swap from exit_mmap(), which proceeds 2450 * exit_mmap(), which proceeds at its !! 2271 * at its own independent pace; and even shmem_writepage() could have 2451 * and even shmem_writepage() could h !! 2272 * been preempted after get_swap_page(), temporarily hiding that swap. 2452 * folio_alloc_swap(), temporarily hi !! 2273 * It's easy and robust (though cpu-intensive) just to keep retrying. 2453 * and robust (though cpu-intensive) << 2454 */ 2274 */ 2455 if (READ_ONCE(si->inuse_pages)) { 2275 if (READ_ONCE(si->inuse_pages)) { 2456 if (!signal_pending(current)) 2276 if (!signal_pending(current)) 2457 goto retry; 2277 goto retry; 2458 return -EINTR; !! 2278 retval = -EINTR; 2459 } 2279 } 2460 !! 2280 out: 2461 success: !! 2281 return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval; 2462 /* << 2463 * Make sure that further cleanups af << 2464 * after swap_range_free() reduces si << 2465 */ << 2466 smp_mb(); << 2467 return 0; << 2468 } 2282 } 2469 2283 2470 /* 2284 /* 2471 * After a successful try_to_unuse, if no swa 2285 * After a successful try_to_unuse, if no swap is now in use, we know 2472 * we can empty the mmlist. swap_lock must b 2286 * we can empty the mmlist. swap_lock must be held on entry and exit. 2473 * Note that mmlist_lock nests inside swap_lo 2287 * Note that mmlist_lock nests inside swap_lock, and an mm must be 2474 * added to the mmlist just after page_duplic 2288 * added to the mmlist just after page_duplicate - before would be racy. 2475 */ 2289 */ 2476 static void drain_mmlist(void) 2290 static void drain_mmlist(void) 2477 { 2291 { 2478 struct list_head *p, *next; 2292 struct list_head *p, *next; 2479 unsigned int type; 2293 unsigned int type; 2480 2294 2481 for (type = 0; type < nr_swapfiles; t 2295 for (type = 0; type < nr_swapfiles; type++) 2482 if (swap_info[type]->inuse_pa 2296 if (swap_info[type]->inuse_pages) 2483 return; 2297 return; 2484 spin_lock(&mmlist_lock); 2298 spin_lock(&mmlist_lock); 2485 list_for_each_safe(p, next, &init_mm. 2299 list_for_each_safe(p, next, &init_mm.mmlist) 2486 list_del_init(p); 2300 list_del_init(p); 2487 spin_unlock(&mmlist_lock); 2301 spin_unlock(&mmlist_lock); 2488 } 2302 } 2489 2303 2490 /* 2304 /* 2491 * Free all of a swapdev's extent information 2305 * Free all of a swapdev's extent information 2492 */ 2306 */ 2493 static void destroy_swap_extents(struct swap_ 2307 static void destroy_swap_extents(struct swap_info_struct *sis) 2494 { 2308 { 2495 while (!RB_EMPTY_ROOT(&sis->swap_exte 2309 while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) { 2496 struct rb_node *rb = sis->swa 2310 struct rb_node *rb = sis->swap_extent_root.rb_node; 2497 struct swap_extent *se = rb_e 2311 struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node); 2498 2312 2499 rb_erase(rb, &sis->swap_exten 2313 rb_erase(rb, &sis->swap_extent_root); 2500 kfree(se); 2314 kfree(se); 2501 } 2315 } 2502 2316 2503 if (sis->flags & SWP_ACTIVATED) { 2317 if (sis->flags & SWP_ACTIVATED) { 2504 struct file *swap_file = sis- 2318 struct file *swap_file = sis->swap_file; 2505 struct address_space *mapping 2319 struct address_space *mapping = swap_file->f_mapping; 2506 2320 2507 sis->flags &= ~SWP_ACTIVATED; 2321 sis->flags &= ~SWP_ACTIVATED; 2508 if (mapping->a_ops->swap_deac 2322 if (mapping->a_ops->swap_deactivate) 2509 mapping->a_ops->swap_ 2323 mapping->a_ops->swap_deactivate(swap_file); 2510 } 2324 } 2511 } 2325 } 2512 2326 2513 /* 2327 /* 2514 * Add a block range (and the corresponding p 2328 * Add a block range (and the corresponding page range) into this swapdev's 2515 * extent tree. 2329 * extent tree. 2516 * 2330 * 2517 * This function rather assumes that it is ca 2331 * This function rather assumes that it is called in ascending page order. 2518 */ 2332 */ 2519 int 2333 int 2520 add_swap_extent(struct swap_info_struct *sis, 2334 add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, 2521 unsigned long nr_pages, secto 2335 unsigned long nr_pages, sector_t start_block) 2522 { 2336 { 2523 struct rb_node **link = &sis->swap_ex 2337 struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL; 2524 struct swap_extent *se; 2338 struct swap_extent *se; 2525 struct swap_extent *new_se; 2339 struct swap_extent *new_se; 2526 2340 2527 /* 2341 /* 2528 * place the new node at the right mo 2342 * place the new node at the right most since the 2529 * function is called in ascending pa 2343 * function is called in ascending page order. 2530 */ 2344 */ 2531 while (*link) { 2345 while (*link) { 2532 parent = *link; 2346 parent = *link; 2533 link = &parent->rb_right; 2347 link = &parent->rb_right; 2534 } 2348 } 2535 2349 2536 if (parent) { 2350 if (parent) { 2537 se = rb_entry(parent, struct 2351 se = rb_entry(parent, struct swap_extent, rb_node); 2538 BUG_ON(se->start_page + se->n 2352 BUG_ON(se->start_page + se->nr_pages != start_page); 2539 if (se->start_block + se->nr_ 2353 if (se->start_block + se->nr_pages == start_block) { 2540 /* Merge it */ 2354 /* Merge it */ 2541 se->nr_pages += nr_pa 2355 se->nr_pages += nr_pages; 2542 return 0; 2356 return 0; 2543 } 2357 } 2544 } 2358 } 2545 2359 2546 /* No merge, insert a new extent. */ 2360 /* No merge, insert a new extent. */ 2547 new_se = kmalloc(sizeof(*se), GFP_KER 2361 new_se = kmalloc(sizeof(*se), GFP_KERNEL); 2548 if (new_se == NULL) 2362 if (new_se == NULL) 2549 return -ENOMEM; 2363 return -ENOMEM; 2550 new_se->start_page = start_page; 2364 new_se->start_page = start_page; 2551 new_se->nr_pages = nr_pages; 2365 new_se->nr_pages = nr_pages; 2552 new_se->start_block = start_block; 2366 new_se->start_block = start_block; 2553 2367 2554 rb_link_node(&new_se->rb_node, parent 2368 rb_link_node(&new_se->rb_node, parent, link); 2555 rb_insert_color(&new_se->rb_node, &si 2369 rb_insert_color(&new_se->rb_node, &sis->swap_extent_root); 2556 return 1; 2370 return 1; 2557 } 2371 } 2558 EXPORT_SYMBOL_GPL(add_swap_extent); 2372 EXPORT_SYMBOL_GPL(add_swap_extent); 2559 2373 2560 /* 2374 /* 2561 * A `swap extent' is a simple thing which ma 2375 * A `swap extent' is a simple thing which maps a contiguous range of pages 2562 * onto a contiguous range of disk blocks. A !! 2376 * onto a contiguous range of disk blocks. An ordered list of swap extents 2563 * built at swapon time and is then used at s !! 2377 * is built at swapon time and is then used at swap_writepage/swap_readpage 2564 * time for locating where on disk a page bel 2378 * time for locating where on disk a page belongs. 2565 * 2379 * 2566 * If the swapfile is an S_ISBLK block device 2380 * If the swapfile is an S_ISBLK block device, a single extent is installed. 2567 * This is done so that the main operating co 2381 * This is done so that the main operating code can treat S_ISBLK and S_ISREG 2568 * swap files identically. 2382 * swap files identically. 2569 * 2383 * 2570 * Whether the swapdev is an S_ISREG file or 2384 * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap 2571 * extent rbtree operates in PAGE_SIZE disk b !! 2385 * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK 2572 * swapfiles are handled *identically* after 2386 * swapfiles are handled *identically* after swapon time. 2573 * 2387 * 2574 * For S_ISREG swapfiles, setup_swap_extents( 2388 * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks 2575 * and will parse them into a rbtree, in PAGE !! 2389 * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If 2576 * blocks are found which do not fall within !! 2390 * some stray blocks are found which do not fall within the PAGE_SIZE alignment 2577 * requirements, they are simply tossed out - 2391 * requirements, they are simply tossed out - we will never use those blocks 2578 * for swapping. 2392 * for swapping. 2579 * 2393 * 2580 * For all swap devices we set S_SWAPFILE acr 2394 * For all swap devices we set S_SWAPFILE across the life of the swapon. This 2581 * prevents users from writing to the swap de 2395 * prevents users from writing to the swap device, which will corrupt memory. 2582 * 2396 * 2583 * The amount of disk space which a single sw 2397 * The amount of disk space which a single swap extent represents varies. 2584 * Typically it is in the 1-4 megabyte range. 2398 * Typically it is in the 1-4 megabyte range. So we can have hundreds of 2585 * extents in the rbtree. - akpm. !! 2399 * extents in the list. To avoid much list walking, we cache the previous >> 2400 * search location in `curr_swap_extent', and start new searches from there. >> 2401 * This is extremely effective. The average number of iterations in >> 2402 * map_swap_page() has been measured at about 0.3 per page. - akpm. 2586 */ 2403 */ 2587 static int setup_swap_extents(struct swap_inf 2404 static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) 2588 { 2405 { 2589 struct file *swap_file = sis->swap_fi 2406 struct file *swap_file = sis->swap_file; 2590 struct address_space *mapping = swap_ 2407 struct address_space *mapping = swap_file->f_mapping; 2591 struct inode *inode = mapping->host; 2408 struct inode *inode = mapping->host; 2592 int ret; 2409 int ret; 2593 2410 2594 if (S_ISBLK(inode->i_mode)) { 2411 if (S_ISBLK(inode->i_mode)) { 2595 ret = add_swap_extent(sis, 0, 2412 ret = add_swap_extent(sis, 0, sis->max, 0); 2596 *span = sis->pages; 2413 *span = sis->pages; 2597 return ret; 2414 return ret; 2598 } 2415 } 2599 2416 2600 if (mapping->a_ops->swap_activate) { 2417 if (mapping->a_ops->swap_activate) { 2601 ret = mapping->a_ops->swap_ac 2418 ret = mapping->a_ops->swap_activate(sis, swap_file, span); 2602 if (ret < 0) !! 2419 if (ret >= 0) 2603 return ret; !! 2420 sis->flags |= SWP_ACTIVATED; 2604 sis->flags |= SWP_ACTIVATED; !! 2421 if (!ret) { 2605 if ((sis->flags & SWP_FS_OPS) !! 2422 sis->flags |= SWP_FS_OPS; 2606 sio_pool_init() != 0) { !! 2423 ret = add_swap_extent(sis, 0, sis->max, 0); 2607 destroy_swap_extents( !! 2424 *span = sis->pages; 2608 return -ENOMEM; << 2609 } 2425 } 2610 return ret; 2426 return ret; 2611 } 2427 } 2612 2428 2613 return generic_swapfile_activate(sis, 2429 return generic_swapfile_activate(sis, swap_file, span); 2614 } 2430 } 2615 2431 2616 static int swap_node(struct swap_info_struct !! 2432 static int swap_node(struct swap_info_struct *p) 2617 { 2433 { 2618 struct block_device *bdev; 2434 struct block_device *bdev; 2619 2435 2620 if (si->bdev) !! 2436 if (p->bdev) 2621 bdev = si->bdev; !! 2437 bdev = p->bdev; 2622 else 2438 else 2623 bdev = si->swap_file->f_inode !! 2439 bdev = p->swap_file->f_inode->i_sb->s_bdev; 2624 2440 2625 return bdev ? bdev->bd_disk->node_id 2441 return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; 2626 } 2442 } 2627 2443 2628 static void setup_swap_info(struct swap_info_ !! 2444 static void setup_swap_info(struct swap_info_struct *p, int prio, 2629 unsigned char *sw 2445 unsigned char *swap_map, 2630 struct swap_clust !! 2446 struct swap_cluster_info *cluster_info) 2631 unsigned long *ze << 2632 { 2447 { 2633 int i; 2448 int i; 2634 2449 2635 if (prio >= 0) 2450 if (prio >= 0) 2636 si->prio = prio; !! 2451 p->prio = prio; 2637 else 2452 else 2638 si->prio = --least_priority; !! 2453 p->prio = --least_priority; 2639 /* 2454 /* 2640 * the plist prio is negated because 2455 * the plist prio is negated because plist ordering is 2641 * low-to-high, while swap ordering i 2456 * low-to-high, while swap ordering is high-to-low 2642 */ 2457 */ 2643 si->list.prio = -si->prio; !! 2458 p->list.prio = -p->prio; 2644 for_each_node(i) { 2459 for_each_node(i) { 2645 if (si->prio >= 0) !! 2460 if (p->prio >= 0) 2646 si->avail_lists[i].pr !! 2461 p->avail_lists[i].prio = -p->prio; 2647 else { 2462 else { 2648 if (swap_node(si) == !! 2463 if (swap_node(p) == i) 2649 si->avail_lis !! 2464 p->avail_lists[i].prio = 1; 2650 else 2465 else 2651 si->avail_lis !! 2466 p->avail_lists[i].prio = -p->prio; 2652 } 2467 } 2653 } 2468 } 2654 si->swap_map = swap_map; !! 2469 p->swap_map = swap_map; 2655 si->cluster_info = cluster_info; !! 2470 p->cluster_info = cluster_info; 2656 si->zeromap = zeromap; << 2657 } 2471 } 2658 2472 2659 static void _enable_swap_info(struct swap_inf !! 2473 static void _enable_swap_info(struct swap_info_struct *p) 2660 { 2474 { 2661 si->flags |= SWP_WRITEOK; !! 2475 p->flags |= SWP_WRITEOK; 2662 atomic_long_add(si->pages, &nr_swap_p !! 2476 atomic_long_add(p->pages, &nr_swap_pages); 2663 total_swap_pages += si->pages; !! 2477 total_swap_pages += p->pages; 2664 2478 2665 assert_spin_locked(&swap_lock); 2479 assert_spin_locked(&swap_lock); 2666 /* 2480 /* 2667 * both lists are plists, and thus pr 2481 * both lists are plists, and thus priority ordered. 2668 * swap_active_head needs to be prior 2482 * swap_active_head needs to be priority ordered for swapoff(), 2669 * which on removal of any swap_info_ 2483 * which on removal of any swap_info_struct with an auto-assigned 2670 * (i.e. negative) priority increment 2484 * (i.e. negative) priority increments the auto-assigned priority 2671 * of any lower-priority swap_info_st 2485 * of any lower-priority swap_info_structs. 2672 * swap_avail_head needs to be priori !! 2486 * swap_avail_head needs to be priority ordered for get_swap_page(), 2673 * which allocates swap pages from th 2487 * which allocates swap pages from the highest available priority 2674 * swap_info_struct. 2488 * swap_info_struct. 2675 */ 2489 */ 2676 plist_add(&si->list, &swap_active_hea !! 2490 plist_add(&p->list, &swap_active_head); 2677 !! 2491 add_to_avail_list(p); 2678 /* add to available list iff swap dev << 2679 if (si->highest_bit) << 2680 add_to_avail_list(si); << 2681 } 2492 } 2682 2493 2683 static void enable_swap_info(struct swap_info !! 2494 static void enable_swap_info(struct swap_info_struct *p, int prio, 2684 unsigned char 2495 unsigned char *swap_map, 2685 struct swap_c 2496 struct swap_cluster_info *cluster_info, 2686 unsigned long !! 2497 unsigned long *frontswap_map) 2687 { 2498 { >> 2499 frontswap_init(p->type, frontswap_map); 2688 spin_lock(&swap_lock); 2500 spin_lock(&swap_lock); 2689 spin_lock(&si->lock); !! 2501 spin_lock(&p->lock); 2690 setup_swap_info(si, prio, swap_map, c !! 2502 setup_swap_info(p, prio, swap_map, cluster_info); 2691 spin_unlock(&si->lock); !! 2503 spin_unlock(&p->lock); 2692 spin_unlock(&swap_lock); 2504 spin_unlock(&swap_lock); 2693 /* 2505 /* 2694 * Finished initializing swap device, 2506 * Finished initializing swap device, now it's safe to reference it. 2695 */ 2507 */ 2696 percpu_ref_resurrect(&si->users); !! 2508 percpu_ref_resurrect(&p->users); 2697 spin_lock(&swap_lock); 2509 spin_lock(&swap_lock); 2698 spin_lock(&si->lock); !! 2510 spin_lock(&p->lock); 2699 _enable_swap_info(si); !! 2511 _enable_swap_info(p); 2700 spin_unlock(&si->lock); !! 2512 spin_unlock(&p->lock); 2701 spin_unlock(&swap_lock); 2513 spin_unlock(&swap_lock); 2702 } 2514 } 2703 2515 2704 static void reinsert_swap_info(struct swap_in !! 2516 static void reinsert_swap_info(struct swap_info_struct *p) 2705 { 2517 { 2706 spin_lock(&swap_lock); 2518 spin_lock(&swap_lock); 2707 spin_lock(&si->lock); !! 2519 spin_lock(&p->lock); 2708 setup_swap_info(si, si->prio, si->swa !! 2520 setup_swap_info(p, p->prio, p->swap_map, p->cluster_info); 2709 _enable_swap_info(si); !! 2521 _enable_swap_info(p); 2710 spin_unlock(&si->lock); !! 2522 spin_unlock(&p->lock); 2711 spin_unlock(&swap_lock); 2523 spin_unlock(&swap_lock); 2712 } 2524 } 2713 2525 2714 static bool __has_usable_swap(void) << 2715 { << 2716 return !plist_head_empty(&swap_active << 2717 } << 2718 << 2719 bool has_usable_swap(void) 2526 bool has_usable_swap(void) 2720 { 2527 { 2721 bool ret; !! 2528 bool ret = true; 2722 2529 2723 spin_lock(&swap_lock); 2530 spin_lock(&swap_lock); 2724 ret = __has_usable_swap(); !! 2531 if (plist_head_empty(&swap_active_head)) >> 2532 ret = false; 2725 spin_unlock(&swap_lock); 2533 spin_unlock(&swap_lock); 2726 return ret; 2534 return ret; 2727 } 2535 } 2728 2536 2729 SYSCALL_DEFINE1(swapoff, const char __user *, 2537 SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) 2730 { 2538 { 2731 struct swap_info_struct *p = NULL; 2539 struct swap_info_struct *p = NULL; 2732 unsigned char *swap_map; 2540 unsigned char *swap_map; 2733 unsigned long *zeromap; << 2734 struct swap_cluster_info *cluster_inf 2541 struct swap_cluster_info *cluster_info; >> 2542 unsigned long *frontswap_map; 2735 struct file *swap_file, *victim; 2543 struct file *swap_file, *victim; 2736 struct address_space *mapping; 2544 struct address_space *mapping; 2737 struct inode *inode; 2545 struct inode *inode; 2738 struct filename *pathname; 2546 struct filename *pathname; 2739 int err, found = 0; 2547 int err, found = 0; >> 2548 unsigned int old_block_size; 2740 2549 2741 if (!capable(CAP_SYS_ADMIN)) 2550 if (!capable(CAP_SYS_ADMIN)) 2742 return -EPERM; 2551 return -EPERM; 2743 2552 2744 BUG_ON(!current->mm); 2553 BUG_ON(!current->mm); 2745 2554 2746 pathname = getname(specialfile); 2555 pathname = getname(specialfile); 2747 if (IS_ERR(pathname)) 2556 if (IS_ERR(pathname)) 2748 return PTR_ERR(pathname); 2557 return PTR_ERR(pathname); 2749 2558 2750 victim = file_open_name(pathname, O_R 2559 victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0); 2751 err = PTR_ERR(victim); 2560 err = PTR_ERR(victim); 2752 if (IS_ERR(victim)) 2561 if (IS_ERR(victim)) 2753 goto out; 2562 goto out; 2754 2563 2755 mapping = victim->f_mapping; 2564 mapping = victim->f_mapping; 2756 spin_lock(&swap_lock); 2565 spin_lock(&swap_lock); 2757 plist_for_each_entry(p, &swap_active_ 2566 plist_for_each_entry(p, &swap_active_head, list) { 2758 if (p->flags & SWP_WRITEOK) { 2567 if (p->flags & SWP_WRITEOK) { 2759 if (p->swap_file->f_m 2568 if (p->swap_file->f_mapping == mapping) { 2760 found = 1; 2569 found = 1; 2761 break; 2570 break; 2762 } 2571 } 2763 } 2572 } 2764 } 2573 } 2765 if (!found) { 2574 if (!found) { 2766 err = -EINVAL; 2575 err = -EINVAL; 2767 spin_unlock(&swap_lock); 2576 spin_unlock(&swap_lock); 2768 goto out_dput; 2577 goto out_dput; 2769 } 2578 } 2770 if (!security_vm_enough_memory_mm(cur 2579 if (!security_vm_enough_memory_mm(current->mm, p->pages)) 2771 vm_unacct_memory(p->pages); 2580 vm_unacct_memory(p->pages); 2772 else { 2581 else { 2773 err = -ENOMEM; 2582 err = -ENOMEM; 2774 spin_unlock(&swap_lock); 2583 spin_unlock(&swap_lock); 2775 goto out_dput; 2584 goto out_dput; 2776 } 2585 } 2777 spin_lock(&p->lock); 2586 spin_lock(&p->lock); 2778 del_from_avail_list(p); 2587 del_from_avail_list(p); 2779 if (p->prio < 0) { 2588 if (p->prio < 0) { 2780 struct swap_info_struct *si = 2589 struct swap_info_struct *si = p; 2781 int nid; 2590 int nid; 2782 2591 2783 plist_for_each_entry_continue 2592 plist_for_each_entry_continue(si, &swap_active_head, list) { 2784 si->prio++; 2593 si->prio++; 2785 si->list.prio--; 2594 si->list.prio--; 2786 for_each_node(nid) { 2595 for_each_node(nid) { 2787 if (si->avail 2596 if (si->avail_lists[nid].prio != 1) 2788 si->a 2597 si->avail_lists[nid].prio--; 2789 } 2598 } 2790 } 2599 } 2791 least_priority++; 2600 least_priority++; 2792 } 2601 } 2793 plist_del(&p->list, &swap_active_head 2602 plist_del(&p->list, &swap_active_head); 2794 atomic_long_sub(p->pages, &nr_swap_pa 2603 atomic_long_sub(p->pages, &nr_swap_pages); 2795 total_swap_pages -= p->pages; 2604 total_swap_pages -= p->pages; 2796 p->flags &= ~SWP_WRITEOK; 2605 p->flags &= ~SWP_WRITEOK; 2797 spin_unlock(&p->lock); 2606 spin_unlock(&p->lock); 2798 spin_unlock(&swap_lock); 2607 spin_unlock(&swap_lock); 2799 2608 2800 disable_swap_slots_cache_lock(); 2609 disable_swap_slots_cache_lock(); 2801 2610 2802 set_current_oom_origin(); 2611 set_current_oom_origin(); 2803 err = try_to_unuse(p->type); !! 2612 err = try_to_unuse(p->type, false, 0); /* force unuse all pages */ 2804 clear_current_oom_origin(); 2613 clear_current_oom_origin(); 2805 2614 2806 if (err) { 2615 if (err) { 2807 /* re-insert swap space back 2616 /* re-insert swap space back into swap_list */ 2808 reinsert_swap_info(p); 2617 reinsert_swap_info(p); 2809 reenable_swap_slots_cache_unl 2618 reenable_swap_slots_cache_unlock(); 2810 goto out_dput; 2619 goto out_dput; 2811 } 2620 } 2812 2621 2813 reenable_swap_slots_cache_unlock(); 2622 reenable_swap_slots_cache_unlock(); 2814 2623 2815 /* 2624 /* 2816 * Wait for swap operations protected 2625 * Wait for swap operations protected by get/put_swap_device() 2817 * to complete. Because of synchroni !! 2626 * to complete. 2818 * operations protected by RCU reader !! 2627 * 2819 * spinlock) will be waited too. Thi !! 2628 * We need synchronize_rcu() here to protect the accessing to 2820 * prevent folio_test_swapcache() and !! 2629 * the swap cache data structure. 2821 * operations from racing with swapof << 2822 */ 2630 */ 2823 percpu_ref_kill(&p->users); 2631 percpu_ref_kill(&p->users); 2824 synchronize_rcu(); 2632 synchronize_rcu(); 2825 wait_for_completion(&p->comp); 2633 wait_for_completion(&p->comp); 2826 2634 2827 flush_work(&p->discard_work); 2635 flush_work(&p->discard_work); 2828 flush_work(&p->reclaim_work); << 2829 2636 2830 destroy_swap_extents(p); 2637 destroy_swap_extents(p); 2831 if (p->flags & SWP_CONTINUED) 2638 if (p->flags & SWP_CONTINUED) 2832 free_swap_count_continuations 2639 free_swap_count_continuations(p); 2833 2640 2834 if (!p->bdev || !bdev_nonrot(p->bdev) !! 2641 if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev))) 2835 atomic_dec(&nr_rotate_swap); 2642 atomic_dec(&nr_rotate_swap); 2836 2643 2837 mutex_lock(&swapon_mutex); 2644 mutex_lock(&swapon_mutex); 2838 spin_lock(&swap_lock); 2645 spin_lock(&swap_lock); 2839 spin_lock(&p->lock); 2646 spin_lock(&p->lock); 2840 drain_mmlist(); 2647 drain_mmlist(); 2841 2648 2842 /* wait for anyone still in scan_swap 2649 /* wait for anyone still in scan_swap_map_slots */ 2843 p->highest_bit = 0; /* cu 2650 p->highest_bit = 0; /* cuts scans short */ 2844 while (p->flags >= SWP_SCANNING) { 2651 while (p->flags >= SWP_SCANNING) { 2845 spin_unlock(&p->lock); 2652 spin_unlock(&p->lock); 2846 spin_unlock(&swap_lock); 2653 spin_unlock(&swap_lock); 2847 schedule_timeout_uninterrupti 2654 schedule_timeout_uninterruptible(1); 2848 spin_lock(&swap_lock); 2655 spin_lock(&swap_lock); 2849 spin_lock(&p->lock); 2656 spin_lock(&p->lock); 2850 } 2657 } 2851 2658 2852 swap_file = p->swap_file; 2659 swap_file = p->swap_file; >> 2660 old_block_size = p->old_block_size; 2853 p->swap_file = NULL; 2661 p->swap_file = NULL; 2854 p->max = 0; 2662 p->max = 0; 2855 swap_map = p->swap_map; 2663 swap_map = p->swap_map; 2856 p->swap_map = NULL; 2664 p->swap_map = NULL; 2857 zeromap = p->zeromap; << 2858 p->zeromap = NULL; << 2859 cluster_info = p->cluster_info; 2665 cluster_info = p->cluster_info; 2860 p->cluster_info = NULL; 2666 p->cluster_info = NULL; >> 2667 frontswap_map = frontswap_map_get(p); 2861 spin_unlock(&p->lock); 2668 spin_unlock(&p->lock); 2862 spin_unlock(&swap_lock); 2669 spin_unlock(&swap_lock); 2863 arch_swap_invalidate_area(p->type); 2670 arch_swap_invalidate_area(p->type); 2864 zswap_swapoff(p->type); !! 2671 frontswap_invalidate_area(p->type); >> 2672 frontswap_map_set(p, NULL); 2865 mutex_unlock(&swapon_mutex); 2673 mutex_unlock(&swapon_mutex); 2866 free_percpu(p->percpu_cluster); 2674 free_percpu(p->percpu_cluster); 2867 p->percpu_cluster = NULL; 2675 p->percpu_cluster = NULL; 2868 free_percpu(p->cluster_next_cpu); 2676 free_percpu(p->cluster_next_cpu); 2869 p->cluster_next_cpu = NULL; 2677 p->cluster_next_cpu = NULL; 2870 vfree(swap_map); 2678 vfree(swap_map); 2871 kvfree(zeromap); << 2872 kvfree(cluster_info); 2679 kvfree(cluster_info); >> 2680 kvfree(frontswap_map); 2873 /* Destroy swap account information * 2681 /* Destroy swap account information */ 2874 swap_cgroup_swapoff(p->type); 2682 swap_cgroup_swapoff(p->type); 2875 exit_swap_address_space(p->type); 2683 exit_swap_address_space(p->type); 2876 2684 2877 inode = mapping->host; 2685 inode = mapping->host; >> 2686 if (S_ISBLK(inode->i_mode)) { >> 2687 struct block_device *bdev = I_BDEV(inode); >> 2688 >> 2689 set_blocksize(bdev, old_block_size); >> 2690 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); >> 2691 } 2878 2692 2879 inode_lock(inode); 2693 inode_lock(inode); 2880 inode->i_flags &= ~S_SWAPFILE; 2694 inode->i_flags &= ~S_SWAPFILE; 2881 inode_unlock(inode); 2695 inode_unlock(inode); 2882 filp_close(swap_file, NULL); 2696 filp_close(swap_file, NULL); 2883 2697 2884 /* 2698 /* 2885 * Clear the SWP_USED flag after all 2699 * Clear the SWP_USED flag after all resources are freed so that swapon 2886 * can reuse this swap_info in alloc_ 2700 * can reuse this swap_info in alloc_swap_info() safely. It is ok to 2887 * not hold p->lock after we cleared 2701 * not hold p->lock after we cleared its SWP_WRITEOK. 2888 */ 2702 */ 2889 spin_lock(&swap_lock); 2703 spin_lock(&swap_lock); 2890 p->flags = 0; 2704 p->flags = 0; 2891 spin_unlock(&swap_lock); 2705 spin_unlock(&swap_lock); 2892 2706 2893 err = 0; 2707 err = 0; 2894 atomic_inc(&proc_poll_event); 2708 atomic_inc(&proc_poll_event); 2895 wake_up_interruptible(&proc_poll_wait 2709 wake_up_interruptible(&proc_poll_wait); 2896 2710 2897 out_dput: 2711 out_dput: 2898 filp_close(victim, NULL); 2712 filp_close(victim, NULL); 2899 out: 2713 out: 2900 putname(pathname); 2714 putname(pathname); 2901 return err; 2715 return err; 2902 } 2716 } 2903 2717 2904 #ifdef CONFIG_PROC_FS 2718 #ifdef CONFIG_PROC_FS 2905 static __poll_t swaps_poll(struct file *file, 2719 static __poll_t swaps_poll(struct file *file, poll_table *wait) 2906 { 2720 { 2907 struct seq_file *seq = file->private_ 2721 struct seq_file *seq = file->private_data; 2908 2722 2909 poll_wait(file, &proc_poll_wait, wait 2723 poll_wait(file, &proc_poll_wait, wait); 2910 2724 2911 if (seq->poll_event != atomic_read(&p 2725 if (seq->poll_event != atomic_read(&proc_poll_event)) { 2912 seq->poll_event = atomic_read 2726 seq->poll_event = atomic_read(&proc_poll_event); 2913 return EPOLLIN | EPOLLRDNORM 2727 return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI; 2914 } 2728 } 2915 2729 2916 return EPOLLIN | EPOLLRDNORM; 2730 return EPOLLIN | EPOLLRDNORM; 2917 } 2731 } 2918 2732 2919 /* iterator */ 2733 /* iterator */ 2920 static void *swap_start(struct seq_file *swap 2734 static void *swap_start(struct seq_file *swap, loff_t *pos) 2921 { 2735 { 2922 struct swap_info_struct *si; 2736 struct swap_info_struct *si; 2923 int type; 2737 int type; 2924 loff_t l = *pos; 2738 loff_t l = *pos; 2925 2739 2926 mutex_lock(&swapon_mutex); 2740 mutex_lock(&swapon_mutex); 2927 2741 2928 if (!l) 2742 if (!l) 2929 return SEQ_START_TOKEN; 2743 return SEQ_START_TOKEN; 2930 2744 2931 for (type = 0; (si = swap_type_to_swa 2745 for (type = 0; (si = swap_type_to_swap_info(type)); type++) { 2932 if (!(si->flags & SWP_USED) | 2746 if (!(si->flags & SWP_USED) || !si->swap_map) 2933 continue; 2747 continue; 2934 if (!--l) 2748 if (!--l) 2935 return si; 2749 return si; 2936 } 2750 } 2937 2751 2938 return NULL; 2752 return NULL; 2939 } 2753 } 2940 2754 2941 static void *swap_next(struct seq_file *swap, 2755 static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) 2942 { 2756 { 2943 struct swap_info_struct *si = v; 2757 struct swap_info_struct *si = v; 2944 int type; 2758 int type; 2945 2759 2946 if (v == SEQ_START_TOKEN) 2760 if (v == SEQ_START_TOKEN) 2947 type = 0; 2761 type = 0; 2948 else 2762 else 2949 type = si->type + 1; 2763 type = si->type + 1; 2950 2764 2951 ++(*pos); 2765 ++(*pos); 2952 for (; (si = swap_type_to_swap_info(t 2766 for (; (si = swap_type_to_swap_info(type)); type++) { 2953 if (!(si->flags & SWP_USED) | 2767 if (!(si->flags & SWP_USED) || !si->swap_map) 2954 continue; 2768 continue; 2955 return si; 2769 return si; 2956 } 2770 } 2957 2771 2958 return NULL; 2772 return NULL; 2959 } 2773 } 2960 2774 2961 static void swap_stop(struct seq_file *swap, 2775 static void swap_stop(struct seq_file *swap, void *v) 2962 { 2776 { 2963 mutex_unlock(&swapon_mutex); 2777 mutex_unlock(&swapon_mutex); 2964 } 2778 } 2965 2779 2966 static int swap_show(struct seq_file *swap, v 2780 static int swap_show(struct seq_file *swap, void *v) 2967 { 2781 { 2968 struct swap_info_struct *si = v; 2782 struct swap_info_struct *si = v; 2969 struct file *file; 2783 struct file *file; 2970 int len; 2784 int len; 2971 unsigned long bytes, inuse; !! 2785 unsigned int bytes, inuse; 2972 2786 2973 if (si == SEQ_START_TOKEN) { 2787 if (si == SEQ_START_TOKEN) { 2974 seq_puts(swap, "Filename\t\t\ 2788 seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n"); 2975 return 0; 2789 return 0; 2976 } 2790 } 2977 2791 2978 bytes = K(si->pages); !! 2792 bytes = si->pages << (PAGE_SHIFT - 10); 2979 inuse = K(READ_ONCE(si->inuse_pages)) !! 2793 inuse = si->inuse_pages << (PAGE_SHIFT - 10); 2980 2794 2981 file = si->swap_file; 2795 file = si->swap_file; 2982 len = seq_file_path(swap, file, " \t\ 2796 len = seq_file_path(swap, file, " \t\n\\"); 2983 seq_printf(swap, "%*s%s\t%lu\t%s%lu\t !! 2797 seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d\n", 2984 len < 40 ? 40 - len : 2798 len < 40 ? 40 - len : 1, " ", 2985 S_ISBLK(file_inode(fi 2799 S_ISBLK(file_inode(file)->i_mode) ? 2986 "partition" : 2800 "partition" : "file\t", 2987 bytes, bytes < 100000 2801 bytes, bytes < 10000000 ? "\t" : "", 2988 inuse, inuse < 100000 2802 inuse, inuse < 10000000 ? "\t" : "", 2989 si->prio); 2803 si->prio); 2990 return 0; 2804 return 0; 2991 } 2805 } 2992 2806 2993 static const struct seq_operations swaps_op = 2807 static const struct seq_operations swaps_op = { 2994 .start = swap_start, 2808 .start = swap_start, 2995 .next = swap_next, 2809 .next = swap_next, 2996 .stop = swap_stop, 2810 .stop = swap_stop, 2997 .show = swap_show 2811 .show = swap_show 2998 }; 2812 }; 2999 2813 3000 static int swaps_open(struct inode *inode, st 2814 static int swaps_open(struct inode *inode, struct file *file) 3001 { 2815 { 3002 struct seq_file *seq; 2816 struct seq_file *seq; 3003 int ret; 2817 int ret; 3004 2818 3005 ret = seq_open(file, &swaps_op); 2819 ret = seq_open(file, &swaps_op); 3006 if (ret) 2820 if (ret) 3007 return ret; 2821 return ret; 3008 2822 3009 seq = file->private_data; 2823 seq = file->private_data; 3010 seq->poll_event = atomic_read(&proc_p 2824 seq->poll_event = atomic_read(&proc_poll_event); 3011 return 0; 2825 return 0; 3012 } 2826 } 3013 2827 3014 static const struct proc_ops swaps_proc_ops = 2828 static const struct proc_ops swaps_proc_ops = { 3015 .proc_flags = PROC_ENTRY_PERMANEN 2829 .proc_flags = PROC_ENTRY_PERMANENT, 3016 .proc_open = swaps_open, 2830 .proc_open = swaps_open, 3017 .proc_read = seq_read, 2831 .proc_read = seq_read, 3018 .proc_lseek = seq_lseek, 2832 .proc_lseek = seq_lseek, 3019 .proc_release = seq_release, 2833 .proc_release = seq_release, 3020 .proc_poll = swaps_poll, 2834 .proc_poll = swaps_poll, 3021 }; 2835 }; 3022 2836 3023 static int __init procswaps_init(void) 2837 static int __init procswaps_init(void) 3024 { 2838 { 3025 proc_create("swaps", 0, NULL, &swaps_ 2839 proc_create("swaps", 0, NULL, &swaps_proc_ops); 3026 return 0; 2840 return 0; 3027 } 2841 } 3028 __initcall(procswaps_init); 2842 __initcall(procswaps_init); 3029 #endif /* CONFIG_PROC_FS */ 2843 #endif /* CONFIG_PROC_FS */ 3030 2844 3031 #ifdef MAX_SWAPFILES_CHECK 2845 #ifdef MAX_SWAPFILES_CHECK 3032 static int __init max_swapfiles_check(void) 2846 static int __init max_swapfiles_check(void) 3033 { 2847 { 3034 MAX_SWAPFILES_CHECK(); 2848 MAX_SWAPFILES_CHECK(); 3035 return 0; 2849 return 0; 3036 } 2850 } 3037 late_initcall(max_swapfiles_check); 2851 late_initcall(max_swapfiles_check); 3038 #endif 2852 #endif 3039 2853 3040 static struct swap_info_struct *alloc_swap_in 2854 static struct swap_info_struct *alloc_swap_info(void) 3041 { 2855 { 3042 struct swap_info_struct *p; 2856 struct swap_info_struct *p; 3043 struct swap_info_struct *defer = NULL 2857 struct swap_info_struct *defer = NULL; 3044 unsigned int type; 2858 unsigned int type; 3045 int i; 2859 int i; 3046 2860 3047 p = kvzalloc(struct_size(p, avail_lis 2861 p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL); 3048 if (!p) 2862 if (!p) 3049 return ERR_PTR(-ENOMEM); 2863 return ERR_PTR(-ENOMEM); 3050 2864 3051 if (percpu_ref_init(&p->users, swap_u 2865 if (percpu_ref_init(&p->users, swap_users_ref_free, 3052 PERCPU_REF_INIT_D 2866 PERCPU_REF_INIT_DEAD, GFP_KERNEL)) { 3053 kvfree(p); 2867 kvfree(p); 3054 return ERR_PTR(-ENOMEM); 2868 return ERR_PTR(-ENOMEM); 3055 } 2869 } 3056 2870 3057 spin_lock(&swap_lock); 2871 spin_lock(&swap_lock); 3058 for (type = 0; type < nr_swapfiles; t 2872 for (type = 0; type < nr_swapfiles; type++) { 3059 if (!(swap_info[type]->flags 2873 if (!(swap_info[type]->flags & SWP_USED)) 3060 break; 2874 break; 3061 } 2875 } 3062 if (type >= MAX_SWAPFILES) { 2876 if (type >= MAX_SWAPFILES) { 3063 spin_unlock(&swap_lock); 2877 spin_unlock(&swap_lock); 3064 percpu_ref_exit(&p->users); 2878 percpu_ref_exit(&p->users); 3065 kvfree(p); 2879 kvfree(p); 3066 return ERR_PTR(-EPERM); 2880 return ERR_PTR(-EPERM); 3067 } 2881 } 3068 if (type >= nr_swapfiles) { 2882 if (type >= nr_swapfiles) { 3069 p->type = type; 2883 p->type = type; 3070 /* 2884 /* 3071 * Publish the swap_info_stru 2885 * Publish the swap_info_struct after initializing it. 3072 * Note that kvzalloc() above 2886 * Note that kvzalloc() above zeroes all its fields. 3073 */ 2887 */ 3074 smp_store_release(&swap_info[ 2888 smp_store_release(&swap_info[type], p); /* rcu_assign_pointer() */ 3075 nr_swapfiles++; 2889 nr_swapfiles++; 3076 } else { 2890 } else { 3077 defer = p; 2891 defer = p; 3078 p = swap_info[type]; 2892 p = swap_info[type]; 3079 /* 2893 /* 3080 * Do not memset this entry: 2894 * Do not memset this entry: a racing procfs swap_next() 3081 * would be relying on p->typ 2895 * would be relying on p->type to remain valid. 3082 */ 2896 */ 3083 } 2897 } 3084 p->swap_extent_root = RB_ROOT; 2898 p->swap_extent_root = RB_ROOT; 3085 plist_node_init(&p->list, 0); 2899 plist_node_init(&p->list, 0); 3086 for_each_node(i) 2900 for_each_node(i) 3087 plist_node_init(&p->avail_lis 2901 plist_node_init(&p->avail_lists[i], 0); 3088 p->flags = SWP_USED; 2902 p->flags = SWP_USED; 3089 spin_unlock(&swap_lock); 2903 spin_unlock(&swap_lock); 3090 if (defer) { 2904 if (defer) { 3091 percpu_ref_exit(&defer->users 2905 percpu_ref_exit(&defer->users); 3092 kvfree(defer); 2906 kvfree(defer); 3093 } 2907 } 3094 spin_lock_init(&p->lock); 2908 spin_lock_init(&p->lock); 3095 spin_lock_init(&p->cont_lock); 2909 spin_lock_init(&p->cont_lock); 3096 init_completion(&p->comp); 2910 init_completion(&p->comp); 3097 2911 3098 return p; 2912 return p; 3099 } 2913 } 3100 2914 3101 static int claim_swapfile(struct swap_info_st !! 2915 static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) 3102 { 2916 { >> 2917 int error; >> 2918 3103 if (S_ISBLK(inode->i_mode)) { 2919 if (S_ISBLK(inode->i_mode)) { 3104 si->bdev = I_BDEV(inode); !! 2920 p->bdev = blkdev_get_by_dev(inode->i_rdev, >> 2921 FMODE_READ | FMODE_WRITE | FMODE_EXCL, p); >> 2922 if (IS_ERR(p->bdev)) { >> 2923 error = PTR_ERR(p->bdev); >> 2924 p->bdev = NULL; >> 2925 return error; >> 2926 } >> 2927 p->old_block_size = block_size(p->bdev); >> 2928 error = set_blocksize(p->bdev, PAGE_SIZE); >> 2929 if (error < 0) >> 2930 return error; 3105 /* 2931 /* 3106 * Zoned block devices contai 2932 * Zoned block devices contain zones that have a sequential 3107 * write only restriction. H 2933 * write only restriction. Hence zoned block devices are not 3108 * suitable for swapping. Di 2934 * suitable for swapping. Disallow them here. 3109 */ 2935 */ 3110 if (bdev_is_zoned(si->bdev)) !! 2936 if (blk_queue_is_zoned(p->bdev->bd_disk->queue)) 3111 return -EINVAL; 2937 return -EINVAL; 3112 si->flags |= SWP_BLKDEV; !! 2938 p->flags |= SWP_BLKDEV; 3113 } else if (S_ISREG(inode->i_mode)) { 2939 } else if (S_ISREG(inode->i_mode)) { 3114 si->bdev = inode->i_sb->s_bde !! 2940 p->bdev = inode->i_sb->s_bdev; 3115 } 2941 } 3116 2942 3117 return 0; 2943 return 0; 3118 } 2944 } 3119 2945 3120 2946 3121 /* 2947 /* 3122 * Find out how many pages are allowed for a 2948 * Find out how many pages are allowed for a single swap device. There 3123 * are two limiting factors: 2949 * are two limiting factors: 3124 * 1) the number of bits for the swap offset 2950 * 1) the number of bits for the swap offset in the swp_entry_t type, and 3125 * 2) the number of bits in the swap pte, as 2951 * 2) the number of bits in the swap pte, as defined by the different 3126 * architectures. 2952 * architectures. 3127 * 2953 * 3128 * In order to find the largest possible bit 2954 * In order to find the largest possible bit mask, a swap entry with 3129 * swap type 0 and swap offset ~0UL is create 2955 * swap type 0 and swap offset ~0UL is created, encoded to a swap pte, 3130 * decoded to a swp_entry_t again, and finall 2956 * decoded to a swp_entry_t again, and finally the swap offset is 3131 * extracted. 2957 * extracted. 3132 * 2958 * 3133 * This will mask all the bits from the initi 2959 * This will mask all the bits from the initial ~0UL mask that can't 3134 * be encoded in either the swp_entry_t or th 2960 * be encoded in either the swp_entry_t or the architecture definition 3135 * of a swap pte. 2961 * of a swap pte. 3136 */ 2962 */ 3137 unsigned long generic_max_swapfile_size(void) 2963 unsigned long generic_max_swapfile_size(void) 3138 { 2964 { 3139 return swp_offset(pte_to_swp_entry( 2965 return swp_offset(pte_to_swp_entry( 3140 swp_entry_to_pte(swp_ 2966 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; 3141 } 2967 } 3142 2968 3143 /* Can be overridden by an architecture for a 2969 /* Can be overridden by an architecture for additional checks. */ 3144 __weak unsigned long arch_max_swapfile_size(v !! 2970 __weak unsigned long max_swapfile_size(void) 3145 { 2971 { 3146 return generic_max_swapfile_size(); 2972 return generic_max_swapfile_size(); 3147 } 2973 } 3148 2974 3149 static unsigned long read_swap_header(struct !! 2975 static unsigned long read_swap_header(struct swap_info_struct *p, 3150 union 2976 union swap_header *swap_header, 3151 struc 2977 struct inode *inode) 3152 { 2978 { 3153 int i; 2979 int i; 3154 unsigned long maxpages; 2980 unsigned long maxpages; 3155 unsigned long swapfilepages; 2981 unsigned long swapfilepages; 3156 unsigned long last_page; 2982 unsigned long last_page; 3157 2983 3158 if (memcmp("SWAPSPACE2", swap_header- 2984 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { 3159 pr_err("Unable to find swap-s 2985 pr_err("Unable to find swap-space signature\n"); 3160 return 0; 2986 return 0; 3161 } 2987 } 3162 2988 3163 /* swap partition endianness hack... 2989 /* swap partition endianness hack... */ 3164 if (swab32(swap_header->info.version) 2990 if (swab32(swap_header->info.version) == 1) { 3165 swab32s(&swap_header->info.ve 2991 swab32s(&swap_header->info.version); 3166 swab32s(&swap_header->info.la 2992 swab32s(&swap_header->info.last_page); 3167 swab32s(&swap_header->info.nr 2993 swab32s(&swap_header->info.nr_badpages); 3168 if (swap_header->info.nr_badp 2994 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 3169 return 0; 2995 return 0; 3170 for (i = 0; i < swap_header-> 2996 for (i = 0; i < swap_header->info.nr_badpages; i++) 3171 swab32s(&swap_header- 2997 swab32s(&swap_header->info.badpages[i]); 3172 } 2998 } 3173 /* Check the swap header's sub-versio 2999 /* Check the swap header's sub-version */ 3174 if (swap_header->info.version != 1) { 3000 if (swap_header->info.version != 1) { 3175 pr_warn("Unable to handle swa 3001 pr_warn("Unable to handle swap header version %d\n", 3176 swap_header->info.ver 3002 swap_header->info.version); 3177 return 0; 3003 return 0; 3178 } 3004 } 3179 3005 3180 si->lowest_bit = 1; !! 3006 p->lowest_bit = 1; 3181 si->cluster_next = 1; !! 3007 p->cluster_next = 1; 3182 si->cluster_nr = 0; !! 3008 p->cluster_nr = 0; 3183 3009 3184 maxpages = swapfile_maximum_size; !! 3010 maxpages = max_swapfile_size(); 3185 last_page = swap_header->info.last_pa 3011 last_page = swap_header->info.last_page; 3186 if (!last_page) { 3012 if (!last_page) { 3187 pr_warn("Empty swap-file\n"); 3013 pr_warn("Empty swap-file\n"); 3188 return 0; 3014 return 0; 3189 } 3015 } 3190 if (last_page > maxpages) { 3016 if (last_page > maxpages) { 3191 pr_warn("Truncating oversized 3017 pr_warn("Truncating oversized swap area, only using %luk out of %luk\n", 3192 K(maxpages), K(last_p !! 3018 maxpages << (PAGE_SHIFT - 10), >> 3019 last_page << (PAGE_SHIFT - 10)); 3193 } 3020 } 3194 if (maxpages > last_page) { 3021 if (maxpages > last_page) { 3195 maxpages = last_page + 1; 3022 maxpages = last_page + 1; 3196 /* p->max is an unsigned int: 3023 /* p->max is an unsigned int: don't overflow it */ 3197 if ((unsigned int)maxpages == 3024 if ((unsigned int)maxpages == 0) 3198 maxpages = UINT_MAX; 3025 maxpages = UINT_MAX; 3199 } 3026 } 3200 si->highest_bit = maxpages - 1; !! 3027 p->highest_bit = maxpages - 1; 3201 3028 3202 if (!maxpages) 3029 if (!maxpages) 3203 return 0; 3030 return 0; 3204 swapfilepages = i_size_read(inode) >> 3031 swapfilepages = i_size_read(inode) >> PAGE_SHIFT; 3205 if (swapfilepages && maxpages > swapf 3032 if (swapfilepages && maxpages > swapfilepages) { 3206 pr_warn("Swap area shorter th 3033 pr_warn("Swap area shorter than signature indicates\n"); 3207 return 0; 3034 return 0; 3208 } 3035 } 3209 if (swap_header->info.nr_badpages && 3036 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) 3210 return 0; 3037 return 0; 3211 if (swap_header->info.nr_badpages > M 3038 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 3212 return 0; 3039 return 0; 3213 3040 3214 return maxpages; 3041 return maxpages; 3215 } 3042 } 3216 3043 3217 #define SWAP_CLUSTER_INFO_COLS 3044 #define SWAP_CLUSTER_INFO_COLS \ 3218 DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(s 3045 DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info)) 3219 #define SWAP_CLUSTER_SPACE_COLS 3046 #define SWAP_CLUSTER_SPACE_COLS \ 3220 DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES 3047 DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER) 3221 #define SWAP_CLUSTER_COLS 3048 #define SWAP_CLUSTER_COLS \ 3222 max_t(unsigned int, SWAP_CLUSTER_INFO 3049 max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS) 3223 3050 3224 static int setup_swap_map_and_extents(struct !! 3051 static int setup_swap_map_and_extents(struct swap_info_struct *p, 3225 union 3052 union swap_header *swap_header, 3226 unsig 3053 unsigned char *swap_map, >> 3054 struct swap_cluster_info *cluster_info, 3227 unsig 3055 unsigned long maxpages, 3228 secto 3056 sector_t *span) 3229 { 3057 { >> 3058 unsigned int j, k; 3230 unsigned int nr_good_pages; 3059 unsigned int nr_good_pages; 3231 unsigned long i; << 3232 int nr_extents; 3060 int nr_extents; >> 3061 unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); >> 3062 unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS; >> 3063 unsigned long i, idx; 3233 3064 3234 nr_good_pages = maxpages - 1; /* om 3065 nr_good_pages = maxpages - 1; /* omit header page */ 3235 3066 >> 3067 cluster_list_init(&p->free_clusters); >> 3068 cluster_list_init(&p->discard_clusters); >> 3069 3236 for (i = 0; i < swap_header->info.nr_ 3070 for (i = 0; i < swap_header->info.nr_badpages; i++) { 3237 unsigned int page_nr = swap_h 3071 unsigned int page_nr = swap_header->info.badpages[i]; 3238 if (page_nr == 0 || page_nr > 3072 if (page_nr == 0 || page_nr > swap_header->info.last_page) 3239 return -EINVAL; 3073 return -EINVAL; 3240 if (page_nr < maxpages) { 3074 if (page_nr < maxpages) { 3241 swap_map[page_nr] = S 3075 swap_map[page_nr] = SWAP_MAP_BAD; 3242 nr_good_pages--; 3076 nr_good_pages--; >> 3077 /* >> 3078 * Haven't marked the cluster free yet, no list >> 3079 * operation involved >> 3080 */ >> 3081 inc_cluster_info_page(p, cluster_info, page_nr); 3243 } 3082 } 3244 } 3083 } 3245 3084 >> 3085 /* Haven't marked the cluster free yet, no list operation involved */ >> 3086 for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) >> 3087 inc_cluster_info_page(p, cluster_info, i); >> 3088 3246 if (nr_good_pages) { 3089 if (nr_good_pages) { 3247 swap_map[0] = SWAP_MAP_BAD; 3090 swap_map[0] = SWAP_MAP_BAD; 3248 si->max = maxpages; !! 3091 /* 3249 si->pages = nr_good_pages; !! 3092 * Not mark the cluster free yet, no list 3250 nr_extents = setup_swap_exten !! 3093 * operation involved >> 3094 */ >> 3095 inc_cluster_info_page(p, cluster_info, 0); >> 3096 p->max = maxpages; >> 3097 p->pages = nr_good_pages; >> 3098 nr_extents = setup_swap_extents(p, span); 3251 if (nr_extents < 0) 3099 if (nr_extents < 0) 3252 return nr_extents; 3100 return nr_extents; 3253 nr_good_pages = si->pages; !! 3101 nr_good_pages = p->pages; 3254 } 3102 } 3255 if (!nr_good_pages) { 3103 if (!nr_good_pages) { 3256 pr_warn("Empty swap-file\n"); 3104 pr_warn("Empty swap-file\n"); 3257 return -EINVAL; 3105 return -EINVAL; 3258 } 3106 } 3259 3107 3260 return nr_extents; << 3261 } << 3262 << 3263 static struct swap_cluster_info *setup_cluste << 3264 << 3265 << 3266 { << 3267 unsigned long nr_clusters = DIV_ROUND << 3268 unsigned long col = si->cluster_next << 3269 struct swap_cluster_info *cluster_inf << 3270 unsigned long i, j, k, idx; << 3271 int cpu, err = -ENOMEM; << 3272 << 3273 cluster_info = kvcalloc(nr_clusters, << 3274 if (!cluster_info) 3108 if (!cluster_info) 3275 goto err; !! 3109 return nr_extents; 3276 << 3277 for (i = 0; i < nr_clusters; i++) << 3278 spin_lock_init(&cluster_info[ << 3279 << 3280 si->cluster_next_cpu = alloc_percpu(u << 3281 if (!si->cluster_next_cpu) << 3282 goto err_free; << 3283 << 3284 /* Random start position to help with << 3285 for_each_possible_cpu(cpu) << 3286 per_cpu(*si->cluster_next_cpu << 3287 get_random_u32_inclusive(1, s << 3288 << 3289 si->percpu_cluster = alloc_percpu(str << 3290 if (!si->percpu_cluster) << 3291 goto err_free; << 3292 << 3293 for_each_possible_cpu(cpu) { << 3294 struct percpu_cluster *cluste << 3295 << 3296 cluster = per_cpu_ptr(si->per << 3297 for (i = 0; i < SWAP_NR_ORDER << 3298 cluster->next[i] = SW << 3299 } << 3300 3110 3301 /* << 3302 * Mark unusable pages as unavailable << 3303 * marked free yet, so no list operat << 3304 * << 3305 * See setup_swap_map_and_extents(): << 3306 * and the EOF part of the last clust << 3307 */ << 3308 inc_cluster_info_page(si, cluster_inf << 3309 for (i = 0; i < swap_header->info.nr_ << 3310 inc_cluster_info_page(si, clu << 3311 swap_he << 3312 for (i = maxpages; i < round_up(maxpa << 3313 inc_cluster_info_page(si, clu << 3314 << 3315 INIT_LIST_HEAD(&si->free_clusters); << 3316 INIT_LIST_HEAD(&si->full_clusters); << 3317 INIT_LIST_HEAD(&si->discard_clusters) << 3318 << 3319 for (i = 0; i < SWAP_NR_ORDERS; i++) << 3320 INIT_LIST_HEAD(&si->nonfull_c << 3321 INIT_LIST_HEAD(&si->frag_clus << 3322 si->frag_cluster_nr[i] = 0; << 3323 } << 3324 3111 3325 /* 3112 /* 3326 * Reduce false cache line sharing be 3113 * Reduce false cache line sharing between cluster_info and 3327 * sharing same address space. 3114 * sharing same address space. 3328 */ 3115 */ 3329 for (k = 0; k < SWAP_CLUSTER_COLS; k+ 3116 for (k = 0; k < SWAP_CLUSTER_COLS; k++) { 3330 j = (k + col) % SWAP_CLUSTER_ 3117 j = (k + col) % SWAP_CLUSTER_COLS; 3331 for (i = 0; i < DIV_ROUND_UP( 3118 for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { 3332 struct swap_cluster_i << 3333 idx = i * SWAP_CLUSTE 3119 idx = i * SWAP_CLUSTER_COLS + j; 3334 ci = cluster_info + i << 3335 if (idx >= nr_cluster 3120 if (idx >= nr_clusters) 3336 continue; 3121 continue; 3337 if (ci->count) { !! 3122 if (cluster_count(&cluster_info[idx])) 3338 ci->flags = C << 3339 list_add_tail << 3340 continue; 3123 continue; 3341 } !! 3124 cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); 3342 ci->flags = CLUSTER_F !! 3125 cluster_list_add_tail(&p->free_clusters, cluster_info, 3343 list_add_tail(&ci->li !! 3126 idx); 3344 } 3127 } 3345 } 3128 } >> 3129 return nr_extents; >> 3130 } >> 3131 >> 3132 /* >> 3133 * Helper to sys_swapon determining if a given swap >> 3134 * backing device queue supports DISCARD operations. >> 3135 */ >> 3136 static bool swap_discardable(struct swap_info_struct *si) >> 3137 { >> 3138 struct request_queue *q = bdev_get_queue(si->bdev); 3346 3139 3347 return cluster_info; !! 3140 if (!q || !blk_queue_discard(q)) >> 3141 return false; 3348 3142 3349 err_free: !! 3143 return true; 3350 kvfree(cluster_info); << 3351 err: << 3352 return ERR_PTR(err); << 3353 } 3144 } 3354 3145 3355 SYSCALL_DEFINE2(swapon, const char __user *, 3146 SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) 3356 { 3147 { 3357 struct swap_info_struct *si; !! 3148 struct swap_info_struct *p; 3358 struct filename *name; 3149 struct filename *name; 3359 struct file *swap_file = NULL; 3150 struct file *swap_file = NULL; 3360 struct address_space *mapping; 3151 struct address_space *mapping; 3361 struct dentry *dentry; 3152 struct dentry *dentry; 3362 int prio; 3153 int prio; 3363 int error; 3154 int error; 3364 union swap_header *swap_header; 3155 union swap_header *swap_header; 3365 int nr_extents; 3156 int nr_extents; 3366 sector_t span; 3157 sector_t span; 3367 unsigned long maxpages; 3158 unsigned long maxpages; 3368 unsigned char *swap_map = NULL; 3159 unsigned char *swap_map = NULL; 3369 unsigned long *zeromap = NULL; << 3370 struct swap_cluster_info *cluster_inf 3160 struct swap_cluster_info *cluster_info = NULL; 3371 struct folio *folio = NULL; !! 3161 unsigned long *frontswap_map = NULL; >> 3162 struct page *page = NULL; 3372 struct inode *inode = NULL; 3163 struct inode *inode = NULL; 3373 bool inced_nr_rotate_swap = false; 3164 bool inced_nr_rotate_swap = false; 3374 3165 3375 if (swap_flags & ~SWAP_FLAGS_VALID) 3166 if (swap_flags & ~SWAP_FLAGS_VALID) 3376 return -EINVAL; 3167 return -EINVAL; 3377 3168 3378 if (!capable(CAP_SYS_ADMIN)) 3169 if (!capable(CAP_SYS_ADMIN)) 3379 return -EPERM; 3170 return -EPERM; 3380 3171 3381 if (!swap_avail_heads) 3172 if (!swap_avail_heads) 3382 return -ENOMEM; 3173 return -ENOMEM; 3383 3174 3384 si = alloc_swap_info(); !! 3175 p = alloc_swap_info(); 3385 if (IS_ERR(si)) !! 3176 if (IS_ERR(p)) 3386 return PTR_ERR(si); !! 3177 return PTR_ERR(p); 3387 3178 3388 INIT_WORK(&si->discard_work, swap_dis !! 3179 INIT_WORK(&p->discard_work, swap_discard_work); 3389 INIT_WORK(&si->reclaim_work, swap_rec << 3390 3180 3391 name = getname(specialfile); 3181 name = getname(specialfile); 3392 if (IS_ERR(name)) { 3182 if (IS_ERR(name)) { 3393 error = PTR_ERR(name); 3183 error = PTR_ERR(name); 3394 name = NULL; 3184 name = NULL; 3395 goto bad_swap; 3185 goto bad_swap; 3396 } 3186 } 3397 swap_file = file_open_name(name, O_RD !! 3187 swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0); 3398 if (IS_ERR(swap_file)) { 3188 if (IS_ERR(swap_file)) { 3399 error = PTR_ERR(swap_file); 3189 error = PTR_ERR(swap_file); 3400 swap_file = NULL; 3190 swap_file = NULL; 3401 goto bad_swap; 3191 goto bad_swap; 3402 } 3192 } 3403 3193 3404 si->swap_file = swap_file; !! 3194 p->swap_file = swap_file; 3405 mapping = swap_file->f_mapping; 3195 mapping = swap_file->f_mapping; 3406 dentry = swap_file->f_path.dentry; 3196 dentry = swap_file->f_path.dentry; 3407 inode = mapping->host; 3197 inode = mapping->host; 3408 3198 3409 error = claim_swapfile(si, inode); !! 3199 error = claim_swapfile(p, inode); 3410 if (unlikely(error)) 3200 if (unlikely(error)) 3411 goto bad_swap; 3201 goto bad_swap; 3412 3202 3413 inode_lock(inode); 3203 inode_lock(inode); 3414 if (d_unlinked(dentry) || cant_mount( 3204 if (d_unlinked(dentry) || cant_mount(dentry)) { 3415 error = -ENOENT; 3205 error = -ENOENT; 3416 goto bad_swap_unlock_inode; 3206 goto bad_swap_unlock_inode; 3417 } 3207 } 3418 if (IS_SWAPFILE(inode)) { 3208 if (IS_SWAPFILE(inode)) { 3419 error = -EBUSY; 3209 error = -EBUSY; 3420 goto bad_swap_unlock_inode; 3210 goto bad_swap_unlock_inode; 3421 } 3211 } 3422 3212 3423 /* 3213 /* 3424 * Read the swap header. 3214 * Read the swap header. 3425 */ 3215 */ 3426 if (!mapping->a_ops->read_folio) { !! 3216 if (!mapping->a_ops->readpage) { 3427 error = -EINVAL; 3217 error = -EINVAL; 3428 goto bad_swap_unlock_inode; 3218 goto bad_swap_unlock_inode; 3429 } 3219 } 3430 folio = read_mapping_folio(mapping, 0 !! 3220 page = read_mapping_page(mapping, 0, swap_file); 3431 if (IS_ERR(folio)) { !! 3221 if (IS_ERR(page)) { 3432 error = PTR_ERR(folio); !! 3222 error = PTR_ERR(page); 3433 goto bad_swap_unlock_inode; 3223 goto bad_swap_unlock_inode; 3434 } 3224 } 3435 swap_header = kmap_local_folio(folio, !! 3225 swap_header = kmap(page); 3436 3226 3437 maxpages = read_swap_header(si, swap_ !! 3227 maxpages = read_swap_header(p, swap_header, inode); 3438 if (unlikely(!maxpages)) { 3228 if (unlikely(!maxpages)) { 3439 error = -EINVAL; 3229 error = -EINVAL; 3440 goto bad_swap_unlock_inode; 3230 goto bad_swap_unlock_inode; 3441 } 3231 } 3442 3232 3443 /* OK, set up the swap map and apply 3233 /* OK, set up the swap map and apply the bad block list */ 3444 swap_map = vzalloc(maxpages); 3234 swap_map = vzalloc(maxpages); 3445 if (!swap_map) { 3235 if (!swap_map) { 3446 error = -ENOMEM; 3236 error = -ENOMEM; 3447 goto bad_swap_unlock_inode; 3237 goto bad_swap_unlock_inode; 3448 } 3238 } 3449 3239 3450 error = swap_cgroup_swapon(si->type, !! 3240 if (p->bdev && blk_queue_stable_writes(p->bdev->bd_disk->queue)) 3451 if (error) !! 3241 p->flags |= SWP_STABLE_WRITES; 3452 goto bad_swap_unlock_inode; << 3453 3242 3454 nr_extents = setup_swap_map_and_exten !! 3243 if (p->bdev && p->bdev->bd_disk->fops->rw_page) 3455 !! 3244 p->flags |= SWP_SYNCHRONOUS_IO; 3456 if (unlikely(nr_extents < 0)) { << 3457 error = nr_extents; << 3458 goto bad_swap_unlock_inode; << 3459 } << 3460 3245 3461 /* !! 3246 if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { 3462 * Use kvmalloc_array instead of bitm !! 3247 int cpu; 3463 * be above MAX_PAGE_ORDER incase of !! 3248 unsigned long ci, nr_cluster; 3464 */ !! 3249 3465 zeromap = kvmalloc_array(BITS_TO_LONG !! 3250 p->flags |= SWP_SOLIDSTATE; 3466 GFP_KERNE !! 3251 p->cluster_next_cpu = alloc_percpu(unsigned int); 3467 if (!zeromap) { !! 3252 if (!p->cluster_next_cpu) { 3468 error = -ENOMEM; !! 3253 error = -ENOMEM; 3469 goto bad_swap_unlock_inode; !! 3254 goto bad_swap_unlock_inode; 3470 } !! 3255 } 3471 !! 3256 /* 3472 if (si->bdev && bdev_stable_writes(si !! 3257 * select a random position to start with to help wear leveling 3473 si->flags |= SWP_STABLE_WRITE !! 3258 * SSD 3474 !! 3259 */ 3475 if (si->bdev && bdev_synchronous(si-> !! 3260 for_each_possible_cpu(cpu) { 3476 si->flags |= SWP_SYNCHRONOUS_ !! 3261 per_cpu(*p->cluster_next_cpu, cpu) = >> 3262 1 + prandom_u32_max(p->highest_bit); >> 3263 } >> 3264 nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); >> 3265 >> 3266 cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info), >> 3267 GFP_KERNEL); >> 3268 if (!cluster_info) { >> 3269 error = -ENOMEM; >> 3270 goto bad_swap_unlock_inode; >> 3271 } 3477 3272 3478 if (si->bdev && bdev_nonrot(si->bdev) !! 3273 for (ci = 0; ci < nr_cluster; ci++) 3479 si->flags |= SWP_SOLIDSTATE; !! 3274 spin_lock_init(&((cluster_info + ci)->lock)); 3480 3275 3481 cluster_info = setup_clusters !! 3276 p->percpu_cluster = alloc_percpu(struct percpu_cluster); 3482 if (IS_ERR(cluster_info)) { !! 3277 if (!p->percpu_cluster) { 3483 error = PTR_ERR(clust !! 3278 error = -ENOMEM; 3484 cluster_info = NULL; << 3485 goto bad_swap_unlock_ 3279 goto bad_swap_unlock_inode; 3486 } 3280 } >> 3281 for_each_possible_cpu(cpu) { >> 3282 struct percpu_cluster *cluster; >> 3283 cluster = per_cpu_ptr(p->percpu_cluster, cpu); >> 3284 cluster_set_null(&cluster->index); >> 3285 } 3487 } else { 3286 } else { 3488 atomic_inc(&nr_rotate_swap); 3287 atomic_inc(&nr_rotate_swap); 3489 inced_nr_rotate_swap = true; 3288 inced_nr_rotate_swap = true; 3490 } 3289 } 3491 3290 3492 if ((swap_flags & SWAP_FLAG_DISCARD) !! 3291 error = swap_cgroup_swapon(p->type, maxpages); 3493 si->bdev && bdev_max_discard_sect !! 3292 if (error) >> 3293 goto bad_swap_unlock_inode; >> 3294 >> 3295 nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map, >> 3296 cluster_info, maxpages, &span); >> 3297 if (unlikely(nr_extents < 0)) { >> 3298 error = nr_extents; >> 3299 goto bad_swap_unlock_inode; >> 3300 } >> 3301 /* frontswap enabled? set up bit-per-page map for frontswap */ >> 3302 if (IS_ENABLED(CONFIG_FRONTSWAP)) >> 3303 frontswap_map = kvcalloc(BITS_TO_LONGS(maxpages), >> 3304 sizeof(long), >> 3305 GFP_KERNEL); >> 3306 >> 3307 if (p->bdev && (swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { 3494 /* 3308 /* 3495 * When discard is enabled fo 3309 * When discard is enabled for swap with no particular 3496 * policy flagged, we set all 3310 * policy flagged, we set all swap discard flags here in 3497 * order to sustain backward 3311 * order to sustain backward compatibility with older 3498 * swapon(8) releases. 3312 * swapon(8) releases. 3499 */ 3313 */ 3500 si->flags |= (SWP_DISCARDABLE !! 3314 p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | 3501 SWP_PAGE_DISCARD 3315 SWP_PAGE_DISCARD); 3502 3316 3503 /* 3317 /* 3504 * By flagging sys_swapon, a 3318 * By flagging sys_swapon, a sysadmin can tell us to 3505 * either do single-time area 3319 * either do single-time area discards only, or to just 3506 * perform discards for relea 3320 * perform discards for released swap page-clusters. 3507 * Now it's time to adjust th 3321 * Now it's time to adjust the p->flags accordingly. 3508 */ 3322 */ 3509 if (swap_flags & SWAP_FLAG_DI 3323 if (swap_flags & SWAP_FLAG_DISCARD_ONCE) 3510 si->flags &= ~SWP_PAG !! 3324 p->flags &= ~SWP_PAGE_DISCARD; 3511 else if (swap_flags & SWAP_FL 3325 else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) 3512 si->flags &= ~SWP_ARE !! 3326 p->flags &= ~SWP_AREA_DISCARD; 3513 3327 3514 /* issue a swapon-time discar 3328 /* issue a swapon-time discard if it's still required */ 3515 if (si->flags & SWP_AREA_DISC !! 3329 if (p->flags & SWP_AREA_DISCARD) { 3516 int err = discard_swa !! 3330 int err = discard_swap(p); 3517 if (unlikely(err)) 3331 if (unlikely(err)) 3518 pr_err("swapo 3332 pr_err("swapon: discard_swap(%p): %d\n", 3519 si, e !! 3333 p, err); 3520 } 3334 } 3521 } 3335 } 3522 3336 3523 error = init_swap_address_space(si->t !! 3337 error = init_swap_address_space(p->type, maxpages); 3524 if (error) 3338 if (error) 3525 goto bad_swap_unlock_inode; 3339 goto bad_swap_unlock_inode; 3526 3340 3527 error = zswap_swapon(si->type, maxpag << 3528 if (error) << 3529 goto free_swap_address_space; << 3530 << 3531 /* 3341 /* 3532 * Flush any pending IO and dirty map 3342 * Flush any pending IO and dirty mappings before we start using this 3533 * swap device. 3343 * swap device. 3534 */ 3344 */ 3535 inode->i_flags |= S_SWAPFILE; 3345 inode->i_flags |= S_SWAPFILE; 3536 error = inode_drain_writes(inode); 3346 error = inode_drain_writes(inode); 3537 if (error) { 3347 if (error) { 3538 inode->i_flags &= ~S_SWAPFILE 3348 inode->i_flags &= ~S_SWAPFILE; 3539 goto free_swap_zswap; !! 3349 goto free_swap_address_space; 3540 } 3350 } 3541 3351 3542 mutex_lock(&swapon_mutex); 3352 mutex_lock(&swapon_mutex); 3543 prio = -1; 3353 prio = -1; 3544 if (swap_flags & SWAP_FLAG_PREFER) 3354 if (swap_flags & SWAP_FLAG_PREFER) 3545 prio = 3355 prio = 3546 (swap_flags & SWAP_FLAG_PRI 3356 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; 3547 enable_swap_info(si, prio, swap_map, !! 3357 enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map); 3548 3358 3549 pr_info("Adding %uk swap on %s. Prio !! 3359 pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", 3550 K(si->pages), name->name, si- !! 3360 p->pages<<(PAGE_SHIFT-10), name->name, p->prio, 3551 K((unsigned long long)span), !! 3361 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), 3552 (si->flags & SWP_SOLIDSTATE) !! 3362 (p->flags & SWP_SOLIDSTATE) ? "SS" : "", 3553 (si->flags & SWP_DISCARDABLE) !! 3363 (p->flags & SWP_DISCARDABLE) ? "D" : "", 3554 (si->flags & SWP_AREA_DISCARD !! 3364 (p->flags & SWP_AREA_DISCARD) ? "s" : "", 3555 (si->flags & SWP_PAGE_DISCARD !! 3365 (p->flags & SWP_PAGE_DISCARD) ? "c" : "", >> 3366 (frontswap_map) ? "FS" : ""); 3556 3367 3557 mutex_unlock(&swapon_mutex); 3368 mutex_unlock(&swapon_mutex); 3558 atomic_inc(&proc_poll_event); 3369 atomic_inc(&proc_poll_event); 3559 wake_up_interruptible(&proc_poll_wait 3370 wake_up_interruptible(&proc_poll_wait); 3560 3371 3561 error = 0; 3372 error = 0; 3562 goto out; 3373 goto out; 3563 free_swap_zswap: << 3564 zswap_swapoff(si->type); << 3565 free_swap_address_space: 3374 free_swap_address_space: 3566 exit_swap_address_space(si->type); !! 3375 exit_swap_address_space(p->type); 3567 bad_swap_unlock_inode: 3376 bad_swap_unlock_inode: 3568 inode_unlock(inode); 3377 inode_unlock(inode); 3569 bad_swap: 3378 bad_swap: 3570 free_percpu(si->percpu_cluster); !! 3379 free_percpu(p->percpu_cluster); 3571 si->percpu_cluster = NULL; !! 3380 p->percpu_cluster = NULL; 3572 free_percpu(si->cluster_next_cpu); !! 3381 free_percpu(p->cluster_next_cpu); 3573 si->cluster_next_cpu = NULL; !! 3382 p->cluster_next_cpu = NULL; >> 3383 if (inode && S_ISBLK(inode->i_mode) && p->bdev) { >> 3384 set_blocksize(p->bdev, p->old_block_size); >> 3385 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); >> 3386 } 3574 inode = NULL; 3387 inode = NULL; 3575 destroy_swap_extents(si); !! 3388 destroy_swap_extents(p); 3576 swap_cgroup_swapoff(si->type); !! 3389 swap_cgroup_swapoff(p->type); 3577 spin_lock(&swap_lock); 3390 spin_lock(&swap_lock); 3578 si->swap_file = NULL; !! 3391 p->swap_file = NULL; 3579 si->flags = 0; !! 3392 p->flags = 0; 3580 spin_unlock(&swap_lock); 3393 spin_unlock(&swap_lock); 3581 vfree(swap_map); 3394 vfree(swap_map); 3582 kvfree(zeromap); << 3583 kvfree(cluster_info); 3395 kvfree(cluster_info); >> 3396 kvfree(frontswap_map); 3584 if (inced_nr_rotate_swap) 3397 if (inced_nr_rotate_swap) 3585 atomic_dec(&nr_rotate_swap); 3398 atomic_dec(&nr_rotate_swap); 3586 if (swap_file) 3399 if (swap_file) 3587 filp_close(swap_file, NULL); 3400 filp_close(swap_file, NULL); 3588 out: 3401 out: 3589 if (!IS_ERR_OR_NULL(folio)) !! 3402 if (page && !IS_ERR(page)) { 3590 folio_release_kmap(folio, swa !! 3403 kunmap(page); >> 3404 put_page(page); >> 3405 } 3591 if (name) 3406 if (name) 3592 putname(name); 3407 putname(name); 3593 if (inode) 3408 if (inode) 3594 inode_unlock(inode); 3409 inode_unlock(inode); 3595 if (!error) 3410 if (!error) 3596 enable_swap_slots_cache(); 3411 enable_swap_slots_cache(); 3597 return error; 3412 return error; 3598 } 3413 } 3599 3414 3600 void si_swapinfo(struct sysinfo *val) 3415 void si_swapinfo(struct sysinfo *val) 3601 { 3416 { 3602 unsigned int type; 3417 unsigned int type; 3603 unsigned long nr_to_be_unused = 0; 3418 unsigned long nr_to_be_unused = 0; 3604 3419 3605 spin_lock(&swap_lock); 3420 spin_lock(&swap_lock); 3606 for (type = 0; type < nr_swapfiles; t 3421 for (type = 0; type < nr_swapfiles; type++) { 3607 struct swap_info_struct *si = 3422 struct swap_info_struct *si = swap_info[type]; 3608 3423 3609 if ((si->flags & SWP_USED) && 3424 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) 3610 nr_to_be_unused += RE !! 3425 nr_to_be_unused += si->inuse_pages; 3611 } 3426 } 3612 val->freeswap = atomic_long_read(&nr_ 3427 val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; 3613 val->totalswap = total_swap_pages + n 3428 val->totalswap = total_swap_pages + nr_to_be_unused; 3614 spin_unlock(&swap_lock); 3429 spin_unlock(&swap_lock); 3615 } 3430 } 3616 3431 3617 /* 3432 /* 3618 * Verify that nr swap entries are valid and !! 3433 * Verify that a swap entry is valid and increment its swap map count. 3619 * 3434 * 3620 * Returns error code in following case. 3435 * Returns error code in following case. 3621 * - success -> 0 3436 * - success -> 0 3622 * - swp_entry is invalid -> EINVAL 3437 * - swp_entry is invalid -> EINVAL 3623 * - swp_entry is migration entry -> EINVAL 3438 * - swp_entry is migration entry -> EINVAL 3624 * - swap-cache reference is requested but th 3439 * - swap-cache reference is requested but there is already one. -> EEXIST 3625 * - swap-cache reference is requested but th 3440 * - swap-cache reference is requested but the entry is not used. -> ENOENT 3626 * - swap-mapped reference requested but need 3441 * - swap-mapped reference requested but needs continued swap count. -> ENOMEM 3627 */ 3442 */ 3628 static int __swap_duplicate(swp_entry_t entry !! 3443 static int __swap_duplicate(swp_entry_t entry, unsigned char usage) 3629 { 3444 { 3630 struct swap_info_struct *si; !! 3445 struct swap_info_struct *p; 3631 struct swap_cluster_info *ci; 3446 struct swap_cluster_info *ci; 3632 unsigned long offset; 3447 unsigned long offset; 3633 unsigned char count; 3448 unsigned char count; 3634 unsigned char has_cache; 3449 unsigned char has_cache; 3635 int err, i; !! 3450 int err; 3636 3451 3637 si = swp_swap_info(entry); !! 3452 p = get_swap_device(entry); >> 3453 if (!p) >> 3454 return -EINVAL; 3638 3455 3639 offset = swp_offset(entry); 3456 offset = swp_offset(entry); 3640 VM_WARN_ON(nr > SWAPFILE_CLUSTER - of !! 3457 ci = lock_cluster_or_swap_info(p, offset); 3641 VM_WARN_ON(usage == 1 && nr > 1); << 3642 ci = lock_cluster_or_swap_info(si, of << 3643 3458 3644 err = 0; !! 3459 count = p->swap_map[offset]; 3645 for (i = 0; i < nr; i++) { << 3646 count = si->swap_map[offset + << 3647 3460 3648 /* !! 3461 /* 3649 * swapin_readahead() doesn't !! 3462 * swapin_readahead() doesn't check if a swap entry is valid, so the 3650 * swap entry could be SWAP_M !! 3463 * swap entry could be SWAP_MAP_BAD. Check here with lock held. 3651 */ !! 3464 */ 3652 if (unlikely(swap_count(count !! 3465 if (unlikely(swap_count(count) == SWAP_MAP_BAD)) { 3653 err = -ENOENT; !! 3466 err = -ENOENT; 3654 goto unlock_out; !! 3467 goto unlock_out; 3655 } !! 3468 } 3656 3469 3657 has_cache = count & SWAP_HAS_ !! 3470 has_cache = count & SWAP_HAS_CACHE; 3658 count &= ~SWAP_HAS_CACHE; !! 3471 count &= ~SWAP_HAS_CACHE; >> 3472 err = 0; 3659 3473 3660 if (!count && !has_cache) { !! 3474 if (usage == SWAP_HAS_CACHE) { 3661 err = -ENOENT; << 3662 } else if (usage == SWAP_HAS_ << 3663 if (has_cache) << 3664 err = -EEXIST << 3665 } else if ((count & ~COUNT_CO << 3666 err = -EINVAL; << 3667 } << 3668 3475 3669 if (err) !! 3476 /* set SWAP_HAS_CACHE if there is no cache and entry is used */ 3670 goto unlock_out; !! 3477 if (!has_cache && count) 3671 } !! 3478 has_cache = SWAP_HAS_CACHE; >> 3479 else if (has_cache) /* someone else added cache */ >> 3480 err = -EEXIST; >> 3481 else /* no users remaining */ >> 3482 err = -ENOENT; 3672 3483 3673 for (i = 0; i < nr; i++) { !! 3484 } else if (count || has_cache) { 3674 count = si->swap_map[offset + << 3675 has_cache = count & SWAP_HAS_ << 3676 count &= ~SWAP_HAS_CACHE; << 3677 3485 3678 if (usage == SWAP_HAS_CACHE) !! 3486 if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) 3679 has_cache = SWAP_HAS_ << 3680 else if ((count & ~COUNT_CONT << 3681 count += usage; 3487 count += usage; 3682 else if (swap_count_continued !! 3488 else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) >> 3489 err = -EINVAL; >> 3490 else if (swap_count_continued(p, offset, count)) 3683 count = COUNT_CONTINU 3491 count = COUNT_CONTINUED; 3684 else { !! 3492 else 3685 /* << 3686 * Don't need to roll << 3687 * usage == 1, there << 3688 */ << 3689 err = -ENOMEM; 3493 err = -ENOMEM; 3690 goto unlock_out; !! 3494 } else 3691 } !! 3495 err = -ENOENT; /* unused swap entry */ 3692 3496 3693 WRITE_ONCE(si->swap_map[offse !! 3497 WRITE_ONCE(p->swap_map[offset], count | has_cache); 3694 } << 3695 3498 3696 unlock_out: 3499 unlock_out: 3697 unlock_cluster_or_swap_info(si, ci); !! 3500 unlock_cluster_or_swap_info(p, ci); >> 3501 if (p) >> 3502 put_swap_device(p); 3698 return err; 3503 return err; 3699 } 3504 } 3700 3505 3701 /* 3506 /* 3702 * Help swapoff by noting that swap entry bel 3507 * Help swapoff by noting that swap entry belongs to shmem/tmpfs 3703 * (in which case its reference count is neve 3508 * (in which case its reference count is never incremented). 3704 */ 3509 */ 3705 void swap_shmem_alloc(swp_entry_t entry, int !! 3510 void swap_shmem_alloc(swp_entry_t entry) 3706 { 3511 { 3707 __swap_duplicate(entry, SWAP_MAP_SHME !! 3512 __swap_duplicate(entry, SWAP_MAP_SHMEM); 3708 } 3513 } 3709 3514 3710 /* 3515 /* 3711 * Increase reference count of swap entry by 3516 * Increase reference count of swap entry by 1. 3712 * Returns 0 for success, or -ENOMEM if a swa 3517 * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required 3713 * but could not be atomically allocated. Re 3518 * but could not be atomically allocated. Returns 0, just as if it succeeded, 3714 * if __swap_duplicate() fails for another re 3519 * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which 3715 * might occur if a page table entry has got 3520 * might occur if a page table entry has got corrupted. 3716 */ 3521 */ 3717 int swap_duplicate(swp_entry_t entry) 3522 int swap_duplicate(swp_entry_t entry) 3718 { 3523 { 3719 int err = 0; 3524 int err = 0; 3720 3525 3721 while (!err && __swap_duplicate(entry !! 3526 while (!err && __swap_duplicate(entry, 1) == -ENOMEM) 3722 err = add_swap_count_continua 3527 err = add_swap_count_continuation(entry, GFP_ATOMIC); 3723 return err; 3528 return err; 3724 } 3529 } 3725 3530 3726 /* 3531 /* 3727 * @entry: first swap entry from which we all !! 3532 * @entry: swap entry for which we allocate swap cache. 3728 * 3533 * 3729 * Called when allocating swap cache for exis !! 3534 * Called when allocating swap cache for existing swap entry, 3730 * This can return error codes. Returns 0 at 3535 * This can return error codes. Returns 0 at success. 3731 * -EEXIST means there is a swap cache. 3536 * -EEXIST means there is a swap cache. 3732 * Note: return code is different from swap_d 3537 * Note: return code is different from swap_duplicate(). 3733 */ 3538 */ 3734 int swapcache_prepare(swp_entry_t entry, int !! 3539 int swapcache_prepare(swp_entry_t entry) 3735 { 3540 { 3736 return __swap_duplicate(entry, SWAP_H !! 3541 return __swap_duplicate(entry, SWAP_HAS_CACHE); 3737 } 3542 } 3738 3543 3739 void swapcache_clear(struct swap_info_struct !! 3544 struct swap_info_struct *swp_swap_info(swp_entry_t entry) 3740 { 3545 { 3741 unsigned long offset = swp_offset(ent !! 3546 return swap_type_to_swap_info(swp_type(entry)); 3742 << 3743 cluster_swap_free_nr(si, offset, nr, << 3744 } 3547 } 3745 3548 3746 struct swap_info_struct *swp_swap_info(swp_en !! 3549 struct swap_info_struct *page_swap_info(struct page *page) 3747 { 3550 { 3748 return swap_type_to_swap_info(swp_typ !! 3551 swp_entry_t entry = { .val = page_private(page) }; >> 3552 return swp_swap_info(entry); 3749 } 3553 } 3750 3554 3751 /* 3555 /* 3752 * out-of-line methods to avoid include hell. !! 3556 * out-of-line __page_file_ methods to avoid include hell. 3753 */ 3557 */ 3754 struct address_space *swapcache_mapping(struc !! 3558 struct address_space *__page_file_mapping(struct page *page) 3755 { 3559 { 3756 return swp_swap_info(folio->swap)->sw !! 3560 return page_swap_info(page)->swap_file->f_mapping; 3757 } 3561 } 3758 EXPORT_SYMBOL_GPL(swapcache_mapping); !! 3562 EXPORT_SYMBOL_GPL(__page_file_mapping); 3759 3563 3760 pgoff_t __folio_swap_cache_index(struct folio !! 3564 pgoff_t __page_file_index(struct page *page) 3761 { 3565 { 3762 return swap_cache_index(folio->swap); !! 3566 swp_entry_t swap = { .val = page_private(page) }; >> 3567 return swp_offset(swap); 3763 } 3568 } 3764 EXPORT_SYMBOL_GPL(__folio_swap_cache_index); !! 3569 EXPORT_SYMBOL_GPL(__page_file_index); 3765 3570 3766 /* 3571 /* 3767 * add_swap_count_continuation - called when 3572 * add_swap_count_continuation - called when a swap count is duplicated 3768 * beyond SWAP_MAP_MAX, it allocates a new pa 3573 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's 3769 * page of the original vmalloc'ed swap_map, 3574 * page of the original vmalloc'ed swap_map, to hold the continuation count 3770 * (for that entry and for its neighbouring P 3575 * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called 3771 * again when count is duplicated beyond SWAP 3576 * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc. 3772 * 3577 * 3773 * These continuation pages are seldom refere 3578 * These continuation pages are seldom referenced: the common paths all work 3774 * on the original swap_map, only referring t 3579 * on the original swap_map, only referring to a continuation page when the 3775 * low "digit" of a count is incremented or d 3580 * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. 3776 * 3581 * 3777 * add_swap_count_continuation(, GFP_ATOMIC) 3582 * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding 3778 * page table locks; if it fails, add_swap_co 3583 * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL) 3779 * can be called after dropping locks. 3584 * can be called after dropping locks. 3780 */ 3585 */ 3781 int add_swap_count_continuation(swp_entry_t e 3586 int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) 3782 { 3587 { 3783 struct swap_info_struct *si; 3588 struct swap_info_struct *si; 3784 struct swap_cluster_info *ci; 3589 struct swap_cluster_info *ci; 3785 struct page *head; 3590 struct page *head; 3786 struct page *page; 3591 struct page *page; 3787 struct page *list_page; 3592 struct page *list_page; 3788 pgoff_t offset; 3593 pgoff_t offset; 3789 unsigned char count; 3594 unsigned char count; 3790 int ret = 0; 3595 int ret = 0; 3791 3596 3792 /* 3597 /* 3793 * When debugging, it's easier to use 3598 * When debugging, it's easier to use __GFP_ZERO here; but it's better 3794 * for latency not to zero a page whi 3599 * for latency not to zero a page while GFP_ATOMIC and holding locks. 3795 */ 3600 */ 3796 page = alloc_page(gfp_mask | __GFP_HI 3601 page = alloc_page(gfp_mask | __GFP_HIGHMEM); 3797 3602 3798 si = get_swap_device(entry); 3603 si = get_swap_device(entry); 3799 if (!si) { 3604 if (!si) { 3800 /* 3605 /* 3801 * An acceptable race has occ 3606 * An acceptable race has occurred since the failing 3802 * __swap_duplicate(): the sw 3607 * __swap_duplicate(): the swap device may be swapoff 3803 */ 3608 */ 3804 goto outer; 3609 goto outer; 3805 } 3610 } 3806 spin_lock(&si->lock); 3611 spin_lock(&si->lock); 3807 3612 3808 offset = swp_offset(entry); 3613 offset = swp_offset(entry); 3809 3614 3810 ci = lock_cluster(si, offset); 3615 ci = lock_cluster(si, offset); 3811 3616 3812 count = swap_count(si->swap_map[offse 3617 count = swap_count(si->swap_map[offset]); 3813 3618 3814 if ((count & ~COUNT_CONTINUED) != SWA 3619 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { 3815 /* 3620 /* 3816 * The higher the swap count, 3621 * The higher the swap count, the more likely it is that tasks 3817 * will race to add swap coun 3622 * will race to add swap count continuation: we need to avoid 3818 * over-provisioning. 3623 * over-provisioning. 3819 */ 3624 */ 3820 goto out; 3625 goto out; 3821 } 3626 } 3822 3627 3823 if (!page) { 3628 if (!page) { 3824 ret = -ENOMEM; 3629 ret = -ENOMEM; 3825 goto out; 3630 goto out; 3826 } 3631 } 3827 3632 >> 3633 /* >> 3634 * We are fortunate that although vmalloc_to_page uses pte_offset_map, >> 3635 * no architecture is using highmem pages for kernel page tables: so it >> 3636 * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps. >> 3637 */ 3828 head = vmalloc_to_page(si->swap_map + 3638 head = vmalloc_to_page(si->swap_map + offset); 3829 offset &= ~PAGE_MASK; 3639 offset &= ~PAGE_MASK; 3830 3640 3831 spin_lock(&si->cont_lock); 3641 spin_lock(&si->cont_lock); 3832 /* 3642 /* 3833 * Page allocation does not initializ 3643 * Page allocation does not initialize the page's lru field, 3834 * but it does always reset its priva 3644 * but it does always reset its private field. 3835 */ 3645 */ 3836 if (!page_private(head)) { 3646 if (!page_private(head)) { 3837 BUG_ON(count & COUNT_CONTINUE 3647 BUG_ON(count & COUNT_CONTINUED); 3838 INIT_LIST_HEAD(&head->lru); 3648 INIT_LIST_HEAD(&head->lru); 3839 set_page_private(head, SWP_CO 3649 set_page_private(head, SWP_CONTINUED); 3840 si->flags |= SWP_CONTINUED; 3650 si->flags |= SWP_CONTINUED; 3841 } 3651 } 3842 3652 3843 list_for_each_entry(list_page, &head- 3653 list_for_each_entry(list_page, &head->lru, lru) { 3844 unsigned char *map; 3654 unsigned char *map; 3845 3655 3846 /* 3656 /* 3847 * If the previous map said n 3657 * If the previous map said no continuation, but we've found 3848 * a continuation page, free 3658 * a continuation page, free our allocation and use this one. 3849 */ 3659 */ 3850 if (!(count & COUNT_CONTINUED 3660 if (!(count & COUNT_CONTINUED)) 3851 goto out_unlock_cont; 3661 goto out_unlock_cont; 3852 3662 3853 map = kmap_local_page(list_pa !! 3663 map = kmap_atomic(list_page) + offset; 3854 count = *map; 3664 count = *map; 3855 kunmap_local(map); !! 3665 kunmap_atomic(map); 3856 3666 3857 /* 3667 /* 3858 * If this continuation count 3668 * If this continuation count now has some space in it, 3859 * free our allocation and us 3669 * free our allocation and use this one. 3860 */ 3670 */ 3861 if ((count & ~COUNT_CONTINUED 3671 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) 3862 goto out_unlock_cont; 3672 goto out_unlock_cont; 3863 } 3673 } 3864 3674 3865 list_add_tail(&page->lru, &head->lru) 3675 list_add_tail(&page->lru, &head->lru); 3866 page = NULL; /* no 3676 page = NULL; /* now it's attached, don't free it */ 3867 out_unlock_cont: 3677 out_unlock_cont: 3868 spin_unlock(&si->cont_lock); 3678 spin_unlock(&si->cont_lock); 3869 out: 3679 out: 3870 unlock_cluster(ci); 3680 unlock_cluster(ci); 3871 spin_unlock(&si->lock); 3681 spin_unlock(&si->lock); 3872 put_swap_device(si); 3682 put_swap_device(si); 3873 outer: 3683 outer: 3874 if (page) 3684 if (page) 3875 __free_page(page); 3685 __free_page(page); 3876 return ret; 3686 return ret; 3877 } 3687 } 3878 3688 3879 /* 3689 /* 3880 * swap_count_continued - when the original s 3690 * swap_count_continued - when the original swap_map count is incremented 3881 * from SWAP_MAP_MAX, check if there is alrea 3691 * from SWAP_MAP_MAX, check if there is already a continuation page to carry 3882 * into, carry if so, or else fail until a ne 3692 * into, carry if so, or else fail until a new continuation page is allocated; 3883 * when the original swap_map count is decrem 3693 * when the original swap_map count is decremented from 0 with continuation, 3884 * borrow from the continuation and report wh 3694 * borrow from the continuation and report whether it still holds more. 3885 * Called while __swap_duplicate() or swap_en 3695 * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster 3886 * lock. 3696 * lock. 3887 */ 3697 */ 3888 static bool swap_count_continued(struct swap_ 3698 static bool swap_count_continued(struct swap_info_struct *si, 3889 pgoff_t offs 3699 pgoff_t offset, unsigned char count) 3890 { 3700 { 3891 struct page *head; 3701 struct page *head; 3892 struct page *page; 3702 struct page *page; 3893 unsigned char *map; 3703 unsigned char *map; 3894 bool ret; 3704 bool ret; 3895 3705 3896 head = vmalloc_to_page(si->swap_map + 3706 head = vmalloc_to_page(si->swap_map + offset); 3897 if (page_private(head) != SWP_CONTINU 3707 if (page_private(head) != SWP_CONTINUED) { 3898 BUG_ON(count & COUNT_CONTINUE 3708 BUG_ON(count & COUNT_CONTINUED); 3899 return false; /* ne 3709 return false; /* need to add count continuation */ 3900 } 3710 } 3901 3711 3902 spin_lock(&si->cont_lock); 3712 spin_lock(&si->cont_lock); 3903 offset &= ~PAGE_MASK; 3713 offset &= ~PAGE_MASK; 3904 page = list_next_entry(head, lru); 3714 page = list_next_entry(head, lru); 3905 map = kmap_local_page(page) + offset; !! 3715 map = kmap_atomic(page) + offset; 3906 3716 3907 if (count == SWAP_MAP_MAX) /* in 3717 if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ 3908 goto init_map; /* ju 3718 goto init_map; /* jump over SWAP_CONT_MAX checks */ 3909 3719 3910 if (count == (SWAP_MAP_MAX | COUNT_CO 3720 if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */ 3911 /* 3721 /* 3912 * Think of how you add 1 to 3722 * Think of how you add 1 to 999 3913 */ 3723 */ 3914 while (*map == (SWAP_CONT_MAX 3724 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { 3915 kunmap_local(map); !! 3725 kunmap_atomic(map); 3916 page = list_next_entr 3726 page = list_next_entry(page, lru); 3917 BUG_ON(page == head); 3727 BUG_ON(page == head); 3918 map = kmap_local_page !! 3728 map = kmap_atomic(page) + offset; 3919 } 3729 } 3920 if (*map == SWAP_CONT_MAX) { 3730 if (*map == SWAP_CONT_MAX) { 3921 kunmap_local(map); !! 3731 kunmap_atomic(map); 3922 page = list_next_entr 3732 page = list_next_entry(page, lru); 3923 if (page == head) { 3733 if (page == head) { 3924 ret = false; 3734 ret = false; /* add count continuation */ 3925 goto out; 3735 goto out; 3926 } 3736 } 3927 map = kmap_local_page !! 3737 map = kmap_atomic(page) + offset; 3928 init_map: *map = 0; 3738 init_map: *map = 0; /* we didn't zero the page */ 3929 } 3739 } 3930 *map += 1; 3740 *map += 1; 3931 kunmap_local(map); !! 3741 kunmap_atomic(map); 3932 while ((page = list_prev_entr 3742 while ((page = list_prev_entry(page, lru)) != head) { 3933 map = kmap_local_page !! 3743 map = kmap_atomic(page) + offset; 3934 *map = COUNT_CONTINUE 3744 *map = COUNT_CONTINUED; 3935 kunmap_local(map); !! 3745 kunmap_atomic(map); 3936 } 3746 } 3937 ret = true; 3747 ret = true; /* incremented */ 3938 3748 3939 } else { 3749 } else { /* decrementing */ 3940 /* 3750 /* 3941 * Think of how you subtract 3751 * Think of how you subtract 1 from 1000 3942 */ 3752 */ 3943 BUG_ON(count != COUNT_CONTINU 3753 BUG_ON(count != COUNT_CONTINUED); 3944 while (*map == COUNT_CONTINUE 3754 while (*map == COUNT_CONTINUED) { 3945 kunmap_local(map); !! 3755 kunmap_atomic(map); 3946 page = list_next_entr 3756 page = list_next_entry(page, lru); 3947 BUG_ON(page == head); 3757 BUG_ON(page == head); 3948 map = kmap_local_page !! 3758 map = kmap_atomic(page) + offset; 3949 } 3759 } 3950 BUG_ON(*map == 0); 3760 BUG_ON(*map == 0); 3951 *map -= 1; 3761 *map -= 1; 3952 if (*map == 0) 3762 if (*map == 0) 3953 count = 0; 3763 count = 0; 3954 kunmap_local(map); !! 3764 kunmap_atomic(map); 3955 while ((page = list_prev_entr 3765 while ((page = list_prev_entry(page, lru)) != head) { 3956 map = kmap_local_page !! 3766 map = kmap_atomic(page) + offset; 3957 *map = SWAP_CONT_MAX 3767 *map = SWAP_CONT_MAX | count; 3958 count = COUNT_CONTINU 3768 count = COUNT_CONTINUED; 3959 kunmap_local(map); !! 3769 kunmap_atomic(map); 3960 } 3770 } 3961 ret = count == COUNT_CONTINUE 3771 ret = count == COUNT_CONTINUED; 3962 } 3772 } 3963 out: 3773 out: 3964 spin_unlock(&si->cont_lock); 3774 spin_unlock(&si->cont_lock); 3965 return ret; 3775 return ret; 3966 } 3776 } 3967 3777 3968 /* 3778 /* 3969 * free_swap_count_continuations - swapoff fr 3779 * free_swap_count_continuations - swapoff free all the continuation pages 3970 * appended to the swap_map, after swap_map i 3780 * appended to the swap_map, after swap_map is quiesced, before vfree'ing it. 3971 */ 3781 */ 3972 static void free_swap_count_continuations(str 3782 static void free_swap_count_continuations(struct swap_info_struct *si) 3973 { 3783 { 3974 pgoff_t offset; 3784 pgoff_t offset; 3975 3785 3976 for (offset = 0; offset < si->max; of 3786 for (offset = 0; offset < si->max; offset += PAGE_SIZE) { 3977 struct page *head; 3787 struct page *head; 3978 head = vmalloc_to_page(si->sw 3788 head = vmalloc_to_page(si->swap_map + offset); 3979 if (page_private(head)) { 3789 if (page_private(head)) { 3980 struct page *page, *n 3790 struct page *page, *next; 3981 3791 3982 list_for_each_entry_s 3792 list_for_each_entry_safe(page, next, &head->lru, lru) { 3983 list_del(&pag 3793 list_del(&page->lru); 3984 __free_page(p 3794 __free_page(page); 3985 } 3795 } 3986 } 3796 } 3987 } 3797 } 3988 } 3798 } 3989 3799 3990 #if defined(CONFIG_MEMCG) && defined(CONFIG_B 3800 #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) 3991 void __folio_throttle_swaprate(struct folio * !! 3801 void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) 3992 { 3802 { 3993 struct swap_info_struct *si, *next; 3803 struct swap_info_struct *si, *next; 3994 int nid = folio_nid(folio); !! 3804 int nid = page_to_nid(page); 3995 << 3996 if (!(gfp & __GFP_IO)) << 3997 return; << 3998 3805 3999 if (!__has_usable_swap()) !! 3806 if (!(gfp_mask & __GFP_IO)) 4000 return; 3807 return; 4001 3808 4002 if (!blk_cgroup_congested()) 3809 if (!blk_cgroup_congested()) 4003 return; 3810 return; 4004 3811 4005 /* 3812 /* 4006 * We've already scheduled a throttle 3813 * We've already scheduled a throttle, avoid taking the global swap 4007 * lock. 3814 * lock. 4008 */ 3815 */ 4009 if (current->throttle_disk) !! 3816 if (current->throttle_queue) 4010 return; 3817 return; 4011 3818 4012 spin_lock(&swap_avail_lock); 3819 spin_lock(&swap_avail_lock); 4013 plist_for_each_entry_safe(si, next, & 3820 plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], 4014 avail_lists 3821 avail_lists[nid]) { 4015 if (si->bdev) { 3822 if (si->bdev) { 4016 blkcg_schedule_thrott !! 3823 blkcg_schedule_throttle(bdev_get_queue(si->bdev), true); 4017 break; 3824 break; 4018 } 3825 } 4019 } 3826 } 4020 spin_unlock(&swap_avail_lock); 3827 spin_unlock(&swap_avail_lock); 4021 } 3828 } 4022 #endif 3829 #endif 4023 3830 4024 static int __init swapfile_init(void) 3831 static int __init swapfile_init(void) 4025 { 3832 { 4026 int nid; 3833 int nid; 4027 3834 4028 swap_avail_heads = kmalloc_array(nr_n 3835 swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head), 4029 GFP_ 3836 GFP_KERNEL); 4030 if (!swap_avail_heads) { 3837 if (!swap_avail_heads) { 4031 pr_emerg("Not enough memory f 3838 pr_emerg("Not enough memory for swap heads, swap is disabled\n"); 4032 return -ENOMEM; 3839 return -ENOMEM; 4033 } 3840 } 4034 3841 4035 for_each_node(nid) 3842 for_each_node(nid) 4036 plist_head_init(&swap_avail_h 3843 plist_head_init(&swap_avail_heads[nid]); 4037 << 4038 swapfile_maximum_size = arch_max_swap << 4039 << 4040 #ifdef CONFIG_MIGRATION << 4041 if (swapfile_maximum_size >= (1UL << << 4042 swap_migration_ad_supported = << 4043 #endif /* CONFIG_MIGRATION */ << 4044 3844 4045 return 0; 3845 return 0; 4046 } 3846 } 4047 subsys_initcall(swapfile_init); 3847 subsys_initcall(swapfile_init); 4048 3848
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.